[PATCH] drm/amdgpu: enable support error injection broadcast to all instances
when the address is -1, TA will do error injection for all instances of the specail sram. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 885a78301bbf..c828ce9525d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -402,8 +402,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * ret = amdgpu_ras_feature_enable(adev, , 1); break; case 2: - if ((data.inject.address >= adev->gmc.mc_vram_size) || - (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) { + if ((data.inject.address != (uint64_t)-1) && + ((data.inject.address >= adev->gmc.mc_vram_size) || + (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT))) { dev_warn(adev->dev, "RAS WARN: input address " "0x%llx is invalid.", data.inject.address); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdkfd: fix a resource leakage issue
The function kfd_lookup_process_by_pasid will increase the reference count of kfd_process object, its caller should call kfd_unref_process to decrease the reference count. Otherwise resource leakage will happen. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 3c9fe078334a..6cc6afb96a45 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -1100,4 +1100,6 @@ void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid) /* user application will handle SIGBUS signal */ send_sig(SIGBUS, p->lead_thread, 0); + + kfd_unref_process(p); } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdkfd: refine the poison data consumption handling
The user applications maybe register the KFD_EVENT_TYPE_HW_EXCEPTION and KFD_EVENT_TYPE_MEMORY events, driver could notify them when poison data consumed. Beside that, some applications maybe register SIGBUS signal hander. These applications will handle poison data by themselves, exit or re-create context to re-dispatch works. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index ba2c2ce0c55a..4d210f23c33c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -1050,3 +1050,42 @@ void kfd_signal_reset_event(struct kfd_dev *dev) } srcu_read_unlock(_processes_srcu, idx); } + +void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid) +{ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_hsa_memory_exception_data memory_exception_data; + struct kfd_hsa_hw_exception_data hw_exception_data; + struct kfd_event *ev; + uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID; + + if (!p) + return; /* Presumably process exited. */ + + memset(_exception_data, 0, sizeof(hw_exception_data)); + hw_exception_data.gpu_id = dev->id; + hw_exception_data.memory_lost = 1; + hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC; + + memset(_exception_data, 0, sizeof(memory_exception_data)); + memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED; + memory_exception_data.gpu_id = dev->id; + memory_exception_data.failure.imprecise = true; + + mutex_lock(>event_mutex); + idr_for_each_entry_continue(>event_idr, ev, id) { + if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { + ev->hw_exception_data = hw_exception_data; + set_event(ev); + } + + if (ev->type == KFD_EVENT_TYPE_MEMORY) { + ev->memory_exception_data = memory_exception_data; + set_event(ev); + } + } + mutex_unlock(>event_mutex); + + /* user application will handle SIGBUS signal */ + send_sig(SIGBUS, p->lead_thread, 0); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 97c36e3c8c80..9f9b1dfb9c37 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -230,7 +230,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, sq_intr_err); if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { - kfd_signal_hw_exception_event(pasid); + kfd_signal_poison_consumed_event(dev, pasid); amdgpu_amdkfd_gpu_reset(dev->kgd); return; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 64552f6b8ba4..daa9d47514c6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1144,6 +1144,8 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid, void kfd_signal_reset_event(struct kfd_dev *dev); +void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid); + void kfd_flush_tlb(struct kfd_process_device *pdd); int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: add synchronization among waves in the same threadgroup
It is possible that the previous waves have exited before others are created, so the other waves maybe reuse pyhsical resouces left by previous ones. Therefore add barrier instruction to synchronize waves within the same threadgroup. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index fdd65589f06b..dbad9ef002d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -93,98 +93,99 @@ static const struct soc15_reg_golden golden_settings_gc_9_4_2_alde[] = { static const u32 vgpr_init_compute_shader_aldebaran[] = { 0xb8840904, 0xb8851a04, 0xb8861344, 0xb8831804, 0x9208ff06, 0x0280, 0x9209a805, 0x920a8a04, 0x81080908, 0x81080a08, 0x81080308, 0x8e078208, - 0x81078407, 0xc0410080, 0x0007, 0xbf8c, 0xd3d94000, 0x1880, - 0xd3d94001, 0x1880, 0xd3d94002, 0x1880, 0xd3d94003, 0x1880, - 0xd3d94004, 0x1880, 0xd3d94005, 0x1880, 0xd3d94006, 0x1880, - 0xd3d94007, 0x1880, 0xd3d94008, 0x1880, 0xd3d94009, 0x1880, - 0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880, 0xd3d9400c, 0x1880, - 0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880, 0xd3d9400f, 0x1880, - 0xd3d94010, 0x1880, 0xd3d94011, 0x1880, 0xd3d94012, 0x1880, - 0xd3d94013, 0x1880, 0xd3d94014, 0x1880, 0xd3d94015, 0x1880, - 0xd3d94016, 0x1880, 0xd3d94017, 0x1880, 0xd3d94018, 0x1880, - 0xd3d94019, 0x1880, 0xd3d9401a, 0x1880, 0xd3d9401b, 0x1880, - 0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880, 0xd3d9401e, 0x1880, - 0xd3d9401f, 0x1880, 0xd3d94020, 0x1880, 0xd3d94021, 0x1880, - 0xd3d94022, 0x1880, 0xd3d94023, 0x1880, 0xd3d94024, 0x1880, - 0xd3d94025, 0x1880, 0xd3d94026, 0x1880, 0xd3d94027, 0x1880, - 0xd3d94028, 0x1880, 0xd3d94029, 0x1880, 0xd3d9402a, 0x1880, - 0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880, 0xd3d9402d, 0x1880, - 0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880, 0xd3d94030, 0x1880, - 0xd3d94031, 0x1880, 0xd3d94032, 0x1880, 0xd3d94033, 0x1880, - 0xd3d94034, 0x1880, 0xd3d94035, 0x1880, 0xd3d94036, 0x1880, - 0xd3d94037, 0x1880, 0xd3d94038, 0x1880, 0xd3d94039, 0x1880, - 0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880, 0xd3d9403c, 0x1880, - 0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880, 0xd3d9403f, 0x1880, - 0xd3d94040, 0x1880, 0xd3d94041, 0x1880, 0xd3d94042, 0x1880, - 0xd3d94043, 0x1880, 0xd3d94044, 0x1880, 0xd3d94045, 0x1880, - 0xd3d94046, 0x1880, 0xd3d94047, 0x1880, 0xd3d94048, 0x1880, - 0xd3d94049, 0x1880, 0xd3d9404a, 0x1880, 0xd3d9404b, 0x1880, - 0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880, 0xd3d9404e, 0x1880, - 0xd3d9404f, 0x1880, 0xd3d94050, 0x1880, 0xd3d94051, 0x1880, - 0xd3d94052, 0x1880, 0xd3d94053, 0x1880, 0xd3d94054, 0x1880, - 0xd3d94055, 0x1880, 0xd3d94056, 0x1880, 0xd3d94057, 0x1880, - 0xd3d94058, 0x1880, 0xd3d94059, 0x1880, 0xd3d9405a, 0x1880, - 0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880, 0xd3d9405d, 0x1880, - 0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880, 0xd3d94060, 0x1880, - 0xd3d94061, 0x1880, 0xd3d94062, 0x1880, 0xd3d94063, 0x1880, - 0xd3d94064, 0x1880, 0xd3d94065, 0x1880, 0xd3d94066, 0x1880, - 0xd3d94067, 0x1880, 0xd3d94068, 0x1880, 0xd3d94069, 0x1880, - 0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880, 0xd3d9406c, 0x1880, - 0xd3d9406d, 0x1880, 0xd3d9406e, 0x1880, 0xd3d9406f, 0x1880, - 0xd3d94070, 0x1880, 0xd3d94071, 0x1880, 0xd3d94072, 0x1880, - 0xd3d94073, 0x1880, 0xd3d94074, 0x1880, 0xd3d94075, 0x1880, - 0xd3d94076, 0x1880, 0xd3d94077, 0x1880, 0xd3d94078, 0x1880, - 0xd3d94079, 0x1880, 0xd3d9407a, 0x1880, 0xd3d9407b, 0x1880, - 0xd3d9407c, 0x1880, 0xd3d9407d, 0x1880, 0xd3d9407e, 0x1880, - 0xd3d9407f, 0x1880, 0xd3d94080, 0x1880, 0xd3d94081, 0x1880, - 0xd3d94082, 0x1880, 0xd3d94083, 0x1880, 0xd3d94084, 0x1880, - 0xd3d94085, 0x1880, 0xd3d94086, 0x1880, 0xd3d94087, 0x1880, - 0xd3d94088, 0x1880, 0xd3d94089, 0x1880, 0xd3d9408a, 0x1880, - 0xd3d9408b, 0x1880, 0xd3d9408c, 0x1880, 0xd3d9408d, 0x1880, - 0xd3d9408e, 0x1880, 0xd3d9408f, 0x1880, 0xd3d94090, 0x1880, - 0xd3d94091, 0x1880, 0xd3d94092, 0x1880, 0xd3d94093, 0x1880, - 0xd3d94094, 0x1880, 0xd3d94095, 0x1880, 0xd3d94096, 0x1880, - 0xd3d94097, 0x1880, 0xd3d94098, 0x1880, 0xd3d94099, 0x1880, - 0xd3d9409a, 0x1880
[PATCH] drm/amdgpu: add function to clear MMEA error status for aldebaran
For aldebaran, hardware will not clear error status automatically when reading error status register, insteadly driver should set clear bit of the error status register explicitly to clear error status. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h index 11aa29933c1f..b27fcbccce2b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h @@ -28,6 +28,7 @@ struct amdgpu_mmhub_ras_funcs { void *ras_error_status); void (*query_ras_error_status)(struct amdgpu_device *adev); void (*reset_ras_error_count)(struct amdgpu_device *adev); + void (*reset_ras_error_status)(struct amdgpu_device *adev); }; struct amdgpu_mmhub_funcs { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4eebb97994d6..a324dc2da101 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -938,6 +938,10 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, if (adev->mmhub.ras_funcs && adev->mmhub.ras_funcs->reset_ras_error_count) adev->mmhub.ras_funcs->reset_ras_error_count(adev); + + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->reset_ras_error_status) + adev->mmhub.ras_funcs->reset_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__SDMA: if (adev->sdma.funcs->reset_ras_error_count) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c index e1500be4a208..998e674f9369 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c @@ -1314,12 +1314,31 @@ static void mmhub_v1_7_query_ras_error_status(struct amdgpu_device *adev) } } +static void mmhub_v1_7_reset_ras_error_status(struct amdgpu_device *adev) +{ + int i; + uint32_t reg_value; + + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) + return; + + for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_ea_err_status_regs); i++) { + reg_value = RREG32(SOC15_REG_ENTRY_OFFSET( + mmhub_v1_7_ea_err_status_regs[i])); + reg_value = REG_SET_FIELD(reg_value, MMEA0_ERR_STATUS, + CLEAR_ERROR_STATUS, 0x01); + WREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_ea_err_status_regs[i]), + reg_value); + } +} + const struct amdgpu_mmhub_ras_funcs mmhub_v1_7_ras_funcs = { .ras_late_init = amdgpu_mmhub_ras_late_init, .ras_fini = amdgpu_mmhub_ras_fini, .query_ras_error_count = mmhub_v1_7_query_ras_error_count, .reset_ras_error_count = mmhub_v1_7_reset_ras_error_count, .query_ras_error_status = mmhub_v1_7_query_ras_error_status, + .reset_ras_error_status = mmhub_v1_7_reset_ras_error_status, }; const struct amdgpu_mmhub_funcs mmhub_v1_7_funcs = { -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: correct the funtion to clear GCEA error status
The bit 11 of GCEA_ERR_STATUS register is used to clear GCEA error status. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index e943cd2923ac..c63599686708 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -1674,13 +1674,16 @@ static void gfx_v9_4_2_reset_utc_err_status(struct amdgpu_device *adev) static void gfx_v9_4_2_reset_ea_err_status(struct amdgpu_device *adev) { uint32_t i, j; + uint32_t value; + + value = REG_SET_FIELD(0, GCEA_ERR_STATUS, CLEAR_ERROR_STATUS, 0x1); mutex_lock(>grbm_idx_mutex); for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) { for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance; j++) { gfx_v9_4_2_select_se_sh(adev, i, 0, j); - WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_regs), 0x10); + WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_regs), value); } } gfx_v9_4_2_select_se_sh(adev, 0x, 0x, 0x); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: covert ras status to kernel errno
The original codes use ras status and kernl errno together in the same function, which is a wrong code style. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 17b728d2c1f2..231479b67b33 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1114,6 +1114,28 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id) return ret; } +static int psp_ras_status_to_errno(struct amdgpu_device *adev, +enum ta_ras_status ras_status) +{ + int ret = -EINVAL; + + switch (ras_status) { + case TA_RAS_STATUS__SUCCESS: + ret = 0; + break; + case TA_RAS_STATUS__RESET_NEEDED: + ret = -EAGAIN; + break; + case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE: + dev_warn(adev->dev, "RAS WARN: ras function unavailable\n"); + break; + default: + dev_err(adev->dev, "RAS ERROR: ras function failed ret 0x%X\n", ret); + } + + return ret; +} + int psp_ras_enable_features(struct psp_context *psp, union ta_ras_cmd_input *info, bool enable) { @@ -1137,7 +1159,7 @@ int psp_ras_enable_features(struct psp_context *psp, if (ret) return -EINVAL; - return ras_cmd->ras_status; + return psp_ras_status_to_errno(psp->adev, ras_cmd->ras_status); } static int psp_ras_terminate(struct psp_context *psp) @@ -1220,7 +1242,7 @@ int psp_ras_trigger_error(struct psp_context *psp, if (amdgpu_ras_intr_triggered()) return 0; - return ras_cmd->ras_status; + return psp_ras_status_to_errno(psp->adev, ras_cmd->ras_status); } // ras end diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ebbe2c5190c4..9b06cb58cff2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -586,29 +586,6 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, } /* obj end */ -static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, -const char* invoke_type, -const char* block_name, -enum ta_ras_status ret) -{ - switch (ret) { - case TA_RAS_STATUS__SUCCESS: - return; - case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE: - dev_warn(adev->dev, - "RAS WARN: %s %s currently unavailable\n", - invoke_type, - block_name); - break; - default: - dev_err(adev->dev, - "RAS ERROR: %s %s error failed ret 0x%X\n", - invoke_type, - block_name, - ret); - } -} - /* feature ctl begin */ static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, struct ras_common_if *head) @@ -705,15 +682,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(>psp, info, enable); if (ret) { - amdgpu_ras_parse_status_code(adev, -enable ? "enable":"disable", -ras_block_str(head->block), - (enum ta_ras_status)ret); - if (ret == TA_RAS_STATUS__RESET_NEEDED) - ret = -EAGAIN; - else - ret = -EINVAL; - + dev_err(adev->dev, "ras %s %s failed %d\n", + enable ? "enable":"disable", + ras_block_str(head->block), + ret); goto out; } } @@ -1058,10 +1030,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ret = -EINVAL; } - amdgpu_ras_parse_status_code(adev, -"inject", -ras_block_str(info->head.block), -(enum ta_ras_status)ret); + if (ret) + dev_err(adev->dev, "ras inject %s failed %d\n", + ras_block_str(info->head.block), ret); return ret; } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: update the shader to clear specific SGPRs
Add shader codes to explicitly clear specific SGPRs, such as flat_scratch_lo, flat_scratch_hi and so on. And also correct the allocation size of SGPRs in PGM_RSRC1. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index 025b1e42e31b..8ad6717e67d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -220,23 +220,24 @@ static const u32 sgpr112_init_compute_shader_aldebaran[] = { 0x9209a805, 0x920a8a04, 0x81080908, 0x81080a08, 0x81080308, 0x8e078208, 0x81078407, 0xc0410080, 0x0007, 0xbf8c, 0xbf8e003f, 0xc0030200, 0x, 0xbf8c, 0xbf06ff08, 0xdeadbeaf, 0xbf84fff9, 0x81028102, - 0xc0410080, 0x0007, 0xbf8c, 0xbefc0080, 0xbe880080, 0xbe890080, - 0xbe8a0080, 0xbe8b0080, 0xbe8c0080, 0xbe8d0080, 0xbe8e0080, 0xbe8f0080, - 0xbe900080, 0xbe910080, 0xbe920080, 0xbe930080, 0xbe940080, 0xbe950080, - 0xbe960080, 0xbe970080, 0xbe980080, 0xbe990080, 0xbe9a0080, 0xbe9b0080, - 0xbe9c0080, 0xbe9d0080, 0xbe9e0080, 0xbe9f0080, 0xbea00080, 0xbea10080, - 0xbea20080, 0xbea30080, 0xbea40080, 0xbea50080, 0xbea60080, 0xbea70080, - 0xbea80080, 0xbea90080, 0xbeaa0080, 0xbeab0080, 0xbeac0080, 0xbead0080, - 0xbeae0080, 0xbeaf0080, 0xbeb00080, 0xbeb10080, 0xbeb20080, 0xbeb30080, - 0xbeb40080, 0xbeb50080, 0xbeb60080, 0xbeb70080, 0xbeb80080, 0xbeb90080, - 0xbeba0080, 0xbebb0080, 0xbebc0080, 0xbebd0080, 0xbebe0080, 0xbebf0080, - 0xbec00080, 0xbec10080, 0xbec20080, 0xbec30080, 0xbec40080, 0xbec50080, - 0xbec60080, 0xbec70080, 0xbec80080, 0xbec90080, 0xbeca0080, 0xbecb0080, - 0xbecc0080, 0xbecd0080, 0xbece0080, 0xbecf0080, 0xbed00080, 0xbed10080, - 0xbed20080, 0xbed30080, 0xbed40080, 0xbed50080, 0xbed60080, 0xbed70080, - 0xbed80080, 0xbed90080, 0xbeda0080, 0xbedb0080, 0xbedc0080, 0xbedd0080, - 0xbede0080, 0xbedf0080, 0xbee00080, 0xbee10080, 0xbee20080, 0xbee30080, - 0xbee40080, 0xbee50080, 0xbf81 + 0xc0410080, 0x0007, 0xbf8c, 0xbefc0080, 0xbeea0080, 0xbeeb0080, + 0xbf00f280, 0xbee60080, 0xbee70080, 0xbee80080, 0xbee90080, 0xbefe0080, + 0xbeff0080, 0xbe880080, 0xbe890080, 0xbe8a0080, 0xbe8b0080, 0xbe8c0080, + 0xbe8d0080, 0xbe8e0080, 0xbe8f0080, 0xbe900080, 0xbe910080, 0xbe920080, + 0xbe930080, 0xbe940080, 0xbe950080, 0xbe960080, 0xbe970080, 0xbe980080, + 0xbe990080, 0xbe9a0080, 0xbe9b0080, 0xbe9c0080, 0xbe9d0080, 0xbe9e0080, + 0xbe9f0080, 0xbea00080, 0xbea10080, 0xbea20080, 0xbea30080, 0xbea40080, + 0xbea50080, 0xbea60080, 0xbea70080, 0xbea80080, 0xbea90080, 0xbeaa0080, + 0xbeab0080, 0xbeac0080, 0xbead0080, 0xbeae0080, 0xbeaf0080, 0xbeb00080, + 0xbeb10080, 0xbeb20080, 0xbeb30080, 0xbeb40080, 0xbeb50080, 0xbeb60080, + 0xbeb70080, 0xbeb80080, 0xbeb90080, 0xbeba0080, 0xbebb0080, 0xbebc0080, + 0xbebd0080, 0xbebe0080, 0xbebf0080, 0xbec00080, 0xbec10080, 0xbec20080, + 0xbec30080, 0xbec40080, 0xbec50080, 0xbec60080, 0xbec70080, 0xbec80080, + 0xbec90080, 0xbeca0080, 0xbecb0080, 0xbecc0080, 0xbecd0080, 0xbece0080, + 0xbecf0080, 0xbed00080, 0xbed10080, 0xbed20080, 0xbed30080, 0xbed40080, + 0xbed50080, 0xbed60080, 0xbed70080, 0xbed80080, 0xbed90080, 0xbeda0080, + 0xbedb0080, 0xbedc0080, 0xbedd0080, 0xbede0080, 0xbedf0080, 0xbee00080, + 0xbee10080, 0xbee20080, 0xbee30080, 0xbee40080, 0xbee50080, 0xbf81, }; const struct soc15_reg_entry sgpr112_init_regs_aldebaran[] = { @@ -244,7 +245,7 @@ const struct soc15_reg_entry sgpr112_init_regs_aldebaran[] = { { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_NUM_THREAD_X), 0x40 }, { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_NUM_THREAD_Y), 8 }, { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_NUM_THREAD_Z), 1 }, - { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_PGM_RSRC1), 0x2c0 }, + { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_PGM_RSRC1), 0x340 }, { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_PGM_RSRC2), 0x6 }, { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_PGM_RSRC3), 0x0 }, { SOC15_REG_ENTRY(GC, 0, regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0x }, @@ -262,21 +263,22 @@ static const u32 sgpr96_init_compute_shader_aldebaran[] = { 0x9209a805, 0x920a8a04, 0x81080908, 0x81080a08, 0x81080308, 0x8e078208, 0x81078407, 0xc0410080, 0x0007, 0xbf8c, 0xbf8e003f, 0xc0030200, 0x, 0xbf8c, 0xbf06ff08, 0xdeadbeaf, 0xbf84fff9, 0x81028102, - 0xc0410080, 0x0007, 0xbf8c, 0xbefc0080, 0xbe880080, 0xbe890080, - 0xbe8a0080, 0xbe8b0080, 0xbe8c0080, 0xbe8d0080, 0xbe8e0080, 0xbe8f0080, - 0xbe900080, 0xbe910080, 0xbe920080, 0xbe930080, 0xbe940080, 0xbe950080, - 0xbe960080, 0xbe970080, 0xbe980080, 0xbe990080, 0xbe9a0080, 0xbe9b0080, - 0xbe9c0080, 0xbe9d0080, 0xbe9e0080, 0xbe9f0080, 0xbea00080, 0xbea10080, - 0xbea20080, 0xbea30080, 0xbea40080, 0xbea50080, 0xbea60080, 0xbea70080
[PATCH] drm/amdgpu: fix no full coverage issue for gprs initialization
The number of waves is changed to 8, so it is impossible to use old solution to cover all sgprs. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index a2fe2dac32c1..2e6789a7dc46 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -328,7 +328,7 @@ int amdgpu_ib_pool_init(struct amdgpu_device *adev) for (i = 0; i < AMDGPU_IB_POOL_MAX; i++) { if (i == AMDGPU_IB_POOL_DIRECT) - size = PAGE_SIZE * 2; + size = PAGE_SIZE * 6; else size = AMDGPU_IB_POOL_SIZE; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index d17e57dea178..77948c033c45 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -32,6 +32,11 @@ #include "amdgpu_ras.h" #include "amdgpu_gfx.h" +#define SE_ID_MAX 8 +#define CU_ID_MAX 16 +#define SIMD_ID_MAX 4 +#define WAVE_ID_MAX 10 + enum gfx_v9_4_2_utc_type { VML2_MEM, VML2_WALKER_MEM, @@ -81,100 +86,100 @@ static const struct soc15_reg_golden golden_settings_gc_9_4_2_alde[] = { }; static const u32 vgpr_init_compute_shader_aldebaran[] = { - 0xb8840904, 0xb8851a04, 0xb8861344, 0x9207c006, 0x92088405, 0x81070807, - 0x81070407, 0x8e078207, 0xbe88008f, 0xc0410200, 0x0007, 0xd3d94000, - 0x1880, 0xd3d94001, 0x1880, 0xd3d94002, 0x1880, 0xd3d94003, - 0x1880, 0xd3d94004, 0x1880, 0xd3d94005, 0x1880, 0xd3d94006, - 0x1880, 0xd3d94007, 0x1880, 0xd3d94008, 0x1880, 0xd3d94009, - 0x1880, 0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880, 0xd3d9400c, - 0x1880, 0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880, 0xd3d9400f, - 0x1880, 0xd3d94010, 0x1880, 0xd3d94011, 0x1880, 0xd3d94012, - 0x1880, 0xd3d94013, 0x1880, 0xd3d94014, 0x1880, 0xd3d94015, - 0x1880, 0xd3d94016, 0x1880, 0xd3d94017, 0x1880, 0xd3d94018, - 0x1880, 0xd3d94019, 0x1880, 0xd3d9401a, 0x1880, 0xd3d9401b, - 0x1880, 0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880, 0xd3d9401e, - 0x1880, 0xd3d9401f, 0x1880, 0xd3d94020, 0x1880, 0xd3d94021, - 0x1880, 0xd3d94022, 0x1880, 0xd3d94023, 0x1880, 0xd3d94024, - 0x1880, 0xd3d94025, 0x1880, 0xd3d94026, 0x1880, 0xd3d94027, - 0x1880, 0xd3d94028, 0x1880, 0xd3d94029, 0x1880, 0xd3d9402a, - 0x1880, 0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880, 0xd3d9402d, - 0x1880, 0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880, 0xd3d94030, - 0x1880, 0xd3d94031, 0x1880, 0xd3d94032, 0x1880, 0xd3d94033, - 0x1880, 0xd3d94034, 0x1880, 0xd3d94035, 0x1880, 0xd3d94036, - 0x1880, 0xd3d94037, 0x1880, 0xd3d94038, 0x1880, 0xd3d94039, - 0x1880, 0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880, 0xd3d9403c, - 0x1880, 0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880, 0xd3d9403f, - 0x1880, 0xd3d94040, 0x1880, 0xd3d94041, 0x1880, 0xd3d94042, - 0x1880, 0xd3d94043, 0x1880, 0xd3d94044, 0x1880, 0xd3d94045, - 0x1880, 0xd3d94046, 0x1880, 0xd3d94047, 0x1880, 0xd3d94048, - 0x1880, 0xd3d94049, 0x1880, 0xd3d9404a, 0x1880, 0xd3d9404b, - 0x1880, 0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880, 0xd3d9404e, - 0x1880, 0xd3d9404f, 0x1880, 0xd3d94050, 0x1880, 0xd3d94051, - 0x1880, 0xd3d94052, 0x1880, 0xd3d94053, 0x1880, 0xd3d94054, - 0x1880, 0xd3d94055, 0x1880, 0xd3d94056, 0x1880, 0xd3d94057, - 0x1880, 0xd3d94058, 0x1880, 0xd3d94059, 0x1880, 0xd3d9405a, - 0x1880, 0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880, 0xd3d9405d, - 0x1880, 0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880, 0xd3d94060, - 0x1880, 0xd3d94061, 0x1880, 0xd3d94062, 0x1880, 0xd3d94063, - 0x1880, 0xd3d94064, 0x1880, 0xd3d94065, 0x1880, 0xd3d94066, - 0x1880, 0xd3d94067, 0x1880, 0xd3d94068, 0x1880, 0xd3d94069, - 0x1880, 0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880, 0xd3d9406c, - 0x1880, 0xd3d9406d, 0x1880, 0xd3d9406e, 0x1880, 0xd3d9406f, - 0x1880, 0xd3d94070, 0x1880, 0xd3d94071, 0x1880, 0xd3d94072, - 0x1880, 0xd3d94073, 0x1880, 0xd3d94074, 0x1880, 0xd3d94075, - 0x1880, 0xd3d94076, 0x1880, 0xd3d94077, 0x1880, 0xd3d94078, - 0x1880, 0xd3d94079, 0x1880, 0xd3d9407a, 0x1880, 0xd3d9407b, - 0x1880, 0xd3d9407c, 0x1880, 0xd3d9407d, 0x1880, 0xd3d9407e, - 0x1880, 0xd3d9407f, 0x1880, 0xd3d94080, 0x1880, 0xd3d94081, - 0x1880, 0xd3d94082, 0x1880, 0xd3d94083,
[PATCH] drm/amdgpu: refine gprs init shaders to check coverage
Add codes to check whether all SIMDs are covered, make sure that all GPRs are initialized. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 9889bd495ba5..9e629f239288 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4656,8 +4656,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) if (!ring->sched.ready) return 0; - if (adev->asic_type == CHIP_ARCTURUS || - adev->asic_type == CHIP_ALDEBARAN) { + if (adev->asic_type == CHIP_ARCTURUS) { vgpr_init_shader_ptr = vgpr_init_compute_shader_arcturus; vgpr_init_shader_size = sizeof(vgpr_init_compute_shader_arcturus); vgpr_init_regs_ptr = vgpr_init_regs_arcturus; @@ -4924,7 +4923,11 @@ static int gfx_v9_0_ecc_late_init(void *handle) } /* requires IBs so do in late init after IB pool is initialized */ - r = gfx_v9_0_do_edc_gpr_workarounds(adev); + if (adev->asic_type == CHIP_ALDEBARAN) + r = gfx_v9_4_2_do_edc_gpr_workarounds(adev); + else + r = gfx_v9_0_do_edc_gpr_workarounds(adev); + if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index 9ca76a3ac38c..798c0e178201 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -22,6 +22,7 @@ */ #include "amdgpu.h" #include "soc15.h" +#include "soc15d.h" #include "gc/gc_9_4_2_offset.h" #include "gc/gc_9_4_2_sh_mask.h" @@ -79,6 +80,377 @@ static const struct soc15_reg_golden golden_settings_gc_9_4_2_alde[] = { SOC15_REG_GOLDEN_VALUE(GC, 0, regTCI_CNTL_3, 0xff, 0x20), }; +static const u32 vgpr_init_compute_shader_aldebaran[] = { + 0xb8840904, 0xb8851a04, 0xb8861344, 0x9207c006, 0x92088405, 0x81070807, + 0x81070407, 0x8e078207, 0xbe88008f, 0xc0410200, 0x0007, 0xd3d94000, + 0x1880, 0xd3d94001, 0x1880, 0xd3d94002, 0x1880, 0xd3d94003, + 0x1880, 0xd3d94004, 0x1880, 0xd3d94005, 0x1880, 0xd3d94006, + 0x1880, 0xd3d94007, 0x1880, 0xd3d94008, 0x1880, 0xd3d94009, + 0x1880, 0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880, 0xd3d9400c, + 0x1880, 0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880, 0xd3d9400f, + 0x1880, 0xd3d94010, 0x1880, 0xd3d94011, 0x1880, 0xd3d94012, + 0x1880, 0xd3d94013, 0x1880, 0xd3d94014, 0x1880, 0xd3d94015, + 0x1880, 0xd3d94016, 0x1880, 0xd3d94017, 0x1880, 0xd3d94018, + 0x1880, 0xd3d94019, 0x1880, 0xd3d9401a, 0x1880, 0xd3d9401b, + 0x1880, 0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880, 0xd3d9401e, + 0x1880, 0xd3d9401f, 0x1880, 0xd3d94020, 0x1880, 0xd3d94021, + 0x1880, 0xd3d94022, 0x1880, 0xd3d94023, 0x1880, 0xd3d94024, + 0x1880, 0xd3d94025, 0x1880, 0xd3d94026, 0x1880, 0xd3d94027, + 0x1880, 0xd3d94028, 0x1880, 0xd3d94029, 0x1880, 0xd3d9402a, + 0x1880, 0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880, 0xd3d9402d, + 0x1880, 0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880, 0xd3d94030, + 0x1880, 0xd3d94031, 0x1880, 0xd3d94032, 0x1880, 0xd3d94033, + 0x1880, 0xd3d94034, 0x1880, 0xd3d94035, 0x1880, 0xd3d94036, + 0x1880, 0xd3d94037, 0x1880, 0xd3d94038, 0x1880, 0xd3d94039, + 0x1880, 0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880, 0xd3d9403c, + 0x1880, 0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880, 0xd3d9403f, + 0x1880, 0xd3d94040, 0x1880, 0xd3d94041, 0x1880, 0xd3d94042, + 0x1880, 0xd3d94043, 0x1880, 0xd3d94044, 0x1880, 0xd3d94045, + 0x1880, 0xd3d94046, 0x1880, 0xd3d94047, 0x1880, 0xd3d94048, + 0x1880, 0xd3d94049, 0x1880, 0xd3d9404a, 0x1880, 0xd3d9404b, + 0x1880, 0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880, 0xd3d9404e, + 0x1880, 0xd3d9404f, 0x1880, 0xd3d94050, 0x1880, 0xd3d94051, + 0x1880, 0xd3d94052, 0x1880, 0xd3d94053, 0x1880, 0xd3d94054, + 0x1880, 0xd3d94055, 0x1880, 0xd3d94056, 0x1880, 0xd3d94057, + 0x1880, 0xd3d94058, 0x1880, 0xd3d94059, 0x1880, 0xd3d9405a, + 0x1880, 0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880, 0xd3d9405d, + 0x1880, 0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880, 0xd3d94060, + 0x1880, 0xd3d94061, 0x1880, 0xd3d94062, 0x1880, 0xd3d94063, + 0x1880, 0xd3d94064, 0x1880, 0xd3d94065, 0x1880, 0xd3d94066, + 0x1880, 0xd3d94067, 0x1880, 0xd3d94068, 0x1880, 0xd3d94069, + 0x1880, 0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880, 0
[PATCH] drm/amdgpu: fix a error injection failed issue
because "sscanf(str, "retire_page")" always return 0, if application use the raw data for error injection, it always wrongly falls into "op == 3". Change to use strstr instead. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 38a691a3b600..7438d4e84776 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -221,7 +221,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; - else if (sscanf(str, "retire_page") == 0) + else if (strstr(str, "retire_page") != NULL) op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdkfd: add edc error interrupt handle for poison propogate mode
In poison progogate mode, when driver receive the edc error interrupt from SQ, driver should kill the process by pasid which is using the poison data, and then trigger GPU reset. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 1c20458f3962..696944fa0177 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -25,6 +25,70 @@ #include "soc15_int.h" #include "kfd_device_queue_manager.h" #include "kfd_smi_events.h" +#include "amdgpu.h" + +enum SQ_INTERRUPT_WORD_ENCODING { + SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0, + SQ_INTERRUPT_WORD_ENCODING_INST, + SQ_INTERRUPT_WORD_ENCODING_ERROR, +}; + +enum SQ_INTERRUPT_ERROR_TYPE { + SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0, + SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST, + SQ_INTERRUPT_ERROR_TYPE_MEMVIOL, + SQ_INTERRUPT_ERROR_TYPE_EDC_FED, +}; + +/* SQ_INTERRUPT_WORD_AUTO_CTXID */ +#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE__SHIFT 0 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT__SHIFT 1 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL__SHIFT 2 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP__SHIFT 3 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP__SHIFT 4 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW__SHIFT 5 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW__SHIFT 6 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW__SHIFT 7 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR__SHIFT 8 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID__SHIFT 24 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING__SHIFT 26 + +#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_MASK 0x0001 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT_MASK 0x0002 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL_MASK 0x0004 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP_MASK 0x0008 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP_MASK 0x0010 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW_MASK 0x0020 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW_MASK 0x0040 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW_MASK 0x0080 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR_MASK 0x0100 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID_MASK 0x0300 +#define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING_MASK 0x0c00 + +/* SQ_INTERRUPT_WORD_WAVE_CTXID */ +#define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA__SHIFT 0 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID__SHIFT 12 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV__SHIFT 13 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID__SHIFT 14 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID__SHIFT 18 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID__SHIFT 20 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID__SHIFT 24 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING__SHIFT 26 + +#define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA_MASK 0x0fff +#define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID_MASK 0x1000 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK 0x2000 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID_MASK 0x0003c000 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID_MASK 0x000c +#define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID_MASK 0x00f0 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x0300 +#define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c00 + +#define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1) \ + ((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff)) + +#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF0 +#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20 static bool event_interrupt_isr_v9(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -108,13 +172,15 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, const uint32_t *ih_ring_entry) { uint16_t source_id, client_id, pasid, vmid; - uint32_t context_id; + uint32_t context_id0, context_id1; + uint32_t sq_intr_err, sq_int_data, encoding; source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); - context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); + context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); + context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry); if (client_id == SOC15_IH_CLIENTID_GRBM_CP || client_id == SOC15_IH_CLIENTID_SE0SH || @@ -122,10 +188,59 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, client_id == SOC15_IH_CLIENTID_SE2SH || client_id == SOC15_IH_CLIENTID_SE3SH
[PATCH 4/4] drm/amdkfd: add reset lock protection for kfd entry functions
When doing GPU reset, try to block all kfd functions including kfd ioctls and file close function, which maybe access hardware. v2: fix a potential recursive locking issue kfd_ioctl_dbg_register has chance called into pqm_create_queue, which will cause recursive locking. So remove locking read_lock from process queue manager, and add read_lock into related ioctls instead. v3: put pqm_query_dev_by_qid under the protection of p->mutex Signed-off-by: Dennis Li Acked-by: Christian König diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 6802c616e10e..283ba9435233 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -40,6 +40,7 @@ #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" #include "kfd_smi_events.h" +#include "amdgpu.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -298,6 +299,9 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, } mutex_lock(>mutex); + err = amdgpu_read_lock(dev->ddev, true); + if (err) + goto err_read_lock; pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { @@ -326,6 +330,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, */ args->doorbell_offset |= doorbell_offset_in_process; + amdgpu_read_unlock(dev->ddev); mutex_unlock(>mutex); pr_debug("Queue id %d was created successfully\n", args->queue_id); @@ -343,6 +348,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, err_create_queue: err_bind_process: + amdgpu_read_unlock(dev->ddev); +err_read_lock: mutex_unlock(>mutex); return err; } @@ -352,6 +359,7 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, { int retval; struct kfd_ioctl_destroy_queue_args *args = data; + struct kfd_dev *dev; pr_debug("Destroying queue id %d for pasid 0x%x\n", args->queue_id, @@ -359,8 +367,20 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, mutex_lock(>mutex); + dev = pqm_query_dev_by_qid(>pqm, args->queue_id); + if (!dev) { + retval = -EINVAL; + goto err_query_dev; + } + + retval = amdgpu_read_lock(dev->ddev, true); + if (retval) + goto err_read_lock; retval = pqm_destroy_queue(>pqm, args->queue_id); + amdgpu_read_unlock(dev->ddev); +err_read_lock: +err_query_dev: mutex_unlock(>mutex); return retval; } @@ -371,6 +391,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, int retval; struct kfd_ioctl_update_queue_args *args = data; struct queue_properties properties; + struct kfd_dev *dev; if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); @@ -404,10 +425,21 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, mutex_lock(>mutex); + dev = pqm_query_dev_by_qid(>pqm, args->queue_id); + if (!dev) { + retval = -EINVAL; + goto err_query_dev; + } + + retval = amdgpu_read_lock(dev->ddev, true); + if (retval) + goto err_read_lock; retval = pqm_update_queue(>pqm, args->queue_id, ); + amdgpu_read_unlock(dev->ddev); +err_read_lock: +err_query_dev: mutex_unlock(>mutex); - return retval; } @@ -420,6 +452,7 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, struct queue_properties properties; uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32); + struct kfd_dev *dev; if ((args->num_cu_mask % 32) != 0) { pr_debug("num_cu_mask 0x%x must be a multiple of 32", @@ -456,8 +489,20 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, mutex_lock(>mutex); + dev = pqm_query_dev_by_qid(>pqm, args->queue_id); + if (!dev) { + retval = -EINVAL; + goto err_query_dev; + } + + retval = amdgpu_read_lock(dev->ddev, true); + if (retval) + goto err_read_lock; retval = pqm_set_cu_mask(>pqm, args->queue_id, ); + amdgpu_read_unlock(dev->ddev); +err_read_lock: +err_query_dev: mutex_unlock(>mutex); if (retval
[PATCH 3/4] drm/amdgpu: instead of using down/up_read directly
change to use amdgpu_read_lock/unlock which could handle more cases Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index bcaf271b39bf..66dec0f49c4a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -59,11 +59,12 @@ int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) { struct amdgpu_device *adev = inode->i_private; + struct drm_device *dev = adev_to_drm(adev); int ret; file->private_data = adev; - ret = down_read_killable(>reset_sem); + ret = amdgpu_read_lock(dev, true); if (ret) return ret; @@ -74,7 +75,7 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) ret = -EBUSY; } - up_read(>reset_sem); + amdgpu_read_unlock(dev); return ret; } @@ -1206,7 +1207,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) } /* Avoid accidently unparking the sched thread during GPU reset */ - r = down_read_killable(>reset_sem); + r = amdgpu_read_lock(dev, true); if (r) return r; @@ -1235,7 +1236,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) kthread_unpark(ring->sched.thread); } - up_read(>reset_sem); + amdgpu_read_unlock(dev); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); @@ -1427,6 +1428,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) struct amdgpu_ring *ring; struct dma_fence **fences = NULL; struct amdgpu_device *adev = (struct amdgpu_device *)data; + struct drm_device *dev = adev_to_drm(adev); if (val >= AMDGPU_MAX_RINGS) return -EINVAL; @@ -1446,7 +1448,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) return -ENOMEM; /* Avoid accidently unparking the sched thread during GPU reset */ - r = down_read_killable(>reset_sem); + r = amdgpu_read_lock(dev, true); if (r) goto pro_end; @@ -1489,7 +1491,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) /* restart the scheduler */ kthread_unpark(ring->sched.thread); - up_read(>reset_sem); + amdgpu_read_unlock(dev); ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 3ee481557fc9..113c63bf187f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -247,12 +247,13 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; + struct drm_device *dev = adev_to_drm(adev); /* block amdgpu_gpu_recover till msg FLR COMPLETE received, * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. */ - if (!down_read_trylock(>reset_sem)) + if (amdgpu_read_lock(dev, true)) return; amdgpu_virt_fini_data_exchange(adev); @@ -268,7 +269,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) flr_done: atomic_set(>in_gpu_reset, 0); - up_read(>reset_sem); + amdgpu_read_unlock(dev); /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 48e588d3c409..2cd910e5caa7 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -268,12 +268,13 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; + struct drm_device *dev = adev_to_drm(adev); /* block amdgpu_gpu_recover till msg FLR COMPLETE received, * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. */ - if (!down_read_trylock(>reset_sem)) + if (amdgpu_read_lock(dev, true)) return; amdgpu_virt_fini_data_exchange(adev); @@ -289,7 +290,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) flr_done: atomic_set(>in_gpu_reset, 0); - up_read(>reset_sem); + amdgpu_read_unlock(
[PATCH 2/4] drm/amdgpu: refine the GPU recovery sequence
Changed to only set in_gpu_reset as 1 when the recovery thread begin, and delay hold reset_sem after pre-reset but before reset. It make sure that other threads have exited or been blocked before doing GPU reset. Compared with the old codes, it could make some threads exit more early without waiting for timeout. Introduce a event recovery_fini_event which is used to block new threads when recovery thread has begun. These threads are only waked up when recovery thread exit. v2: remove codes to check the usage of adev->reset_sem, because lockdep will show all locks held in the system, when system detect hung timeout in the recovery thread. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 02a34f9a26aa..67c716e5ee8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1044,6 +1044,8 @@ struct amdgpu_device { atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; struct rw_semaphore reset_sem; + wait_queue_head_t recovery_fini_event; + struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; @@ -1406,4 +1408,8 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev) { return atomic_read(>in_gpu_reset); } + +int amdgpu_read_lock(struct drm_device *dev, bool interruptible); +void amdgpu_read_unlock(struct drm_device *dev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 24ff5992cb02..15235610cc54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -211,6 +211,60 @@ static ssize_t amdgpu_device_get_serial_number(struct device *dev, static DEVICE_ATTR(serial_number, S_IRUGO, amdgpu_device_get_serial_number, NULL); +int amdgpu_read_lock(struct drm_device *dev, bool interruptible) +{ + struct amdgpu_device *adev = drm_to_adev(dev); + int ret = 0; + + /** +* if a thread hold the read lock, but recovery thread has started, +* it should release the read lock and wait for recovery thread finished +* Because pre-reset functions have begun, which stops old threads but no +* include the current thread. + */ + if (interruptible) { + while (!(ret = down_read_killable(>reset_sem)) && + amdgpu_in_reset(adev)) { + up_read(>reset_sem); + ret = wait_event_interruptible(adev->recovery_fini_event, + !amdgpu_in_reset(adev)); + if (ret) + break; + } + } else { + down_read(>reset_sem); + while (amdgpu_in_reset(adev)) { + up_read(>reset_sem); + wait_event(adev->recovery_fini_event, + !amdgpu_in_reset(adev)); + down_read(>reset_sem); + } + } + + return ret; +} + +void amdgpu_read_unlock(struct drm_device *dev) +{ + struct amdgpu_device *adev = drm_to_adev(dev); + + up_read(>reset_sem); +} + +static void amdgpu_write_lock(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) +{ + if (hive) { + down_write_nest_lock(>reset_sem, >hive_lock); + } else { + down_write(>reset_sem); + } +} + +static void amdgpu_write_unlock(struct amdgpu_device *adev) +{ + up_write(>reset_sem); +} + /** * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control * @@ -3280,6 +3334,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, hash_init(adev->mn_hash); atomic_set(>in_gpu_reset, 0); init_rwsem(>reset_sem); + init_waitqueue_head(>recovery_fini_event); mutex_init(>psp.mutex); mutex_init(>notifier_lock); @@ -4509,39 +4564,18 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, return r; } -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive) +static bool amdgpu_device_recovery_enter(struct amdgpu_device *adev) { if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0) return false; - if (hive) { - down_write_nest_lock(>reset_sem, >hive_lock); - } else { - down_write(>reset_sem); - } - - switch (amdgpu_asic_reset_method(adev)) { - case AMD_RESET_METHOD_MODE1: - adev->mp1_state = PP_MP1_STATE_SHUTDOWN; - break; - case AMD_RESET_METHOD_MODE2: - adev->mp1_state = PP_MP1_STATE_RESET; - break; -
[PATCH 1/4] drm/amdgpu: remove reset lock from low level functions
It is easy to cause performance drop issue when using lock in low level functions. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0b1e0127056f..24ff5992cb02 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -374,13 +374,10 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, if ((reg * 4) < adev->rmmio_size) { if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && - amdgpu_sriov_runtime(adev) && - down_read_trylock(>reset_sem)) { + amdgpu_sriov_runtime(adev)) ret = amdgpu_kiq_rreg(adev, reg); - up_read(>reset_sem); - } else { + else ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); - } } else { ret = adev->pcie_rreg(adev, reg * 4); } @@ -459,13 +456,10 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, if ((reg * 4) < adev->rmmio_size) { if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && - amdgpu_sriov_runtime(adev) && - down_read_trylock(>reset_sem)) { + amdgpu_sriov_runtime(adev)) amdgpu_kiq_wreg(adev, reg, v); - up_read(>reset_sem); - } else { + else writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); - } } else { adev->pcie_wreg(adev, reg * 4, v); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index a05dbbbd9803..9f6eaca107ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -155,11 +155,7 @@ static int __update_table_header(struct amdgpu_ras_eeprom_control *control, msg.addr = control->i2c_address; - /* i2c may be unstable in gpu reset */ - down_read(>reset_sem); ret = i2c_transfer(>pm.smu_i2c, , 1); - up_read(>reset_sem); - if (ret < 1) DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret); @@ -546,11 +542,7 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, control->next_addr += EEPROM_TABLE_RECORD_SIZE; } - /* i2c may be unstable in gpu reset */ - down_read(>reset_sem); ret = i2c_transfer(>pm.smu_i2c, msgs, num); - up_read(>reset_sem); - if (ret < 1) { DRM_ERROR("Failed to process EEPROM table records, ret:%d", ret); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 33e54eed2eec..690f368ce378 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -317,8 +317,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, * Directly use kiq to do the vm invalidation instead */ if (adev->gfx.kiq.ring.sched.ready && - (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - down_read_trylock(>reset_sem)) { + (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) { struct amdgpu_vmhub *hub = >vmhub[vmhub]; const unsigned eng = 17; u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type); @@ -328,7 +327,6 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid); - up_read(>reset_sem); return; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 1567dd227f51..ec3c05360776 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -757,14 +757,12 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, * as GFXOFF under bare metal */ if (adev->gfx.kiq.ring.sched.ready && - (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - down_read_trylock(>reset_sem)) { + (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) { uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng; uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid); - up
[PATCH 0/4] Refine GPU recovery sequence to enhance its stability
We have defined two variables in_gpu_reset and reset_sem in adev object. The atomic type variable in_gpu_reset is used to avoid recovery thread reenter and make lower functions return more earlier when recovery start, but couldn't block recovery thread when it access hardware. The r/w semaphore reset_sem is used to solve these synchronization issues between recovery thread and other threads. The original solution locked registers' access in lower functions, which will introduce following issues: 1) many lower functions are used in both recovery thread and others. Firstly we must harvest these functions, it is easy to miss someones. Secondly these functions need select which lock (read lock or write lock) will be used, according to the thread it is running in. If the thread context isn't considered, the added lock will easily introduce deadlock. Besides that, in most time, developer easily forget to add locks for new functions. 2) performance drop. More lower functions are more frequently called. 3) easily introduce false positive lockdep complaint, because write lock has big range in recovery thread, but low level functions will hold read lock may be protected by other locks in other threads. Therefore the new solution will try to add lock protection for ioctls of kfd. Its goal is that there are no threads except for recovery thread or its children (for xgmi) to access hardware when doing GPU reset and resume. So refine recovery thread as the following: Step 0: atomic_cmpxchg(>in_gpu_reset, 0, 1) 1). if failed, it means system had a recovery thread running, current thread exit directly; 2). if success, enter recovery thread; Step 1: cancel all delay works, stop drm schedule, complete all unreceived fences and so on. It try to stop or pause other threads. Step 2: call down_write(>reset_sem) to hold write lock, which will block recovery thread until other threads release read locks. Step 3: normally, there is only recovery threads running to access hardware, it is safe to do gpu reset now. Step 4: do post gpu reset, such as call all ips' resume functions; Step 5: atomic set adev->in_gpu_reset as 0, wake up other threads and release write lock. Recovery thread exit normally. Other threads call the amdgpu_read_lock to synchronize with recovery thread. If it finds that in_gpu_reset is 1, it should release read lock if it has holden one, and then blocks itself to wait for recovery finished event. If thread successfully hold read lock and in_gpu_reset is 0, it continues. It will exit normally or be stopped by recovery thread in step 1. Dennis Li (4): drm/amdgpu: remove reset lock from low level functions drm/amdgpu: refine the GPU recovery sequence drm/amdgpu: instead of using down/up_read directly drm/amdkfd: add reset lock protection for kfd entry functions drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 14 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 173 +- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 8 - drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c| 4 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 9 +- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 5 +- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 5 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 172 - drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 + .../amd/amdkfd/kfd_process_queue_manager.c| 17 ++ 12 files changed, 345 insertions(+), 75 deletions(-) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery
When GPU recovery thread is doing GPU reset, it is unsafe that other threads access hardware concurrently, which could cause GPU reset randomly hang. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 1624c2bc8285..c71d3bba5f69 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1033,6 +1033,7 @@ struct amdgpu_device { atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; struct rw_semaphore reset_sem; + struct thread_info *recovery_thread; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; @@ -1385,4 +1386,13 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev) { return atomic_read(>in_gpu_reset); } + +static inline bool amdgpu_in_recovery_thread(struct amdgpu_device *adev) +{ + if (unlikely(adev->recovery_thread != NULL) && + adev->recovery_thread == current_thread_info()) + return true; + + return false; +} #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 71805dfd9e25..7c17a5468d43 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -401,13 +401,22 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) */ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { + bool locked; + if (adev->in_pci_err_recovery) return; + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(>reset_sem); + if (offset < adev->rmmio_size) writeb(value, adev->rmmio + offset); else BUG(); + + if (locked) + up_read(>reset_sem); } /** @@ -424,15 +433,19 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + bool locked; + if (adev->in_pci_err_recovery) return; + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(>reset_sem); + if ((reg * 4) < adev->rmmio_size) { if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && - amdgpu_sriov_runtime(adev) && - down_read_trylock(>reset_sem)) { + amdgpu_sriov_runtime(adev)) { amdgpu_kiq_wreg(adev, reg, v); - up_read(>reset_sem); } else { writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } @@ -440,6 +453,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, adev->pcie_wreg(adev, reg * 4, v); } + if (locked) + up_read(>reset_sem); + trace_amdgpu_device_wreg(adev->pdev->device, reg, v); } @@ -451,9 +467,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v) { + bool locked; + if (adev->in_pci_err_recovery) return; + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(>reset_sem); + if (amdgpu_sriov_fullaccess(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { @@ -462,6 +484,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, } else { writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } + + if (locked) + up_read(>reset_sem); } /** @@ -496,15 +521,24 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { + bool locked; + if (adev->in_pci_err_recovery) return; + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(>reset_sem); + if ((reg * 4) < adev->rio_mem_size) iowrite32(v, adev->rio_mem + (reg * 4)); else { iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); } + + if (locked) + up_read(>reset_sem); } /** @@ -679,6 +713,11 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, unsigned long flags; void __iomem *pcie_index_offset; void __iomem *pcie_data_offset; + bool locked; + + locked
[PATCH v2] drm/amdgpu: remove unnecessary reading for epprom header
If the number of badpage records exceed the threshold, driver has updated both epprom header and control->tbl_hdr.header before gpu reset, therefore GPU recovery thread no need to read epprom header directly. v2: merge amdgpu_ras_check_err_threshold into amdgpu_ras_eeprom_check_err_threshold Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f0f7ed42ee7f..f2ff10403d93 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4397,7 +4397,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, * bad_page_threshold value to fix this once * probing driver again. */ - if (!amdgpu_ras_check_err_threshold(tmp_adev)) { + if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { /* must succeed. */ amdgpu_ras_resume(tmp_adev); } else { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 09546dec40ff..c669435ccc74 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2189,19 +2189,3 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) return false; } - -bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - bool exc_err_limit = false; - - if (con && (amdgpu_bad_page_threshold != 0)) - amdgpu_ras_eeprom_check_err_threshold(>eeprom_control, - _err_limit); - - /* -* We are only interested in variable exc_err_limit, -* as it says if GPU is in bad state or not. -*/ - return exc_err_limit; -} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index aed0716efa5a..42aab9adc263 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -491,8 +491,6 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev); unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, bool is_ce); -bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev); - /* error handling functions */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct eeprom_table_record *bps, int pages); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 19d9aa76cfbf..7f527f8bbdb1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -434,47 +434,21 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address) return curr_address; } -int amdgpu_ras_eeprom_check_err_threshold( - struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit) +bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) { - struct amdgpu_device *adev = to_amdgpu_device(control); - unsigned char buff[EEPROM_ADDRESS_SIZE + - EEPROM_TABLE_HEADER_SIZE] = { 0 }; - struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr; - struct i2c_msg msg = { - .addr = control->i2c_address, - .flags = I2C_M_RD, - .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE, - .buf = buff, - }; - int ret; - - *exceed_err_limit = false; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); if (!__is_ras_eeprom_supported(adev)) - return 0; - - /* read EEPROM table header */ - mutex_lock(>tbl_mutex); - ret = i2c_transfer(>pm.smu_i2c, , 1); - if (ret < 1) { - dev_err(adev->dev, "Failed to read EEPROM table header.\n"); - goto err; - } - - __decode_table_header_from_buff(hdr, [2]); + return false; - if (hdr->header == EEPROM_TABLE_HDR_BAD) { + if (con->eeprom_control.tbl_hdr.header == EEPROM_TABLE_HDR_BAD) { dev_warn(adev->dev, "This GPU is in BAD status."); dev_warn(adev->dev, "Please retire it or setting one bigger " "threshold value when reloading driver.\n"); - *exceed_err_limit = true; + return true; } -err: - mutex_unlock(>tbl_mutex); - return 0; + return false; } int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, di
[PATCH] drm/amdgpu: remove unnecessary reading for epprom header
If the number of badpage records exceed the threshold, driver has updated both epprom header and control->tbl_hdr.header before gpu reset, therefore GPU recovery thread no need to read epprom header directly. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 19d9aa76cfbf..4310ad63890c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -439,41 +439,19 @@ int amdgpu_ras_eeprom_check_err_threshold( bool *exceed_err_limit) { struct amdgpu_device *adev = to_amdgpu_device(control); - unsigned char buff[EEPROM_ADDRESS_SIZE + - EEPROM_TABLE_HEADER_SIZE] = { 0 }; - struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr; - struct i2c_msg msg = { - .addr = control->i2c_address, - .flags = I2C_M_RD, - .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE, - .buf = buff, - }; - int ret; *exceed_err_limit = false; if (!__is_ras_eeprom_supported(adev)) return 0; - /* read EEPROM table header */ - mutex_lock(>tbl_mutex); - ret = i2c_transfer(>pm.smu_i2c, , 1); - if (ret < 1) { - dev_err(adev->dev, "Failed to read EEPROM table header.\n"); - goto err; - } - - __decode_table_header_from_buff(hdr, [2]); - - if (hdr->header == EEPROM_TABLE_HDR_BAD) { + if (control->tbl_hdr.header == EEPROM_TABLE_HDR_BAD) { dev_warn(adev->dev, "This GPU is in BAD status."); dev_warn(adev->dev, "Please retire it or setting one bigger " "threshold value when reloading driver.\n"); *exceed_err_limit = true; } -err: - mutex_unlock(>tbl_mutex); return 0; } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2] drm/amdgpu: reserve backup pages for bad page retirment
it's not user friendly that users' visiable unused memories are decreased when bad pages are retired. Therefore reserve limit backup pages when init, and return ones when bad pages retired, to keep no change of unused memory size. v2: refine codes to calculate badpags threshold Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b7ee587484b2..ff4387bbfb1e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -170,7 +170,7 @@ struct amdgpu_mgpu_info mgpu_info = { }; int amdgpu_ras_enable = -1; uint amdgpu_ras_mask = 0x; -int amdgpu_bad_page_threshold = -1; +int amdgpu_bad_page_threshold = 100; /** * DOC: vramlimit (int) @@ -804,7 +804,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444); * faulty pages by ECC exceed threshold value and leave it for user's further * check. */ -MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)"); +MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value"); module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 93699ea4860c..09546dec40ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1747,13 +1747,14 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, return ret; } -static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, - uint32_t max_length) +static uint32_t +amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int tmp_threshold = amdgpu_bad_page_threshold; u64 val; + uint32_t max_length = 0; + max_length = amdgpu_ras_eeprom_get_record_max_length(); /* * Justification of value bad_page_cnt_threshold in ras structure * @@ -1779,20 +1780,18 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, tmp_threshold = max_length; if (tmp_threshold == -1) { - val = adev->gmc.mc_vram_size; + val = adev->gmc.real_vram_size; do_div(val, RAS_BAD_PAGE_RATE); - con->bad_page_cnt_threshold = min(lower_32_bits(val), - max_length); - } else { - con->bad_page_cnt_threshold = tmp_threshold; + tmp_threshold = min(lower_32_bits(val), max_length); } + + return tmp_threshold; } int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; - uint32_t max_eeprom_records_len = 0; bool exc_err_limit = false; int ret; @@ -1812,8 +1811,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(>in_recovery, 0); con->adev = adev; - max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); - amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + if (!con->bad_page_cnt_threshold) { + con->bad_page_cnt_threshold = + amdgpu_ras_calculate_badpags_threshold(adev); + + ret = amdgpu_vram_mgr_reserve_backup_pages( + ttm_manager_type(>mman.bdev, TTM_PL_VRAM), + con->bad_page_cnt_threshold); + if (ret) + goto out; + } ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit); /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 69ba8dd4f3ee..927d33d75c22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -52,6 +52,8 @@ struct amdgpu_vram_mgr { spinlock_t lock; struct list_head reservations_pending; struct list_head reserved_pages; + struct list_head backup_pages; + uint32_t num_backup_pages; atomic64_t usage; atomic64_t vis_usage; }; @@ -127,6 +129,8 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man); uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man); int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man, uint64_t start, uint64_t size); +int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man, +
[PATCH] drm/amdgpu: reserve backup pages for bad page retirment
it's not user friendly that users' visiable unused memories are decreased when bad pages are retired. Therefore reserve limit backup pages when init, and return ones when bad pages retired, to keep no change of unused memory size. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b7ee587484b2..ff4387bbfb1e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -170,7 +170,7 @@ struct amdgpu_mgpu_info mgpu_info = { }; int amdgpu_ras_enable = -1; uint amdgpu_ras_mask = 0x; -int amdgpu_bad_page_threshold = -1; +int amdgpu_bad_page_threshold = 100; /** * DOC: vramlimit (int) @@ -804,7 +804,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444); * faulty pages by ECC exceed threshold value and leave it for user's further * check. */ -MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)"); +MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value"); module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 93699ea4860c..fb1c3f6cef29 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1779,7 +1779,7 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, tmp_threshold = max_length; if (tmp_threshold == -1) { - val = adev->gmc.mc_vram_size; + val = adev->gmc.real_vram_size; do_div(val, RAS_BAD_PAGE_RATE); con->bad_page_cnt_threshold = min(lower_32_bits(val), max_length); @@ -1812,8 +1812,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(>in_recovery, 0); con->adev = adev; - max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); - amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + if (!con->bad_page_cnt_threshold) { + max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + + ret = amdgpu_vram_mgr_reserve_backup_pages( + ttm_manager_type(>mman.bdev, TTM_PL_VRAM), + con->bad_page_cnt_threshold); + if (ret) + goto out; + } ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit); /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 69ba8dd4f3ee..927d33d75c22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -52,6 +52,8 @@ struct amdgpu_vram_mgr { spinlock_t lock; struct list_head reservations_pending; struct list_head reserved_pages; + struct list_head backup_pages; + uint32_t num_backup_pages; atomic64_t usage; atomic64_t vis_usage; }; @@ -127,6 +129,8 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man); uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man); int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man, uint64_t start, uint64_t size); +int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man, +uint32_t num_pages); int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man, uint64_t start); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 21d18efca277..b325b067926b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -28,6 +28,9 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr, +uint32_t num_pages); + static inline struct amdgpu_vram_mgr *to_vram_mgr(struct ttm_resource_manager *man) { return container_of(man, struct amdgpu_vram_mgr, manager); @@ -189,6 +192,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev) spin_lock_init(>lock); INIT_LIST_HEAD(>reservations_pending); INIT_LIST_HEAD(>reserved_pages); + INIT_LIST_HEAD(>backup_pages); /* Add the two VRAM-related sysfs files */ ret = sysfs_create_files(>d
[PATCH] drm/amdgpu: Fix issue no bad_pages after umc ue injection
old code wrongly used the bad page status as the function return value, which cause amdgpu_ras_badpages_read always return failed. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c136bd449744..82e952696d24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1518,7 +1518,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; int i = 0; - int ret = 0; + int ret = 0, status; if (!con || !con->eh_data || !bps || !count) return -EINVAL; @@ -1543,12 +1543,12 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, .size = AMDGPU_GPU_PAGE_SIZE, .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, }; - ret = amdgpu_vram_mgr_query_page_status( + status = amdgpu_vram_mgr_query_page_status( ttm_manager_type(>mman.bdev, TTM_PL_VRAM), data->bps[i].retired_page); - if (ret == -EBUSY) + if (status == -EBUSY) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; - else if (ret == -ENOENT) + else if (status == -ENOENT) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2] drm/amdgpu: fix a GPU hang issue when remove device
When GFXOFF is enabled and GPU is idle, driver will fail to access some registers. Therefore change to disable power gating before all access registers with MMIO. Dmesg log is as following: amdgpu :03:00.0: amdgpu: amdgpu: finishing device. amdgpu: cp queue pipe 4 queue 0 preemption failed amdgpu :03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2 amdgpu :03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706 amdgpu :03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2 amdgpu :03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706 Signed-off-by: Dennis Li Change-Id: I42431f5d0bf54909e1df888a0d72fc009d8e196c diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1cb7d73f7317..b69c34074d8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2548,11 +2548,11 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_remove_device(adev); - amdgpu_amdkfd_device_fini(adev); - amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); + amdgpu_amdkfd_device_fini(adev); + /* need to disable SMC first */ for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.hw) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: fix a GPU hang issue when remove device
When GFXOFF is enabled and GPU is idle, driver will fail to access some registers. Therefore disable GFXOFF before unload device. amdgpu :03:00.0: amdgpu: amdgpu: finishing device. amdgpu: cp queue pipe 4 queue 0 preemption failed amdgpu :03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2 amdgpu :03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706 amdgpu :03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2 amdgpu :03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706 Signed-off-by: Dennis Li Change-Id: I42431f5d0bf54909e1df888a0d72fc009d8e196c diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index e365c4fdcfe3..47d1291d5053 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -83,6 +83,8 @@ void amdgpu_driver_unload_kms(struct drm_device *dev) if (adev == NULL) return; + amdgpu_gfx_off_ctrl(adev, false); + amdgpu_unregister_gpu_instance(adev); if (adev->rmmio == NULL) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: fix a memory protection fault when remove amdgpu device
ASD and TA share the same firmware in SIENNA_CICHLID and only TA firmware is requested during boot, so only need release TA firmware when remove device. [ 83.877150] general protection fault, probably for non-canonical address 0x1269f97e6ed04095: [#1] SMP PTI [ 83.888076] CPU: 0 PID: 1312 Comm: modprobe Tainted: GW OE 5.9.0-rc5-deli-amd-vangogh-0.0.6.6-114-gdd99d5669a96-dirty #2 [ 83.901160] Hardware name: System manufacturer System Product Name/TUF Z370-PLUS GAMING II, BIOS 0411 09/21/2018 [ 83.912353] RIP: 0010:free_fw_priv+0xc/0x120 [ 83.917531] Code: e8 99 cd b0 ff b8 a1 ff ff ff eb 9f 4c 89 f7 e8 8a cd b0 ff b8 f4 ff ff ff eb 90 0f 1f 00 0f 1f 44 00 00 55 48 89 e5 41 54 53 <4c> 8b 67 18 48 89 fb 4c 89 e7 e8 45 94 41 00 b8 ff ff ff ff f0 0f [ 83.937576] RSP: 0018:bc34c13a3ce0 EFLAGS: 00010206 [ 83.943699] RAX: bb681850 RBX: a047f117eb60 RCX: 80800055 [ 83.951879] RDX: bc34c1d5f000 RSI: 80800055 RDI: 1269f97e6ed04095 [ 83.959955] RBP: bc34c13a3cf0 R08: R09: 0001 [ 83.968107] R10: bc34c13a3cc8 R11: ff00 R12: a047d6b23378 [ 83.976166] R13: a047d6b23338 R14: a047d6b240c8 R15: [ 83.984295] FS: 7f74f6712540() GS:a047fbe0() knlGS: [ 83.993323] CS: 0010 DS: ES: CR0: 80050033 [ 84.56] CR2: 556a1cca4e18 CR3: 00021faa8004 CR4: 003706f0 [ 84.008128] DR0: DR1: DR2: [ 84.016155] DR3: DR6: fffe0ff0 DR7: 0400 [ 84.024174] Call Trace: [ 84.027514] release_firmware.part.11+0x4b/0x70 [ 84.033017] release_firmware+0x13/0x20 [ 84.037803] psp_sw_fini+0x77/0xb0 [amdgpu] [ 84.042857] amdgpu_device_fini+0x38c/0x5d0 [amdgpu] [ 84.048815] amdgpu_driver_unload_kms+0x43/0x70 [amdgpu] [ 84.055055] drm_dev_unregister+0x73/0xb0 [drm] [ 84.060499] drm_dev_unplug+0x28/0x30 [drm] [ 84.065598] amdgpu_dev_uninit+0x1b/0x40 [amdgpu] [ 84.071223] amdgpu_pci_remove+0x4e/0x70 [amdgpu] [ 84.076835] pci_device_remove+0x3e/0xc0 [ 84.081609] device_release_driver_internal+0xfb/0x1c0 [ 84.087558] driver_detach+0x4d/0xa0 [ 84.092041] bus_remove_driver+0x5f/0xe0 [ 84.096854] driver_unregister+0x2f/0x50 [ 84.101594] pci_unregister_driver+0x22/0xa0 [ 84.106806] amdgpu_exit+0x15/0x2b [amdgpu] Signed-off-by: Dennis Li Change-Id: Icc981a421499dff844855d5a662e91d1730c2754 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index eb19ae734396..b44b46dd60f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -564,7 +564,7 @@ static int psp_asd_load(struct psp_context *psp) * add workaround to bypass it for sriov now. * TODO: add version check to make it common */ - if (amdgpu_sriov_vf(psp->adev) || !psp->asd_fw) + if (amdgpu_sriov_vf(psp->adev) || !psp->asd_ucode_size) return 0; cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL); @@ -2779,11 +2779,10 @@ static int parse_ta_bin_descriptor(struct psp_context *psp, switch (desc->fw_type) { case TA_FW_TYPE_PSP_ASD: - psp->asd_fw_version= le32_to_cpu(desc->fw_version); + psp->asd_fw_version= le32_to_cpu(desc->fw_version); psp->asd_feature_version = le32_to_cpu(desc->fw_version); - psp->asd_ucode_size= le32_to_cpu(desc->size_bytes); + psp->asd_ucode_size= le32_to_cpu(desc->size_bytes); psp->asd_start_addr= ucode_start_addr; - psp->asd_fw= psp->ta_fw; break; case TA_FW_TYPE_PSP_XGMI: psp->ta_xgmi_ucode_version = le32_to_cpu(desc->fw_version); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/3] drm/amdgpu: remove redundant GPU reset
Because bad pages saving has been moved to UMC error interrupt callback, which will trigger a new GPU reset after saving. Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 10 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 16 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 0926c0770d7a..7c39d706e6d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -33,7 +33,6 @@ #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) #define AMDGPU_RAS_FLAG_INIT_NEED_RESET(0x1 << 1) -#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV (0x1 << 2) enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, @@ -513,14 +512,7 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - /* -* Save bad page to eeprom before gpu reset, i2c may be unstable -* in gpu reset. -* -* Also, exclude the case when ras recovery issuer is -* eeprom page write itself. -*/ - if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) && in_task()) + if (in_task()) amdgpu_ras_reserve_bad_pages(adev); if (atomic_cmpxchg(>in_recovery, 0, 1) == 0) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 695bcfc5c983..c3710c591b55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -479,7 +479,6 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, int i, ret = 0; struct i2c_msg *msgs, *msg; unsigned char *buffs, *buff; - bool sched_ras_recovery = false; struct eeprom_table_record *record; struct amdgpu_device *adev = to_amdgpu_device(control); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); @@ -517,7 +516,6 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, "Saved bad pages(%d) reaches threshold value(%d).\n", control->num_recs + num, ras->bad_page_cnt_threshold); control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD; - sched_ras_recovery = true; } /* In case of overflow just start from beginning to not lose newest records */ @@ -603,20 +601,6 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, __update_tbl_checksum(control, records, num, old_hdr_byte_sum); __update_table_header(control, buffs); - - if (sched_ras_recovery) { - /* -* Before scheduling ras recovery, assert the related -* flag first, which shall bypass common bad page -* reservation execution in amdgpu_ras_reset_gpu. -*/ - amdgpu_ras_get_context(adev)->flags |= - AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV; - - dev_warn(adev->dev, "Conduct ras recovery due to bad " - "page threshold reached.\n"); - amdgpu_ras_reset_gpu(adev); - } } else if (!__validate_tbl_checksum(control, records, num)) { DRM_WARN("EEPROM Table checksum mismatch!"); /* TODO Uncomment when EEPROM read/write is relliable */ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 3/3] drm/amdgpu: fix the issue of reserving bad pages failed
In amdgpu_ras_reset_gpu, because bad pages may not be freed, it has high probability to reserve bad pages failed. Change to reserve bad pages when freeing VRAM. v2: 1. avoid allocating the drm_mm node outside of amdgpu_vram_mgr.c 2. move bad page reserving into amdgpu_ras_add_bad_pages, if vram mgr reserve bad page failed, it will put it into pending list, otherwise put it into processed list; 3. remove amdgpu_ras_release_bad_pages, because retired page's info has been moved into amdgpu_vram_mgr v3: 1. formate code style; 2. rename amdgpu_vram_reserve_scope as amdgpu_vram_reservation; 3. rename scope_pending as reservations_pending; 4. rename scope_processed as reserved_pages; 5. change to iterate over all the pending ones and try to insert them with drm_mm_reserve_node(); v4: 1. rename amdgpu_vram_mgr_reserve_scope as amdgpu_vram_mgr_reserve_range; 2. remove unused include "amdgpu_ras.h"; 3. rename amdgpu_vram_mgr_check_and_reserve as amdgpu_vram_mgr_do_reserve; 4. refine amdgpu_vram_mgr_reserve_range to call amdgpu_vram_mgr_do_reserve. Signed-off-by: Dennis Li Signed-off-by: Wenhui Sheng --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 150 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 8 - drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 4 + drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 115 ++ 4 files changed, 158 insertions(+), 119 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0f57a0003df6..84bb55ab6ac5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -80,6 +80,8 @@ enum amdgpu_ras_retire_page_reservation { atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); +static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, + uint64_t addr); static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); @@ -1573,10 +1575,12 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, .size = AMDGPU_GPU_PAGE_SIZE, .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, }; - - if (data->last_reserved <= i) + ret = amdgpu_vram_mgr_query_page_status( + >mman.bdev.man[TTM_PL_VRAM], + data->bps[i].retired_page); + if (ret == -EBUSY) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; - else if (data->bps_bo[i] == NULL) + else if (ret == -ENOENT) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; } @@ -1628,12 +1632,9 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, unsigned int new_space = old_space + pages; unsigned int align_space = ALIGN(new_space, 512); void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); - struct amdgpu_bo **bps_bo = - kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL); - if (!bps || !bps_bo) { + if (!bps) { kfree(bps); - kfree(bps_bo); return -ENOMEM; } @@ -1642,14 +1643,8 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, data->count * sizeof(*data->bps)); kfree(data->bps); } - if (data->bps_bo) { - memcpy(bps_bo, data->bps_bo, - data->count * sizeof(*data->bps_bo)); - kfree(data->bps_bo); - } data->bps = bps; - data->bps_bo = bps_bo; data->space_left += align_space - old_space; return 0; } @@ -1661,6 +1656,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; int ret = 0; + uint32_t i; if (!con || !con->eh_data || !bps || pages <= 0) return 0; @@ -1670,16 +1666,26 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, if (!data) goto out; - if (data->space_left <= pages) - if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { + for (i = 0; i < pages; i++) { + if (amdgpu_ras_check_bad_page_unlock(con, + bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) + continue; + + if (!data->space_left && + amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { ret = -ENOMEM; goto out; } - memcpy(>bps[data->count], bps, pages * sizeof(*data->bps)); -
[PATCH 1/3] drm/amdgpu: change to save bad pages in UMC error interrupt callback
Instead of saving bad pages in amdgpu_ras_reset_gpu, it will reduce the unnecessary calling of amdgpu_ras_save_bad_pages. Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 7 --- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 7f79d25fbccc..0f57a0003df6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1690,7 +1690,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, * write error record array to eeprom, the function should be * protected by recovery_lock */ -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) +int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; @@ -1863,9 +1863,6 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) data->last_reserved = i + 1; bo = NULL; } - - /* continue to save bad pages to eeprom even reesrve_vram fails */ - ret = amdgpu_ras_save_bad_pages(adev); out: mutex_unlock(>recovery_lock); return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6b8d7bb83bb3..0926c0770d7a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -506,6 +506,7 @@ bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev); int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct eeprom_table_record *bps, int pages); +int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev); int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev); static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 262baf0f61ea..a2975c8092a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -126,10 +126,11 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, err_data->ue_count); if ((amdgpu_bad_page_threshold != 0) && - err_data->err_addr_cnt && + err_data->err_addr_cnt) { amdgpu_ras_add_bad_pages(adev, err_data->err_addr, - err_data->err_addr_cnt)) - dev_warn(adev->dev, "Failed to add ras bad page!\n"); + err_data->err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } amdgpu_ras_reset_gpu(adev); } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 0/3] Refine the codes about reseving bad pages.
Beside umc, others' UE interrupt callback could enter into amdgpu_ras_reset_gpu, so the first patch change to save bad pages in UMC error interrupt callback. When bad page error happens, the bad page mostly still be hold by some process, therefore driver will fail to reserve the bad page. The third patch will reserve the bad page when freeing it, make system has no chance to allocate it to other proccess. Dennis Li (3): drm/amdgpu: change to save bad pages in UMC error interrupt callback drm/amdgpu: remove redundant GPU reset drm/amdgpu: fix the issue of reserving bad pages failed drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 155 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 17 +- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 16 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 4 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 7 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 115 + 6 files changed, 164 insertions(+), 150 deletions(-) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: protect eeprom update from GPU reset
because i2c is unstable in GPU reset, driver need protect eeprom update from GPU reset, to not miss any bad page record. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 0e64c39a2372..695bcfc5c983 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -149,7 +149,11 @@ static int __update_table_header(struct amdgpu_ras_eeprom_control *control, msg.addr = control->i2c_address; + /* i2c may be unstable in gpu reset */ + down_read(>reset_sem); ret = i2c_transfer(>pm.smu_i2c, , 1); + up_read(>reset_sem); + if (ret < 1) DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret); @@ -557,7 +561,11 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, control->next_addr += EEPROM_TABLE_RECORD_SIZE; } + /* i2c may be unstable in gpu reset */ + down_read(>reset_sem); ret = i2c_transfer(>pm.smu_i2c, msgs, num); + up_read(>reset_sem); + if (ret < 1) { DRM_ERROR("Failed to process EEPROM table records, ret:%d", ret); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdkfd: fix a memory leak issue
In the resume stage of GPU recovery, start_cpsch will call pm_init which set pm->allocated as false, cause the next pm_release_ib has no chance to release ib memory. Add pm_release_ib in stop_cpsch which will be called in the suspend stage of GPU recovery. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 069ba4be1e8f..20ef048d6a03 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1192,6 +1192,8 @@ static int stop_cpsch(struct device_queue_manager *dqm) dqm->sched_running = false; dqm_unlock(dqm); + pm_release_ib(>packets); + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(>packets, hanging); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2] drm/kfd: fix a system crash issue during GPU recovery
The crash log as the below: [Thu Aug 20 23:18:14 2020] general protection fault: [#1] SMP NOPTI [Thu Aug 20 23:18:14 2020] CPU: 152 PID: 1837 Comm: kworker/152:1 Tainted: G OE 5.4.0-42-generic #46~18.04.1-Ubuntu [Thu Aug 20 23:18:14 2020] Hardware name: GIGABYTE G482-Z53-YF/MZ52-G40-00, BIOS R12 05/13/2020 [Thu Aug 20 23:18:14 2020] Workqueue: events amdgpu_ras_do_recovery [amdgpu] [Thu Aug 20 23:18:14 2020] RIP: 0010:evict_process_queues_cpsch+0xc9/0x130 [amdgpu] [Thu Aug 20 23:18:14 2020] Code: 49 8d 4d 10 48 39 c8 75 21 eb 44 83 fa 03 74 36 80 78 72 00 74 0c 83 ab 68 01 00 00 01 41 c6 45 41 00 48 8b 00 48 39 c8 74 25 <80> 78 70 00 c6 40 6d 01 74 ee 8b 50 28 c6 40 70 00 83 ab 60 01 00 [Thu Aug 20 23:18:14 2020] RSP: 0018:b29b52f6fc90 EFLAGS: 00010213 [Thu Aug 20 23:18:14 2020] RAX: 1c884edb0a118914 RBX: 8a0d45ff3c00 RCX: 8a2d83e41038 [Thu Aug 20 23:18:14 2020] RDX: RSI: 0082 RDI: 8a0e2e4178c0 [Thu Aug 20 23:18:14 2020] RBP: b29b52f6fcb0 R08: 1b64 R09: 0004 [Thu Aug 20 23:18:14 2020] R10: b29b52f6fb78 R11: 0001 R12: 8a0d45ff3d28 [Thu Aug 20 23:18:14 2020] R13: 8a2d83e41028 R14: R15: [Thu Aug 20 23:18:14 2020] FS: () GS:8a0e2e40() knlGS: [Thu Aug 20 23:18:14 2020] CS: 0010 DS: ES: CR0: 80050033 [Thu Aug 20 23:18:14 2020] CR2: 55c783c0e6a8 CR3: 0034a1284000 CR4: 00340ee0 [Thu Aug 20 23:18:14 2020] Call Trace: [Thu Aug 20 23:18:14 2020] kfd_process_evict_queues+0x43/0xd0 [amdgpu] [Thu Aug 20 23:18:14 2020] kfd_suspend_all_processes+0x60/0xf0 [amdgpu] [Thu Aug 20 23:18:14 2020] kgd2kfd_suspend.part.7+0x43/0x50 [amdgpu] [Thu Aug 20 23:18:14 2020] kgd2kfd_pre_reset+0x46/0x60 [amdgpu] [Thu Aug 20 23:18:14 2020] amdgpu_amdkfd_pre_reset+0x1a/0x20 [amdgpu] [Thu Aug 20 23:18:14 2020] amdgpu_device_gpu_recover+0x377/0xf90 [amdgpu] [Thu Aug 20 23:18:14 2020] ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu] [Thu Aug 20 23:18:14 2020] amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [Thu Aug 20 23:18:14 2020] process_one_work+0x20f/0x400 [Thu Aug 20 23:18:14 2020] worker_thread+0x34/0x410 When GPU hang, user process will fail to create a compute queue whose struct object will be freed later, but driver wrongly add this queue to queue list of the proccess. And then kfd_process_evict_queues will access a freed memory, which cause a system crash. v2: The failure to execute_queues should probably not be reported to the caller of create_queue, because the queue was already created. Therefore change to ignore the return value from execute_queues. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 560adc57a050..069ba4be1e8f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1302,7 +1302,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, if (q->properties.is_active) { increment_queue_count(dqm, q->properties.type); - retval = execute_queues_cpsch(dqm, + execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2] drm/amdgpu: block ring buffer access during GPU recovery
When GPU is in reset, its status isn't stable and ring buffer also need be reset when resuming. Therefore driver should protect GPU recovery thread from ring buffer accessed by other threads. Otherwise GPU will randomly hang during recovery. v2: correct indent Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 172dc47b7f39..9b586bc80c38 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -319,8 +319,12 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, { uint32_t ret; - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) - return amdgpu_kiq_rreg(adev, reg); + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && + down_read_trylock(>reset_sem)) { + ret = amdgpu_kiq_rreg(adev, reg); + up_read(>reset_sem); + return ret; + } if ((reg * 4) < adev->rmmio_size) ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ -332,6 +336,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); spin_unlock_irqrestore(>mmio_idx_lock, flags); } + trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); return ret; } @@ -407,8 +412,12 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) - return amdgpu_kiq_wreg(adev, reg, v); + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && + down_read_trylock(>reset_sem)) { + amdgpu_kiq_wreg(adev, reg, v); + up_read(>reset_sem); + return; + } amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index ad9ad622ccce..31359e519d69 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -287,8 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, */ if (adev->gfx.kiq.ring.sched.ready && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - !amdgpu_in_reset(adev)) { - + down_read_trylock(>reset_sem)) { struct amdgpu_vmhub *hub = >vmhub[vmhub]; const unsigned eng = 17; u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type); @@ -297,6 +296,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid); + + up_read(>reset_sem); return; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index e1a0ae327cf5..c602ddc68384 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -500,13 +500,14 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, * as GFXOFF under bare metal */ if (adev->gfx.kiq.ring.sched.ready && - (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - !amdgpu_in_reset(adev)) { + (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && + down_read_trylock(>reset_sem)) { uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng; uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid); + up_read(>reset_sem); return; } @@ -599,7 +600,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, if (amdgpu_in_reset(adev)) return -EIO; - if (ring->sched.ready) { + if (ring->sched.ready && down_read_trylock(>reset_sem)) { /* Vega20+XGMI caches PTEs in TC and TLB. Add a * heavy-weight TLB flush (type 2), which flushes * both. Due to a race condition with concurrent @@ -626,6 +627,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, if (r) { amdgpu_ring
[PATCH] drm/kfd: fix a system crash issue during GPU recovery
The crash log as the below: [Thu Aug 20 23:18:14 2020] general protection fault: [#1] SMP NOPTI [Thu Aug 20 23:18:14 2020] CPU: 152 PID: 1837 Comm: kworker/152:1 Tainted: G OE 5.4.0-42-generic #46~18.04.1-Ubuntu [Thu Aug 20 23:18:14 2020] Hardware name: GIGABYTE G482-Z53-YF/MZ52-G40-00, BIOS R12 05/13/2020 [Thu Aug 20 23:18:14 2020] Workqueue: events amdgpu_ras_do_recovery [amdgpu] [Thu Aug 20 23:18:14 2020] RIP: 0010:evict_process_queues_cpsch+0xc9/0x130 [amdgpu] [Thu Aug 20 23:18:14 2020] Code: 49 8d 4d 10 48 39 c8 75 21 eb 44 83 fa 03 74 36 80 78 72 00 74 0c 83 ab 68 01 00 00 01 41 c6 45 41 00 48 8b 00 48 39 c8 74 25 <80> 78 70 00 c6 40 6d 01 74 ee 8b 50 28 c6 40 70 00 83 ab 60 01 00 [Thu Aug 20 23:18:14 2020] RSP: 0018:b29b52f6fc90 EFLAGS: 00010213 [Thu Aug 20 23:18:14 2020] RAX: 1c884edb0a118914 RBX: 8a0d45ff3c00 RCX: 8a2d83e41038 [Thu Aug 20 23:18:14 2020] RDX: RSI: 0082 RDI: 8a0e2e4178c0 [Thu Aug 20 23:18:14 2020] RBP: b29b52f6fcb0 R08: 1b64 R09: 0004 [Thu Aug 20 23:18:14 2020] R10: b29b52f6fb78 R11: 0001 R12: 8a0d45ff3d28 [Thu Aug 20 23:18:14 2020] R13: 8a2d83e41028 R14: R15: [Thu Aug 20 23:18:14 2020] FS: () GS:8a0e2e40() knlGS: [Thu Aug 20 23:18:14 2020] CS: 0010 DS: ES: CR0: 80050033 [Thu Aug 20 23:18:14 2020] CR2: 55c783c0e6a8 CR3: 0034a1284000 CR4: 00340ee0 [Thu Aug 20 23:18:14 2020] Call Trace: [Thu Aug 20 23:18:14 2020] kfd_process_evict_queues+0x43/0xd0 [amdgpu] [Thu Aug 20 23:18:14 2020] kfd_suspend_all_processes+0x60/0xf0 [amdgpu] [Thu Aug 20 23:18:14 2020] kgd2kfd_suspend.part.7+0x43/0x50 [amdgpu] [Thu Aug 20 23:18:14 2020] kgd2kfd_pre_reset+0x46/0x60 [amdgpu] [Thu Aug 20 23:18:14 2020] amdgpu_amdkfd_pre_reset+0x1a/0x20 [amdgpu] [Thu Aug 20 23:18:14 2020] amdgpu_device_gpu_recover+0x377/0xf90 [amdgpu] [Thu Aug 20 23:18:14 2020] ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu] [Thu Aug 20 23:18:14 2020] amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [Thu Aug 20 23:18:14 2020] process_one_work+0x20f/0x400 [Thu Aug 20 23:18:14 2020] worker_thread+0x34/0x410 When GPU hang, user process will fail to create a compute queue whose struct object will be freed later, but driver wrongly add this queue to queue list of the proccess. And then kfd_process_evict_queues will access a freed memory, which cause a system crash. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 560adc57a050..d5e6b07ffb27 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1296,16 +1296,18 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, >gart_mqd_addr, >properties); - list_add(>list, >queues_list); - qpd->queue_count++; - if (q->properties.is_active) { increment_queue_count(dqm, q->properties.type); retval = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + if (retval) + goto out_execute_cpsch; } + list_add(>list, >queues_list); + qpd->queue_count++; + /* * Unconditionally increment this counter, regardless of the queue's * type or whether the queue is active. @@ -1318,6 +1320,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, dqm_unlock(dqm); return retval; +out_execute_cpsch: + decrement_queue_count(dqm, q->properties.type); + dqm_unlock(dqm); out_deallocate_doorbell: deallocate_doorbell(qpd, q); out_deallocate_sdma_queue: -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: block ring buffer access during GPU recovery
When GPU is in reset, its status isn't stable and ring buffer also need be reset when resuming. Therefore driver should protect GPU recovery thread from ring buffer accessed by other threads. Otherwise GPU will randomly hang during recovery. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 172dc47b7f39..8db56a22cd1b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, { uint32_t ret; - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) - return amdgpu_kiq_rreg(adev, reg); + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && + amdgpu_sriov_runtime(adev) && + down_read_trylock(>reset_sem)) { + ret = amdgpu_kiq_rreg(adev, reg); + up_read(>reset_sem); + return ret; + } if ((reg * 4) < adev->rmmio_size) ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ -332,6 +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); spin_unlock_irqrestore(>mmio_idx_lock, flags); } + trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); return ret; } @@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) - return amdgpu_kiq_wreg(adev, reg, v); + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && + amdgpu_sriov_runtime(adev) && + down_read_trylock(>reset_sem)) { + amdgpu_kiq_wreg(adev, reg, v); + up_read(>reset_sem); + return; + } amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index ad9ad622ccce..4ea2a065daa9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, */ if (adev->gfx.kiq.ring.sched.ready && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - !amdgpu_in_reset(adev)) { + down_read_trylock(>reset_sem)) { struct amdgpu_vmhub *hub = >vmhub[vmhub]; const unsigned eng = 17; @@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid); + + up_read(>reset_sem); return; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index e1a0ae327cf5..33b7cf1c79ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, */ if (adev->gfx.kiq.ring.sched.ready && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - !amdgpu_in_reset(adev)) { + down_read_trylock(>reset_sem)) { uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng; uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid); + up_read(>reset_sem); return; } @@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, if (amdgpu_in_reset(adev)) return -EIO; - if (ring->sched.ready) { + if (ring->sched.ready && +down_read_trylock(>reset_sem)) { /* Vega20+XGMI caches PTEs in TC and TLB. Add a * heavy-weight TLB flush (type 2), which flushes * both. Due to a race condition with concurrent @@ -626,6 +628,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, if (r) { amdgpu_ring_undo(ring); spin_unlock(>gfx.kiq.ring_lock); + up_read(>reset_sem); return -ETIME;
[PATCH] drm/amdgpu: skip scheduling IBs when GPU recovery
If GPU begin to do recovery, skip scheduling IBs. Otherwise GPU recovery randomly fail. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index dcfe8a3b03ff..054d7b0357fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -212,6 +212,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) struct dma_fence *fence = NULL, *finished; struct amdgpu_job *job; int r = 0; + int locked; job = to_amdgpu_job(sched_job); finished = >base.s_fence->finished; @@ -220,6 +221,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) trace_amdgpu_sched_run_job(job); + locked = down_read_trylock(>adev->reset_sem); + if (!locked) + dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if GPU recovery */ + if (job->vram_lost_counter != atomic_read(>adev->vram_lost_counter)) dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */ @@ -231,6 +236,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) if (r) DRM_ERROR("Error scheduling IBs (%d)\n", r); } + + if (locked) + up_read(>adev->reset_sem); + /* if gpu reset, hw fence will be replaced here */ dma_fence_put(job->fence); job->fence = dma_fence_get(fence); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore
clients don't need reset-lock for synchronization when no GPU recovery. v2: change to return the return value of down_read_killable. v3: if GPU recovery begin, VF ignore FLR notification. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c8aec832b244..ec11ed2a9ca4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -954,7 +954,7 @@ struct amdgpu_device { atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; - struct mutex lock_reset; + struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 79b397800cbc..cc5c7f81c540 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) file->private_data = adev; - mutex_lock(>lock_reset); + ret = down_read_killable(>reset_sem); + if (ret) + return ret; + if (adev->autodump.dumping.done) { reinit_completion(>autodump.dumping); ret = 0; } else { ret = -EBUSY; } - mutex_unlock(>lock_reset); + + up_read(>reset_sem); return ret; } @@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) } /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(>lock_reset); + r = down_read_killable(>reset_sem); + if (r) + return r; /* hold on the scheduler */ for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) kthread_unpark(ring->sched.thread); } - mutex_unlock(>lock_reset); + up_read(>reset_sem); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); @@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) return -ENOMEM; /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(>lock_reset); + r = down_read_killable(>reset_sem); + if (r) + goto pro_end; /* stop the scheduler */ kthread_park(ring->sched.thread); @@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) /* restart the scheduler */ kthread_unpark(ring->sched.thread); - mutex_unlock(>lock_reset); + up_read(>reset_sem); ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched); +pro_end: kfree(fences); - return 0; + return r; } static int amdgpu_debugfs_sclk_set(void *data, u64 val) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 78fd2c9a7b7d..82242e2f5658 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(>virt.vf_errors.lock); hash_init(adev->mn_hash); atomic_set(>in_gpu_reset, 0); - mutex_init(>lock_reset); + init_rwsem(>reset_sem); mutex_init(>psp.mutex); mutex_init(>notifier_lock); @@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev) if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0) return false; - mutex_lock(>lock_reset); + down_write(>reset_sem); atomic_inc(>gpu_reset_counter); switch (amdgpu_asic_reset_method(adev)) { @@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; atomic_set(>in_gpu_reset, 0); - mutex_unlock(>lock_reset); + up_write(>reset_sem); } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index f27d83f2de78..9c07014d9bd6 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -238,19 +238,15 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; - int locked; /* block
[PATCH v2] drm/amdgpu: change reset lock from mutex to rw_semaphore
clients don't need reset-lock for synchronization when no GPU recovery. v2: change to return the return value of down_read_killable. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c8aec832b244..ec11ed2a9ca4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -954,7 +954,7 @@ struct amdgpu_device { atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; - struct mutex lock_reset; + struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 79b397800cbc..cc5c7f81c540 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) file->private_data = adev; - mutex_lock(>lock_reset); + ret = down_read_killable(>reset_sem); + if (ret) + return ret; + if (adev->autodump.dumping.done) { reinit_completion(>autodump.dumping); ret = 0; } else { ret = -EBUSY; } - mutex_unlock(>lock_reset); + + up_read(>reset_sem); return ret; } @@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) } /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(>lock_reset); + r = down_read_killable(>reset_sem); + if (r) + return r; /* hold on the scheduler */ for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) kthread_unpark(ring->sched.thread); } - mutex_unlock(>lock_reset); + up_read(>reset_sem); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); @@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) return -ENOMEM; /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(>lock_reset); + r = down_read_killable(>reset_sem); + if (r) + goto pro_end; /* stop the scheduler */ kthread_park(ring->sched.thread); @@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) /* restart the scheduler */ kthread_unpark(ring->sched.thread); - mutex_unlock(>lock_reset); + up_read(>reset_sem); ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched); +pro_end: kfree(fences); - return 0; + return r; } static int amdgpu_debugfs_sclk_set(void *data, u64 val) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 78fd2c9a7b7d..82242e2f5658 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(>virt.vf_errors.lock); hash_init(adev->mn_hash); atomic_set(>in_gpu_reset, 0); - mutex_init(>lock_reset); + init_rwsem(>reset_sem); mutex_init(>psp.mutex); mutex_init(>notifier_lock); @@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev) if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0) return false; - mutex_lock(>lock_reset); + down_write(>reset_sem); atomic_inc(>gpu_reset_counter); switch (amdgpu_asic_reset_method(adev)) { @@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; atomic_set(>in_gpu_reset, 0); - mutex_unlock(>lock_reset); + up_write(>reset_sem); } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index f27d83f2de78..8ac63f13fc6f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -238,19 +238,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; - int locked; /* block amdgpu_gpu_recover till msg FLR COMPLETE received, *
[PATCH] drm/amdgpu: refine message print for devices of hive
Using dev_xxx instead of DRM_xxx/pr_xxx to indicate which device of a hive is the message for. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 81b1d9a1dca0..08548e051cc0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3370,7 +3370,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) { int r; - DRM_INFO("amdgpu: finishing device.\n"); + dev_info(adev->dev, "amdgpu: finishing device.\n"); flush_delayed_work(>delayed_init_work); adev->shutdown = true; @@ -3555,12 +3555,12 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) if (amdgpu_device_need_post(adev)) { r = amdgpu_atom_asic_init(adev->mode_info.atom_context); if (r) - DRM_ERROR("amdgpu asic init failed\n"); + dev_err(adev->dev, "amdgpu asic init failed\n"); } r = amdgpu_device_ip_resume(adev); if (r) { - DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); + dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); return r; } amdgpu_fence_driver_resume(adev); @@ -3584,7 +3584,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) if (r == 0) { r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); if (r != 0) - DRM_ERROR("Failed to pin cursor BO (%d)\n", r); + dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); amdgpu_bo_unreserve(aobj); } @@ -3674,7 +3674,7 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) adev->ip_blocks[i].status.hang = adev->ip_blocks[i].version->funcs->check_soft_reset(adev); if (adev->ip_blocks[i].status.hang) { - DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); + dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); asic_hang = true; } } @@ -3735,7 +3735,7 @@ static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { if (adev->ip_blocks[i].status.hang) { - DRM_INFO("Some block need full reset!\n"); + dev_info(adev->dev, "Some block need full reset!\n"); return true; } } @@ -3823,7 +3823,7 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) else tmo = msecs_to_jiffies(100); - DRM_INFO("recover vram bo from shadow start\n"); + dev_info(adev->dev, "recover vram bo from shadow start\n"); mutex_lock(>shadow_list_lock); list_for_each_entry(shadow, >shadow_list, shadow_list) { @@ -3859,11 +3859,11 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) dma_fence_put(fence); if (r < 0 || tmo <= 0) { - DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); + dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); return -EIO; } - DRM_INFO("recover vram bo from shadow done\n"); + dev_info(adev->dev, "recover vram bo from shadow done\n"); return 0; } @@ -3962,7 +3962,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev) bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) { if (!amdgpu_device_ip_check_soft_reset(adev)) { - DRM_INFO("Timeout, but no hardware hang detected.\n"); + dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); return false; } @@ -4002,7 +4002,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) return true; disabled: - DRM_INFO("GPU recovery disabled.\n"); +
[PATCH] drm/amdgpu: fix the nullptr issue when reenter GPU recovery
in single gpu system, if driver reenter gpu recovery, amdgpu_device_lock_adev will return false, but hive is nullptr now. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 82242e2f5658..81b1d9a1dca0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4371,8 +4371,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (!amdgpu_device_lock_adev(tmp_adev)) { DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", job ? job->base.id : -1); - mutex_unlock(>hive_lock); - return 0; + r = 0; + goto skip_recovery; } /* @@ -4505,6 +4505,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_device_unlock_adev(tmp_adev); } +skip_recovery: if (hive) { atomic_set(>in_reset, 0); mutex_unlock(>hive_lock); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: change reset lock from mutex to rw_semaphore
clients don't need reset-lock for synchronization when no GPU recovery. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c8aec832b244..ec11ed2a9ca4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -954,7 +954,7 @@ struct amdgpu_device { atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; - struct mutex lock_reset; + struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 79b397800cbc..0090e850eab9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -101,14 +101,17 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) file->private_data = adev; - mutex_lock(>lock_reset); + if (down_read_killable(>reset_sem)) + return -EINTR; + if (adev->autodump.dumping.done) { reinit_completion(>autodump.dumping); ret = 0; } else { ret = -EBUSY; } - mutex_unlock(>lock_reset); + + up_read(>reset_sem); return ret; } @@ -1242,7 +1245,8 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) } /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(>lock_reset); + if (down_read_killable(>reset_sem)) + return -EINTR; /* hold on the scheduler */ for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1273,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) kthread_unpark(ring->sched.thread); } - mutex_unlock(>lock_reset); + up_read(>reset_sem); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); @@ -1459,7 +1463,10 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) return -ENOMEM; /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(>lock_reset); + if (down_read_killable(>reset_sem)) { + kfree(fences); + return -EINTR; + } /* stop the scheduler */ kthread_park(ring->sched.thread); @@ -1500,7 +1507,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) /* restart the scheduler */ kthread_unpark(ring->sched.thread); - mutex_unlock(>lock_reset); + up_read(>reset_sem); ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 78fd2c9a7b7d..82242e2f5658 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(>virt.vf_errors.lock); hash_init(adev->mn_hash); atomic_set(>in_gpu_reset, 0); - mutex_init(>lock_reset); + init_rwsem(>reset_sem); mutex_init(>psp.mutex); mutex_init(>notifier_lock); @@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev) if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0) return false; - mutex_lock(>lock_reset); + down_write(>reset_sem); atomic_inc(>gpu_reset_counter); switch (amdgpu_asic_reset_method(adev)) { @@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; atomic_set(>in_gpu_reset, 0); - mutex_unlock(>lock_reset); + up_write(>reset_sem); } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index f27d83f2de78..8ac63f13fc6f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -238,19 +238,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; - int locked; /* block amdgpu_gpu_recover till msg FLR COMPLETE received, * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. -* -* we can unlock the lock_reset to allow "amdgpu_job_timedout" -* to run gpu_recover() aft
[PATCH v2] drm/amdgpu: refine codes to avoid reentering GPU recovery
if other threads have holden the reset lock, recovery will fail to try_lock. Therefore we introduce atomic hive->in_reset and adev->in_gpu_reset, to avoid reentering GPU recovery. v2: drop "? true : false" in the definition of amdgpu_in_reset Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 8ba389780001..c8aec832b244 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -952,7 +952,7 @@ struct amdgpu_device { boolin_suspend; boolin_hibernate; - boolin_gpu_reset; + atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; struct mutex lock_reset; struct amdgpu_doorbell_index doorbell_index; @@ -1270,4 +1270,8 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } +static inline int amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(>in_gpu_reset); +} #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 691c89705bcd..b872cdb0b705 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; #if 0 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 0b7e78748540..832a200bb62f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long flags, end_jiffies; int retry; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index ccd635b812b5..d0940121a6a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, int retry; struct vi_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 4cd851fc5c82..64fdb6a81c47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -554,7 +554,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v9_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 35fed75a4397..79b397800cbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_ poll_wait(file, >autodump.gpu_hang, poll_table); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return POLLIN | POLLRDNORM | POLLWRNORM; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6573e1112462..78fd2c9a7b7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->ip_blocks[i].status.hw == true) break; - if (adev->in_gpu_reset || adev->in_suspend) { + if (amdgpu_in_reset(adev) || adev->in_suspend) { r = adev->ip_blocks[i].version->funcs->resume(adev); if (r) { DRM_ERROR("resume of IP block <%s> failed %d\n", @@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) AMDGPU_RESET_MAGIC_NUM)) return true; - if (!adev->in_gpu_reset) + if (!amdgpu_in_r
[PATCH] drm/amdgpu: refine codes to avoid reentering GPU recovery
if other threads have holden the reset lock, recovery will fail to try_lock. Therefore we introduce atomic hive->in_reset and adev->in_gpu_reset, to avoid reentering GPU recovery. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 8ba389780001..0fba65efdb48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -952,7 +952,7 @@ struct amdgpu_device { boolin_suspend; boolin_hibernate; - boolin_gpu_reset; + atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; struct mutex lock_reset; struct amdgpu_doorbell_index doorbell_index; @@ -1270,4 +1270,8 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } +static inline bool amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(>in_gpu_reset) ? true : false; +} #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 691c89705bcd..b872cdb0b705 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; #if 0 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 0b7e78748540..832a200bb62f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long flags, end_jiffies; int retry; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index ccd635b812b5..d0940121a6a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, int retry; struct vi_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 4cd851fc5c82..64fdb6a81c47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -554,7 +554,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v9_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 35fed75a4397..79b397800cbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_ poll_wait(file, >autodump.gpu_hang, poll_table); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return POLLIN | POLLRDNORM | POLLWRNORM; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6573e1112462..78fd2c9a7b7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->ip_blocks[i].status.hw == true) break; - if (adev->in_gpu_reset || adev->in_suspend) { + if (amdgpu_in_reset(adev) || adev->in_suspend) { r = adev->ip_blocks[i].version->funcs->resume(adev); if (r) { DRM_ERROR("resume of IP block <%s> failed %d\n", @@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) AMDGPU_RESET_MAGIC_NUM)) return true; - if (!adev->in_gpu_reset) + if (!amdgpu_in_reset(adev)) return false; /* @
[PATCH v3] drm/amdgpu: refine create and release logic of hive info
Change to dynamically create and release hive info object, which help driver support more hives in the future. v2: Change to save hive object pointer in adev, to avoid locking xgmi_mutex every time when calling amdgpu_get_xgmi_hive. v3: 1. Change type of hive object pointer in adev from void* to amdgpu_hive_info*. 2. remove unnecessary variable initialization. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 98d0c6e5ab3c..e25f952d8836 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -251,6 +251,7 @@ struct amdgpu_fpriv; struct amdgpu_bo_va_mapping; struct amdgpu_atif; struct kfd_vm_fault_info; +struct amdgpu_hive_info; enum amdgpu_cp_irq { AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP = 0, @@ -730,7 +731,7 @@ struct amdgpu_device { #ifdef CONFIG_DRM_AMD_ACP struct amdgpu_acp acp; #endif - + struct amdgpu_hive_info *hive; /* ASIC */ enum amd_asic_type asic_type; uint32_tfamily; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f323281c82b0..bc6ef0caf157 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2857,7 +2857,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* It's a bug to not have a hive within this function */ if (WARN_ON(!hive)) @@ -2895,6 +2895,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev->ddev->unique); + amdgpu_put_xgmi_hive(hive); } static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) @@ -4315,7 +4316,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - hive = amdgpu_get_xgmi_hive(adev, false); + hive = amdgpu_get_xgmi_hive(adev); if (hive) { if (atomic_cmpxchg(>in_reset, 0, 1) != 0) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bf71f0a58786..18cdd259d568 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1555,9 +1555,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); if (!ras->disable_ras_err_cnt_harvest) { + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { device_list_handle = >device_list; @@ -1570,6 +1571,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) amdgpu_ras_log_on_err_counter(remote_adev); + + amdgpu_put_xgmi_hive(hive); } if (amdgpu_device_should_recover_gpu(ras->adev)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 7a61dc6738eb..08ed4dddfaf1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -35,11 +35,9 @@ static DEFINE_MUTEX(xgmi_mutex); -#define AMDGPU_MAX_XGMI_HIVE 8 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4 -static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; -static unsigned hive_count = 0; +static LIST_HEAD(xgmi_hive_list); static const int xgmi_pcs_err_status_reg_vg20[] = { smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, @@ -171,59 +169,47 @@ static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { * */ +static struct attribute amdgpu_xgmi_hive_id = { + .name = "xgmi_hive_id", + .mode = S_IRUGO +}; -static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev, - struct device_attribute *attr, char *buf) -{ -
[PATCH v2] drm/amdgpu: refine create and release logic of hive info
Change to dynamically create and release hive info object, which help driver support more hives in the future. v2: Change to save hive object pointer in adev, to avoid locking xgmi_mutex every time when calling amdgpu_get_xgmi_hive. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 98d0c6e5ab3c..894886d6381b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -730,7 +730,7 @@ struct amdgpu_device { #ifdef CONFIG_DRM_AMD_ACP struct amdgpu_acp acp; #endif - + void*hive; /* ASIC */ enum amd_asic_type asic_type; uint32_tfamily; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f323281c82b0..bc6ef0caf157 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2857,7 +2857,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* It's a bug to not have a hive within this function */ if (WARN_ON(!hive)) @@ -2895,6 +2895,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev->ddev->unique); + amdgpu_put_xgmi_hive(hive); } static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) @@ -4315,7 +4316,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - hive = amdgpu_get_xgmi_hive(adev, false); + hive = amdgpu_get_xgmi_hive(adev); if (hive) { if (atomic_cmpxchg(>in_reset, 0, 1) != 0) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bf71f0a58786..18cdd259d568 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1555,9 +1555,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); if (!ras->disable_ras_err_cnt_harvest) { + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { device_list_handle = >device_list; @@ -1570,6 +1571,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) amdgpu_ras_log_on_err_counter(remote_adev); + + amdgpu_put_xgmi_hive(hive); } if (amdgpu_device_should_recover_gpu(ras->adev)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 7a61dc6738eb..c6bd5f0c1339 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -35,11 +35,9 @@ static DEFINE_MUTEX(xgmi_mutex); -#define AMDGPU_MAX_XGMI_HIVE 8 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4 -static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; -static unsigned hive_count = 0; +static LIST_HEAD(xgmi_hive_list); static const int xgmi_pcs_err_status_reg_vg20[] = { smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, @@ -171,59 +169,47 @@ static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { * */ +static struct attribute amdgpu_xgmi_hive_id = { + .name = "xgmi_hive_id", + .mode = S_IRUGO +}; -static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct amdgpu_hive_info *hive = - container_of(attr, struct amdgpu_hive_info, dev_attr); - - return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); -} +static struct attribute *amdgpu_xgmi_hive_attrs[] = { + _xgmi_hive_id, + NULL +}; -static int amdgpu_xgmi_sysfs_create(struct am
[PATCH] drm/amdgpu: refine create and release logic of hive info
Change to dynamically create and release hive info object, which help driver support more hives in the future. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8a55b0bc044a..fdfdc2f678c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2840,7 +2840,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* It's a bug to not have a hive within this function */ if (WARN_ON(!hive)) @@ -2878,6 +2878,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev->ddev->unique); + amdgpu_put_xgmi_hive(hive); } static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) @@ -4286,11 +4287,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - hive = amdgpu_get_xgmi_hive(adev, false); + hive = amdgpu_get_xgmi_hive(adev); if (hive) { if (atomic_cmpxchg(>in_reset, 0, 1) != 0) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", job ? job->base.id : -1, hive->hive_id); + amdgpu_put_xgmi_hive(hive); return 0; } mutex_lock(>hive_lock); @@ -4456,6 +4458,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (hive) { atomic_set(>in_reset, 0); mutex_unlock(>hive_lock); + amdgpu_put_xgmi_hive(hive); } if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5680f7eafcb1..e18606e322e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1514,7 +1514,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) @@ -1525,6 +1525,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) device_list_handle = _list; } + amdgpu_put_xgmi_hive(hive); + list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { amdgpu_ras_log_on_err_counter(remote_adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 67a756f4337b..5315d16539f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -35,11 +35,9 @@ static DEFINE_MUTEX(xgmi_mutex); -#define AMDGPU_MAX_XGMI_HIVE 8 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4 -static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; -static unsigned hive_count = 0; +static LIST_HEAD(xgmi_hive_list); static const int xgmi_pcs_err_status_reg_vg20[] = { smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, @@ -171,59 +169,47 @@ static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { * */ +static struct attribute amdgpu_xgmi_hive_id = { + .name = "xgmi_hive_id", + .mode = S_IRUGO +}; -static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct amdgpu_hive_info *hive = - container_of(attr, struct amdgpu_hive_info, dev_attr); - - return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); -} +static struct attribute *amdgpu_xgmi_hive_attrs[] = { + _xgmi_hive_id, + NULL +}; -static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive) +static ssize_t amdgpu_xgmi_show_hive_id(struct kobject *kobj, + struct attribute *attr, char *buf) { - int ret = 0; - - if (WARN_ON(hive->kobj)) - return -EINVAL; - - hive->kobj = kobject_create_and_add(&q
[PATCH] drm/amdgpu: fix a potential circular locking dependency
[ 653.902305] == [ 653.902928] WARNING: possible circular locking dependency detected [ 653.903517] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G OE [ 653.904098] -- [ 653.904675] amdgpu_test/3975 is trying to acquire lock: [ 653.905241] 97848f8647a0 (>reset_sem){.+.+}, at: amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu] [ 653.905953] but task is already holding lock: [ 653.907087] 9744adbee1f8 (reservation_ww_class_mutex){+.+.}, at: ttm_eu_reserve_buffers+0x1ae/0x520 [ttm] [ 653.907694] which lock already depends on the new lock. [ 653.909423] the existing dependency chain (in reverse order) is: [ 653.910594] -> #1 (reservation_ww_class_mutex){+.+.}: [ 653.911759]__ww_mutex_lock.constprop.15+0xca/0x1120 [ 653.912350]ww_mutex_lock+0x73/0x80 [ 653.913044]amdgpu_amdkfd_alloc_gtt_mem+0xde/0x380 [amdgpu] [ 653.913724]kgd2kfd_device_init+0x13f/0x5e0 [amdgpu] [ 653.914388]amdgpu_amdkfd_device_init+0x155/0x190 [amdgpu] [ 653.915033]amdgpu_device_init+0x1303/0x1e10 [amdgpu] [ 653.915685]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu] [ 653.916349]amdgpu_pci_probe+0x11d/0x200 [amdgpu] [ 653.916959]local_pci_probe+0x47/0xa0 [ 653.917570]work_for_cpu_fn+0x1a/0x30 [ 653.918184]process_one_work+0x29e/0x630 [ 653.918803]worker_thread+0x22b/0x3f0 [ 653.919427]kthread+0x12f/0x150 [ 653.920047]ret_from_fork+0x3a/0x50 [ 653.920661] -> #0 (>reset_sem){.+.+}: [ 653.921893]__lock_acquire+0x13ec/0x16e0 [ 653.922531]lock_acquire+0xb8/0x1c0 [ 653.923174]down_read+0x48/0x230 [ 653.923886]amdgpu_gem_va_ioctl+0x286/0x4f0 [amdgpu] [ 653.924588]drm_ioctl_kernel+0xb6/0x100 [drm] [ 653.925283]drm_ioctl+0x389/0x450 [drm] [ 653.926013]amdgpu_drm_ioctl+0x4f/0x80 [amdgpu] [ 653.926686]ksys_ioctl+0x98/0xb0 [ 653.927357]__x64_sys_ioctl+0x1a/0x20 [ 653.928030]do_syscall_64+0x5f/0x250 [ 653.928697]entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 653.929373] other info that might help us debug this: [ 653.931356] Possible unsafe locking scenario: [ 653.932647]CPU0CPU1 [ 653.933287] [ 653.933911] lock(reservation_ww_class_mutex); [ 653.934530]lock(>reset_sem); [ 653.935154]lock(reservation_ww_class_mutex); [ 653.935766] lock(>reset_sem); [ 653.936360] *** DEADLOCK *** [ 653.938028] 2 locks held by amdgpu_test/3975: [ 653.938574] #0: b2a862d6bcd0 (reservation_ww_class_acquire){+.+.}, at: amdgpu_gem_va_ioctl+0x39b/0x4f0 [amdgpu] [ 653.939233] #1: 9744adbee1f8 (reservation_ww_class_mutex){+.+.}, at: ttm_eu_reserve_buffers+0x1ae/0x520 [ttm] change the order of reservation_ww_class_mutex and adev->reset_sem in amdgpu_gem_va_ioctl the same as ones in amdgpu_amdkfd_alloc_gtt_mem, to avoid potential dead lock. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index ee1e8fff83b2..fc889c477696 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -652,6 +652,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, abo = NULL; } + down_read(>reset_sem); + amdgpu_vm_get_pd_bo(>vm, , _pd); r = ttm_eu_reserve_buffers(, , true, ); @@ -670,8 +672,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, bo_va = NULL; } - down_read(>reset_sem); - switch (args->operation) { case AMDGPU_VA_OP_MAP: va_flags = amdgpu_gem_va_map_flags(adev, args->flags); @@ -701,12 +701,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, amdgpu_gem_va_update_vm(adev, >vm, bo_va, args->operation); - up_read(>reset_sem); - error_backoff: ttm_eu_backoff_reservation(, ); error_unref: + up_read(>reset_sem); drm_gem_object_put_unlocked(gobj); return r; } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: refine create and release logic of hive info
Change to dynamically create and release hive info object, which help driver support more hives in the future. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8a55b0bc044a..fdfdc2f678c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2840,7 +2840,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* It's a bug to not have a hive within this function */ if (WARN_ON(!hive)) @@ -2878,6 +2878,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev->ddev->unique); + amdgpu_put_xgmi_hive(hive); } static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) @@ -4286,11 +4287,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - hive = amdgpu_get_xgmi_hive(adev, false); + hive = amdgpu_get_xgmi_hive(adev); if (hive) { if (atomic_cmpxchg(>in_reset, 0, 1) != 0) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", job ? job->base.id : -1, hive->hive_id); + amdgpu_put_xgmi_hive(hive); return 0; } mutex_lock(>hive_lock); @@ -4456,6 +4458,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (hive) { atomic_set(>in_reset, 0); mutex_unlock(>hive_lock); + amdgpu_put_xgmi_hive(hive); } if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5680f7eafcb1..e18606e322e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1514,7 +1514,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) @@ -1525,6 +1525,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) device_list_handle = _list; } + amdgpu_put_xgmi_hive(hive); + list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { amdgpu_ras_log_on_err_counter(remote_adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 67a756f4337b..5315d16539f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -35,11 +35,9 @@ static DEFINE_MUTEX(xgmi_mutex); -#define AMDGPU_MAX_XGMI_HIVE 8 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4 -static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; -static unsigned hive_count = 0; +static LIST_HEAD(xgmi_hive_list); static const int xgmi_pcs_err_status_reg_vg20[] = { smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, @@ -171,59 +169,47 @@ static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { * */ +static struct attribute amdgpu_xgmi_hive_id = { + .name = "xgmi_hive_id", + .mode = S_IRUGO +}; -static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct amdgpu_hive_info *hive = - container_of(attr, struct amdgpu_hive_info, dev_attr); - - return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); -} +static struct attribute *amdgpu_xgmi_hive_attrs[] = { + _xgmi_hive_id, + NULL +}; -static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive) +static ssize_t amdgpu_xgmi_show_hive_id(struct kobject *kobj, + struct attribute *attr, char *buf) { - int ret = 0; - - if (WARN_ON(hive->kobj)) - return -EINVAL; - - hive->kobj = kobject_create_and_add(&q
[PATCH v3] drm/amdgpu: annotate a false positive recursive locking
[ 584.110304] [ 584.110590] WARNING: possible recursive locking detected [ 584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G OE [ 584.64] [ 584.111456] kworker/38:1/553 is trying to acquire lock: [ 584.111721] 9b15ff0a47a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.112112] but task is already holding lock: [ 584.112673] 9b1603d247a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.113068] other info that might help us debug this: [ 584.113689] Possible unsafe locking scenario: [ 584.114350]CPU0 [ 584.114685] [ 584.115014] lock(>reset_sem); [ 584.115349] lock(>reset_sem); [ 584.115678] *** DEADLOCK *** [ 584.116624] May be due to missing lock nesting notation [ 584.117284] 4 locks held by kworker/38:1/553: [ 584.117616] #0: 9ad635c1d348 ((wq_completion)events){+.+.}, at: process_one_work+0x21f/0x630 [ 584.117967] #1: ac708e1c3e58 ((work_completion)(>recovery_work)){+.+.}, at: process_one_work+0x21f/0x630 [ 584.118358] #2: c1c2a5d0 (>hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xae/0x1030 [amdgpu] [ 584.118786] #3: 9b1603d247a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.119222] stack backtrace: [ 584.119990] CPU: 38 PID: 553 Comm: kworker/38:1 Kdump: loaded Tainted: G OE 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 [ 584.120782] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019 [ 584.121223] Workqueue: events amdgpu_ras_do_recovery [amdgpu] [ 584.121638] Call Trace: [ 584.122050] dump_stack+0x98/0xd5 [ 584.122499] __lock_acquire+0x1139/0x16e0 [ 584.122931] ? trace_hardirqs_on+0x3b/0xf0 [ 584.123358] ? cancel_delayed_work+0xa6/0xc0 [ 584.123771] lock_acquire+0xb8/0x1c0 [ 584.124197] ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.124599] down_write+0x49/0x120 [ 584.125032] ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.125472] amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.125910] ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu] [ 584.126367] amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [ 584.126789] process_one_work+0x29e/0x630 [ 584.127208] worker_thread+0x3c/0x3f0 [ 584.127621] ? __kthread_parkme+0x61/0x90 [ 584.128014] kthread+0x12f/0x150 [ 584.128402] ? process_one_work+0x630/0x630 [ 584.128790] ? kthread_park+0x90/0x90 [ 584.129174] ret_from_fork+0x3a/0x50 Each adev has owned lock_class_key to avoid false positive recursive locking. v2: 1. register adev->lock_key into lockdep, otherwise lockdep will report the below warning [ 1216.705820] BUG: key 890183b647d0 has not been registered! [ 1216.705924] [ cut here ] [ 1216.705972] DEBUG_LOCKS_WARN_ON(1) [ 1216.705997] WARNING: CPU: 20 PID: 541 at kernel/locking/lockdep.c:3743 lockdep_init_map+0x150/0x210 v3: change to use down_write_nest_lock to annotate the false dead-lock warning. Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 62ecac97fbd2..8a55b0bc044a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4145,12 +4145,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, return r; } -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev) +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) { if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0) return false; - down_write(>reset_sem); + if (hive) { + down_write_nest_lock(>reset_sem, >hive_lock); + } else + down_write(>reset_sem); atomic_inc(>gpu_reset_counter); switch (amdgpu_asic_reset_method(adev)) { @@ -4312,7 +4315,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (!amdgpu_device_lock_adev(tmp_adev)) { + if (!amdgpu_device_lock_adev(tmp_adev, hive)) { DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", job ? job->base.id : -1); r = 0; -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2] drm/amdgpu: annotate a false positive recursive locking
[ 584.110304] [ 584.110590] WARNING: possible recursive locking detected [ 584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G OE [ 584.64] [ 584.111456] kworker/38:1/553 is trying to acquire lock: [ 584.111721] 9b15ff0a47a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.112112] but task is already holding lock: [ 584.112673] 9b1603d247a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.113068] other info that might help us debug this: [ 584.113689] Possible unsafe locking scenario: [ 584.114350]CPU0 [ 584.114685] [ 584.115014] lock(>reset_sem); [ 584.115349] lock(>reset_sem); [ 584.115678] *** DEADLOCK *** [ 584.116624] May be due to missing lock nesting notation [ 584.117284] 4 locks held by kworker/38:1/553: [ 584.117616] #0: 9ad635c1d348 ((wq_completion)events){+.+.}, at: process_one_work+0x21f/0x630 [ 584.117967] #1: ac708e1c3e58 ((work_completion)(>recovery_work)){+.+.}, at: process_one_work+0x21f/0x630 [ 584.118358] #2: c1c2a5d0 (>hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xae/0x1030 [amdgpu] [ 584.118786] #3: 9b1603d247a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.119222] stack backtrace: [ 584.119990] CPU: 38 PID: 553 Comm: kworker/38:1 Kdump: loaded Tainted: G OE 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 [ 584.120782] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019 [ 584.121223] Workqueue: events amdgpu_ras_do_recovery [amdgpu] [ 584.121638] Call Trace: [ 584.122050] dump_stack+0x98/0xd5 [ 584.122499] __lock_acquire+0x1139/0x16e0 [ 584.122931] ? trace_hardirqs_on+0x3b/0xf0 [ 584.123358] ? cancel_delayed_work+0xa6/0xc0 [ 584.123771] lock_acquire+0xb8/0x1c0 [ 584.124197] ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.124599] down_write+0x49/0x120 [ 584.125032] ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.125472] amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.125910] ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu] [ 584.126367] amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [ 584.126789] process_one_work+0x29e/0x630 [ 584.127208] worker_thread+0x3c/0x3f0 [ 584.127621] ? __kthread_parkme+0x61/0x90 [ 584.128014] kthread+0x12f/0x150 [ 584.128402] ? process_one_work+0x630/0x630 [ 584.128790] ? kthread_park+0x90/0x90 [ 584.129174] ret_from_fork+0x3a/0x50 Each adev has owned lock_class_key to avoid false positive recursive locking. v2: 1. register adev->lock_key into lockdep, otherwise lockdep will report the below warning [ 1216.705820] BUG: key 890183b647d0 has not been registered! [ 1216.705924] [ cut here ] [ 1216.705972] DEBUG_LOCKS_WARN_ON(1) [ 1216.705997] WARNING: CPU: 20 PID: 541 at kernel/locking/lockdep.c:3743 lockdep_init_map+0x150/0x210 Signed-off-by: Dennis Li Change-Id: I7571efeccbf15483982031d00504a353031a854a diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e97c088d03b3..766dc8f8c8a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -967,6 +967,7 @@ struct amdgpu_device { atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; struct rw_semaphore reset_sem; + struct lock_class_key lock_key; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 62ecac97fbd2..ae0a576f9895 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3037,6 +3037,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(>virt.vf_errors.lock); hash_init(adev->mn_hash); init_rwsem(>reset_sem); + lockdep_register_key(>lock_key); + lockdep_set_class(>reset_sem, >lock_key); atomic_set(>in_gpu_reset, 0); mutex_init(>psp.mutex); mutex_init(>notifier_lock); @@ -3411,6 +3413,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_pmu_fini(adev); if (adev->discovery_bin) amdgpu_discovery_fini(adev); + + lockdep_unregister_key(>lock_key); } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: annotate a false positive recursive locking
[ 584.110304] [ 584.110590] WARNING: possible recursive locking detected [ 584.110876] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G OE [ 584.64] [ 584.111456] kworker/38:1/553 is trying to acquire lock: [ 584.111721] 9b15ff0a47a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.112112] but task is already holding lock: [ 584.112673] 9b1603d247a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.113068] other info that might help us debug this: [ 584.113689] Possible unsafe locking scenario: [ 584.114350]CPU0 [ 584.114685] [ 584.115014] lock(>reset_sem); [ 584.115349] lock(>reset_sem); [ 584.115678] *** DEADLOCK *** [ 584.116624] May be due to missing lock nesting notation [ 584.117284] 4 locks held by kworker/38:1/553: [ 584.117616] #0: 9ad635c1d348 ((wq_completion)events){+.+.}, at: process_one_work+0x21f/0x630 [ 584.117967] #1: ac708e1c3e58 ((work_completion)(>recovery_work)){+.+.}, at: process_one_work+0x21f/0x630 [ 584.118358] #2: c1c2a5d0 (>hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xae/0x1030 [amdgpu] [ 584.118786] #3: 9b1603d247a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.119222] stack backtrace: [ 584.119990] CPU: 38 PID: 553 Comm: kworker/38:1 Kdump: loaded Tainted: G OE 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 [ 584.120782] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019 [ 584.121223] Workqueue: events amdgpu_ras_do_recovery [amdgpu] [ 584.121638] Call Trace: [ 584.122050] dump_stack+0x98/0xd5 [ 584.122499] __lock_acquire+0x1139/0x16e0 [ 584.122931] ? trace_hardirqs_on+0x3b/0xf0 [ 584.123358] ? cancel_delayed_work+0xa6/0xc0 [ 584.123771] lock_acquire+0xb8/0x1c0 [ 584.124197] ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.124599] down_write+0x49/0x120 [ 584.125032] ? amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.125472] amdgpu_device_gpu_recover+0x262/0x1030 [amdgpu] [ 584.125910] ? amdgpu_ras_error_query+0x1b8/0x2a0 [amdgpu] [ 584.126367] amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [ 584.126789] process_one_work+0x29e/0x630 [ 584.127208] worker_thread+0x3c/0x3f0 [ 584.127621] ? __kthread_parkme+0x61/0x90 [ 584.128014] kthread+0x12f/0x150 [ 584.128402] ? process_one_work+0x630/0x630 [ 584.128790] ? kthread_park+0x90/0x90 [ 584.129174] ret_from_fork+0x3a/0x50 Each adev has owned lock_class_key to avoid false positive recursive locking. Signed-off-by: Dennis Li Change-Id: I7571efeccbf15483982031d00504a353031a854a diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e97c088d03b3..766dc8f8c8a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -967,6 +967,7 @@ struct amdgpu_device { atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; struct rw_semaphore reset_sem; + struct lock_class_key lock_key; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6c572db42d92..d78df9312d34 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3037,6 +3037,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(>virt.vf_errors.lock); hash_init(adev->mn_hash); init_rwsem(>reset_sem); + lockdep_set_class(>reset_sem, >lock_key); atomic_set(>in_gpu_reset, 0); mutex_init(>psp.mutex); mutex_init(>notifier_lock); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2] drm/amdgpu: unlock mutex on error
Make sure to unlock the mutex when error happen v2: 1. correct syntax error in the commit comment 2. remove change-Id Acked-by: Nirmoy Das Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index a0ea663ecdbc..5e5369abc6fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -632,13 +632,14 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, } ret = amdgpu_ib_schedule(ring, 1, ib, job, ); + + up_read(>reset_sem); + if (ret) { DRM_ERROR("amdgpu: failed to schedule IB.\n"); goto err_ib_sched; } - up_read(>reset_sem); - ret = dma_fence_wait(f, false); err_ib_sched: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 4e017f379eb6..67a756f4337b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -545,7 +545,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) } ret = amdgpu_xgmi_update_topology(hive, tmp_adev); if (ret) - goto exit; + goto exit_unlock; } /* get latest topology info for each device from psp */ @@ -558,7 +558,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) tmp_adev->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret); /* To do : continue with some node failed or disable the whole hive */ - goto exit; + goto exit_unlock; } } } @@ -566,7 +566,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) if (!ret) ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); - +exit_unlock: mutex_unlock(>hive_lock); exit: if (!ret) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: annotate a false positive locking dependency
[ 264.483189] == [ 264.483487] WARNING: possible circular locking dependency detected [ 264.483781] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G OE [ 264.484076] -- [ 264.484370] kworker/39:1/567 is trying to acquire lock: [ 264.484663] c15df4b0 (mgpu_info.mutex){+.+.}, at: amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu] [ 264.485081] but task is already holding lock: [ 264.485670] 965fd31647a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu] [ 264.486074] which lock already depends on the new lock. [ 264.487043] the existing dependency chain (in reverse order) is: [ 264.487710] -> #3 (>reset_sem){}: [ 264.488400]down_write+0x49/0x120 [ 264.488783]amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu] [ 264.489179]amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [ 264.489544]process_one_work+0x29e/0x630 [ 264.489910]worker_thread+0x3c/0x3f0 [ 264.490279]kthread+0x12f/0x150 [ 264.490649]ret_from_fork+0x3a/0x50 [ 264.491020] -> #2 (>hive_lock){+.+.}: [ 264.491764]__mutex_lock+0x95/0xa20 [ 264.492137]mutex_lock_nested+0x1b/0x20 [ 264.492553]amdgpu_get_xgmi_hive+0x352/0x400 [amdgpu] [ 264.492972]amdgpu_xgmi_add_device+0xb8/0x460 [amdgpu] [ 264.493387]amdgpu_device_init+0x12fb/0x1e10 [amdgpu] [ 264.493807]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu] [ 264.494226]amdgpu_pci_probe+0x11d/0x200 [amdgpu] [ 264.494617]local_pci_probe+0x47/0xa0 [ 264.494998]work_for_cpu_fn+0x1a/0x30 [ 264.495369]process_one_work+0x29e/0x630 [ 264.495746]worker_thread+0x22b/0x3f0 [ 264.496124]kthread+0x12f/0x150 [ 264.496504]ret_from_fork+0x3a/0x50 [ 264.496876] -> #1 (xgmi_mutex){+.+.}: [ 264.497596]__mutex_lock+0x95/0xa20 [ 264.497954]mutex_lock_nested+0x1b/0x20 [ 264.498346]amdgpu_get_xgmi_hive+0x38/0x400 [amdgpu] [ 264.498741]amdgpu_xgmi_set_pstate+0x10/0x20 [amdgpu] [ 264.499126]amdgpu_device_ip_late_init+0x219/0x230 [amdgpu] [ 264.499506]amdgpu_device_init+0x1401/0x1e10 [amdgpu] [ 264.499886]amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu] [ 264.500264]amdgpu_pci_probe+0x11d/0x200 [amdgpu] [ 264.500608]local_pci_probe+0x47/0xa0 [ 264.500945]work_for_cpu_fn+0x1a/0x30 [ 264.501276]process_one_work+0x29e/0x630 [ 264.501603]worker_thread+0x22b/0x3f0 [ 264.501927]kthread+0x12f/0x150 [ 264.502239]ret_from_fork+0x3a/0x50 [ 264.502541] -> #0 (mgpu_info.mutex){+.+.}: [ 264.503126]__lock_acquire+0x13ec/0x16e0 [ 264.503411]lock_acquire+0xb8/0x1c0 [ 264.503693]__mutex_lock+0x95/0xa20 [ 264.504019]mutex_lock_nested+0x1b/0x20 [ 264.504354]amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu] [ 264.504691]amdgpu_device_gpu_recover+0x360/0x1030 [amdgpu] [ 264.505029]amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [ 264.505334]process_one_work+0x29e/0x630 [ 264.505617]worker_thread+0x3c/0x3f0 [ 264.505897]kthread+0x12f/0x150 [ 264.506176]ret_from_fork+0x3a/0x50 [ 264.506453] other info that might help us debug this: [ 264.507267] Chain exists of: mgpu_info.mutex --> >hive_lock --> >reset_sem [ 264.508102] Possible unsafe locking scenario: [ 264.508664]CPU0CPU1 [ 264.508945] [ 264.509221] lock(>reset_sem); [ 264.509524]lock(>hive_lock); [ 264.509818]lock(>reset_sem); [ 264.510111] lock(mgpu_info.mutex); [ 264.510401] *** DEADLOCK *** [ 264.511224] 4 locks held by kworker/39:1/567: [ 264.511499] #0: 961ff5c1d348 ((wq_completion)events){+.+.}, at: process_one_work+0x21f/0x630 [ 264.511793] #1: afa90e233e58 ((work_completion)(>recovery_work)){+.+.}, at: process_one_work+0x21f/0x630 [ 264.512100] #2: c16245d0 (>hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xb0/0x1030 [amdgpu] [ 264.512450] #3: 965fd31647a0 (>reset_sem){}, at: amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu] Remove the lock(>hive_lock) out of amdgpu_get_xgmi_hive, to disable its locking dependency on xgmi_mutex. Signed-off-by: Dennis Li Change-Id: I2d9d80ee23f9f9ac6ce9e1b9e5e1b2b3530f5bdd diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 62ecac97fbd2..6c572db42d92 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2840,7 +2840,7 @@ static
[PATCH] drm/amdgpu: unlock mutex on error
Make sure unlock the mutex when error happen Signed-off-by: Dennis Li Change-Id: I6c36a193df5fe70516282d8136b4eadf32d20915 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index a0ea663ecdbc..5e5369abc6fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -632,13 +632,14 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, } ret = amdgpu_ib_schedule(ring, 1, ib, job, ); + + up_read(>reset_sem); + if (ret) { DRM_ERROR("amdgpu: failed to schedule IB.\n"); goto err_ib_sched; } - up_read(>reset_sem); - ret = dma_fence_wait(f, false); err_ib_sched: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 4e017f379eb6..67a756f4337b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -545,7 +545,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) } ret = amdgpu_xgmi_update_topology(hive, tmp_adev); if (ret) - goto exit; + goto exit_unlock; } /* get latest topology info for each device from psp */ @@ -558,7 +558,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) tmp_adev->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret); /* To do : continue with some node failed or disable the whole hive */ - goto exit; + goto exit_unlock; } } } @@ -566,7 +566,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) if (!ret) ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); - +exit_unlock: mutex_unlock(>hive_lock); exit: if (!ret) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v5] drm/amdgpu: fix system hang issue during GPU reset
when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover, the atomic adev->in_gpu_reset and hive->in_reset are used to avoid re-entering GPU recovery. During GPU reset and resume, it is unsafe that other threads access GPU, which maybe cause GPU reset failed. Therefore the new rw_semaphore adev->reset_sem is introduced, which protect GPU from being accessed by external threads during recovery. v2: 1. add rwlock for some ioctls, debugfs and file-close function. 2. change to use dqm->is_resetting and dqm_lock for protection in kfd driver. 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid re-enter GPU recovery for the same GPU hang. v3: 1. change back to use adev->reset_sem to protect kfd callback functions, because dqm_lock couldn't protect all codes, for example: free_mqd must be called outside of dqm_lock; [ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019 [ 1230.177221] Call Trace: [ 1230.178249] dump_stack+0x98/0xd5 [ 1230.179443] amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu] [ 1230.180673] gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu] [ 1230.181882] amdgpu_gart_unbind+0xa9/0xe0 [amdgpu] [ 1230.183098] amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu] [ 1230.184239] ? ttm_bo_put+0x171/0x5f0 [ttm] [ 1230.185394] ttm_tt_unbind+0x21/0x40 [ttm] [ 1230.186558] ttm_tt_destroy.part.12+0x12/0x60 [ttm] [ 1230.187707] ttm_tt_destroy+0x13/0x20 [ttm] [ 1230.188832] ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm] [ 1230.189979] ttm_bo_put+0x1be/0x5f0 [ttm] [ 1230.191230] amdgpu_bo_unref+0x1e/0x30 [amdgpu] [ 1230.192522] amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu] [ 1230.193833] free_mqd+0x25/0x40 [amdgpu] [ 1230.195143] destroy_queue_cpsch+0x1a7/0x270 [amdgpu] [ 1230.196475] pqm_destroy_queue+0x105/0x260 [amdgpu] [ 1230.197819] kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu] [ 1230.199154] kfd_ioctl+0x277/0x500 [amdgpu] [ 1230.200458] ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu] [ 1230.201656] ? tomoyo_file_ioctl+0x19/0x20 [ 1230.202831] ksys_ioctl+0x98/0xb0 [ 1230.204004] __x64_sys_ioctl+0x1a/0x20 [ 1230.205174] do_syscall_64+0x5f/0x250 [ 1230.206339] entry_SYSCALL_64_after_hwframe+0x49/0xbe 2. remove try_lock and introduce atomic hive->in_reset, to avoid re-enter GPU recovery. v4: 1. remove an unnecessary whitespace change in kfd_chardev.c 2. remove comment codes in amdgpu_device.c 3. add more detailed comment in commit message 4. define a wrap function amdgpu_in_reset v5: 1. Fix some style issues. Signed-off-by: Dennis Li Reviewed-by: Andrey Grodzovsky Reviewed-by: Christian König Reviewed-by: Felix Kuehling Reviewed-by: Lijo Lazar Suggested-by: Luben Tukov Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 80f32b3beb88..be8bd3ae783a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -963,9 +963,9 @@ struct amdgpu_device { boolin_suspend; boolin_hibernate; - boolin_gpu_reset; + atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; - struct mutex lock_reset; + struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; @@ -1280,4 +1280,9 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } +static inline bool amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(>in_gpu_reset) ? true : false; +} + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1b865fed74ca..a0ea663ecdbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -244,11 +244,14 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, if (cp_mqd_gfx9) bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9; + if (!down_read_trylock(>reset_sem)) + return -EIO; + r = amdgpu_bo_create(adev, , ); if (r) { dev_err(adev->dev, "failed to allocate BO for amdkfd (%d)\n", r); - return r; + goto err; } /* map the buffer */ @@ -283,6 +286,7 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, amdgpu_bo_unreserve(bo); + up_read(>reset_sem); return 0; allocate_mem_kmap_bo_failed: @@ -291,19 +295,25 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, amdgpu_bo_unreserve(bo); allocate_mem_reserve_bo_failed: amdgpu_bo_unref(); - +err: + up_read(>reset_sem); return r; } void amdgpu_amdkf
[PATCH v4] drm/amdgpu: fix system hang issue during GPU reset
when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover, the atomic adev->in_gpu_reset and hive->in_reset are used to avoid re-entering GPU recovery. During GPU reset and resume, it is unsafe that other threads access GPU, which maybe cause GPU reset failed. Therefore the new rw_semaphore adev->reset_sem is introduced, which protect GPU from being accessed by external threads during recovery. v2: 1. add rwlock for some ioctls, debugfs and file-close function. 2. change to use dqm->is_resetting and dqm_lock for protection in kfd driver. 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid re-enter GPU recovery for the same GPU hang. v3: 1. change back to use adev->reset_sem to protect kfd callback functions, because dqm_lock couldn't protect all codes, for example: free_mqd must be called outside of dqm_lock; [ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019 [ 1230.177221] Call Trace: [ 1230.178249] dump_stack+0x98/0xd5 [ 1230.179443] amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu] [ 1230.180673] gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu] [ 1230.181882] amdgpu_gart_unbind+0xa9/0xe0 [amdgpu] [ 1230.183098] amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu] [ 1230.184239] ? ttm_bo_put+0x171/0x5f0 [ttm] [ 1230.185394] ttm_tt_unbind+0x21/0x40 [ttm] [ 1230.186558] ttm_tt_destroy.part.12+0x12/0x60 [ttm] [ 1230.187707] ttm_tt_destroy+0x13/0x20 [ttm] [ 1230.188832] ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm] [ 1230.189979] ttm_bo_put+0x1be/0x5f0 [ttm] [ 1230.191230] amdgpu_bo_unref+0x1e/0x30 [amdgpu] [ 1230.192522] amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu] [ 1230.193833] free_mqd+0x25/0x40 [amdgpu] [ 1230.195143] destroy_queue_cpsch+0x1a7/0x270 [amdgpu] [ 1230.196475] pqm_destroy_queue+0x105/0x260 [amdgpu] [ 1230.197819] kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu] [ 1230.199154] kfd_ioctl+0x277/0x500 [amdgpu] [ 1230.200458] ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu] [ 1230.201656] ? tomoyo_file_ioctl+0x19/0x20 [ 1230.202831] ksys_ioctl+0x98/0xb0 [ 1230.204004] __x64_sys_ioctl+0x1a/0x20 [ 1230.205174] do_syscall_64+0x5f/0x250 [ 1230.206339] entry_SYSCALL_64_after_hwframe+0x49/0xbe 2. remove try_lock and introduce atomic hive->in_reset, to avoid re-enter GPU recovery. v4: 1. remove an unnecessary whitespace change in kfd_chardev.c 2. remove comment codes in amdgpu_device.c 3. add more detailed comment in commit message 4. define a wrap function amdgpu_in_reset Signed-off-by: Dennis Li Reviewed-by: Andrey Grodzovsky Reviewed-by: Christian König Reviewed-by: Felix Kuehling Reviewed-by: Lijo Lazar Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 80f32b3beb88..be8bd3ae783a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -963,9 +963,9 @@ struct amdgpu_device { boolin_suspend; boolin_hibernate; - boolin_gpu_reset; + atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; - struct mutex lock_reset; + struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; @@ -1280,4 +1280,9 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } +static inline bool amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(>in_gpu_reset) ? true : false; +} + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1b865fed74ca..3fc2229fc533 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -244,11 +244,14 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, if (cp_mqd_gfx9) bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9; + if(!down_read_trylock(>reset_sem)) + return -EIO; + r = amdgpu_bo_create(adev, , ); if (r) { dev_err(adev->dev, "failed to allocate BO for amdkfd (%d)\n", r); - return r; + goto err; } /* map the buffer */ @@ -283,6 +286,7 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, amdgpu_bo_unreserve(bo); + up_read(>reset_sem); return 0; allocate_mem_kmap_bo_failed: @@ -291,19 +295,25 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, amdgpu_bo_unreserve(bo); allocate_mem_reserve_bo_failed: amdgpu_bo_unref(); - +err: + up_read(>reset_sem); return r; } void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) { + s
[PATCH v2] drm/amdgpu: fix system hang issue during GPU reset
During GPU reset, driver should hold on all external access to GPU, otherwise psp will randomly fail to do post, and then cause system hang. v2: 1. add rwlock for some ioctls, debugfs and file-close function. 2. change to use dqm->is_resetting and dqm_lock for protection in kfd driver. 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid re-enter GPU recovery for the same GPU hang. Signed-off-by: Dennis Li Change-Id: I7f77a72795462587ed7d5f51fe53a594a0f1f708 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 80f32b3beb88..f235492799d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -963,9 +963,9 @@ struct amdgpu_device { boolin_suspend; boolin_hibernate; - boolin_gpu_reset; + atomic_tin_gpu_reset; enum pp_mp1_state mp1_state; - struct mutex lock_reset; + struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 691c89705bcd..af71d8e93081 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (atomic_read(>in_gpu_reset)) return -EIO; #if 0 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 0b7e78748540..750a8308c868 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long flags, end_jiffies; int retry; - if (adev->in_gpu_reset) + if (atomic_read(>in_gpu_reset)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index ccd635b812b5..027793e0c1ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, int retry; struct vi_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (atomic_read(>in_gpu_reset)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index df841c2ac5e7..e4a77f7a4c2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -541,7 +541,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v9_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (atomic_read(>in_gpu_reset)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index ffbcaf4bfb8b..a94b3f862fc2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1292,6 +1292,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) parser.adev = adev; parser.filp = filp; + down_read(>reset_sem); + r = amdgpu_cs_parser_init(, data); if (r) { DRM_ERROR("Failed to initialize parser %d!\n", r); @@ -1331,6 +1333,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) out: amdgpu_cs_parser_fini(, r, reserved_buffers); + up_read(>reset_sem); + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 8842c55d4490..d85d13f7a043 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -358,6 +358,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, if (atomic_read(>guilty)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; + down_read(>reset_sem); + /*query ue count*/ ras_counter = amdgpu_ras_query_error_count(adev, false); /*ras counter is monotonic increasing*/ @@ -373,6 +375,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, ctx->ras_counter_ce = ras_counter; } + up_read(>reset_sem); + m
[PATCH] drm/amdgpu: fix system hang issue during GPU reset
During GPU reset, driver should hold on all external access to GPU, otherwise psp will randomly fail to do post, and then cause system hang. Signed-off-by: Dennis Li Change-Id: I7d5d41f9c4198b917d7b49606ba3850988e5b936 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 6c7dd0a707c9..34bfc2a147ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -965,7 +965,7 @@ struct amdgpu_device { boolin_gpu_reset; enum pp_mp1_state mp1_state; - struct mutex lock_reset; + struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index ad59ac4423b8..4139c81389a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -611,7 +611,9 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, /* This works for NO_HWS. TODO: need to handle without knowing VMID */ job->vmid = vmid; + down_read(>reset_sem); ret = amdgpu_ib_schedule(ring, 1, ib, job, ); + up_read(>reset_sem); if (ret) { DRM_ERROR("amdgpu: failed to schedule IB.\n"); goto err_ib_sched; @@ -649,6 +651,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid) { struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + down_read(>reset_sem); + if (adev->family == AMDGPU_FAMILY_AI) { int i; @@ -658,6 +662,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid) amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0); } + up_read(>reset_sem); + return 0; } @@ -666,11 +672,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid) struct amdgpu_device *adev = (struct amdgpu_device *)kgd; const uint32_t flush_type = 0; bool all_hub = false; + int ret = 0; if (adev->family == AMDGPU_FAMILY_AI) all_hub = true; - return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); + down_read(>reset_sem); + + ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); + + up_read(>reset_sem); + + return ret; } bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 691c89705bcd..db5d533dd406 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -542,6 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long end_jiffies; uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); + int ret = 0; if (adev->in_gpu_reset) return -EIO; @@ -551,6 +552,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, int retry; #endif + down_read(>reset_sem); + acquire_queue(kgd, pipe_id, queue_id); if (m->cp_hqd_vmid == 0) @@ -633,14 +636,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, break; if (time_after(jiffies, end_jiffies)) { pr_err("cp queue preemption time out.\n"); - release_queue(kgd); - return -ETIME; + ret = -ETIME; + goto pro_end; } usleep_range(500, 1000); } +pro_end: release_queue(kgd); - return 0; + up_read(>reset_sem); + return ret; } static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 0b7e78748540..cf27fe5091aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -424,10 +424,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, enum hqd_dequeue_request_type type; unsigned long flags, end_jiffies; int retry; + int ret = 0; if (adev->in_gpu_reset) return -EIO; + down_read(>reset_sem); + acquire_queue(kgd, pipe_id, queue_id); WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0); @@ -506,14 +509,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, break; if (time_after(jiffies, end_jiffies)) { pr_err("cp queue preemption time out\n"); -
[PATCH] drm/amdkfd: change to return status when flush tlb
If GPU hang, driver will fail to flush tlb, return the hang error to callers, make callers have a chance to handle the error. Signed-off-by: Dennis Li Change-Id: Ie305ad0a77675f6eab7d5b8f68e279b7f4e7a8b9 diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index e9b96ad3d9a5..18e243183b5e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1488,7 +1488,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, peer_pdd = kfd_get_process_device_data(peer, p); if (WARN_ON_ONCE(!peer_pdd)) continue; - kfd_flush_tlb(peer_pdd); + err = kfd_flush_tlb(peer_pdd); } kfree(devices_arr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 21eb0998c4ae..d636cbf7d32f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -263,6 +263,7 @@ static int allocate_vmid(struct device_queue_manager *dqm, struct queue *q) { int allocated_vmid = -1, i; + int ret = 0; for (i = dqm->dev->vm_info.first_vmid_kfd; i <= dqm->dev->vm_info.last_vmid_kfd; i++) { @@ -295,13 +296,26 @@ static int allocate_vmid(struct device_queue_manager *dqm, qpd->vmid, qpd->page_table_base); /* invalidate the VM context after pasid and vmid mapping is set up */ - kfd_flush_tlb(qpd_to_pdd(qpd)); + ret = kfd_flush_tlb(qpd_to_pdd(qpd)); + if (ret) { + pr_err("Failed to flush tlb\n"); + goto pro_failed; + } if (dqm->dev->kfd2kgd->set_scratch_backing_va) dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); return 0; + +pro_failed: + /* Release the vmid mapping */ + set_pasid_vmid_mapping(dqm, 0, qpd->vmid); + dqm->vmid_pasid[qpd->vmid] = 0; + + qpd->vmid = 0; + q->properties.vmid = 0; + return ret; } static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, @@ -326,12 +340,17 @@ static void deallocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { + int ret = 0; + /* On GFX v7, CP doesn't flush TC at dequeue */ if (q->device->device_info->asic_family == CHIP_HAWAII) if (flush_texture_cache_nocpsch(q->device, qpd)) pr_err("Failed to flush TC\n"); - kfd_flush_tlb(qpd_to_pdd(qpd)); + ret = kfd_flush_tlb(qpd_to_pdd(qpd)); + if (ret) { + pr_err("Failed to flush tlb\n"); + } /* Release the vmid mapping */ set_pasid_vmid_mapping(dqm, 0, qpd->vmid); @@ -795,7 +814,9 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, dqm->dev->kgd, qpd->vmid, qpd->page_table_base); - kfd_flush_tlb(pdd); + ret = kfd_flush_tlb(pdd); + if (ret) + goto out; } /* Take a safe reference to the mm_struct, which may otherwise diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 51ba2020732e..31ea72946d06 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1081,7 +1081,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, void kfd_signal_reset_event(struct kfd_dev *dev); -void kfd_flush_tlb(struct kfd_process_device *pdd); +int kfd_flush_tlb(struct kfd_process_device *pdd); int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 8616a204e4c3..3919cc88813c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1444,21 +1444,24 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot); } -void kfd_flush_tlb(struct kfd_process_device *pdd) +int kfd_flush_tlb(struct kfd_process_device *pdd) { struct kfd_dev *dev = pdd->dev; + int ret = 0; if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { /* Nothing to flush until a VMID is assigned, which * only happens when the first que
[PATCH v2] drm/amdgpu: set error query ready after all IPs late init
If set error query ready in amdgpu_ras_late_init, which will cause some IP blocks aren't initialized, but their error query is ready. v2: change the prefix of title to "drm/amdgpu" and remove the unnecessary "{}". Change-Id: I5087527261cb1b462afd82ad7592cf1ef73b15bd Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 423eed223aa5..e37e0982cd46 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2218,6 +2218,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) adev->ip_blocks[i].status.late_initialized = true; } + amdgpu_ras_set_error_query_ready(adev, true); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 68b82f7b0b80..8b14aee370c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1921,10 +1921,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, } /* in resume phase, no need to create ras fs node */ - if (adev->in_suspend || adev->in_gpu_reset) { - amdgpu_ras_set_error_query_ready(adev, true); + if (adev->in_suspend || adev->in_gpu_reset) return 0; - } if (ih_info->cb) { r = amdgpu_ras_interrupt_add_handler(adev, ih_info); @@ -1936,8 +1934,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, if (r) goto sysfs; - amdgpu_ras_set_error_query_ready(adev, true); - return 0; cleanup: amdgpu_ras_sysfs_remove(adev, ras_block); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amd/amdgpu: set error query ready after all IPs late init
If set error query ready in amdgpu_ras_late_init, which will cause some IP blocks aren't initialized, but their error query is ready. Change-Id: I5087527261cb1b462afd82ad7592cf1ef73b15bd Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c old mode 100644 new mode 100755 index 423eed223aa5..e37e0982cd46 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2218,6 +2218,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) adev->ip_blocks[i].status.late_initialized = true; } + amdgpu_ras_set_error_query_ready(adev, true); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 68b82f7b0b80..060866d372a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1922,7 +1922,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, /* in resume phase, no need to create ras fs node */ if (adev->in_suspend || adev->in_gpu_reset) { - amdgpu_ras_set_error_query_ready(adev, true); return 0; } @@ -1936,8 +1935,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, if (r) goto sysfs; - amdgpu_ras_set_error_query_ready(adev, true); - return 0; cleanup: amdgpu_ras_sysfs_remove(adev, ras_block); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: replace DRM prefix with PCI device info for gfx/mmhub
Prefix RAS message printing in gfx/mmhub with PCI device info, which assists the debug in multiple GPU case. Change-Id: Iceba7cafd5aac7d0251d9f871503745cc617fba2 Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c old mode 100644 new mode 100755 index dce945ef21a5..46351db36922 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c @@ -732,7 +732,8 @@ static int gfx_v9_4_query_utc_edc_status(struct amdgpu_device *adev, sec_count = REG_GET_FIELD(data, VML2_WALKER_MEM_ECC_CNTL, SEC_COUNT); if (sec_count) { - DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i, + dev_info(adev->dev, +"Instance[%d]: SubBlock %s, SEC %d\n", i, vml2_walker_mems[i], sec_count); err_data->ce_count += sec_count; } @@ -740,7 +741,8 @@ static int gfx_v9_4_query_utc_edc_status(struct amdgpu_device *adev, ded_count = REG_GET_FIELD(data, VML2_WALKER_MEM_ECC_CNTL, DED_COUNT); if (ded_count) { - DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i, + dev_info(adev->dev, +"Instance[%d]: SubBlock %s, DED %d\n", i, vml2_walker_mems[i], ded_count); err_data->ue_count += ded_count; } @@ -752,14 +754,16 @@ static int gfx_v9_4_query_utc_edc_status(struct amdgpu_device *adev, sec_count = REG_GET_FIELD(data, UTCL2_MEM_ECC_CNTL, SEC_COUNT); if (sec_count) { - DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i, + dev_info(adev->dev, +"Instance[%d]: SubBlock %s, SEC %d\n", i, utcl2_router_mems[i], sec_count); err_data->ce_count += sec_count; } ded_count = REG_GET_FIELD(data, UTCL2_MEM_ECC_CNTL, DED_COUNT); if (ded_count) { - DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i, + dev_info(adev->dev, +"Instance[%d]: SubBlock %s, DED %d\n", i, utcl2_router_mems[i], ded_count); err_data->ue_count += ded_count; } @@ -772,7 +776,8 @@ static int gfx_v9_4_query_utc_edc_status(struct amdgpu_device *adev, sec_count = REG_GET_FIELD(data, ATC_L2_CACHE_2M_DSM_CNTL, SEC_COUNT); if (sec_count) { - DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i, + dev_info(adev->dev, +"Instance[%d]: SubBlock %s, SEC %d\n", i, atc_l2_cache_2m_mems[i], sec_count); err_data->ce_count += sec_count; } @@ -780,7 +785,8 @@ static int gfx_v9_4_query_utc_edc_status(struct amdgpu_device *adev, ded_count = REG_GET_FIELD(data, ATC_L2_CACHE_2M_DSM_CNTL, DED_COUNT); if (ded_count) { - DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i, + dev_info(adev->dev, +"Instance[%d]: SubBlock %s, DED %d\n", i, atc_l2_cache_2m_mems[i], ded_count); err_data->ue_count += ded_count; } @@ -793,7 +799,8 @@ static int gfx_v9_4_query_utc_edc_status(struct amdgpu_device *adev, sec_count = REG_GET_FIELD(data, ATC_L2_CACHE_4K_DSM_CNTL, SEC_COUNT); if (sec_count) { - DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i, + dev_info(adev->dev, +"Instance[%d]: SubBlock %s, SEC %d\n", i, atc_l2_cache_4k_mems[i], sec_count); err_data->ce_count += sec_count; } @@ -801,7 +808,8 @@ static int gfx_v9_4_query_utc_edc_status(struct amdgpu_device *adev, ded_count = REG_GET_FIELD(data, ATC_L2_CACHE_4K_DSM_CNTL, DED_COUNT); if (ded_count) { - DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i, + dev_info(adev->dev, +"Instance[%d]: Su
[PATCH] drm/amdgpu: fix the coverage issue to clear ArcVPGRs
Set ComputePGMRSRC1.VGPRS as 0x3f to clear all ArcVGPRs. Change-Id: I296c3a162c0d5c7b84d4b48dc2002340a5c22e2a Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c old mode 100644 new mode 100755 index 324838baa71c..44fb64460c1f --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4264,7 +4264,7 @@ static const struct soc15_reg_entry vgpr_init_regs_arcturus[] = { { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_X), 0x40 }, { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Y), 4 }, { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Z), 1 }, - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0x81 }, + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0xbf }, { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC2), 0x40 }, /* 64KB LDS */ { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE0), 0x }, { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE1), 0x }, -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2] drm/amdgpu: add codes to clear AccVGPR for arcturus
AccVGPRs are newly added in arcturus. Before reading these registers, they should be initialized. Otherwise edc error happens, when RAS is enabled. v2: reuse the existing logical to calculate register size Change-Id: I4ed384f0cc4b781a10cfd6ad1e3a132445bdc261 Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c old mode 100644 new mode 100755 index c78ffdc51373..324838baa71c --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4144,6 +4144,101 @@ static const u32 sgpr_init_compute_shader[] = 0xbe800080, 0xbf81, }; +static const u32 vgpr_init_compute_shader_arcturus[] = { + 0xd3d94000, 0x1880, 0xd3d94001, 0x1880, 0xd3d94002, 0x1880, + 0xd3d94003, 0x1880, 0xd3d94004, 0x1880, 0xd3d94005, 0x1880, + 0xd3d94006, 0x1880, 0xd3d94007, 0x1880, 0xd3d94008, 0x1880, + 0xd3d94009, 0x1880, 0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880, + 0xd3d9400c, 0x1880, 0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880, + 0xd3d9400f, 0x1880, 0xd3d94010, 0x1880, 0xd3d94011, 0x1880, + 0xd3d94012, 0x1880, 0xd3d94013, 0x1880, 0xd3d94014, 0x1880, + 0xd3d94015, 0x1880, 0xd3d94016, 0x1880, 0xd3d94017, 0x1880, + 0xd3d94018, 0x1880, 0xd3d94019, 0x1880, 0xd3d9401a, 0x1880, + 0xd3d9401b, 0x1880, 0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880, + 0xd3d9401e, 0x1880, 0xd3d9401f, 0x1880, 0xd3d94020, 0x1880, + 0xd3d94021, 0x1880, 0xd3d94022, 0x1880, 0xd3d94023, 0x1880, + 0xd3d94024, 0x1880, 0xd3d94025, 0x1880, 0xd3d94026, 0x1880, + 0xd3d94027, 0x1880, 0xd3d94028, 0x1880, 0xd3d94029, 0x1880, + 0xd3d9402a, 0x1880, 0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880, + 0xd3d9402d, 0x1880, 0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880, + 0xd3d94030, 0x1880, 0xd3d94031, 0x1880, 0xd3d94032, 0x1880, + 0xd3d94033, 0x1880, 0xd3d94034, 0x1880, 0xd3d94035, 0x1880, + 0xd3d94036, 0x1880, 0xd3d94037, 0x1880, 0xd3d94038, 0x1880, + 0xd3d94039, 0x1880, 0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880, + 0xd3d9403c, 0x1880, 0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880, + 0xd3d9403f, 0x1880, 0xd3d94040, 0x1880, 0xd3d94041, 0x1880, + 0xd3d94042, 0x1880, 0xd3d94043, 0x1880, 0xd3d94044, 0x1880, + 0xd3d94045, 0x1880, 0xd3d94046, 0x1880, 0xd3d94047, 0x1880, + 0xd3d94048, 0x1880, 0xd3d94049, 0x1880, 0xd3d9404a, 0x1880, + 0xd3d9404b, 0x1880, 0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880, + 0xd3d9404e, 0x1880, 0xd3d9404f, 0x1880, 0xd3d94050, 0x1880, + 0xd3d94051, 0x1880, 0xd3d94052, 0x1880, 0xd3d94053, 0x1880, + 0xd3d94054, 0x1880, 0xd3d94055, 0x1880, 0xd3d94056, 0x1880, + 0xd3d94057, 0x1880, 0xd3d94058, 0x1880, 0xd3d94059, 0x1880, + 0xd3d9405a, 0x1880, 0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880, + 0xd3d9405d, 0x1880, 0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880, + 0xd3d94060, 0x1880, 0xd3d94061, 0x1880, 0xd3d94062, 0x1880, + 0xd3d94063, 0x1880, 0xd3d94064, 0x1880, 0xd3d94065, 0x1880, + 0xd3d94066, 0x1880, 0xd3d94067, 0x1880, 0xd3d94068, 0x1880, + 0xd3d94069, 0x1880, 0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880, + 0xd3d9406c, 0x1880, 0xd3d9406d, 0x1880, 0xd3d9406e, 0x1880, + 0xd3d9406f, 0x1880, 0xd3d94070, 0x1880, 0xd3d94071, 0x1880, + 0xd3d94072, 0x1880, 0xd3d94073, 0x1880, 0xd3d94074, 0x1880, + 0xd3d94075, 0x1880, 0xd3d94076, 0x1880, 0xd3d94077, 0x1880, + 0xd3d94078, 0x1880, 0xd3d94079, 0x1880, 0xd3d9407a, 0x1880, + 0xd3d9407b, 0x1880, 0xd3d9407c, 0x1880, 0xd3d9407d, 0x1880, + 0xd3d9407e, 0x1880, 0xd3d9407f, 0x1880, 0xd3d94080, 0x1880, + 0xd3d94081, 0x1880, 0xd3d94082, 0x1880, 0xd3d94083, 0x1880, + 0xd3d94084, 0x1880, 0xd3d94085, 0x1880, 0xd3d94086, 0x1880, + 0xd3d94087, 0x1880, 0xd3d94088, 0x1880, 0xd3d94089, 0x1880, + 0xd3d9408a, 0x1880, 0xd3d9408b, 0x1880, 0xd3d9408c, 0x1880, + 0xd3d9408d, 0x1880, 0xd3d9408e, 0x1880, 0xd3d9408f, 0x1880, + 0xd3d94090, 0x1880, 0xd3d94091, 0x1880, 0xd3d94092, 0x1880, + 0xd3d94093, 0x1880, 0xd3d94094, 0x1880, 0xd3d94095, 0x1880, + 0xd3d94096, 0x1880, 0xd3d94097, 0x1880, 0xd3d94098, 0x1880, + 0xd3d94099, 0x1880, 0xd3d9409a, 0x1880, 0xd3d9409b, 0x1880, + 0xd3d9409c, 0x1880, 0xd3d9409d, 0x1880, 0xd3d9409e, 0x1880, + 0xd3d9409f, 0x1880, 0xd3d940a0, 0x1880
[PATCH] drm/amdgpu: add codes to clear AccVGPR for arcturus
AccVGPRs are newly added in arcturus. Before reading these registers, they should be initialized. Otherwise edc error happens, when RAS is enabled. Change-Id: I4ed384f0cc4b781a10cfd6ad1e3a132445bdc261 Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c old mode 100644 new mode 100755 index c78ffdc51373..d5dd754bfb85 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4144,6 +4144,101 @@ static const u32 sgpr_init_compute_shader[] = 0xbe800080, 0xbf81, }; +static const u32 vgpr_init_compute_shader_arcturus[] = { + 0xd3d94000, 0x1880, 0xd3d94001, 0x1880, 0xd3d94002, 0x1880, + 0xd3d94003, 0x1880, 0xd3d94004, 0x1880, 0xd3d94005, 0x1880, + 0xd3d94006, 0x1880, 0xd3d94007, 0x1880, 0xd3d94008, 0x1880, + 0xd3d94009, 0x1880, 0xd3d9400a, 0x1880, 0xd3d9400b, 0x1880, + 0xd3d9400c, 0x1880, 0xd3d9400d, 0x1880, 0xd3d9400e, 0x1880, + 0xd3d9400f, 0x1880, 0xd3d94010, 0x1880, 0xd3d94011, 0x1880, + 0xd3d94012, 0x1880, 0xd3d94013, 0x1880, 0xd3d94014, 0x1880, + 0xd3d94015, 0x1880, 0xd3d94016, 0x1880, 0xd3d94017, 0x1880, + 0xd3d94018, 0x1880, 0xd3d94019, 0x1880, 0xd3d9401a, 0x1880, + 0xd3d9401b, 0x1880, 0xd3d9401c, 0x1880, 0xd3d9401d, 0x1880, + 0xd3d9401e, 0x1880, 0xd3d9401f, 0x1880, 0xd3d94020, 0x1880, + 0xd3d94021, 0x1880, 0xd3d94022, 0x1880, 0xd3d94023, 0x1880, + 0xd3d94024, 0x1880, 0xd3d94025, 0x1880, 0xd3d94026, 0x1880, + 0xd3d94027, 0x1880, 0xd3d94028, 0x1880, 0xd3d94029, 0x1880, + 0xd3d9402a, 0x1880, 0xd3d9402b, 0x1880, 0xd3d9402c, 0x1880, + 0xd3d9402d, 0x1880, 0xd3d9402e, 0x1880, 0xd3d9402f, 0x1880, + 0xd3d94030, 0x1880, 0xd3d94031, 0x1880, 0xd3d94032, 0x1880, + 0xd3d94033, 0x1880, 0xd3d94034, 0x1880, 0xd3d94035, 0x1880, + 0xd3d94036, 0x1880, 0xd3d94037, 0x1880, 0xd3d94038, 0x1880, + 0xd3d94039, 0x1880, 0xd3d9403a, 0x1880, 0xd3d9403b, 0x1880, + 0xd3d9403c, 0x1880, 0xd3d9403d, 0x1880, 0xd3d9403e, 0x1880, + 0xd3d9403f, 0x1880, 0xd3d94040, 0x1880, 0xd3d94041, 0x1880, + 0xd3d94042, 0x1880, 0xd3d94043, 0x1880, 0xd3d94044, 0x1880, + 0xd3d94045, 0x1880, 0xd3d94046, 0x1880, 0xd3d94047, 0x1880, + 0xd3d94048, 0x1880, 0xd3d94049, 0x1880, 0xd3d9404a, 0x1880, + 0xd3d9404b, 0x1880, 0xd3d9404c, 0x1880, 0xd3d9404d, 0x1880, + 0xd3d9404e, 0x1880, 0xd3d9404f, 0x1880, 0xd3d94050, 0x1880, + 0xd3d94051, 0x1880, 0xd3d94052, 0x1880, 0xd3d94053, 0x1880, + 0xd3d94054, 0x1880, 0xd3d94055, 0x1880, 0xd3d94056, 0x1880, + 0xd3d94057, 0x1880, 0xd3d94058, 0x1880, 0xd3d94059, 0x1880, + 0xd3d9405a, 0x1880, 0xd3d9405b, 0x1880, 0xd3d9405c, 0x1880, + 0xd3d9405d, 0x1880, 0xd3d9405e, 0x1880, 0xd3d9405f, 0x1880, + 0xd3d94060, 0x1880, 0xd3d94061, 0x1880, 0xd3d94062, 0x1880, + 0xd3d94063, 0x1880, 0xd3d94064, 0x1880, 0xd3d94065, 0x1880, + 0xd3d94066, 0x1880, 0xd3d94067, 0x1880, 0xd3d94068, 0x1880, + 0xd3d94069, 0x1880, 0xd3d9406a, 0x1880, 0xd3d9406b, 0x1880, + 0xd3d9406c, 0x1880, 0xd3d9406d, 0x1880, 0xd3d9406e, 0x1880, + 0xd3d9406f, 0x1880, 0xd3d94070, 0x1880, 0xd3d94071, 0x1880, + 0xd3d94072, 0x1880, 0xd3d94073, 0x1880, 0xd3d94074, 0x1880, + 0xd3d94075, 0x1880, 0xd3d94076, 0x1880, 0xd3d94077, 0x1880, + 0xd3d94078, 0x1880, 0xd3d94079, 0x1880, 0xd3d9407a, 0x1880, + 0xd3d9407b, 0x1880, 0xd3d9407c, 0x1880, 0xd3d9407d, 0x1880, + 0xd3d9407e, 0x1880, 0xd3d9407f, 0x1880, 0xd3d94080, 0x1880, + 0xd3d94081, 0x1880, 0xd3d94082, 0x1880, 0xd3d94083, 0x1880, + 0xd3d94084, 0x1880, 0xd3d94085, 0x1880, 0xd3d94086, 0x1880, + 0xd3d94087, 0x1880, 0xd3d94088, 0x1880, 0xd3d94089, 0x1880, + 0xd3d9408a, 0x1880, 0xd3d9408b, 0x1880, 0xd3d9408c, 0x1880, + 0xd3d9408d, 0x1880, 0xd3d9408e, 0x1880, 0xd3d9408f, 0x1880, + 0xd3d94090, 0x1880, 0xd3d94091, 0x1880, 0xd3d94092, 0x1880, + 0xd3d94093, 0x1880, 0xd3d94094, 0x1880, 0xd3d94095, 0x1880, + 0xd3d94096, 0x1880, 0xd3d94097, 0x1880, 0xd3d94098, 0x1880, + 0xd3d94099, 0x1880, 0xd3d9409a, 0x1880, 0xd3d9409b, 0x1880, + 0xd3d9409c, 0x1880, 0xd3d9409d, 0x1880, 0xd3d9409e, 0x1880, + 0xd3d9409f, 0x1880, 0xd3d940a0, 0x1880, 0xd3d940a1, 0x1880, + 0xd3d940a2, 0x1880
[PATCH] drm/amdgpu: fix a bug NULL pointer dereference
check whether the queue of entity is null to avoid null pointer dereference. Change-Id: I08d56774012cf229ba2fe7a011c1359e8d1e2781 Signed-off-by: Dennis Li diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c index 4cc7881f438c..67cca463ddcc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c @@ -95,6 +95,9 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p, int r; entity = p->direct ? >vm->direct : >vm->delayed; + if (!entity->rq) + return 0; + ring = container_of(entity->rq->sched, struct amdgpu_ring, sched); WARN_ON(ib->length_dw == 0); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 0/2] query edc counter for more mmhub sub-blocks of Acrturus
1. Add RAS support for MAM D(0~3)_MEM in mmhub. 2. Add RAS support for other mmhub ranges from 2 to 7. Dennis Li (2): drm/amdgpu: update mmhub 9.4.1 header files for Acrturus drm/amdgpu: enable RAS feature for more mmhub sub-blocks of Acrturus drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 701 +- .../asic_reg/mmhub/mmhub_9_4_1_sh_mask.h | 128 2 files changed, 823 insertions(+), 6 deletions(-) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/2] drm/amdgpu: update mmhub 9.4.1 header files for Acrturus
Add mask & shift definition of MAM_D(0~3)MEM for all mmhub ranges. Change-Id: I65c8a3040611198273a4b6da77c1a1ad2ffe7fd3 Signed-off-by: Dennis Li --- .../asic_reg/mmhub/mmhub_9_4_1_sh_mask.h | 128 ++ 1 file changed, 128 insertions(+) diff --git a/drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_1_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_1_sh_mask.h index 40dfbf16bd34..111a71b434e2 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_1_sh_mask.h +++ b/drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_1_sh_mask.h @@ -11185,6 +11185,14 @@ #define MMEA0_EDC_CNT2__GMIWR_DATAMEM_DED_COUNT__SHIFT 0xa #define MMEA0_EDC_CNT2__GMIRD_PAGEMEM_SED_COUNT__SHIFT 0xc #define MMEA0_EDC_CNT2__GMIWR_PAGEMEM_SED_COUNT__SHIFT 0xe +#define MMEA0_EDC_CNT2__MAM_D0MEM_SED_COUNT__SHIFT 0x10 +#define MMEA0_EDC_CNT2__MAM_D1MEM_SED_COUNT__SHIFT 0x12 +#define MMEA0_EDC_CNT2__MAM_D2MEM_SED_COUNT__SHIFT 0x14 +#define MMEA0_EDC_CNT2__MAM_D3MEM_SED_COUNT__SHIFT 0x16 +#define MMEA0_EDC_CNT2__MAM_D0MEM_DED_COUNT__SHIFT 0x18 +#define MMEA0_EDC_CNT2__MAM_D1MEM_DED_COUNT__SHIFT 0x1a +#define MMEA0_EDC_CNT2__MAM_D2MEM_DED_COUNT__SHIFT 0x1c +#define MMEA0_EDC_CNT2__MAM_D3MEM_DED_COUNT__SHIFT 0x1e #define MMEA0_EDC_CNT2__GMIRD_CMDMEM_SEC_COUNT_MASK 0x0003L #define MMEA0_EDC_CNT2__GMIRD_CMDMEM_DED_COUNT_MASK 0x000CL #define MMEA0_EDC_CNT2__GMIWR_CMDMEM_SEC_COUNT_MASK 0x0030L @@ -11193,6 +11201,14 @@ #define MMEA0_EDC_CNT2__GMIWR_DATAMEM_DED_COUNT_MASK 0x0C00L #define MMEA0_EDC_CNT2__GMIRD_PAGEMEM_SED_COUNT_MASK 0x3000L #define MMEA0_EDC_CNT2__GMIWR_PAGEMEM_SED_COUNT_MASK 0xC000L +#define MMEA0_EDC_CNT2__MAM_D0MEM_SED_COUNT_MASK 0x0003L +#define MMEA0_EDC_CNT2__MAM_D1MEM_SED_COUNT_MASK 0x000CL +#define MMEA0_EDC_CNT2__MAM_D2MEM_SED_COUNT_MASK 0x0030L +#define MMEA0_EDC_CNT2__MAM_D3MEM_SED_COUNT_MASK 0x00C0L +#define MMEA0_EDC_CNT2__MAM_D0MEM_DED_COUNT_MASK 0x0300L +#define MMEA0_EDC_CNT2__MAM_D1MEM_DED_COUNT_MASK 0x0C00L +#define MMEA0_EDC_CNT2__MAM_D2MEM_DED_COUNT_MASK 0x3000L +#define MMEA0_EDC_CNT2__MAM_D3MEM_DED_COUNT_MASK 0xC000L //MMEA0_DSM_CNTL #define MMEA0_DSM_CNTL__DRAMRD_CMDMEM_DSM_IRRITATOR_DATA__SHIFT 0x0 #define MMEA0_DSM_CNTL__DRAMRD_CMDMEM_ENABLE_SINGLE_WRITE__SHIFT 0x2 @@ -14197,6 +14213,14 @@ #define MMEA1_EDC_CNT2__GMIWR_DATAMEM_DED_COUNT__SHIFT 0xa #define MMEA1_EDC_CNT2__GMIRD_PAGEMEM_SED_COUNT__SHIFT 0xc #define MMEA1_EDC_CNT2__GMIWR_PAGEMEM_SED_COUNT__SHIFT 0xe +#define MMEA1_EDC_CNT2__MAM_D0MEM_SED_COUNT__SHIFT 0x10 +#define MMEA1_EDC_CNT2__MAM_D1MEM_SED_COUNT__SHIFT 0x12 +#define MMEA1_EDC_CNT2__MAM_D2MEM_SED_COUNT__SHIFT 0x14 +#define MMEA1_EDC_CNT2__MAM_D3MEM_SED_COUNT__SHIFT 0x16 +#define MMEA1_EDC_CNT2__MAM_D0MEM_DED_COUNT__SHIFT 0x18 +#define MMEA1_EDC_CNT2__MAM_D1MEM_DED_COUNT__SHIFT 0x1a +#de
[PATCH 2/2] drm/amdgpu: enable RAS feature for more mmhub sub-blocks of Acrturus
Compared with Vg20, the size of mmhub range is changed from 2 to 8. Change-Id: I529c0ff0aaed200e5b102d482563ed9dc2278260 Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 701 +++- 1 file changed, 695 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c index 5c42387c9274..e8bb0a91ed65 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c @@ -663,6 +663,7 @@ void mmhub_v9_4_get_clockgating(struct amdgpu_device *adev, u32 *flags) } static const struct soc15_ras_field_entry mmhub_v9_4_ras_fields[] = { + /* MMHUB Range 0 */ { "MMEA0_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_CMDMEM_SEC_COUNT), SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_CMDMEM_DED_COUNT), @@ -751,6 +752,24 @@ static const struct soc15_ras_field_entry mmhub_v9_4_ras_fields[] = { 0, 0, SOC15_REG_FIELD(MMEA0_EDC_CNT3, GMIWR_PAGEMEM_DED_COUNT), }, + { "MMEA0_MAM_D0MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D0MEM_SED_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D0MEM_DED_COUNT), + }, + { "MMEA0_MAM_D1MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D1MEM_SED_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D1MEM_DED_COUNT), + }, + { "MMEA0_MAM_D2MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D2MEM_SED_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D2MEM_DED_COUNT), + }, + { "MMEA0_MAM_D3MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D3MEM_SED_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, MAM_D3MEM_DED_COUNT), + }, + + /* MMHUB Range 1 */ { "MMEA1_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT), SOC15_REG_FIELD(MMEA1_EDC_CNT, DRAMRD_CMDMEM_SEC_COUNT), SOC15_REG_FIELD(MMEA1_EDC_CNT, DRAMRD_CMDMEM_DED_COUNT), @@ -838,16 +857,686 @@ static const struct soc15_ras_field_entry mmhub_v9_4_ras_fields[] = { { "MMEA1_GMIWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT3), 0, 0, SOC15_REG_FIELD(MMEA1_EDC_CNT3, GMIWR_PAGEMEM_DED_COUNT), + }, + { "MMEA1_MAM_D0MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT2), + SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D0MEM_SED_COUNT), + SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D0MEM_DED_COUNT), + }, + { "MMEA1_MAM_D1MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT2), + SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D1MEM_SED_COUNT), + SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D1MEM_DED_COUNT), + }, + { "MMEA1_MAM_D2MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT2), + SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D2MEM_SED_COUNT), + SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D2MEM_DED_COUNT), + }, + { "MMEA1_MAM_D3MEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_EDC_CNT2), + SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D3MEM_SED_COUNT), + SOC15_REG_FIELD(MMEA1_EDC_CNT2, MAM_D3MEM_DED_COUNT), + }, + + /* MMHUB Range 2*/ + { "MMEA2_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMRD_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMRD_CMDMEM_DED_COUNT), + }, + { "MMEA2_DRAMWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_CMDMEM_DED_COUNT), + }, + { "MMEA2_DRAMWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_DATAMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_DATAMEM_DED_COUNT), + }, + { "MMEA2_RRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, RRET_TAGMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, RRET_TAGMEM_DED_COUNT), + }, + { "MMEA2_WRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, WRET_TAGMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, WRET_TAGMEM_DED_COUNT), + }, + { "MMEA2_DRAMRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMRD_PAGEMEM_SED_COUNT), + 0, 0, + }, + { "MMEA2_DRAMWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT), + SOC15_REG_FIELD(MMEA2_EDC_CNT, DRAMWR_PAGEMEM_SED_COUNT), + 0, 0, + }, + { "MMEA2_IORD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_EDC_CNT), + SO
[PATCH 4/4] drm/amdgpu: add RAS support for the gfx block of Arcturus
Implement functions to do the RAS error injection and query EDC counter. Change-Id: I4d947511331a19c1967551b9d42997698073f795 Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/Makefile | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 26 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 978 ++ drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h | 35 + 4 files changed, 1039 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 83ee1c676e3a..ccfdcfc6a526 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -120,6 +120,7 @@ amdgpu-y += \ amdgpu_rlc.o \ gfx_v8_0.o \ gfx_v9_0.o \ + gfx_v9_4.o \ gfx_v10_0.o # add async DMA block diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 35b5ca7a9272..7c5b3ad25d51 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -48,6 +48,8 @@ #include "amdgpu_ras.h" +#include "gfx_v9_4.h" + #define GFX9_NUM_GFX_RINGS 1 #define GFX9_MEC_HPD_SIZE 4096 #define RLCG_UCODE_LOADING_START_ADDRESS 0x2000L @@ -1822,6 +1824,17 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = { .query_ras_error_count = _v9_0_query_ras_error_count }; +static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = { + .get_gpu_clock_counter = _v9_0_get_gpu_clock_counter, + .select_se_sh = _v9_0_select_se_sh, + .read_wave_data = _v9_0_read_wave_data, + .read_wave_sgprs = _v9_0_read_wave_sgprs, + .read_wave_vgprs = _v9_0_read_wave_vgprs, + .select_me_pipe_q = _v9_0_select_me_pipe_q, + .ras_error_inject = _v9_4_ras_error_inject, + .query_ras_error_count = _v9_4_query_ras_error_count +}; + static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) { u32 gb_addr_config; @@ -1873,6 +1886,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN; break; case CHIP_ARCTURUS: + adev->gfx.funcs = _v9_4_gfx_funcs; adev->gfx.config.max_hw_contexts = 8; adev->gfx.config.sc_prim_fifo_size_frontend = 0x20; adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -4232,7 +4246,17 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) goto fail; } - gfx_v9_0_clear_ras_edc_counter(adev); + switch (adev->asic_type) + { + case CHIP_VEGA20: + gfx_v9_0_clear_ras_edc_counter(adev); + break; + case CHIP_ARCTURUS: + gfx_v9_4_clear_ras_edc_counter(adev); + break; + default: + break; + } fail: amdgpu_ib_free(adev, , NULL); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c new file mode 100644 index ..e19d275f3f7d --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c @@ -0,0 +1,978 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include + +#include "amdgpu.h" +#include "amdgpu_gfx.h" +#include "soc15.h" +#include "soc15d.h" +#include "amdgpu_atomfirmware.h" +#include "amdgpu_pm.h" + +#include "gc/gc_9_4_1_offset.h" +#include "gc/gc_9_4_1_sh_mask.h" +#include "soc15_common.h" + +#include "gfx_v9_4.h" +#include "amdgpu_ras.h" + +static const struct soc15_reg_entry gfx_v9_4_edc_counter_regs[] = { + /* CPC */ + { SOC15_R
[PATCH 0/4] Enable RAS feature for the gc of Arcturus
Refactor the ras related codes of vega20: 1. refine the security check for RAS functions. 2. abstract clearing edc counters to a separated function. 3. add ip prefix to ip related codes. Implementation of RAS feature for Arcturus gfx: 1. add new register head files for gfx v9.4.1. 2. add codes to support querying of EDC counter and error injection. Dennis Li (4): drm/amdgpu: refine the security check for RAS functions drm/amdgpu: abstract EDC counter clear to a separated function drm/amdgpu: add EDC counter registers of gc for Arcturus drm/amdgpu: add RAS support for the gfx block of Arcturus drivers/gpu/drm/amd/amdgpu/Makefile | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 138 ++- drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 978 ++ drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h | 35 + .../amd/include/asic_reg/gc/gc_9_4_1_offset.h | 264 + .../include/asic_reg/gc/gc_9_4_1_sh_mask.h| 748 ++ 6 files changed, 2128 insertions(+), 36 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h create mode 100644 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h create mode 100644 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/4] drm/amdgpu: refine the security check for RAS functions
To avoid calling RAS related functions when RAS feature isn't supported in hardware. Change to check supported features, instead of checking asic type. v2: reuse amdgpu_ras_is_supported function, instead of introducing a new flag for hardware ras feature. Change-Id: Ia3f73bd9ee41ee3d0dd18d6f46e67124cf88d653 Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index e3d466bd5c4e..759d8144f9c0 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -5967,7 +5967,7 @@ static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev, int ret; struct ta_ras_trigger_error_input block_info = { 0 }; - if (adev->asic_type != CHIP_VEGA20) + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) return -EINVAL; if (info->head.sub_block_index >= ARRAY_SIZE(ras_gfx_subblocks)) @@ -6218,7 +6218,7 @@ static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev, uint32_t i, j, k; uint32_t reg_value; - if (adev->asic_type != CHIP_VEGA20) + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) return -EINVAL; err_data->ue_count = 0; -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 3/4] drm/amdgpu: add EDC counter registers of gc for Arcturus
add reg headers to gc includes v2: remove unused registers and fields in this patch set Change-Id: If3476c0b0ed88e5d11bdb8bec1278ae10fc5af25 Signed-off-by: Dennis Li --- .../amd/include/asic_reg/gc/gc_9_4_1_offset.h | 264 +++ .../include/asic_reg/gc/gc_9_4_1_sh_mask.h| 748 ++ 2 files changed, 1012 insertions(+) create mode 100644 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h create mode 100644 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h new file mode 100644 index ..f41556abfbbc --- /dev/null +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _gc_9_4_1_OFFSET_HEADER +#define _gc_9_4_1_OFFSET_HEADER + +// addressBlock: gc_grbmdec +// base address: 0x8000 +#define mmGRBM_CNTL 0x +#define mmGRBM_CNTL_BASE_IDX 0 +#define mmGRBM_SKEW_CNTL 0x0001 +#define mmGRBM_SKEW_CNTL_BASE_IDX 0 +#define mmGRBM_STATUS2 0x0002 +#define mmGRBM_STATUS2_BASE_IDX 0 +#define mmGRBM_PWR_CNTL 0x0003 +#define mmGRBM_PWR_CNTL_BASE_IDX 0 +#define mmGRBM_STATUS 0x0004 +#define mmGRBM_STATUS_BASE_IDX 0 +#define mmGRBM_STATUS_SE0 0x0005 +#define mmGRBM_STATUS_SE0_BASE_IDX 0 +#define mmGRBM_STATUS_SE1 0x0006 +#define mmGRBM_STATUS_SE1_BASE_IDX 0 +#define mmGRBM_SOFT_RESET 0x0008 +#define mmGRBM_SOFT_RESET_BASE_IDX 0 +#define mmGRBM_GFX_CLKEN_CNTL 0x000c +#define mmGRBM_GFX_CLKEN_CNTL_BASE_IDX 0 +#define mmGRBM_WAIT_IDLE_CLOCKS 0x000d +#define mmGRBM_WAIT_IDLE_CLOCKS_BASE_IDX 0 +#define mmGRBM_STATUS_SE2 0x000e +#define mmGRBM_STATUS_SE2_BASE_IDX 0 +#define mmGRBM_STATUS_SE3 0x000f +#define mmGRBM_STATUS_SE3_BASE_IDX 0 +#define mmGRBM_READ_ERROR 0x0016 +#define mmGRBM_READ_ERROR_BASE_IDX 0 +#defin
[PATCH 2/4] drm/amdgpu: abstract EDC counter clear to a separated function
1. Add IP prefix for the IP related codes. 2. Refactor the code to clear EDC counter. Change-Id: I1cd9ec304a7ace9a74480264d24368fd11a87833 Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 112 ++ 1 file changed, 77 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 759d8144f9c0..35b5ca7a9272 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -736,6 +736,7 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring); static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status); +static void gfx_v9_0_clear_ras_edc_counter(struct amdgpu_device *adev); static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev, void *inject_if); @@ -3996,7 +3997,7 @@ static const struct soc15_reg_entry sgpr2_init_regs[] = { { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE7), 0xff00 }, }; -static const struct soc15_reg_entry sec_ded_counter_registers[] = { +static const struct soc15_reg_entry gfx_v9_0_edc_counter_regs[] = { { SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT), 0, 1, 1}, { SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_UCODE_CNT), 0, 1, 1}, { SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_ROQ_CNT), 0, 1, 1}, @@ -4085,7 +4086,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) struct amdgpu_ring *ring = >gfx.compute_ring[0]; struct amdgpu_ib ib; struct dma_fence *f = NULL; - int r, i, j, k; + int r, i; unsigned total_size, vgpr_offset, sgpr_offset; u64 gpu_addr; @@ -4231,18 +4232,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) goto fail; } - /* read back registers to clear the counters */ - mutex_lock(>grbm_idx_mutex); - for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++) { - for (j = 0; j < sec_ded_counter_registers[i].se_num; j++) { - for (k = 0; k < sec_ded_counter_registers[i].instance; k++) { - gfx_v9_0_select_se_sh(adev, j, 0x0, k); - RREG32(SOC15_REG_ENTRY_OFFSET(sec_ded_counter_registers[i])); - } - } - } - WREG32_SOC15(GC, 0, mmGRBM_GFX_INDEX, 0xe000); - mutex_unlock(>grbm_idx_mutex); + gfx_v9_0_clear_ras_edc_counter(adev); fail: amdgpu_ib_free(adev, , NULL); @@ -5519,7 +5509,7 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev, } -static const struct soc15_ras_field_entry gc_ras_fields_vg20[] = { +static const struct soc15_ras_field_entry gfx_v9_0_ras_fields[] = { { "CPC_SCRATCH", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT), SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, SEC_COUNT), SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, DED_COUNT) @@ -6092,7 +6082,7 @@ static int gfx_v9_0_query_utc_edc_status(struct amdgpu_device *adev, WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255); WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_CNT, 0); - for (i = 0; i < 16; i++) { + for (i = 0; i < ARRAY_SIZE(vml2_mems); i++) { WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_INDEX, i); data = RREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_CNT); @@ -6111,7 +6101,7 @@ static int gfx_v9_0_query_utc_edc_status(struct amdgpu_device *adev, } } - for (i = 0; i < 7; i++) { + for (i = 0; i < ARRAY_SIZE(vml2_walker_mems); i++) { WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_INDEX, i); data = RREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_CNT); @@ -6132,7 +6122,7 @@ static int gfx_v9_0_query_utc_edc_status(struct amdgpu_device *adev, } } - for (i = 0; i < 4; i++) { + for (i = 0; i < ARRAY_SIZE(atc_l2_cache_2m_mems); i++) { WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_INDEX, i); data = RREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_CNT); @@ -6144,7 +6134,7 @@ static int gfx_v9_0_query_utc_edc_status(struct amdgpu_device *adev, } } - for (i = 0; i < 32; i++) { + for (i = 0; i < ARRAY_SIZE(atc_l2_cache_4k_mems); i++) { WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, i); data = RREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_CNT); @@ -6171,36 +6161,36 @@ static int gfx_v9_0_query_utc_edc_status(struct amdgpu_device *adev, return 0; } -static int __get_ras_error_count(const struct soc15_reg_entry *reg, +static int gfx_v9_0_ras_error_count(const struct soc15_reg_entry *reg, uint32_t se_id, uint32_t inst_id, u
[PATCH v2 2/3] drm/amdgpu: refine query function of mmhub EDC counter in vg20
Add codes to print the detail EDC info for the subblock of mmhub v2: Move the EDC_CNT registers' defintion from mmhub_9_4 header files to mmhub_1_0 ones. Add mmhub_v1_0_ prefix for the local static variable and function. Change-Id: I1d5b3df38caa8f0b437c96b78091662aaeaf264b Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 232 .../include/asic_reg/mmhub/mmhub_1_0_offset.h | 16 ++ .../asic_reg/mmhub/mmhub_1_0_sh_mask.h| 122 + .../asic_reg/mmhub/mmhub_9_4_0_offset.h | 53 .../asic_reg/mmhub/mmhub_9_4_0_sh_mask.h | 257 -- 5 files changed, 318 insertions(+), 362 deletions(-) delete mode 100644 drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_0_offset.h delete mode 100644 drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_0_sh_mask.h diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c index 6965e1e6fa9e..d7575ac27038 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c @@ -27,17 +27,13 @@ #include "mmhub/mmhub_1_0_offset.h" #include "mmhub/mmhub_1_0_sh_mask.h" #include "mmhub/mmhub_1_0_default.h" -#include "mmhub/mmhub_9_4_0_offset.h" #include "vega10_enum.h" - +#include "soc15.h" #include "soc15_common.h" #define mmDAGB0_CNTL_MISC2_RV 0x008f #define mmDAGB0_CNTL_MISC2_RV_BASE_IDX 0 -#define EA_EDC_CNT_MASK 0x3 -#define EA_EDC_CNT_SHIFT 0x2 - u64 mmhub_v1_0_get_fb_location(struct amdgpu_device *adev) { u64 base = RREG32_SOC15(MMHUB, 0, mmMC_VM_FB_LOCATION_BASE); @@ -562,59 +558,191 @@ void mmhub_v1_0_get_clockgating(struct amdgpu_device *adev, u32 *flags) *flags |= AMD_CG_SUPPORT_MC_LS; } +static const struct soc15_ras_field_entry mmhub_v1_0_ras_fields[] = { + { "MMEA0_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMRD_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMRD_CMDMEM_DED_COUNT), + }, + { "MMEA0_DRAMWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_CMDMEM_DED_COUNT), + }, + { "MMEA0_DRAMWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_DATAMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_DATAMEM_DED_COUNT), + }, + { "MMEA0_RRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, RRET_TAGMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, RRET_TAGMEM_DED_COUNT), + }, + { "MMEA0_WRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, WRET_TAGMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, WRET_TAGMEM_DED_COUNT), + }, + { "MMEA0_DRAMRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMRD_PAGEMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_DRAMWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, DRAMWR_PAGEMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_IORD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, IORD_CMDMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_IOWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, IOWR_CMDMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_IOWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT_VG20, IOWR_DATAMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_GMIRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIRD_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIRD_CMDMEM_DED_COUNT), + }, + { "MMEA0_GMIWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIWR_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIWR_CMDMEM_DED_COUNT), + }, + { "MMEA0_GMIWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIWR_DATAMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIWR_DATAMEM_DED_COUNT), + }, + { "MMEA0_GMIRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2_VG20), + SOC15_REG_FIELD(MMEA0_EDC_CNT2_VG20, GMIRD_PAGEMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_GMIWR
[PATCH v2 0/3] RAS support for mmhub
This set of patches is a continuation of RAS enablement patches for AMDGPU. 1. The new struct soc15_ras_field_entry will be reused by gfx, mmhub and other IP blocks. 2. Refine the query function of RAS error counter for VG20, add codes to help user to locate which sub-block of mmhub cause error. 3. Implement the query function of RAS error counter for Mi100 v2: 1. Fix some comment issues. 2. Add IP name prefix for the local static variable and function. 3. Move the EDC_CNT registers' defintion from mmhub_9_4 header files to mmhub_1_0 ones for vg20. Dennis Li (3): drm/amdgpu: define soc15_ras_field_entry for reuse drm/amdgpu: refine query function of mmhub EDC counter in vg20 drm/amdgpu: implement querying ras error count for mmhub9.4 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 34 +-- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 + drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 232 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 253 - drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h | 2 + drivers/gpu/drm/amd/amdgpu/soc15.h| 12 + .../include/asic_reg/mmhub/mmhub_1_0_offset.h | 16 ++ .../asic_reg/mmhub/mmhub_1_0_sh_mask.h| 122 + .../asic_reg/mmhub/mmhub_9_4_0_offset.h | 53 .../asic_reg/mmhub/mmhub_9_4_0_sh_mask.h | 257 -- 10 files changed, 598 insertions(+), 386 deletions(-) delete mode 100644 drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_0_offset.h delete mode 100644 drivers/gpu/drm/amd/include/asic_reg/mmhub/mmhub_9_4_0_sh_mask.h -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2 1/3] drm/amdgpu: define soc15_ras_field_entry for reuse
The struct soc15_ras_field_entry will be reused by other IPs, such as mmhub and gc v2: rename ras_subblock_regs to gc_ras_fields_vg20, because the future asic maybe have a different table. Change-Id: I6c3388a09b5fbf927ad90fcd626baa448d1681a6 Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 34 +-- drivers/gpu/drm/amd/amdgpu/soc15.h| 12 ++ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index c7ae685d6f74..8073fcd4720e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -131,18 +131,6 @@ MODULE_FIRMWARE("amdgpu/renoir_rlc.bin"); #define mmTCP_CHAN_STEER_5_ARCT 0x0b0c #define mmTCP_CHAN_STEER_5_ARCT_BASE_IDX 0 -struct ras_gfx_subblock_reg { - const char *name; - uint32_t hwip; - uint32_t inst; - uint32_t seg; - uint32_t reg_offset; - uint32_t sec_count_mask; - uint32_t sec_count_shift; - uint32_t ded_count_mask; - uint32_t ded_count_shift; -}; - enum ta_ras_gfx_subblock { /*CPC*/ TA_RAS_BLOCK__GFX_CPC_INDEX_START = 0, @@ -5487,7 +5475,7 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev, } -static const struct ras_gfx_subblock_reg ras_subblock_regs[] = { +static const struct soc15_ras_field_entry gc_ras_fields_vg20[] = { { "CPC_SCRATCH", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT), SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, SEC_COUNT), SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, DED_COUNT) @@ -6146,29 +6134,29 @@ static int __get_ras_error_count(const struct soc15_reg_entry *reg, uint32_t i; uint32_t sec_cnt, ded_cnt; - for (i = 0; i < ARRAY_SIZE(ras_subblock_regs); i++) { - if(ras_subblock_regs[i].reg_offset != reg->reg_offset || - ras_subblock_regs[i].seg != reg->seg || - ras_subblock_regs[i].inst != reg->inst) + for (i = 0; i < ARRAY_SIZE(gc_ras_fields_vg20); i++) { + if(gc_ras_fields_vg20[i].reg_offset != reg->reg_offset || + gc_ras_fields_vg20[i].seg != reg->seg || + gc_ras_fields_vg20[i].inst != reg->inst) continue; sec_cnt = (value & - ras_subblock_regs[i].sec_count_mask) >> - ras_subblock_regs[i].sec_count_shift; + gc_ras_fields_vg20[i].sec_count_mask) >> + gc_ras_fields_vg20[i].sec_count_shift; if (sec_cnt) { DRM_INFO("GFX SubBlock %s, Instance[%d][%d], SEC %d\n", - ras_subblock_regs[i].name, + gc_ras_fields_vg20[i].name, se_id, inst_id, sec_cnt); *sec_count += sec_cnt; } ded_cnt = (value & - ras_subblock_regs[i].ded_count_mask) >> - ras_subblock_regs[i].ded_count_shift; + gc_ras_fields_vg20[i].ded_count_mask) >> + gc_ras_fields_vg20[i].ded_count_shift; if (ded_cnt) { DRM_INFO("GFX SubBlock %s, Instance[%d][%d], DED %d\n", - ras_subblock_regs[i].name, + gc_ras_fields_vg20[i].name, se_id, inst_id, ded_cnt); *ded_count += ded_cnt; diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h index 9af6c6ffbfa2..344280b869c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h @@ -60,6 +60,18 @@ struct soc15_allowed_register_entry { bool grbm_indexed; }; +struct soc15_ras_field_entry { + const char *name; + uint32_t hwip; + uint32_t inst; + uint32_t seg; + uint32_t reg_offset; + uint32_t sec_count_mask; + uint32_t sec_count_shift; + uint32_t ded_count_mask; + uint32_t ded_count_shift; +}; + #define SOC15_REG_ENTRY(ip, inst, reg) ip##_HWIP, inst, reg##_BASE_IDX, reg #define SOC15_REG_ENTRY_OFFSET(entry) (adev->reg_offset[entry.hwip][entry.inst][entry.seg] + entry.reg_offset) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2 3/3] drm/amdgpu: implement querying ras error count for mmhub9.4
Get mmhub error counter by accessing EDC_CNT registers. v2: Add mmhub_v9_4_ prefix for local static variable and function Change-Id: I728d4183a08707aaf0fc71d184e86322a681e725 Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 + drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 253 +++- drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h | 2 + 3 files changed, 257 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index ee615d050837..5f4a6cdf83a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -658,6 +658,9 @@ static void gmc_v9_0_set_mmhub_funcs(struct amdgpu_device *adev) case CHIP_VEGA20: adev->mmhub.funcs = _v1_0_funcs; break; + case CHIP_ARCTURUS: + adev->mmhub.funcs = _v9_4_funcs; + break; default: break; } diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c index 2c5adfe803a2..6fe5c39e5581 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c @@ -21,6 +21,7 @@ * */ #include "amdgpu.h" +#include "amdgpu_ras.h" #include "mmhub_v9_4.h" #include "mmhub/mmhub_9_4_1_offset.h" @@ -29,7 +30,7 @@ #include "athub/athub_1_0_offset.h" #include "athub/athub_1_0_sh_mask.h" #include "vega10_enum.h" - +#include "soc15.h" #include "soc15_common.h" #define MMHUB_NUM_INSTANCES2 @@ -651,3 +652,253 @@ void mmhub_v9_4_get_clockgating(struct amdgpu_device *adev, u32 *flags) if (data & ATCL2_0_ATC_L2_MISC_CG__MEM_LS_ENABLE_MASK) *flags |= AMD_CG_SUPPORT_MC_LS; } + +static const struct soc15_ras_field_entry mmhub_v9_4_ras_fields[] = { + { "MMEA0_DRAMRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_CMDMEM_DED_COUNT), + }, + { "MMEA0_DRAMWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_CMDMEM_DED_COUNT), + }, + { "MMEA0_DRAMWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_DATAMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_DATAMEM_DED_COUNT), + }, + { "MMEA0_RRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, RRET_TAGMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, RRET_TAGMEM_DED_COUNT), + }, + { "MMEA0_WRET_TAGMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, WRET_TAGMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, WRET_TAGMEM_DED_COUNT), + }, + { "MMEA0_DRAMRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMRD_PAGEMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_DRAMWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, DRAMWR_PAGEMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_IORD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, IORD_CMDMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_IOWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, IOWR_CMDMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_IOWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT, IOWR_DATAMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_GMIRD_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIRD_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIRD_CMDMEM_DED_COUNT), + }, + { "MMEA0_GMIWR_CMDMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_CMDMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_CMDMEM_DED_COUNT), + }, + { "MMEA0_GMIWR_DATAMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_DATAMEM_SEC_COUNT), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_DATAMEM_DED_COUNT), + }, + { "MMEA0_GMIRD_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIRD_PAGEMEM_SED_COUNT), + 0, 0, + }, + { "MMEA0_GMIWR_PAGEMEM", SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_EDC_CNT2), + SOC15_REG_FIELD(MMEA0_EDC_CNT2, GMIWR_PAGEMEM_SED_COUNT), +
[PATCH 3/3] drm/amdgpu: add RAS support for VML2 and ATCL2
Add codes to query the EDC count of VML2 & ATCL2 Change-Id: If2c251481ba0a1a34ce3405a85f86d65eecee461 Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 167 ++ 1 file changed, 167 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 2a95093b85a5..22be6177938e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -6152,6 +6152,171 @@ static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev, return ret; } +static const char *vml2_mems[] = { + "UTC_VML2_BANK_CACHE_0_BIGK_MEM0", + "UTC_VML2_BANK_CACHE_0_BIGK_MEM1", + "UTC_VML2_BANK_CACHE_0_4K_MEM0", + "UTC_VML2_BANK_CACHE_0_4K_MEM1", + "UTC_VML2_BANK_CACHE_1_BIGK_MEM0", + "UTC_VML2_BANK_CACHE_1_BIGK_MEM1", + "UTC_VML2_BANK_CACHE_1_4K_MEM0", + "UTC_VML2_BANK_CACHE_1_4K_MEM1", + "UTC_VML2_BANK_CACHE_2_BIGK_MEM0", + "UTC_VML2_BANK_CACHE_2_BIGK_MEM1", + "UTC_VML2_BANK_CACHE_2_4K_MEM0", + "UTC_VML2_BANK_CACHE_2_4K_MEM1", + "UTC_VML2_BANK_CACHE_3_BIGK_MEM0", + "UTC_VML2_BANK_CACHE_3_BIGK_MEM1", + "UTC_VML2_BANK_CACHE_3_4K_MEM0", + "UTC_VML2_BANK_CACHE_3_4K_MEM1", +}; + +static const char *vml2_walker_mems[] = { + "UTC_VML2_CACHE_PDE0_MEM0", + "UTC_VML2_CACHE_PDE0_MEM1", + "UTC_VML2_CACHE_PDE1_MEM0", + "UTC_VML2_CACHE_PDE1_MEM1", + "UTC_VML2_CACHE_PDE2_MEM0", + "UTC_VML2_CACHE_PDE2_MEM1", + "UTC_VML2_RDIF_LOG_FIFO", +}; + +static const char *atc_l2_cache_2m_mems[] = { + "UTC_ATCL2_CACHE_2M_BANK0_WAY0_MEM", + "UTC_ATCL2_CACHE_2M_BANK0_WAY1_MEM", + "UTC_ATCL2_CACHE_2M_BANK1_WAY0_MEM", + "UTC_ATCL2_CACHE_2M_BANK1_WAY1_MEM", +}; + +static const char *atc_l2_cache_4k_mems[] = { + "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM0", + "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM1", + "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM2", + "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM3", + "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM4", + "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM5", + "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM6", + "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM7", + "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM0", + "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM1", + "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM2", + "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM3", + "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM4", + "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM5", + "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM6", + "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM7", + "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM0", + "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM1", + "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM2", + "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM3", + "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM4", + "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM5", + "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM6", + "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM7", + "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM0", + "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM1", + "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM2", + "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM3", + "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM4", + "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM5", + "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM6", + "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM7", +}; + +static int gfx_v9_0_query_utc_edc_status(struct amdgpu_device *adev, +struct ras_err_data *err_data) +{ + uint32_t i, data; + uint32_t sec_count, ded_count; + + WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_INDEX, 255); + WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_CNT, 0); + WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_INDEX, 255); + WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_CNT, 0); + WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_INDEX, 255); + WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_CNT, 0); + WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255); + WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_CNT, 0); + + for (i = 0; i < 16; i++) { + WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_INDEX, i); + data = RREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_CNT); + + sec_count = REG_GET_FIELD(data, VM_L2_MEM_ECC_CNT, SEC_COUNT); + if (sec_count) { + DRM_INFO("Instance[%d]: SubBlock %s, SEC %
[PATCH 1/3] drm/amdgpu: change to query the actual EDC counter
For the potential request in the future, change to query the actual EDC counter. Change-Id: I783ccd76f4c65f9829f7a8967a539a23ae5484b5 Signed-off-by: Dennis Li --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 819 -- drivers/gpu/drm/amd/amdgpu/soc15.h| 2 + 2 files changed, 496 insertions(+), 325 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 9a1f91cf0ee8..2a95093b85a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -127,6 +127,18 @@ MODULE_FIRMWARE("amdgpu/renoir_rlc.bin"); #define mmTCP_CHAN_STEER_5_ARCT 0x0b0c #define mmTCP_CHAN_STEER_5_ARCT_BASE_IDX 0 +struct ras_gfx_subblock_reg { + const char *name; + uint32_t hwip; + uint32_t inst; + uint32_t seg; + uint32_t reg_offset; + uint32_t sec_count_mask; + uint32_t sec_count_shift; + uint32_t ded_count_mask; + uint32_t ded_count_shift; +}; + enum ta_ras_gfx_subblock { /*CPC*/ TA_RAS_BLOCK__GFX_CPC_INDEX_START = 0, @@ -4172,6 +4184,7 @@ static const struct soc15_reg_entry sec_ded_counter_registers[] = { { SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT), 0, 1, 16}, { SOC15_REG_ENTRY(GC, 0, mmTCP_ATC_EDC_GATCL1_CNT), 0, 4, 16}, { SOC15_REG_ENTRY(GC, 0, mmTCP_EDC_CNT), 0, 4, 16}, + { SOC15_REG_ENTRY(GC, 0, mmTCP_EDC_CNT_NEW), 0, 4, 16}, { SOC15_REG_ENTRY(GC, 0, mmTD_EDC_CNT), 0, 4, 16}, { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT2), 0, 4, 6}, { SOC15_REG_ENTRY(GC, 0, mmSQ_EDC_CNT), 0, 4, 16}, @@ -5652,301 +5665,445 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, return AMDGPU_RAS_SUCCESS; } -static const struct { - const char *name; - uint32_t ip; - uint32_t inst; - uint32_t seg; - uint32_t reg_offset; - uint32_t per_se_instance; - int32_t num_instance; - uint32_t sec_count_mask; - uint32_t ded_count_mask; -} gfx_ras_edc_regs[] = { - { "CPC_SCRATCH", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT), 0, 1, - REG_FIELD_MASK(CPC_EDC_SCRATCH_CNT, SEC_COUNT), - REG_FIELD_MASK(CPC_EDC_SCRATCH_CNT, DED_COUNT) }, - { "CPC_UCODE", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_UCODE_CNT), 0, 1, - REG_FIELD_MASK(CPC_EDC_UCODE_CNT, SEC_COUNT), - REG_FIELD_MASK(CPC_EDC_UCODE_CNT, DED_COUNT) }, - { "CPF_ROQ_ME1", SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_ROQ_CNT), 0, 1, - REG_FIELD_MASK(CPF_EDC_ROQ_CNT, COUNT_ME1), 0 }, - { "CPF_ROQ_ME2", SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_ROQ_CNT), 0, 1, - REG_FIELD_MASK(CPF_EDC_ROQ_CNT, COUNT_ME2), 0 }, - { "CPF_TAG", SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_TAG_CNT), 0, 1, - REG_FIELD_MASK(CPF_EDC_TAG_CNT, SEC_COUNT), - REG_FIELD_MASK(CPF_EDC_TAG_CNT, DED_COUNT) }, - { "CPG_DMA_ROQ", SOC15_REG_ENTRY(GC, 0, mmCPG_EDC_DMA_CNT), 0, 1, - REG_FIELD_MASK(CPG_EDC_DMA_CNT, ROQ_COUNT), 0 }, - { "CPG_DMA_TAG", SOC15_REG_ENTRY(GC, 0, mmCPG_EDC_DMA_CNT), 0, 1, - REG_FIELD_MASK(CPG_EDC_DMA_CNT, TAG_SEC_COUNT), - REG_FIELD_MASK(CPG_EDC_DMA_CNT, TAG_DED_COUNT) }, - { "CPG_TAG", SOC15_REG_ENTRY(GC, 0, mmCPG_EDC_TAG_CNT), 0, 1, - REG_FIELD_MASK(CPG_EDC_TAG_CNT, SEC_COUNT), - REG_FIELD_MASK(CPG_EDC_TAG_CNT, DED_COUNT) }, - { "DC_CSINVOC", SOC15_REG_ENTRY(GC, 0, mmDC_EDC_CSINVOC_CNT), 0, 1, - REG_FIELD_MASK(DC_EDC_CSINVOC_CNT, COUNT_ME1), 0 }, - { "DC_RESTORE", SOC15_REG_ENTRY(GC, 0, mmDC_EDC_RESTORE_CNT), 0, 1, - REG_FIELD_MASK(DC_EDC_RESTORE_CNT, COUNT_ME1), 0 }, - { "DC_STATE", SOC15_REG_ENTRY(GC, 0, mmDC_EDC_STATE_CNT), 0, 1, - REG_FIELD_MASK(DC_EDC_STATE_CNT, COUNT_ME1), 0 }, - { "GDS_MEM", SOC15_REG_ENTRY(GC, 0, mmGDS_EDC_CNT), 0, 1, - REG_FIELD_MASK(GDS_EDC_CNT, GDS_MEM_SEC), - REG_FIELD_MASK(GDS_EDC_CNT, GDS_MEM_DED) }, - { "GDS_INPUT_QUEUE", SOC15_REG_ENTRY(GC, 0, mmGDS_EDC_CNT), 0, 1, - REG_FIELD_MASK(GDS_EDC_CNT, GDS_INPUT_QUEUE_SED), 0 }, +static const struct ras_gfx_subblock_reg ras_subblock_regs[] = { + { "CPC_SCRATCH", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_SCRATCH_CNT), + SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, SEC_COUNT), + SOC15_REG_FIELD(CPC_EDC_SCRATCH_CNT, DED_COUNT) + }, + { "CPC_UCODE", SOC15_REG_ENTRY(GC, 0, mmCPC_EDC_UCODE_CNT), + SOC15_REG_FIELD(CPC_EDC_UCODE_CNT, SEC_COUNT), + SOC15_REG_FIELD(CPC_EDC_UCODE_CNT, DED_COUNT) + }, + { "CPF_ROQ_ME1", SOC15_REG_ENTRY(GC, 0, mmCPF_EDC_ROQ_CNT), + SOC15_REG_FIELD(CPF_EDC_ROQ_CNT, COUNT_ME1), + 0, 0 + }, + { "CPF_R
[PATCH 2/3] drm/amd/include: add register define for VML2 and ATCL2
Add VML2 and ATCL2 ECC registers to support VEGA20 RAS Change-Id: I8860f2e37fa7afd8d6123290fb7b9dcee56edd6e Signed-off-by: Dennis Li --- .../amd/include/asic_reg/gc/gc_9_0_offset.h| 18 -- .../amd/include/asic_reg/gc/gc_9_0_sh_mask.h | 18 -- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h index ca16d9125fbc..2bfaaa8157d0 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h @@ -1146,7 +1146,14 @@ #define mmATC_L2_MEM_POWER_LS_BASE_IDX 0 #define mmATC_L2_CGTT_CLK_CTRL 0x080c #define mmATC_L2_CGTT_CLK_CTRL_BASE_IDX 0 - +#define mmATC_L2_CACHE_4K_EDC_INDEX 0x080e +#define mmATC_L2_CACHE_4K_EDC_INDEX_BASE_IDX 0 +#define mmATC_L2_CACHE_2M_EDC_INDEX 0x080f +#define mmATC_L2_CACHE_2M_EDC_INDEX_BASE_IDX 0 +#define mmATC_L2_CACHE_4K_EDC_CNT 0x0810 +#define mmATC_L2_CACHE_4K_EDC_CNT_BASE_IDX 0 +#define mmATC_L2_CACHE_2M_EDC_CNT 0x0811 +#define mmATC_L2_CACHE_2M_EDC_CNT_BASE_IDX 0 // addressBlock: gc_utcl2_vml2pfdec // base address: 0xa100 @@ -1206,7 +1213,14 @@ #define mmVM_L2_CACHE_PARITY_CNTL_BASE_IDX 0 #define mmVM_L2_CGTT_CLK_CTRL 0x085e #define mmVM_L2_CGTT_CLK_CTRL_BASE_IDX 0 - +#define mmVM_L2_MEM_ECC_INDEX 0x0860 +#define mmVM_L2_MEM_ECC_INDEX_BASE_IDX 0 +#define mmVM_L2_WALKER_MEM_ECC_INDEX 0x0861 +#define mmVM_L2_WALKER_MEM_ECC_INDEX_BASE_IDX 0 +#define mmVM_L2_MEM_ECC_CNT 0x0862 +#define mmVM_L2_MEM_ECC_CNT_BASE_IDX 0 +#define mmVM_L2_WALKER_MEM_ECC_CNT 0x0863 +#define mmVM_L2_WALKER_MEM_ECC_CNT_BASE_IDX 0 // addressBlock: gc_utcl2_vml2vcdec // base address: 0xa200 diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h index 064c4bb1dc62..d4c613a85352 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h @@ -6661,7 +6661,6 @@ #define ATC_L2_CGTT_CLK_CTRL__SOFT_STALL_OVERRIDE_MASK 0x00FFL #define ATC_L2_CGTT_CLK_CTRL__SOFT_OVERRIDE_MASK 0xFF00L - // addressBlock: gc_utcl2_vml2pfdec //VM_L2_CNTL #define VM_L2_CNTL__ENABLE_L2_CACHE__SHIFT 0x0 @@ -6991,7 +6990,22 @@ #define VM_L2_CGTT_CLK_CTRL__MGLS_OVERRIDE_MASK 0x8000L #define VM_L2_CGTT_CLK_CTRL__SOFT_STALL_OVERRIDE_MASK 0x00FFL #define VM_L2_CGTT_CLK_CTRL__SOFT_OVERRIDE_MASK 0xFF00L - +//VM_L2_MEM_ECC_INDEX +#define VM_L2_MEM_ECC_INDEX__INDEX__SHIFT 0x0 +#define VM_L2_MEM_ECC_INDEX__INDEX_MASK 0x00FFL +//VM_L2_WALKER_MEM_ECC_INDEX +#define VM_L2_WALKER_MEM_ECC_INDEX__INDEX__SHIFT 0x0 +#define VM_L2_WALKER_MEM_ECC_INDEX__INDEX_MASK 0x00FFL +//VM_L2_MEM_ECC_CNT +#define VM_L2_MEM_ECC_CNT__SEC_COUNT__SHIFT
[PATCH 0/3] RAS Support for GFX blocks
1. Add the EDC count from hardware. 2. Add RAS support for VML2 amd ATCL2 sub blocks. Dennis Li (3): drm/amdgpu: change to query the actual EDC counter drm/amd/include: add register define for VML2 and ATCL2 drm/amdgpu: add RAS support for VML2 and ATCL2 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 986 -- drivers/gpu/drm/amd/amdgpu/soc15.h| 2 + .../amd/include/asic_reg/gc/gc_9_0_offset.h | 18 +- .../amd/include/asic_reg/gc/gc_9_0_sh_mask.h | 18 +- 4 files changed, 695 insertions(+), 329 deletions(-) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx