[PATCH] drm/amdgpu: disable GPU RAS bad page feature for specific ASIC
The feature is not applicable to specific app platform. v2: update the disablement condition and commit description v3: move the setting to amdgpu_ras_check_supported Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index dbfc41ddc3c7..ebe3e8f01fe2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3483,6 +3483,11 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) /* aca is disabled by default */ adev->aca.is_enabled = false; + + /* bad page feature is not applicable to specific app platform */ + if (adev->gmc.is_app_apu && + amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) + amdgpu_bad_page_threshold = 0; } static void amdgpu_ras_counte_dw(struct work_struct *work) -- 2.34.1
[PATCH] drm/amdgpu: disable GPU RAS bad page feature for specific ASIC
The feature is not applicable to specific app platform. v2: update the disablement condition and commit description Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index dbfc41ddc3c7..08efc9121adc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2055,6 +2055,11 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) con->event_state_attr = dev_attr_event_state; sysfs_attr_init(attrs[3]); + /* bad page feature is not applicable to specific app platform */ + if (adev->gmc.is_app_apu && + amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) + amdgpu_bad_page_threshold = 0; + if (amdgpu_bad_page_threshold != 0) { /* add bad_page_features entry */ bin_attr_gpu_vram_bad_pages.private = NULL; -- 2.34.1
[PATCH] drm/amdgpu: disable RAS bad page feature for specific APU
The feature is unsupported on specific APU. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index dbfc41ddc3c7..d46f216a33b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3679,6 +3679,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_init_reserved_vram_size(adev); + /* bad page feature is unspported on specific APU */ + if ((adev->gmc.xgmi.connected_to_cpu || + adev->gmc.is_app_apu) && + amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) + amdgpu_bad_page_threshold = 0; + if (amdgpu_ras_fs_init(adev)) { r = -EINVAL; goto release_con; -- 2.34.1
[PATCH 2/3] drm/amdgpu: update bad state check in GPU recovery
Return RMA status without message print. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 96e525ab9a84..5d49f70704c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5538,7 +5538,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, * bad_page_threshold value to fix this once * probing driver again. */ - if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { + if (!amdgpu_ras_is_rma(tmp_adev)) { /* must succeed. */ amdgpu_ras_resume(tmp_adev); } else { -- 2.34.1
[PATCH 3/3] drm/amdgpu: report bad status in GPU recovery
Instead of printing GPU reset failed. v2: add check for reset_context->src. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5d49f70704c6..7b21243c7c55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5921,8 +5921,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, tmp_adev->asic_reset_res = 0; if (r) { - /* bad news, how to tell it to userspace ? */ - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); + /* bad news, how to tell it to userspace ? +* for ras error, we should report GPU bad status instead of +* reset failure +*/ + if (reset_context->src != AMDGPU_RESET_SRC_RAS || + !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", + atomic_read(&tmp_adev->gpu_reset_counter)); amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); } else { dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); -- 2.34.1
[PATCH 1/3] drm/amdgpu: create function to check RAS RMA status
In the convenience of calling it globally. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 2 +- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 12ab48f26bd5..0941518f04c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2153,7 +2153,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * /* gpu reset is fallback for failed and default cases. * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. */ - if (poison_stat && !con->is_rma) { + if (poison_stat && !amdgpu_ras_is_rma(adev)) { event_id = amdgpu_ras_acquire_event_id(adev, type); RAS_EVENT_LOG(adev, event_id, "GPU reset for %s RAS poison consumption is issued!\n", @@ -2951,7 +2951,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) amdgpu_ras_error_data_fini(&err_data); - if (err_cnt && con->is_rma) + if (err_cnt && amdgpu_ras_is_rma(adev)) amdgpu_ras_reset_gpu(adev); amdgpu_ras_schedule_retirement_dwork(con, @@ -3053,7 +3053,7 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, } /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ - if (reset_flags && !con->is_rma) { + if (reset_flags && !amdgpu_ras_is_rma(adev)) { if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) @@ -3202,7 +3202,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) * This calling fails when is_rma is true or * ret != 0. */ - if (con->is_rma || ret) + if (amdgpu_ras_is_rma(adev) || ret) goto free; if (con->eeprom_control.ras_num_recs) { @@ -3254,7 +3254,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) * Except error threshold exceeding case, other failure cases in this * function would not fail amdgpu driver init. */ - if (!con->is_rma) + if (!amdgpu_ras_is_rma(adev)) ret = 0; else ret = -EINVAL; @@ -4301,7 +4301,7 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); /* mode1 is the only selection for RMA status */ - if (ras->is_rma) { + if (amdgpu_ras_is_rma(adev)) { ras->gpu_reset_flags = 0; ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; } @@ -4835,3 +4835,13 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, va_end(args); } + +bool amdgpu_ras_is_rma(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!con) + return false; + + return con->is_rma; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 7ddd13d5c06b..25a19760f098 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -972,4 +972,5 @@ __printf(3, 4) void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, const char *fmt, ...); +bool amdgpu_ras_is_rma(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 32be258d81e1..9e70a7b3aa64 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -196,7 +196,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, amdgpu_umc_handle_bad_pages(adev, ras_error_status); if ((err_data->ue_count || err_data->de_count) && - (reset || (con && con->is_rma))) { + (reset || amdgpu_ras_is_rma(adev))) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c index 9cd221ed240c..999bb3cc88b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c @@ -97,7 +97,7 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev, ras->gpu_reset_flags |= AMDGPU_RAS_GPU
[PATCH] drm/amdgpu: report bad status in GPU recovery
Instead of printing GPU reset failed. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 355c2478c4b6..b7c967779b4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5933,8 +5933,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, tmp_adev->asic_reset_res = 0; if (r) { - /* bad news, how to tell it to userspace ? */ - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); + /* bad news, how to tell it to userspace ? +* for ras error, we should report GPU bad status instead of +* reset failure +*/ + if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", + atomic_read(&tmp_adev->gpu_reset_counter)); amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); } else { dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); -- 2.34.1
[PATCH] drm/amdkfd: add ASIC version check for the reset selection of RAS poison
GFX v9.4.3 uses mode1 reset, other ASICs choose mode2. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 78dde62fb04a..816800555f7f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -164,7 +164,10 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_UTCL2: block = AMDGPU_RAS_BLOCK__GFX; - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; case SOC15_IH_CLIENTID_VMC: case SOC15_IH_CLIENTID_VMC1: @@ -177,7 +180,10 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA4: block = AMDGPU_RAS_BLOCK__SDMA; - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; default: dev_warn(dev->adev->dev, -- 2.34.1
[PATCH] drm/amdkfd: use mode1 reset for RAS poison consumption
Per FW requirement, replace mode2 with mode1. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index e1c21d250611..78dde62fb04a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -164,7 +164,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_UTCL2: block = AMDGPU_RAS_BLOCK__GFX; - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; break; case SOC15_IH_CLIENTID_VMC: case SOC15_IH_CLIENTID_VMC1: @@ -177,7 +177,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA4: block = AMDGPU_RAS_BLOCK__SDMA; - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; break; default: dev_warn(dev->adev->dev, -- 2.34.1
[PATCH 2/2] drm/amd/pm: update check condition for SMU mode1 reset
The fed status does indicate RAS fatal error. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 04533f99f1e3..2c35eb31475a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1876,7 +1876,7 @@ static int aldebaran_mode1_reset(struct smu_context *smu) /* fatal error triggered by ras, PMFW supports the flag from 68.44.0 */ if ((smu->smc_fw_version >= 0x00442c00) && - amdgpu_ras_in_recovery(adev)) + amdgpu_ras_get_fed_status(adev)) fatal_err = 1; param |= (fatal_err << 16); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index d1766a603bb9..c9639141792f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2788,7 +2788,7 @@ static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu, struct amdgpu_device *adev = smu->adev; if ((smu->smc_fw_version >= supported_version) && - amdgpu_ras_in_recovery(adev)) + amdgpu_ras_get_fed_status(adev)) /* Set RAS fatal error reset flag */ *param = 1 << 16; else diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index c1d7528a6dc8..7fda7196fa7c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2580,7 +2580,7 @@ static int smu_v13_0_6_mode1_reset(struct smu_context *smu) param = SMU_RESET_MODE_1; /* fatal error triggered by ras, PMFW supports the flag */ - if (amdgpu_ras_in_recovery(adev)) + if (amdgpu_ras_get_fed_status(adev)) fatal_err = 1; param |= (fatal_err << 16); -- 2.34.1
[PATCH 1/2] drm/amdgpu: set RAS fed status for more cases
Indicate fatal error for each RAS block and NBIO. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 77566dcc0852..183eae22b687 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2114,6 +2114,7 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, /* Let IP handle its data, maybe we need get the output * from the callback to update the error type/count, etc */ + amdgpu_ras_set_fed(obj->adev, true); ret = data->cb(obj->adev, &err_data, entry); /* ue will trigger an interrupt, and in that case * we need do a reset to recovery the whole system. diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index b8fc9e126e0d..9446bf6f82c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -414,6 +414,7 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device /* ras_controller_int is dedicated for nbif ras error, * not the global interrupt for sync flood */ + amdgpu_ras_set_fed(adev, true); amdgpu_ras_reset_gpu(adev); } -- 2.34.1
[PATCH 4/5] drma/amdgpu: set fatal flag for RAS recovery
PMFW needs the flag to know the reason of mode1. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 2 +- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index fb5fc1fe6ad0..f55bff59052f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -940,7 +940,7 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev, if (adev->gfx.ras && adev->gfx.ras->ras_block.hw_ops && adev->gfx.ras->ras_block.hw_ops->query_ras_error_count) adev->gfx.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, true); } return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ff2d34dc9718..2071e30d7e56 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2070,7 +2070,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * if (poison_stat && !con->is_rma) { dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", block_obj->ras_comm.name); - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); } if (!poison_stat) @@ -2825,7 +2825,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) amdgpu_ras_error_data_fini(&err_data); if (err_cnt && con->is_rma) - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); mutex_lock(&con->umc_ecc_log.lock); if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree, @@ -2888,7 +2888,7 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); } return 0; @@ -3815,7 +3815,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) amdgpu_ras_set_fed(adev, true); ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, true); } } @@ -3996,7 +3996,7 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev, return ret; } -int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 37e1c93c243d..ed5793458a70 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -878,7 +878,7 @@ bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev); int amdgpu_ras_is_supported(struct amdgpu_device *adev, unsigned int block); -int amdgpu_ras_reset_gpu(struct amdgpu_device *adev); +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal); struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 151f83ea803b..f976b6deb42d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -129,7 +129,7 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev, if (amdgpu_sriov_vf(adev)) return AMDGPU_RAS_SUCCESS; - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, true); return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 4a72ff8d8d80..2596a1c2a64e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -198,7 +198,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, if ((err_data->ue_count || err_data->de_count) && (reset || (con && con->is_rma))) { con->gpu_reset_flags |= reset; - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, fa
[PATCH 1/5] drm/amdgpu: add RAS is_rma flag
Set the flag to true if bad page number reaches threshold. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 7 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 +-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2dc47475b8e9..616dc2387f34 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; u32 max_eeprom_records_count = 0; - bool exc_err_limit = false; int ret; if (!con || amdgpu_sriov_vf(adev)) @@ -2977,12 +2976,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) */ if (adev->gmc.xgmi.pending_reset) return 0; - ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); /* * This calling fails when exc_err_limit is true or * ret != 0. */ - if (exc_err_limit || ret) + if (con->is_rma || ret) goto free; if (con->eeprom_control.ras_num_recs) { @@ -3033,7 +3032,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) * Except error threshold exceeding case, other failure cases in this * function would not fail amdgpu driver init. */ - if (!exc_err_limit) + if (!con->is_rma) ret = 0; else ret = -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index d06c01b978cd..437c58c85639 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -521,6 +521,7 @@ struct amdgpu_ras { bool update_channel_flag; /* Record status of smu mca debug mode */ bool is_aca_debug_mode; + bool is_rma; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 9b789dcc2bd1..eae0a555df3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) control->tbl_rai.health_percent = 0; } + if (amdgpu_bad_page_threshold != -1) + ras->is_rma = true; + /* ignore the -ENOTSUPP return value */ amdgpu_dpm_send_rma_reason(adev); } @@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control) return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int res; - *exceed_err_limit = false; + ras->is_rma = false; if (!__is_ras_eeprom_supported(adev)) return 0; @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1."); res = 0; } else { - *exceed_err_limit = true; + ras->is_rma = true; dev_err(adev->dev, "RAS records:%d exceed threshold:%d, " "GPU will not be initialized. Replace this GPU or increase the threshold", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 6dfd667f3013..b9ebda577797 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -129,8 +129,7 @@ struct eeprom_table_record { unsigned char mcumc_id; } __packed; -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit); +int amdgpu_ras_eeprom_init(struct am
[PATCH 5/5] drm/amdgpu: add ras fatal flag to distingush fatal error reset
Check it in mode1 reset. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 + .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 2 +- .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 2 +- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +- 6 files changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2071e30d7e56..97b770ba6424 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2451,6 +2451,26 @@ bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) return false; } +bool amdgpu_ras_in_fatal(struct amdgpu_device *adev) +{ + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + int hive_ras_fatal = 0; + + if (!amdgpu_ras_in_recovery(adev)) + return false; + + if (hive) { + hive_ras_fatal = atomic_read(&hive->ras_fatal); + amdgpu_put_xgmi_hive(hive); + } + + if (ras && (atomic_read(&ras->in_fatal) || hive_ras_fatal)) + return true; + + return false; +} + static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = @@ -2462,6 +2482,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) if (hive) { atomic_set(&hive->ras_recovery, 1); + if (atomic_read(&ras->in_fatal)) + atomic_set(&hive->ras_fatal, 1); /* If any device which is part of the hive received RAS fatal * error interrupt, set fatal error status on all. This @@ -2526,8 +2548,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } atomic_set(&ras->in_recovery, 0); + atomic_set(&ras->in_fatal, 0); if (hive) { atomic_set(&hive->ras_recovery, 0); + atomic_set(&hive->ras_fatal, 0); amdgpu_put_xgmi_hive(hive); } } @@ -2982,6 +3006,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); + atomic_set(&con->in_fatal, 0); con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); @@ -4006,8 +4031,13 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal) ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; } - if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) { + if (fatal) + atomic_set(&ras->in_fatal, 1); + amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); + } + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ed5793458a70..444a7fb7fbe3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -489,6 +489,7 @@ struct amdgpu_ras { /* gpu recovery */ struct work_struct recovery_work; atomic_t in_recovery; + atomic_t in_fatal; struct amdgpu_device *adev; /* error handler data */ struct ras_err_handler_data *eh_data; @@ -953,6 +954,7 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, pasid_notify pasid_fn, void *data, uint32_t reset); bool amdgpu_ras_in_recovery(struct amdgpu_device *adev); +bool amdgpu_ras_in_fatal(struct amdgpu_device *adev); __printf(3, 4) void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index a3bfc16de6d4..a6d6272a4ec6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -44,6 +44,7 @@ struct amdgpu_hive_info { struct amdgpu_reset_domain *reset_domain; atomic_t ras_recovery; + atomic_t ras_fatal; struct ras_event_manager event_mgr; }; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 04533f99f1e3..a850e7b29d9d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1876,7 +1876,7 @@ static int aldebaran_mode1_reset(str
[PATCH 2/5] drm/amdgpu: trigger mode1 reset for RAS RMA status
Check RMA status in bad page retirement flow. v2: fix coding bugs in v1. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 8 +++ drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 4 +++- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 616dc2387f34..10cbcc0d1a1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2049,8 +2049,9 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * struct amdgpu_device *adev = obj->adev; struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, obj->head.block, 0); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!block_obj) + if (!block_obj || !con) return; /* both query_poison_status and handle_poison_consumption are optional, @@ -2073,14 +2074,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); - /* gpu reset is fallback for failed and default cases */ - if (poison_stat) { + /* gpu reset is fallback for failed and default cases. +* For RMA case, amdgpu_umc_poison_handler will handle gpu reset. +*/ + if (poison_stat && !con->is_rma) { dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", block_obj->ras_comm.name); amdgpu_ras_reset_gpu(adev); - } else { - amdgpu_gfx_poison_consumption_handler(adev, entry); } + + if (!poison_stat) + amdgpu_gfx_poison_consumption_handler(adev, entry); } static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, @@ -2801,6 +2805,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) page_retirement_dwork.work); struct amdgpu_device *adev = con->adev; struct ras_err_data err_data; + unsigned long err_cnt; if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) return; @@ -2808,9 +2813,13 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) amdgpu_ras_error_data_init(&err_data); amdgpu_umc_handle_bad_pages(adev, &err_data); + err_cnt = err_data.err_addr_cnt; amdgpu_ras_error_data_fini(&err_data); + if (err_cnt && con->is_rma) + amdgpu_ras_reset_gpu(adev); + mutex_lock(&con->umc_ecc_log.lock); if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree, UMC_ECC_NEW_DETECTED_TAG)) @@ -2867,7 +2876,8 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, if (poison_msg->pasid_fn) poison_msg->pasid_fn(adev, pasid, poison_msg->data); - if (reset) { + /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ + if (reset && !con->is_rma) { flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; @@ -3983,6 +3993,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + /* mode1 is the only selection for RMA status */ + if (ras->is_rma) { + ras->gpu_reset_flags = 0; + ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; + } + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 1dbe69eabb9a..4a72ff8d8d80 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -195,7 +195,8 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); amdgpu_umc_handle_bad_pages(adev, ras_error_status); - if (err_data->ue_count && reset) { + if ((err_data->ue_count || err_data->de_count) && + (reset || (con && con->is_rma))) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -211,6 +212,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, .block
[PATCH 3/5] drm/amdgpu: create amdgpu_ras_in_recovery to simplify code
Reduce redundant code and user doesn't need to pay attention to RAS details. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 13 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 14 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 31 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 5 ++- .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 3 +- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 12 +-- 7 files changed, 29 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2ba8c4d5dc76..1811c7ba9bdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6325,20 +6325,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) struct amdgpu_reset_context reset_context; u32 memsize; struct list_head device_list; - struct amdgpu_hive_info *hive; - int hive_ras_recovery = 0; - struct amdgpu_ras *ras; /* PCI error slot reset should be skipped During RAS recovery */ - hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - ras = amdgpu_ras_get_context(adev); if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || -amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && + amdgpu_ras_in_recovery(adev)) return PCI_ERS_RESULT_RECOVERED; DRM_INFO("PCI error: slot reset callback!!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 68505eaa92f9..fb5fc1fe6ad0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -505,9 +505,6 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) { struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id]; struct amdgpu_ring *kiq_ring = &kiq->ring; - struct amdgpu_hive_info *hive; - struct amdgpu_ras *ras; - int hive_ras_recovery = 0; int i, r = 0; int j; @@ -532,16 +529,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) * This is workaround: only skip kiq_ring test * during ras recovery in suspend stage for gfx9.4.3 */ - hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - - ras = amdgpu_ras_get_context(adev); if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || -amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) { + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && + amdgpu_ras_in_recovery(adev)) { spin_unlock(&kiq->ring_lock); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 10cbcc0d1a1a..ff2d34dc9718 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1409,11 +1409,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; - struct amdgpu_hive_info *hive; - int hive_ras_recovery = 0; if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1425,15 +1422,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, !amdgpu_ras_get_aca_debug_mode(adev)) return -EOPNOTSUPP; - hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - /* skip ras error reset in gpu reset */ - if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || - hive_ras_recovery) && + if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && ((smu_funcs && smu_fu
[PATCH 2/2] drm/amdgpu: trigger mode1 reset for RAS RMA status
Check RMA status in bad page retirement flow. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 7 +++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 934dfb2bf9e5..a6da44ac3fbd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2049,8 +2049,9 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * struct amdgpu_device *adev = obj->adev; struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, obj->head.block, 0); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!block_obj) + if (!block_obj || !con) return; /* both query_poison_status and handle_poison_consumption are optional, @@ -2074,7 +2075,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); /* gpu reset is fallback for failed and default cases */ - if (poison_stat) { + if (poison_stat || con->is_rma) { dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", block_obj->ras_comm.name); amdgpu_ras_reset_gpu(adev); @@ -2817,6 +2818,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) schedule_delayed_work(&con->page_retirement_dwork, msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL)); mutex_unlock(&con->umc_ecc_log.lock); + + if (err_data->err_addr_cnt && con->is_rma) + amdgpu_ras_reset_gpu(adev); } static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, @@ -2867,7 +2871,7 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, if (poison_msg->pasid_fn) poison_msg->pasid_fn(adev, pasid, poison_msg->data); - if (reset) { + if (reset && !con->is_rma) { flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; @@ -3983,6 +3987,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + /* mode1 is the only selection for RMA status */ + if (ras->is_rma) { + ras->gpu_reset_flags = 0; + ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; + } + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 1dbe69eabb9a..5f3866548cb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -195,7 +195,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); amdgpu_umc_handle_bad_pages(adev, ras_error_status); - if (err_data->ue_count && reset) { + if ((err_data->ue_count && (reset || con->is_rma)) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -211,6 +211,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, .block = AMDGPU_RAS_BLOCK__UMC, }; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); uint32_t timeout = timeout_ms; memset(&err_data, 0, sizeof(err_data)); @@ -243,9 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (reset) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - + if (reset || (err_data.err_addr_cnt && con->is_rma) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } -- 2.34.1
[PATCH 1/2] drm/amdgpu: add RAS is_rma flag
Set the flag to true if bad page number reaches threshold. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 7 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 +-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ecce022c657b..934dfb2bf9e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; u32 max_eeprom_records_count = 0; - bool exc_err_limit = false; int ret; if (!con || amdgpu_sriov_vf(adev)) @@ -2977,12 +2976,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) */ if (adev->gmc.xgmi.pending_reset) return 0; - ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); /* * This calling fails when exc_err_limit is true or * ret != 0. */ - if (exc_err_limit || ret) + if (con->is_rma || ret) goto free; if (con->eeprom_control.ras_num_recs) { @@ -3033,7 +3032,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) * Except error threshold exceeding case, other failure cases in this * function would not fail amdgpu driver init. */ - if (!exc_err_limit) + if (!con->is_rma) ret = 0; else ret = -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index d06c01b978cd..437c58c85639 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -521,6 +521,7 @@ struct amdgpu_ras { bool update_channel_flag; /* Record status of smu mca debug mode */ bool is_aca_debug_mode; + bool is_rma; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 9b789dcc2bd1..eae0a555df3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) control->tbl_rai.health_percent = 0; } + if (amdgpu_bad_page_threshold != -1) + ras->is_rma = true; + /* ignore the -ENOTSUPP return value */ amdgpu_dpm_send_rma_reason(adev); } @@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control) return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int res; - *exceed_err_limit = false; + ras->is_rma = false; if (!__is_ras_eeprom_supported(adev)) return 0; @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1."); res = 0; } else { - *exceed_err_limit = true; + ras->is_rma = true; dev_err(adev->dev, "RAS records:%d exceed threshold:%d, " "GPU will not be initialized. Replace this GPU or increase the threshold", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 6dfd667f3013..b9ebda577797 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -129,8 +129,7 @@ struct eeprom_table_record { unsigned char mcumc_id; } __packed; -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit); +int amdgpu_ras_eeprom_init(struct am
[PATCH] drm/amdgpu: use u32 for buf size in __amdgpu_eeprom_xfer
And also make sure the the value of msg[1].len should be in the range of u16. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c index 09a34c7258e2..35fee3e8cde2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c @@ -90,7 +90,7 @@ #define MAKE_I2C_ADDR(_aa) ((0xA << 3) | (((_aa) >> 16) & 0xF)) static int __amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, - u8 *eeprom_buf, u16 buf_size, bool read) + u8 *eeprom_buf, u32 buf_size, bool read) { u8 eeprom_offset_buf[EEPROM_OFFSET_SIZE]; struct i2c_msg msgs[] = { @@ -133,15 +133,15 @@ static int __amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, * cycle begins. This is implied for the * "i2c_transfer()" abstraction. */ - len = min(EEPROM_PAGE_SIZE - (eeprom_addr & - EEPROM_PAGE_MASK), - (u32)buf_size); + len = min(EEPROM_PAGE_SIZE - (eeprom_addr & EEPROM_PAGE_MASK), + buf_size); } else { /* Reading from the EEPROM has no limitation * on the number of bytes read from the EEPROM * device--they are simply sequenced out. +* Keep in mind that i2c_msg.len is u16 type. */ - len = buf_size; + len = min(U16_MAX, buf_size); } msgs[1].len = len; msgs[1].buf = eeprom_buf; -- 2.34.1
[PATCH] drm/amdgpu: update type of buf size to u32 for eeprom functions
Avoid overflow issue. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c index e71768661ca8..09a34c7258e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c @@ -179,7 +179,7 @@ static int __amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, * Returns the number of bytes read/written; -errno on error. */ static int amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, - u8 *eeprom_buf, u16 buf_size, bool read) + u8 *eeprom_buf, u32 buf_size, bool read) { const struct i2c_adapter_quirks *quirks = i2c_adap->quirks; u16 limit; @@ -225,7 +225,7 @@ static int amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, int amdgpu_eeprom_read(struct i2c_adapter *i2c_adap, u32 eeprom_addr, u8 *eeprom_buf, - u16 bytes) + u32 bytes) { return amdgpu_eeprom_xfer(i2c_adap, eeprom_addr, eeprom_buf, bytes, true); @@ -233,7 +233,7 @@ int amdgpu_eeprom_read(struct i2c_adapter *i2c_adap, int amdgpu_eeprom_write(struct i2c_adapter *i2c_adap, u32 eeprom_addr, u8 *eeprom_buf, - u16 bytes) + u32 bytes) { return amdgpu_eeprom_xfer(i2c_adap, eeprom_addr, eeprom_buf, bytes, false); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h index 6935adb2be1f..8083b8253ef4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h @@ -28,10 +28,10 @@ int amdgpu_eeprom_read(struct i2c_adapter *i2c_adap, u32 eeprom_addr, u8 *eeprom_buf, - u16 bytes); + u32 bytes); int amdgpu_eeprom_write(struct i2c_adapter *i2c_adap, u32 eeprom_addr, u8 *eeprom_buf, - u16 bytes); + u32 bytes); #endif -- 2.34.1
[PATCH] drm/amdgpu: retire UMC v12 mca_addr_to_pa
RAS TA will handle it, the interface is useless. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 1 - drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 105 ++--- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 62 +-- 3 files changed, 7 insertions(+), 161 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 4ba26d7e52bd..afae497cbf40 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1460,7 +1460,6 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev) adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET; adev->umc.active_mask = adev->aid_mask; adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; - adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0]; if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) adev->umc.ras = &umc_v12_0_ras; break; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index f46a176f9b55..a0122b22eda4 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -28,28 +28,6 @@ #include "umc/umc_12_0_0_sh_mask.h" #include "mp/mp_13_0_6_sh_mask.h" -const uint32_t - umc_v12_0_channel_idx_tbl[] - [UMC_V12_0_UMC_INSTANCE_NUM] - [UMC_V12_0_CHANNEL_INSTANCE_NUM] = { - {{3, 7, 11, 15, 2, 6, 10, 14}, {1, 5, 9, 13, 0, 4, 8, 12}, -{19, 23, 27, 31, 18, 22, 26, 30}, {17, 21, 25, 29, 16, 20, 24, 28}}, - {{47, 43, 39, 35, 46, 42, 38, 34}, {45, 41, 37, 33, 44, 40, 36, 32}, -{63, 59, 55, 51, 62, 58, 54, 50}, {61, 57, 53, 49, 60, 56, 52, 48}}, - {{79, 75, 71, 67, 78, 74, 70, 66}, {77, 73, 69, 65, 76, 72, 68, 64}, -{95, 91, 87, 83, 94, 90, 86, 82}, {93, 89, 85, 81, 92, 88, 84, 80}}, - {{99, 103, 107, 111, 98, 102, 106, 110}, {97, 101, 105, 109, 96, 100, 104, 108}, -{115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 112, 116, 120, 124}} - }; - -/* mapping of MCA error address to normalized address */ -static const uint32_t umc_v12_0_ma2na_mapping[] = { - 0, 5, 6, 8, 9, 14, 12, 13, - 10, 11, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, - 24, 7, 29, 30, -}; - static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev, uint32_t node_inst, uint32_t umc_inst, @@ -192,79 +170,6 @@ static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev, umc_v12_0_reset_error_count(adev); } -static bool umc_v12_0_bit_wise_xor(uint32_t val) -{ - bool result = 0; - int i; - - for (i = 0; i < 32; i++) - result = result ^ ((val >> i) & 0x1); - - return result; -} - -static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device *adev, - uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst, - uint32_t node_inst, - struct ta_ras_query_address_output *addr_out) -{ - uint32_t channel_index, i; - uint64_t na, soc_pa; - uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row; - uint32_t bank0, bank1, bank2, bank3, bank; - - bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL; - bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL; - bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL; - bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL; - col = (err_addr >> 1) & 0x1fULL; - row = (err_addr >> 10) & 0x3fffULL; - - /* apply bank hash algorithm */ - bank0 = - bank_hash0 ^ (UMC_V12_0_XOR_EN0 & - (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^ - (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0; - bank1 = - bank_hash1 ^ (UMC_V12_0_XOR_EN1 & - (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^ - (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1; - bank2 = - bank_hash2 ^ (UMC_V12_0_XOR_EN2 & - (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^ - (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2; - bank3 = - bank_hash3 ^ (UMC_V12_0_XOR_EN3 & - (umc_v12_0_bit_wise_xor(col &
[PATCH] drm/amdgpu: update check condition for XGMI ACA UE
Check more possibile ext error codes. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index f4be524b0dc1..be1f4efa9ef6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -1066,7 +1066,9 @@ static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_ban switch (type) { case ACA_SMU_TYPE_UE: - count = ext_error_code == 0 ? count : 0ULL; + if (ext_error_code != 0 && ext_error_code != 9) + count = 0ULL; + ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count); break; case ACA_SMU_TYPE_CE: -- 2.34.1
[PATCH] drm/amd/pm: update XGMI RAS UE criteria for sum v13.0.6
Add more possible ext error code. v2: still use ext error code instead of UC bit. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 443233563a52..7a7c7f4b7de3 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2694,7 +2694,8 @@ static int mca_pcs_xgmi_mca_get_err_count(const struct mca_ras_info *mca_ras, st ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(entry->regs[MCA_REG_IDX_STATUS]); err_cnt = MCA_REG__MISC0__ERRCNT(entry->regs[MCA_REG_IDX_MISC0]); - if (type == AMDGPU_MCA_ERROR_TYPE_UE && ext_error_code == 0) + if (type == AMDGPU_MCA_ERROR_TYPE_UE && + (ext_error_code == 0 || ext_error_code == 9)) *count = err_cnt; else if (type == AMDGPU_MCA_ERROR_TYPE_CE && ext_error_code == 6) *count = err_cnt; -- 2.34.1
[PATCH] drm/amd/pm: update XGMI RAS UC criteria for sum v13.0.6
Check UC bit instead of ext error code. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 443233563a52..027bbebbf28e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2689,12 +2689,13 @@ static int mca_pcs_xgmi_mca_get_err_count(const struct mca_ras_info *mca_ras, st uint32_t *count) { u32 ext_error_code; - u32 err_cnt; + u32 err_cnt, uc; ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(entry->regs[MCA_REG_IDX_STATUS]); err_cnt = MCA_REG__MISC0__ERRCNT(entry->regs[MCA_REG_IDX_MISC0]); + uc = MCA_REG__STATUS__UC(entry->regs[MCA_REG_IDX_STATUS]); - if (type == AMDGPU_MCA_ERROR_TYPE_UE && ext_error_code == 0) + if (type == AMDGPU_MCA_ERROR_TYPE_UE && uc) *count = err_cnt; else if (type == AMDGPU_MCA_ERROR_TYPE_CE && ext_error_code == 6) *count = err_cnt; -- 2.34.1
[PATCH] drm/amdgpu: implement IRQ_STATE_ENABLE for SDMA v4.4.2
SDMA_CNTL is not set in some cases, driver configures it by itself. v2: simplify code Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 16 +++- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 71c2f50530cb..f8e2cd514493 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1602,19 +1602,9 @@ static int sdma_v4_4_2_set_ecc_irq_state(struct amdgpu_device *adev, u32 sdma_cntl; sdma_cntl = RREG32_SDMA(type, regSDMA_CNTL); - switch (state) { - case AMDGPU_IRQ_STATE_DISABLE: - sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, - DRAM_ECC_INT_ENABLE, 0); - WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); - break; - /* sdma ecc interrupt is enabled by default -* driver doesn't need to do anything to -* enable the interrupt */ - case AMDGPU_IRQ_STATE_ENABLE: - default: - break; - } + sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, DRAM_ECC_INT_ENABLE, + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); + WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); return 0; } -- 2.34.1
[PATCH] drm/amdgpu: implement IRQ_STATE_ENABLE for SDMA v4.4.2
SDMA_CNTL is not set in some cases, driver configures it by itself. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 71c2f50530cb..d10ae4ce5ddd 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1608,10 +1608,11 @@ static int sdma_v4_4_2_set_ecc_irq_state(struct amdgpu_device *adev, DRAM_ECC_INT_ENABLE, 0); WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); break; - /* sdma ecc interrupt is enabled by default -* driver doesn't need to do anything to -* enable the interrupt */ case AMDGPU_IRQ_STATE_ENABLE: + sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, + DRAM_ECC_INT_ENABLE, 1); + WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); + break; default: break; } -- 2.34.1
[PATCH 1/2] drm/amdgpu: add socket id parameter for psp query address cmd
And set the socket id. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/ta_ras_if.h | 1 + drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 14 +++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h index 056d4df8fa1f..3ac56a9645eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h +++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h @@ -146,6 +146,7 @@ struct ta_ras_mca_addr { uint32_t ch_inst; uint32_t umc_inst; uint32_t node_inst; + uint32_t socket_id; }; struct ta_ras_phy_addr { diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 77af4e25ff46..0a9cc87e98d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -268,7 +268,7 @@ static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device *adev, static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst, - uint32_t node_inst) + uint32_t node_inst, uint32_t socket_id) { uint32_t col, row, row_xor, bank, channel_index; uint64_t soc_pa, retired_page, column; @@ -280,6 +280,7 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, addr_in.ma.ch_inst = ch_inst; addr_in.ma.umc_inst = umc_inst; addr_in.ma.node_inst = node_inst; + addr_in.ma.socket_id = socket_id; if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out)) /* fallback to old path if fail to get pa from psp */ @@ -331,6 +332,7 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data = (struct ras_err_data *)data; uint64_t umc_reg_offset = get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); + uint32_t socket_id = 0; mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); @@ -357,8 +359,13 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev, err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + if (!adev->aid_mask && + adev->smuio.funcs && + adev->smuio.funcs->get_socket_id) + socket_id = adev->smuio.funcs->get_socket_id(adev); + umc_v12_0_convert_error_address(adev, err_data, err_addr, - ch_inst, umc_inst, node_inst); + ch_inst, umc_inst, node_inst, socket_id); } /* clear umc status */ @@ -450,7 +457,8 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade err_data, err_addr, MCA_IPID_LO_2_UMC_CH(InstanceIdLo), MCA_IPID_LO_2_UMC_INST(InstanceIdLo), - err_info->mcm_info.die_id); + err_info->mcm_info.die_id, + err_info->mcm_info.socket_id); } /* Delete error address node from list and free memory */ -- 2.34.1
[PATCH 2/2] drm/amdgpu: simplify convert_error_address interface for UMC v12
Replace separate parameters with struct ta_ras_query_address_input. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57 ++ 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 0a9cc87e98d0..d0fcfcb3404f 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -266,26 +266,19 @@ static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device *adev, } static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, - struct ras_err_data *err_data, uint64_t err_addr, - uint32_t ch_inst, uint32_t umc_inst, - uint32_t node_inst, uint32_t socket_id) + struct ras_err_data *err_data, + struct ta_ras_query_address_input *addr_in) { uint32_t col, row, row_xor, bank, channel_index; - uint64_t soc_pa, retired_page, column; - struct ta_ras_query_address_input addr_in; + uint64_t soc_pa, retired_page, column, err_addr; struct ta_ras_query_address_output addr_out; - addr_in.addr_type = TA_RAS_MCA_TO_PA; - addr_in.ma.err_addr = err_addr; - addr_in.ma.ch_inst = ch_inst; - addr_in.ma.umc_inst = umc_inst; - addr_in.ma.node_inst = node_inst; - addr_in.ma.socket_id = socket_id; - - if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out)) + err_addr = addr_in->ma.err_addr; + addr_in->addr_type = TA_RAS_MCA_TO_PA; + if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) /* fallback to old path if fail to get pa from psp */ - umc_v12_0_mca_addr_to_pa(adev, err_addr, ch_inst, umc_inst, - node_inst, &addr_out); + umc_v12_0_mca_addr_to_pa(adev, err_addr, addr_in->ma.ch_inst, + addr_in->ma.umc_inst, addr_in->ma.node_inst, &addr_out); soc_pa = addr_out.pa.pa; bank = addr_out.pa.bank; @@ -310,7 +303,7 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", retired_page, row, col, bank, channel_index); amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); + retired_page, channel_index, addr_in->ma.umc_inst); /* shift R13 bit */ retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); @@ -318,7 +311,7 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", retired_page, row_xor, col, bank, channel_index); amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); + retired_page, channel_index, addr_in->ma.umc_inst); } } @@ -326,13 +319,13 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev, uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst, void *data) { + struct ras_err_data *err_data = (struct ras_err_data *)data; + struct ta_ras_query_address_input addr_in; uint64_t mc_umc_status_addr; uint64_t mc_umc_status, err_addr; uint64_t mc_umc_addrt0; - struct ras_err_data *err_data = (struct ras_err_data *)data; uint64_t umc_reg_offset = get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); - uint32_t socket_id = 0; mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); @@ -362,10 +355,16 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev, if (!adev->aid_mask && adev->smuio.funcs && adev->smuio.funcs->get_socket_id) - socket_id = adev->smuio.funcs->get_socket_id(adev); + addr_in.ma.socket_id = adev->smuio.funcs->get_socket_id(adev); + else + addr_in.ma.socket_id = 0; + + addr_in.ma.err_addr = err_addr; + addr_in.ma.ch_inst = ch_inst; + addr_in.ma.umc_inst = umc_inst; + addr_in.ma.node_inst = node_inst; - umc_v12_0_convert_error_address(adev, err_data, err_addr, - ch_inst, umc_inst, node_inst, socke
[PATCH 3/3] drm/amdgpu: make reset method configurable for RAS poison
Each RAS block has different requirement for gpu reset in poison consumption handling. Add support for mmhub RAS poison consumption handling. v2: remove the mmhub poison support for kfd int v10. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 14 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 4 ++-- .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 13 +++- .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 9 + .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 20 ++- 8 files changed, 40 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 0b4910108f61..66753940bb4d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -760,7 +760,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev) } void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset) + enum amdgpu_ras_block block, uint32_t reset) { amdgpu_umc_poison_handler(adev, block, reset); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 03bf20e0e3da..ad50c7bbc326 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -400,7 +400,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config); void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset); + enum amdgpu_ras_block block, uint32_t reset); bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e32a186c2de1..58fe7bebdf1b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2045,7 +2045,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * } } - amdgpu_umc_poison_handler(adev, obj->head.block, false); + amdgpu_umc_poison_handler(adev, obj->head.block, 0); if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); @@ -2698,7 +2698,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_dec(&con->page_retirement_req_cnt); amdgpu_umc_bad_page_polling_timeout(adev, - false, MAX_UMC_POISON_POLLING_TIME_ASYNC); + 0, MAX_UMC_POISON_POLLING_TIME_ASYNC); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 20436f81856a..2c02585dcbff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, amdgpu_umc_handle_bad_pages(adev, ras_error_status); if (err_data->ue_count && reset) { - /* use mode-2 reset for poison consumption */ - if (!entry) - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; + con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, } int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, - bool reset, uint32_t timeout_ms) + uint32_t reset, uint32_t timeout_ms) { struct ras_err_data err_data; struct ras_common_if head = { @@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, if (reset) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - /* use mode-2 reset for poison consumption */ - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; + con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, } int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset) +
[PATCH 2/3] drm/amdgpu: support utcl2 RAS poison query for mmhub
Support the query for both gfxhub and mmhub, also replace xcc_id with hub_inst. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 17 - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 ++--- .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c| 17 +++-- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 17 +++-- 5 files changed, 37 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 0c794301d18d..0b4910108f61 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -782,12 +782,19 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, } bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, - int xcc_id) + int hub_inst, int hub_type) { - if (adev->gfxhub.funcs->query_utcl2_poison_status) - return adev->gfxhub.funcs->query_utcl2_poison_status(adev, xcc_id); - else - return false; + if (!hub_type) { + if (adev->gfxhub.funcs->query_utcl2_poison_status) + return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst); + else + return false; + } else { + if (adev->mmhub.funcs->query_utcl2_poison_status) + return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst); + else + return false; + } } int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 73b7fa7c5116..03bf20e0e3da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -406,7 +406,7 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem * void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, - int xcc_id); + int hub_inst, int hub_type); int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id); void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index b3894fe868b2..4ba26d7e52bd 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -670,10 +670,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW); fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); - /* for gfx fed error, kfd will handle it, return directly */ + /* for fed error, kfd will handle it, return directly */ if (fed && amdgpu_ras_is_poison_mode_supported(adev) && - (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)) && - (vmhub < AMDGPU_MMHUB0_START)) + (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2))) return 0; WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index a8e76287dde0..650da18b0d87 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -369,18 +369,23 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry); - int xcc_id = 0; + int hub_inst = 0; struct kfd_hsa_memory_exception_data exception_data; + /* gfxhub */ if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) { - xcc_id = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev, + hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev, node_id); - if (xcc_id < 0) - xcc_id = 0; + if (hub_inst < 0) + hub_inst = 0; } - if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type && - amdgpu_amdkfd_ras_query_utc
[PATCH 1/3] drm/amdgpu: add utcl2 RAS poison query for mmhub
Add it for mmhub v1.8. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h | 2 ++ drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 15 +++ 2 files changed, 17 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h index 1ca9d4ed8063..95d676ee207f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h @@ -63,6 +63,8 @@ struct amdgpu_mmhub_funcs { uint64_t page_table_base); void (*update_power_gating)(struct amdgpu_device *adev, bool enable); + bool (*query_utcl2_poison_status)(struct amdgpu_device *adev, + int hub_inst); }; struct amdgpu_mmhub { diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index c0fc44cdd658..b7aa05dbef86 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -559,6 +559,20 @@ static void mmhub_v1_8_get_clockgating(struct amdgpu_device *adev, u64 *flags) } +static bool mmhub_v1_8_query_utcl2_poison_status(struct amdgpu_device *adev, + int hub_inst) +{ + u32 fed, status; + + status = RREG32_SOC15(MMHUB, hub_inst, regVM_L2_PROTECTION_FAULT_STATUS); + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); + /* reset page fault status */ + WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst, + regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1); + + return fed; +} + const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = { .get_fb_location = mmhub_v1_8_get_fb_location, .init = mmhub_v1_8_init, @@ -568,6 +582,7 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = { .setup_vm_pt_regs = mmhub_v1_8_setup_vm_pt_regs, .set_clockgating = mmhub_v1_8_set_clockgating, .get_clockgating = mmhub_v1_8_get_clockgating, + .query_utcl2_poison_status = mmhub_v1_8_query_utcl2_poison_status, }; static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = { -- 2.34.1
[PATCH 3/3] drm/amdgpu: make reset method configurable for RAS poison
Each RAS block has different requirement for gpu reset in poison consumption handling. Add support for mmhub RAS poison consumption handling. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 14 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 4 ++-- .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 20 ++- .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 20 ++- 7 files changed, 42 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 9687650b0fe3..262d20167039 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -760,7 +760,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev) } void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset) + enum amdgpu_ras_block block, uint32_t reset) { amdgpu_umc_poison_handler(adev, block, reset); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 03bf20e0e3da..ad50c7bbc326 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -400,7 +400,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config); void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset); + enum amdgpu_ras_block block, uint32_t reset); bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e32a186c2de1..58fe7bebdf1b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2045,7 +2045,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * } } - amdgpu_umc_poison_handler(adev, obj->head.block, false); + amdgpu_umc_poison_handler(adev, obj->head.block, 0); if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); @@ -2698,7 +2698,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_dec(&con->page_retirement_req_cnt); amdgpu_umc_bad_page_polling_timeout(adev, - false, MAX_UMC_POISON_POLLING_TIME_ASYNC); + 0, MAX_UMC_POISON_POLLING_TIME_ASYNC); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 20436f81856a..2c02585dcbff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, amdgpu_umc_handle_bad_pages(adev, ras_error_status); if (err_data->ue_count && reset) { - /* use mode-2 reset for poison consumption */ - if (!entry) - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; + con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, } int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, - bool reset, uint32_t timeout_ms) + uint32_t reset, uint32_t timeout_ms) { struct ras_err_data err_data; struct ras_common_if head = { @@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, if (reset) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - /* use mode-2 reset for poison consumption */ - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; + con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, } int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset) + enum amdgpu_ras_block block, uint32_t reset) { int ret = AMDGPU_RAS_SUCCESS; @@ -311,7 +308,8
[PATCH 2/3] drm/amdgpu: support utcl2 RAS poison query for mmhub
Support the query for both gfxhub and mmhub, also replace xcc_id with hub_inst. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 17 - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 +-- .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c| 17 +++-- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 17 +++-- 5 files changed, 36 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index fa958cbc603a..9687650b0fe3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -782,12 +782,19 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, } bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, - int xcc_id) + int hub_inst, int hub_type) { - if (adev->gfxhub.funcs->query_utcl2_poison_status) - return adev->gfxhub.funcs->query_utcl2_poison_status(adev, xcc_id); - else - return false; + if (!hub_type) { + if (adev->gfxhub.funcs->query_utcl2_poison_status) + return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst); + else + return false; + } else { + if (adev->mmhub.funcs->query_utcl2_poison_status) + return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst); + else + return false; + } } int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 73b7fa7c5116..03bf20e0e3da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -406,7 +406,7 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem * void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, - int xcc_id); + int hub_inst, int hub_type); int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id); void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index fb19b88e5522..d615d0fc2c6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -672,8 +672,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, /* for gfx fed error, kfd will handle it, return directly */ if (fed && amdgpu_ras_is_poison_mode_supported(adev) && - (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)) && - (vmhub < AMDGPU_MMHUB0_START)) + (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2))) return 0; WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index a8e76287dde0..650da18b0d87 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -369,18 +369,23 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry); - int xcc_id = 0; + int hub_inst = 0; struct kfd_hsa_memory_exception_data exception_data; + /* gfxhub */ if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) { - xcc_id = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev, + hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev, node_id); - if (xcc_id < 0) - xcc_id = 0; + if (hub_inst < 0) + hub_inst = 0; } - if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, xcc_id)) { + /* mmhub */ + if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC) + hub_inst = nod
[PATCH 1/3] drm/amdgpu: add utcl2 RAS poison query for mmhub
Add it for mmhub v1.8. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h | 2 ++ drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 15 +++ 2 files changed, 17 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h index 1ca9d4ed8063..95d676ee207f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h @@ -63,6 +63,8 @@ struct amdgpu_mmhub_funcs { uint64_t page_table_base); void (*update_power_gating)(struct amdgpu_device *adev, bool enable); + bool (*query_utcl2_poison_status)(struct amdgpu_device *adev, + int hub_inst); }; struct amdgpu_mmhub { diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index c0fc44cdd658..b7aa05dbef86 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -559,6 +559,20 @@ static void mmhub_v1_8_get_clockgating(struct amdgpu_device *adev, u64 *flags) } +static bool mmhub_v1_8_query_utcl2_poison_status(struct amdgpu_device *adev, + int hub_inst) +{ + u32 fed, status; + + status = RREG32_SOC15(MMHUB, hub_inst, regVM_L2_PROTECTION_FAULT_STATUS); + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); + /* reset page fault status */ + WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst, + regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1); + + return fed; +} + const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = { .get_fb_location = mmhub_v1_8_get_fb_location, .init = mmhub_v1_8_init, @@ -568,6 +582,7 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = { .setup_vm_pt_regs = mmhub_v1_8_setup_vm_pt_regs, .set_clockgating = mmhub_v1_8_set_clockgating, .get_clockgating = mmhub_v1_8_get_clockgating, + .query_utcl2_poison_status = mmhub_v1_8_query_utcl2_poison_status, }; static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = { -- 2.34.1
[PATCH] drm/amdgpu: add deferred error check for UMC v12 address query
Both RAS UE and deferred errors need page retirement. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 14ef7a24be7b..77af4e25ff46 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -348,7 +348,8 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev, } /* calculate error address if ue error is detected */ - if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) { + if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) || + umc_v12_0_is_deferred_error(adev, mc_umc_status)) { mc_umc_addrt0 = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); -- 2.34.1
[PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling
Let kfd interrupt handler process it. v2: return 0 instead of 1 for fed error. drop the usage of strcmp in interrupt handler. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 001e96d89cd7..09364817ae97 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, { bool retry_fault = !!(entry->src_data[1] & 0x80); bool write_fault = !!(entry->src_data[1] & 0x20); - uint32_t status = 0, cid = 0, rw = 0; + uint32_t status = 0, cid = 0, rw = 0, fed = 0; struct amdgpu_task_info task_info; struct amdgpu_vmhub *hub; const char *mmhub_cid; @@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, status = RREG32(hub->vm_l2_pro_fault_status); cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID); rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW); + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); + + /* for gfx fed error, kfd will handle it, return directly */ + if (fed && amdgpu_ras_is_poison_mode_supported(adev) && + (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)) && + (vmhub < AMDGPU_MMHUB0_START)) + return 0; + WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); #ifdef HAVE_STRUCT_XARRAY amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub); -- 2.34.1
[PATCH 4/5] amd/amdkfd: get node id for query_utcl2_poison_status
Obtain it from ring entry. v2: replace node id with logical xcc id. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 14 -- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 14 -- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index 9a06c6fb6605..a8e76287dde0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -367,10 +367,20 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); + uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry); + int xcc_id = 0; struct kfd_hsa_memory_exception_data exception_data; - if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { + if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) { + xcc_id = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev, + node_id); + if (xcc_id < 0) + xcc_id = 0; + } + + if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type && + amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, xcc_id)) { event_interrupt_poison_consumption(dev, pasid, client_id); return; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 91dd5e045b51..ff7392336795 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -413,10 +413,20 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); + uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry); + int xcc_id = 0; struct kfd_hsa_memory_exception_data exception_data; - if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { + if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) { + xcc_id = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev, + node_id); + if (xcc_id < 0) + xcc_id = 0; + } + + if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type && + amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, xcc_id)) { event_interrupt_poison_consumption_v9(dev, pasid, client_id); return; } -- 2.34.1
[PATCH 3/5] drm/amdgpu: retire gfx ras query_utcl2_poison_status
Replace it with related interface in gfxhub functions. v2: replace node id with xcc id. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 7 --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h| 1 - drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c| 12 4 files changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index f759a42def59..817464ee6a01 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -776,10 +776,11 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, return 0; } -bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev) +bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, + int xcc_id) { - if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status) - return adev->gfx.ras->query_utcl2_poison_status(adev); + if (adev->gfxhub.funcs->query_utcl2_poison_status) + return adev->gfxhub.funcs->query_utcl2_poison_status(adev, xcc_id); else return false; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index b2990470d1c6..47d30f67f2b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -404,7 +404,8 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); -bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev); +bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, + int xcc_id); int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id); void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index a36d153b2ff7..9fa580b85417 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -268,7 +268,6 @@ struct amdgpu_cu_info { struct amdgpu_gfx_ras { struct amdgpu_ras_block_object ras_block; void (*enable_watchdog_timer)(struct amdgpu_device *adev); - bool (*query_utcl2_poison_status)(struct amdgpu_device *adev); int (*rlc_gc_fed_irq)(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index 0d5b4133fdf7..e3ed568eaacc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev) mutex_unlock(&adev->grbm_idx_mutex); } -static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev) -{ - u32 status = 0; - struct amdgpu_vmhub *hub; - - hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; - status = RREG32(hub->vm_l2_pro_fault_status); - /* reset page fault status */ - WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); - return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); -} struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = { .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count, @@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = { .hw_ops = &gfx_v9_4_2_ras_ops, }, .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer, - .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status, }; -- 2.34.1
[PATCH 1/5] drm/amdgpu: add new bit definitions for GC 9.0 PROTECTION_FAULT_STATUS
Add UCE and FED bit definitions. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h | 4 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h index efc16ddf274a..2dfa0e5b1aa3 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h @@ -6822,6 +6822,8 @@ #define VM_L2_PROTECTION_FAULT_STATUS__VMID__SHIFT 0x14 #define VM_L2_PROTECTION_FAULT_STATUS__VF__SHIFT 0x18 #define VM_L2_PROTECTION_FAULT_STATUS__VFID__SHIFT 0x19 +#define VM_L2_PROTECTION_FAULT_STATUS__UCE__SHIFT 0x1d +#define VM_L2_PROTECTION_FAULT_STATUS__FED__SHIFT 0x1e #define VM_L2_PROTECTION_FAULT_STATUS__MORE_FAULTS_MASK 0x0001L #define VM_L2_PROTECTION_FAULT_STATUS__WALKER_ERROR_MASK 0x000EL #define VM_L2_PROTECTION_FAULT_STATUS__PERMISSION_FAULTS_MASK 0x00F0L @@ -6832,6 +6834,8 @@ #define VM_L2_PROTECTION_FAULT_STATUS__VMID_MASK 0x00F0L #define VM_L2_PROTECTION_FAULT_STATUS__VF_MASK 0x0100L #define VM_L2_PROTECTION_FAULT_STATUS__VFID_MASK 0x1E00L +#define VM_L2_PROTECTION_FAULT_STATUS__UCE_MASK 0x2000L +#define VM_L2_PROTECTION_FAULT_STATUS__FED_MASK 0x4000L //VM_L2_PROTECTION_FAULT_ADDR_LO32 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32__SHIFT 0x0 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32_MASK 0xL -- 2.34.1
[PATCH 2/5] drm/amdgpu: add utcl2 poison query for gfxhub
Implement it for gfxhub 1.0 and 1.2. v2: input logical xcc id for poison query interface. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h | 2 ++ drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c | 17 + drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 15 +++ 3 files changed, 34 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h index c7b44aeb671b..103a837ccc71 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h @@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs { void (*mode2_save_regs)(struct amdgpu_device *adev); void (*mode2_restore_regs)(struct amdgpu_device *adev); void (*halt)(struct amdgpu_device *adev); + bool (*query_utcl2_poison_status)(struct amdgpu_device *adev, + int xcc_id); }; struct amdgpu_gfxhub { diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c index 22175da0e16a..d200310d1731 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c @@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev) mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32; } +static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev, + int xcc_id) +{ + u32 status = 0; + struct amdgpu_vmhub *hub; + + if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2)) + return false; + + hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; + status = RREG32(hub->vm_l2_pro_fault_status); + /* reset page fault status */ + WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); + + return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); +} const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = { .get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset, @@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = { .set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default, .init = gfxhub_v1_0_init, .get_xgmi_info = gfxhub_v1_1_get_xgmi_info, + .query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status, }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c index 49aecdcee006..77df8c9cbad2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c @@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device *adev) return 0; } +static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev, + int xcc_id) +{ + u32 fed, status; + + status = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regVM_L2_PROTECTION_FAULT_STATUS); + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); + /* reset page fault status */ + WREG32_P(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), + regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1); + + return fed; +} + const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = { .get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset, .setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs, @@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = { .set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default, .init = gfxhub_v1_2_init, .get_xgmi_info = gfxhub_v1_2_get_xgmi_info, + .query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status, }; static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask) -- 2.34.1
[PATCH 4/5] amd/amdkfd: get node id for query_utcl2_poison_status
Obtain it from ring entry. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 3 ++- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index 9a06c6fb6605..747cb785a7d3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -367,10 +367,11 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); struct kfd_hsa_memory_exception_data exception_data; if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { + amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, node_id)) { event_interrupt_poison_consumption(dev, pasid, client_id); return; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 91dd5e045b51..eb94d967c532 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -413,10 +413,11 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); struct kfd_hsa_memory_exception_data exception_data; if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { + amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, node_id)) { event_interrupt_poison_consumption_v9(dev, pasid, client_id); return; } -- 2.34.1
[PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling
Let kfd interrupt handler process it. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 773725a92cf1..70defc394b7b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, { bool retry_fault = !!(entry->src_data[1] & 0x80); bool write_fault = !!(entry->src_data[1] & 0x20); - uint32_t status = 0, cid = 0, rw = 0; + uint32_t status = 0, cid = 0, rw = 0, fed = 0; struct amdgpu_task_info task_info; struct amdgpu_vmhub *hub; const char *mmhub_cid; @@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, status = RREG32(hub->vm_l2_pro_fault_status); cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID); rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW); + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); + + /* for gfx fed error, kfd will handle it, return directly */ + if (fed && amdgpu_ras_is_poison_mode_supported(adev) && + amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) && + !strcmp(hub_name, "gfxhub0")) + return 1; + WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); #ifdef HAVE_STRUCT_XARRAY amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub); -- 2.34.1
[PATCH 3/5] drm/amdgpu: retire gfx ras query_utcl2_poison_status
Replace it with related interface in gfxhub functions. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 7 --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h| 1 - drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c| 12 4 files changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index f759a42def59..bb509b26112d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -776,10 +776,11 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, return 0; } -bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev) +bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, + uint32_t node_id) { - if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status) - return adev->gfx.ras->query_utcl2_poison_status(adev); + if (adev->gfxhub.funcs->query_utcl2_poison_status) + return adev->gfxhub.funcs->query_utcl2_poison_status(adev, node_id); else return false; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index b2990470d1c6..ae1e449e5479 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -404,7 +404,8 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); -bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev); +bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, + uint32_t node_id); int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id); void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 2af2e28952db..d91f83bd7267 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -268,7 +268,6 @@ struct amdgpu_cu_info { struct amdgpu_gfx_ras { struct amdgpu_ras_block_object ras_block; void (*enable_watchdog_timer)(struct amdgpu_device *adev); - bool (*query_utcl2_poison_status)(struct amdgpu_device *adev); int (*rlc_gc_fed_irq)(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index 0d5b4133fdf7..e3ed568eaacc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev) mutex_unlock(&adev->grbm_idx_mutex); } -static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev) -{ - u32 status = 0; - struct amdgpu_vmhub *hub; - - hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; - status = RREG32(hub->vm_l2_pro_fault_status); - /* reset page fault status */ - WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); - return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); -} struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = { .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count, @@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = { .hw_ops = &gfx_v9_4_2_ras_ops, }, .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer, - .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status, }; -- 2.34.1
[PATCH 2/5] drm/amdgpu: add utcl2 poison query for gfxhub
Implement it for gfxhub 1.0 and 1.2. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h | 2 ++ drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c | 17 + drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 15 +++ 3 files changed, 34 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h index c7b44aeb671b..12b131a9cc42 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h @@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs { void (*mode2_save_regs)(struct amdgpu_device *adev); void (*mode2_restore_regs)(struct amdgpu_device *adev); void (*halt)(struct amdgpu_device *adev); + bool (*query_utcl2_poison_status)(struct amdgpu_device *adev, + uint32_t node_id); }; struct amdgpu_gfxhub { diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c index 22175da0e16a..1c14b1665c9f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c @@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev) mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32; } +static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev, + uint32_t node_id) +{ + u32 status = 0; + struct amdgpu_vmhub *hub; + + if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2)) + return false; + + hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; + status = RREG32(hub->vm_l2_pro_fault_status); + /* reset page fault status */ + WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); + + return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); +} const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = { .get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset, @@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = { .set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default, .init = gfxhub_v1_0_init, .get_xgmi_info = gfxhub_v1_1_get_xgmi_info, + .query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status, }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c index 49aecdcee006..ebc96739e1c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c @@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device *adev) return 0; } +static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev, + uint32_t node_id) +{ + u32 fed, status; + + status = RREG32_SOC15(GC, node_id, regVM_L2_PROTECTION_FAULT_STATUS); + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); + /* reset page fault status */ + WREG32_P(SOC15_REG_OFFSET(GC, node_id, + regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1); + + return fed; +} + const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = { .get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset, .setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs, @@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = { .set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default, .init = gfxhub_v1_2_init, .get_xgmi_info = gfxhub_v1_2_get_xgmi_info, + .query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status, }; static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask) -- 2.34.1
[PATCH 1/5] drm/amdgpu: add new bit definitions for GC 9.0 PROTECTION_FAULT_STATUS
Add UCE and FED bit definitions. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h | 4 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h index efc16ddf274a..2dfa0e5b1aa3 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h @@ -6822,6 +6822,8 @@ #define VM_L2_PROTECTION_FAULT_STATUS__VMID__SHIFT 0x14 #define VM_L2_PROTECTION_FAULT_STATUS__VF__SHIFT 0x18 #define VM_L2_PROTECTION_FAULT_STATUS__VFID__SHIFT 0x19 +#define VM_L2_PROTECTION_FAULT_STATUS__UCE__SHIFT 0x1d +#define VM_L2_PROTECTION_FAULT_STATUS__FED__SHIFT 0x1e #define VM_L2_PROTECTION_FAULT_STATUS__MORE_FAULTS_MASK 0x0001L #define VM_L2_PROTECTION_FAULT_STATUS__WALKER_ERROR_MASK 0x000EL #define VM_L2_PROTECTION_FAULT_STATUS__PERMISSION_FAULTS_MASK 0x00F0L @@ -6832,6 +6834,8 @@ #define VM_L2_PROTECTION_FAULT_STATUS__VMID_MASK 0x00F0L #define VM_L2_PROTECTION_FAULT_STATUS__VF_MASK 0x0100L #define VM_L2_PROTECTION_FAULT_STATUS__VFID_MASK 0x1E00L +#define VM_L2_PROTECTION_FAULT_STATUS__UCE_MASK 0x2000L +#define VM_L2_PROTECTION_FAULT_STATUS__FED_MASK 0x4000L //VM_L2_PROTECTION_FAULT_ADDR_LO32 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32__SHIFT 0x0 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32_MASK 0xL -- 2.34.1
[PATCH] drm/amdgpu: add UTCL2 RAS poison query for gfx 9.4.3
Add help function to query and reset RAS UTCL2 poison status. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index aace4594a603..de04006f8db1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -4329,10 +4329,24 @@ static int gfx_v9_4_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo return r; } +static bool gfx_v9_4_3_query_uctl2_poison_status(struct amdgpu_device *adev) +{ + u32 status = 0; + struct amdgpu_vmhub *hub; + + hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; + status = RREG32(hub->vm_l2_pro_fault_status); + /* reset page fault status */ + WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); + + return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); +} + struct amdgpu_gfx_ras gfx_v9_4_3_ras = { .ras_block = { .hw_ops = &gfx_v9_4_3_ras_ops, .ras_late_init = &gfx_v9_4_3_ras_late_init, }, .enable_watchdog_timer = &gfx_v9_4_3_enable_watchdog_timer, + .query_utcl2_poison_status = &gfx_v9_4_3_query_uctl2_poison_status, }; -- 2.34.1
[PATCH 2/2] use PSP address query command
Get UMC physical address from PSP in RAS error address coversion. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 46 ++ 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 836a4cc1134e..14ef7a24be7b 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -203,14 +203,14 @@ static bool umc_v12_0_bit_wise_xor(uint32_t val) return result; } -static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, - struct ras_err_data *err_data, uint64_t err_addr, - uint32_t ch_inst, uint32_t umc_inst, - uint32_t node_inst) +static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device *adev, + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst, + uint32_t node_inst, + struct ta_ras_query_address_output *addr_out) { uint32_t channel_index, i; - uint64_t soc_pa, na, retired_page, column; - uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row, row_xor; + uint64_t na, soc_pa; + uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row; uint32_t bank0, bank1, bank2, bank3, bank; bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL; @@ -260,12 +260,44 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, /* the umc channel bits are not original values, they are hashed */ UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa); + addr_out->pa.pa = soc_pa; + addr_out->pa.bank = bank; + addr_out->pa.channel_idx = channel_index; +} + +static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst, + uint32_t node_inst) +{ + uint32_t col, row, row_xor, bank, channel_index; + uint64_t soc_pa, retired_page, column; + struct ta_ras_query_address_input addr_in; + struct ta_ras_query_address_output addr_out; + + addr_in.addr_type = TA_RAS_MCA_TO_PA; + addr_in.ma.err_addr = err_addr; + addr_in.ma.ch_inst = ch_inst; + addr_in.ma.umc_inst = umc_inst; + addr_in.ma.node_inst = node_inst; + + if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out)) + /* fallback to old path if fail to get pa from psp */ + umc_v12_0_mca_addr_to_pa(adev, err_addr, ch_inst, umc_inst, + node_inst, &addr_out); + + soc_pa = addr_out.pa.pa; + bank = addr_out.pa.bank; + channel_index = addr_out.pa.channel_idx; + + col = (err_addr >> 1) & 0x1fULL; + row = (err_addr >> 10) & 0x3fffULL; + row_xor = row ^ (0x1ULL << 13); /* clear [C3 C2] in soc physical address */ soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); /* clear [C4] in soc physical address */ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); - row_xor = row ^ (0x1ULL << 13); /* loop for all possibilities of [C4 C3 C2] */ for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); -- 2.34.1
[PATCH 1/2] add PSP RAS address query command
Convert mca address to physical address or vice versa via RAS TA. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 25 + drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 3 +++ drivers/gpu/drm/amd/amdgpu/ta_ras_if.h | 36 + 3 files changed, 64 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 9eff8753f9b9..bb2d419fe914 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1782,6 +1782,31 @@ int psp_ras_trigger_error(struct psp_context *psp, return 0; } + +int psp_ras_query_address(struct psp_context *psp, + struct ta_ras_query_address_input *addr_in, + struct ta_ras_query_address_output *addr_out) +{ + struct ta_ras_shared_memory *ras_cmd; + int ret; + + if (!psp->ras_context.context.initialized) + return -EINVAL; + + ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; + memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); + + ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS; + ras_cmd->ras_in_message.address = *addr_in; + + ret = psp_ras_invoke(psp, ras_cmd->cmd_id); + if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status) + return -EINVAL; + + *addr_out = ras_cmd->ras_out_message.address; + + return 0; +} // ras end // HDCP start diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 652b0a01854a..9951bdd022de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -502,6 +502,9 @@ int psp_ras_enable_features(struct psp_context *psp, int psp_ras_trigger_error(struct psp_context *psp, struct ta_ras_trigger_error_input *info, uint32_t instance_mask); int psp_ras_terminate(struct psp_context *psp); +int psp_ras_query_address(struct psp_context *psp, + struct ta_ras_query_address_input *addr_in, + struct ta_ras_query_address_output *addr_out); int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id); int psp_dtm_invoke(struct psp_context *psp, uint32_t ta_cmd_id); diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h index 879bb7af297c..056d4df8fa1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h +++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h @@ -36,6 +36,9 @@ enum ras_command { TA_RAS_COMMAND__ENABLE_FEATURES = 0, TA_RAS_COMMAND__DISABLE_FEATURES, TA_RAS_COMMAND__TRIGGER_ERROR, + TA_RAS_COMMAND__QUERY_BLOCK_INFO, + TA_RAS_COMMAND__QUERY_SUB_BLOCK_INFO, + TA_RAS_COMMAND__QUERY_ADDRESS, }; enum ta_ras_status { @@ -105,6 +108,11 @@ enum ta_ras_error_type { TA_RAS_ERROR__POISON= 8, }; +enum ta_ras_address_type { + TA_RAS_MCA_TO_PA, + TA_RAS_PA_TO_MCA, +}; + /* Input/output structures for RAS commands */ /**/ @@ -133,12 +141,38 @@ struct ta_ras_init_flags { uint8_t channel_dis_num; }; +struct ta_ras_mca_addr { + uint64_t err_addr; + uint32_t ch_inst; + uint32_t umc_inst; + uint32_t node_inst; +}; + +struct ta_ras_phy_addr { + uint64_t pa; + uint32_t bank; + uint32_t channel_idx; +}; + +struct ta_ras_query_address_input { + enum ta_ras_address_type addr_type; + struct ta_ras_mca_addr ma; + struct ta_ras_phy_addr pa; +}; + struct ta_ras_output_flags { uint8_t ras_init_success_flag; uint8_t err_inject_switch_disable_flag; uint8_t reg_access_failure_flag; }; +struct ta_ras_query_address_output { + /* don't use the flags here */ + struct ta_ras_output_flags flags; + struct ta_ras_mca_addr ma; + struct ta_ras_phy_addr pa; +}; + /* Common input structure for RAS callbacks */ /**/ union ta_ras_cmd_input { @@ -146,12 +180,14 @@ union ta_ras_cmd_input { struct ta_ras_enable_features_input enable_features; struct ta_ras_disable_features_inputdisable_features; struct ta_ras_trigger_error_input trigger_error; + struct ta_ras_query_address_input address; uint32_t reserve_pad[256]; }; union ta_ras_cmd_output { struct ta_ras_output_flags flags; + struct ta_ras_query_address_output address; uint32_t reserve_pad[256]; }; -- 2.34.1
[PATCH] drm/amdgpu: disable ras feature when fini
Send ras disable feature command in fini. Signed-off-by: Tao Zhou Change-Id: I95f1d1e0a46fb613631e5cd77497e64c0551c4c7 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a249f24ed038..a9fa2d134670 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3437,7 +3437,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); if (AMDGPU_RAS_GET_FEATURES(con->features)) - amdgpu_ras_disable_all_features(adev, 1); + amdgpu_ras_disable_all_features(adev, 0); cancel_delayed_work_sync(&con->ras_counte_delay_work); -- 2.34.1
[PATCH 2/2] update check condition of query for ras page retire
Support page retirement handling in debug mode. v2: revert smu_v13_0_6_get_ecc_info directly. Signed-off-by: Tao Zhou Change-Id: I0aaa807d7fe87b3da0f023c380e57ab6dd446fcf --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 9d1cf41cf483..d8d263956e85 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -93,11 +93,14 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int ret = 0; + unsigned int error_query_mode; unsigned long err_count; kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + amdgpu_ras_get_error_query_mode(adev, &error_query_mode); ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); - if (ret == -EOPNOTSUPP) { + if (ret == -EOPNOTSUPP && + error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && adev->umc.ras->ras_block.hw_ops->query_ras_error_count) adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status); @@ -121,7 +124,8 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, */ adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status); } - } else if (!ret) { + } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY || + (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) { if (adev->umc.ras && adev->umc.ras->ecc_info_query_ras_error_count) adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status); -- 2.34.1
[PATCH 1/2] Revert "drm/amd/pm: smu v13_0_6 supports ecc info by default"
This reverts commit affdce050ab4119a3cdf74d7faa8f1eb30f6f6aa. We use debug mode flag instead of this interface. Signed-off-by: Tao Zhou Change-Id: I49eae821ce352d542143d68c05802634b4bf469d --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 8 1 file changed, 8 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 952a983da49a..6d8fdf8c538c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -3034,13 +3034,6 @@ static int smu_v13_0_6_select_xgmi_plpd_policy(struct smu_context *smu, return ret; } -static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu, - void *table) -{ - /* Support ecc info by default */ - return 0; -} - static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { /* init dpm */ .get_allowed_feature_mask = smu_v13_0_6_get_allowed_feature_mask, @@ -3095,7 +3088,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { .i2c_init = smu_v13_0_6_i2c_control_init, .i2c_fini = smu_v13_0_6_i2c_control_fini, .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num, - .get_ecc_info = smu_v13_0_6_get_ecc_info, }; void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu) -- 2.34.1
[PATCH 2/2] update check condition of query for ras page retire
Support page retirement handling in debug mode. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 9 +++-- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 41139bac7643..6df32f0afd89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -90,12 +90,16 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + unsigned int error_query_mode; int ret = 0; + amdgpu_ras_get_error_query_mode(adev, &error_query_mode); + mutex_lock(&con->page_retirement_lock); ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); - if (ret == -EOPNOTSUPP) { + if (ret == -EOPNOTSUPP && + error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && adev->umc.ras->ras_block.hw_ops->query_ras_error_count) adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status); @@ -119,7 +123,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, */ adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status); } - } else if (!ret) { + } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY || + (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) { if (adev->umc.ras && adev->umc.ras->ecc_info_query_ras_error_count) adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index c560f4af214d..d86c9e7fc64b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2909,8 +2909,8 @@ static int smu_v13_0_6_select_xgmi_plpd_policy(struct smu_context *smu, static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu, void *table) { - /* Support ecc info by default */ - return 0; + /* we use debug mode flag instead of this interface */ + return -EOPNOTSUPP; } static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { -- 2.35.1
[PATCH 1/2] update error condition check for umc_v12_0_query_error_address
Deferred error is also taken into account. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 10edf818acf5..2e0bd4312f2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -337,10 +337,7 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev, } /* calculate error address if ue error is detected */ - if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 && - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1) { - + if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) { mc_umc_addrt0 = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); -- 2.35.1
[PATCH] drm/amdgpu: Don't warn for unsupported set_xgmi_plpd_mode
set_xgmi_plpd_mode may be unsupported and this isn't error, no need to print warning for it. v2: add ret2 to save the status of psp_ras_trigger_error. Suggested-by: lijo.la...@amd.com Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 14 -- 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 0533f873001b..a5a72e5aae94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -1131,28 +1131,30 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if, uint32_t instance_mask) { - int ret = 0; + int ret1, ret2; struct ta_ras_trigger_error_input *block_info = (struct ta_ras_trigger_error_input *)inject_if; if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) dev_warn(adev->dev, "Failed to disallow df cstate"); - if (amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW)) + ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW); + if (ret1 && ret1 != -EOPNOTSUPP) dev_warn(adev->dev, "Failed to disallow XGMI power down"); - ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask); + ret2 = psp_ras_trigger_error(&adev->psp, block_info, instance_mask); if (amdgpu_ras_intr_triggered()) - return ret; + return ret2; - if (amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT)) + ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT); + if (ret1 && ret1 != -EOPNOTSUPP) dev_warn(adev->dev, "Failed to allow XGMI power down"); if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) dev_warn(adev->dev, "Failed to allow df cstate"); - return ret; + return ret2; } struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { -- 2.35.1
[PATCH] drm/amdgpu: check recovery status of xgmi hive in ras_reset_error_count
Handle xgmi hive case. Suggested-by: Hawking Zhang Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 753260745554..0093c28f4343 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1226,6 +1226,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + struct amdgpu_hive_info *hive; + int hive_ras_recovery = 0; if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1237,8 +1239,15 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, !amdgpu_ras_get_mca_debug_mode(adev)) return -EOPNOTSUPP; + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + hive_ras_recovery = atomic_read(&hive->ras_recovery); + amdgpu_put_xgmi_hive(hive); + } + /* skip ras error reset in gpu reset */ - if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) && + if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || + hive_ras_recovery) && mca_funcs && mca_funcs->mca_set_debug_mode) return -EOPNOTSUPP; -- 2.35.1
[PATCH] drm/amdgpu: handle extra UE register entries for gfx v9_4_3
The UE registe list is larger than CE list. Reported-by: yipeng.c...@amd.com Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 38 + 1 file changed, 38 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 41bbabd9ad4d..046ae95b366a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -3799,6 +3799,27 @@ static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev, } } + /* handle extra register entries of UE */ + for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) { + for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) { + for (k = 0; k < gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) { + /* no need to select if instance number is 1 */ + if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 || + gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1) + gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id); + + amdgpu_ras_inst_query_ras_error_count(adev, + &(gfx_v9_4_3_ue_reg_list[i].reg_entry), + 1, + gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].mem_id_ent, + gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].size, + GET_INST(GC, xcc_id), + AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, + &ue_count); + } + } + } + gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x, xcc_id); mutex_unlock(&adev->grbm_idx_mutex); @@ -3838,6 +3859,23 @@ static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev, } } + /* handle extra register entries of UE */ + for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) { + for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) { + for (k = 0; k < gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) { + /* no need to select if instance number is 1 */ + if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 || + gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1) + gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id); + + amdgpu_ras_inst_reset_ras_error_count(adev, + &(gfx_v9_4_3_ue_reg_list[i].reg_entry), + 1, + GET_INST(GC, xcc_id)); + } + } + } + gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x, xcc_id); mutex_unlock(&adev->grbm_idx_mutex); -- 2.35.1
[PATCH] drm/amdgpu: Don't warn for unsupported set_xgmi_plpd_mode
set_xgmi_plpd_mode may be unsupported and this isn't error, no need to print warning for it. Suggested-by: lijo.la...@amd.com Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 0533f873001b..c9b09bddbcdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -1138,7 +1138,8 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) dev_warn(adev->dev, "Failed to disallow df cstate"); - if (amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW)) + ret = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW); + if (ret && ret != -EOPNOTSUPP) dev_warn(adev->dev, "Failed to disallow XGMI power down"); ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask); @@ -1146,7 +1147,8 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, if (amdgpu_ras_intr_triggered()) return ret; - if (amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT)) + ret = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT); + if (ret && ret != -EOPNOTSUPP) dev_warn(adev->dev, "Failed to allow XGMI power down"); if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) -- 2.35.1
[PATCH 2/2] drm/amdgpu: add RAS reset/query operations for XGMI v6_4
Reset/query RAS error status and count. v2: use XGMI IP version instead of WAFL version. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 46 ++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 2b7dc490ba6b..0533f873001b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -103,6 +103,16 @@ static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = { smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x10 }; +static const int xgmi3x16_pcs_err_status_reg_v6_4[] = { + smnPCS_XGMI3X16_PCS_ERROR_STATUS, + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x10 +}; + +static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = { + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x10 +}; + static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { {"XGMI PCS DataLossErr", SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, @@ -958,6 +968,16 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) default: break; } + + switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { + case IP_VERSION(6, 4, 0): + for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) + pcs_clear_status(adev, + xgmi3x16_pcs_err_status_reg_v6_4[i]); + break; + default: + break; + } } static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, @@ -975,7 +995,9 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, if (is_xgmi_pcs) { if (amdgpu_ip_version(adev, XGMI_HWIP, 0) == - IP_VERSION(6, 1, 0)) { + IP_VERSION(6, 1, 0) || + amdgpu_ip_version(adev, XGMI_HWIP, 0) == + IP_VERSION(6, 4, 0)) { pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0]; field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields); } else { @@ -1013,7 +1035,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - int i; + int i, supported = 1; uint32_t data, mask_data = 0; uint32_t ue_cnt = 0, ce_cnt = 0; @@ -1077,7 +1099,25 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, } break; default: - dev_warn(adev->dev, "XGMI RAS error query not supported"); + supported = 0; + break; + } + + switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { + case IP_VERSION(6, 4, 0): + /* check xgmi3x16 pcs error */ + for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) { + data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]); + mask_data = + RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]); + if (data) + amdgpu_xgmi_query_pcs_error_status(adev, data, + mask_data, &ue_cnt, &ce_cnt, true, true); + } + break; + default: + if (!supported) + dev_warn(adev->dev, "XGMI RAS error query not supported"); break; } -- 2.35.1
[PATCH 1/2] drm/amdgpu: set XGMI IP version manually for v6_4
The version can't be queried from discovery table. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index 0b711bac2092..d22f22d706e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -2464,6 +2464,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev) if (amdgpu_ip_version(adev, XGMI_HWIP, 0) == IP_VERSION(4, 8, 0)) adev->gmc.xgmi.supported = true; + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) + adev->ip_versions[XGMI_HWIP][0] = IP_VERSION(6, 4, 0); + /* set NBIO version */ switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(6, 1, 0): -- 2.35.1
[PATCH] drm/amdgpu: use mode-2 reset for RAS poison consumption
Switch from mode-1 reset to mode-2 for poison consumption. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index f74347cc087a..d65e21914d8c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -166,8 +166,12 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, } } - if (reset) + if (reset) { + /* use mode-2 reset for poison consumption */ + if (!entry) + con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; amdgpu_ras_reset_gpu(adev); + } } kfree(err_data->err_addr); -- 2.35.1
[PATCH] drm/amdgpu: check RAS supported first in ras_reset_error_count
Not all platforms support RAS. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c71321edf50b..a6cff4a31c54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1233,15 +1233,15 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, return -EOPNOTSUPP; } + if (!amdgpu_ras_is_supported(adev, block) || + !amdgpu_ras_get_mca_debug_mode(adev)) + return -EOPNOTSUPP; + /* skip ras error reset in gpu reset */ if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) && mca_funcs && mca_funcs->mca_set_debug_mode) return -EOPNOTSUPP; - if (!amdgpu_ras_is_supported(adev, block) || - !amdgpu_ras_get_mca_debug_mode(adev)) - return -EOPNOTSUPP; - if (block_obj->hw_ops->reset_ras_error_count) block_obj->hw_ops->reset_ras_error_count(adev); -- 2.35.1
[PATCH] drm/amdgpu: get RAS poison status from DF v4_6_2
Add DF block and RAS poison mode query for DF v4_6_2. Signed-off-by: Tao Zhou Reviewed-by: Stanley.Yang --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 4 +++ drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c| 34 +++ drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h| 31 + 4 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index ec1daf7112a9..260e32ef7bae 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -104,7 +104,8 @@ amdgpu-y += \ amdgpu-y += \ df_v1_7.o \ df_v3_6.o \ - df_v4_3.o + df_v4_3.o \ + df_v4_6_2.o # add GMC block amdgpu-y += \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index 17d4311e22d5..8d3681172cea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -35,6 +35,7 @@ #include "df_v1_7.h" #include "df_v3_6.h" #include "df_v4_3.h" +#include "df_v4_6_2.h" #include "nbio_v6_1.h" #include "nbio_v7_0.h" #include "nbio_v7_4.h" @@ -2557,6 +2558,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev) case IP_VERSION(4, 3, 0): adev->df.funcs = &df_v4_3_funcs; break; + case IP_VERSION(4, 6, 2): + adev->df.funcs = &df_v4_6_2_funcs; + break; default: break; } diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c new file mode 100644 index ..a47960a0babd --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c @@ -0,0 +1,34 @@ +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "amdgpu.h" +#include "df_v4_6_2.h" + +static bool df_v4_6_2_query_ras_poison_mode(struct amdgpu_device *adev) +{ + /* return true since related regs are inaccessible */ + return true; +} + +const struct amdgpu_df_funcs df_v4_6_2_funcs = { + .query_ras_poison_mode = df_v4_6_2_query_ras_poison_mode, +}; diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h new file mode 100644 index ..3bc3e6d216e2 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h @@ -0,0 +1,31 @@ +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __DF_V4_6_2_H__ +#define __DF_V4_6_2_H__ + +#include "soc15_common.h" + +extern const struct amdgpu_df_funcs df_v4_6_2_funcs; + +#endif -- 2.35.1
[PATCH] drm/amdgpu: enable RAS poison mode for APU
Enable it by default on APU platform. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 95c181cd1fea..a41cab0a2f9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2710,7 +2710,8 @@ static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) return; /* Init poison supported flag, the default value is false */ - if (adev->gmc.xgmi.connected_to_cpu) { + if (adev->gmc.xgmi.connected_to_cpu || + adev->gmc.is_app_apu) { /* enabled by default when GPU is connected to CPU */ con->poison_supported = true; } else if (adev->df.funcs && -- 2.35.1
[PATCH 6/6] drm/amdgpu: drop status query/reset for GCEA 9.4.3 and MMEA 1.8
PMFW will be responsible for them. v2: remove query interfaces. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 60 -- drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 143 2 files changed, 203 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index a1c2c952d882..362bf51ab1d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -3754,10 +3754,6 @@ static const struct amdgpu_gfx_ras_reg_entry gfx_v9_4_3_ue_reg_list[] = { AMDGPU_GFX_LDS_MEM, 4}, }; -static const struct soc15_reg_entry gfx_v9_4_3_ea_err_status_regs = { - SOC15_REG_ENTRY(GC, 0, regGCEA_ERR_STATUS), 0, 1, 16 -}; - static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev, void *ras_error_status, int xcc_id) { @@ -3846,39 +3842,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev, mutex_unlock(&adev->grbm_idx_mutex); } -static void gfx_v9_4_3_inst_query_ea_err_status(struct amdgpu_device *adev, - int xcc_id) -{ - uint32_t i, j; - uint32_t reg_value; - - mutex_lock(&adev->grbm_idx_mutex); - - for (i = 0; i < gfx_v9_4_3_ea_err_status_regs.se_num; i++) { - for (j = 0; j < gfx_v9_4_3_ea_err_status_regs.instance; j++) { - gfx_v9_4_3_xcc_select_se_sh(adev, i, 0, j, xcc_id); - reg_value = RREG32_SOC15(GC, GET_INST(GC, xcc_id), - regGCEA_ERR_STATUS); - if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_STATUS) || - REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_WRRSP_STATUS) || - REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) { - dev_warn(adev->dev, - "GCEA err detected at instance: %d, status: 0x%x!\n", - j, reg_value); - } - /* clear after read */ - reg_value = REG_SET_FIELD(reg_value, GCEA_ERR_STATUS, - CLEAR_ERROR_STATUS, 0x1); - WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGCEA_ERR_STATUS, - reg_value); - } - } - - gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x, - xcc_id); - mutex_unlock(&adev->grbm_idx_mutex); -} - static void gfx_v9_4_3_inst_query_utc_err_status(struct amdgpu_device *adev, int xcc_id) { @@ -3983,7 +3946,6 @@ static void gfx_v9_4_3_inst_query_sq_timeout_status(struct amdgpu_device *adev, static void gfx_v9_4_3_inst_query_ras_err_status(struct amdgpu_device *adev, void *ras_error_status, int xcc_id) { - gfx_v9_4_3_inst_query_ea_err_status(adev, xcc_id); gfx_v9_4_3_inst_query_utc_err_status(adev, xcc_id); gfx_v9_4_3_inst_query_sq_timeout_status(adev, xcc_id); } @@ -3996,27 +3958,6 @@ static void gfx_v9_4_3_inst_reset_utc_err_status(struct amdgpu_device *adev, WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_WALKER_MEM_ECC_STATUS, 0x3); } -static void gfx_v9_4_3_inst_reset_ea_err_status(struct amdgpu_device *adev, - int xcc_id) -{ - uint32_t i, j; - uint32_t value; - - mutex_lock(&adev->grbm_idx_mutex); - for (i = 0; i < gfx_v9_4_3_ea_err_status_regs.se_num; i++) { - for (j = 0; j < gfx_v9_4_3_ea_err_status_regs.instance; j++) { - gfx_v9_4_3_xcc_select_se_sh(adev, i, 0, j, xcc_id); - value = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGCEA_ERR_STATUS); - value = REG_SET_FIELD(value, GCEA_ERR_STATUS, - CLEAR_ERROR_STATUS, 0x1); - WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGCEA_ERR_STATUS, value); - } - } - gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x, - xcc_id); - mutex_unlock(&adev->grbm_idx_mutex); -} - static void gfx_v9_4_3_inst_reset_sq_timeout_status(struct amdgpu_device *adev, int xcc_id) { @@ -4042,7 +3983,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_status(struct amdgpu_device *adev, void *ras_error_status, int xcc_id) { gfx_v9_4_3_inst_reset_utc_err_status(adev, xcc_id); - gfx_v9_4_3_inst_reset_ea_err_status(adev, xcc_id); gfx_v9_4_3_inst_reset_sq_timeout_status(adev, xcc_id);
[PATCH 4/6] drm/amd/pm: record mca debug mode in RAS
Call amdgpu_ras_set_mca_debug_mode when we set mca debug mode in smu v13_0_6. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 74fc945a8f9b..c5c1f479b925 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1475,6 +1475,7 @@ static int smu_v13_0_6_mca_set_debug_mode(struct smu_context *smu, bool enable) if (smu->smc_fw_version < 0x554800) return 0; + amdgpu_ras_set_mca_debug_mode(smu->adev, enable); return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead, enable ? 0 : ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK, NULL); -- 2.35.1
[PATCH 5/6] drm/amdgpu: bypass RAS error reset in some conditions
PMFW is responsible for RAS error reset in some conditions, driver can skip the operation. v2: add check for ras->in_recovery, it's set earlier than amdgpu_in_reset. v3: fix error in gpu reset check. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0eb3dbd9d548..95c181cd1fea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1178,6 +1178,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1185,7 +1187,13 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, return -EOPNOTSUPP; } - if (!amdgpu_ras_is_supported(adev, block)) + /* skip ras error reset in gpu reset */ + if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) && + mca_funcs && mca_funcs->mca_set_debug_mode) + return -EOPNOTSUPP; + + if (!amdgpu_ras_is_supported(adev, block) || + !amdgpu_ras_get_mca_debug_mode(adev)) return -EOPNOTSUPP; if (block_obj->hw_ops->reset_ras_error_count) -- 2.35.1
[PATCH 3/6] drm/amdgpu: add set/get mca debug mode operations
Record the debug mode status in RAS. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 + 2 files changed, 26 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 335f5d8bc20b..0eb3dbd9d548 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3323,6 +3323,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) return 0; } +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (con) + con->is_mca_debug_mode = enable; +} + +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + + if (!con) + return false; + + if (mca_funcs && mca_funcs->mca_set_debug_mode) + return con->is_mca_debug_mode; + else + return true; +} /* Register each ip ras block into amdgpu ras */ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 3f9ac0ab67e6..2fdfef62ee27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -434,6 +434,8 @@ struct amdgpu_ras { /* Indicates smu whether need update bad channel info */ bool update_channel_flag; + /* Record status of smu mca debug mode */ + bool is_mca_debug_mode; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; @@ -768,6 +770,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); + int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); -- 2.35.1
[PATCH 2/6] drm/amdgpu: replace reset_error_count with amdgpu_ras_reset_error_count
Simplify the code. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 9 ++--- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 7 ++- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 7 ++- 5 files changed, 10 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 31f8c3ead161..04cfd67a37a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3594,9 +3594,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) goto fail; - if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); } else { task_barrier_full(&hive->tb); @@ -5242,9 +5240,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (!r && amdgpu_ras_intr_triggered()) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); + amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); } amdgpu_ras_intr_cleared(); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 70e38b013309..2b7dc490ba6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -914,7 +914,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm adev->gmc.xgmi.num_physical_nodes == 0) return 0; - adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); return amdgpu_ras_block_late_init(adev, ras_block); } @@ -1081,7 +1081,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, break; } - adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); err_data->ue_count += ue_cnt; err_data->ce_count += ce_cnt; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index d95fafe7f7ed..70e7e93d382f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1604,13 +1604,8 @@ static int gmc_v9_0_late_init(void *handle) } if (!amdgpu_persistent_edc_harvesting_supported(adev)) { - if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); - - if (adev->hdp.ras && adev->hdp.ras->ras_block.hw_ops && - adev->hdp.ras->ras_block.hw_ops->reset_ras_error_count) - adev->hdp.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__HDP); } r = amdgpu_gmc_ras_late_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index ef04aad788a8..7ae5f134f09b 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1750,11 +1750,8 @@ static int sdma_v4_0_late_init(void *handle) sdma_v4_0_setup_ulv(adev); - if (!amdgpu_persistent_edc_harvesting_supported(adev)) { - if (adev->sdma.ras && adev->sdma.ras->ras_block.hw_ops && - adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count) - adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count(adev); - } + if (!amdgpu_persistent_edc_harvesting_supported(adev)) + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__SDMA); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/
[PATCH 1/6] drm/amdgpu: define ras_reset_error_count function
Make the code architecture more simple. v2: reuse ras_reset_error_count in ras_reset_error_status. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1b23651cacf4..335f5d8bc20b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1174,23 +1174,34 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, return ret; } -int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", -ras_block_str(block)); - return 0; + ras_block_str(block)); + return -EOPNOTSUPP; } if (!amdgpu_ras_is_supported(adev, block)) - return 0; + return -EOPNOTSUPP; if (block_obj->hw_ops->reset_ras_error_count) block_obj->hw_ops->reset_ras_error_count(adev); + return 0; +} + +int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, + enum amdgpu_ras_block block) +{ + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + + if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) + return 0; + if ((block == AMDGPU_RAS_BLOCK__GFX) || (block == AMDGPU_RAS_BLOCK__MMHUB)) { if (block_obj->hw_ops->reset_ras_error_status) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 0a5c8a107fb2..3f9ac0ab67e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -714,6 +714,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev); int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info); +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, + enum amdgpu_ras_block block); int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block); -- 2.35.1
[PATCH 6/6] drm/amdgpu: drop status reset for GCEA 9.4.3 and MMEA 1.8
PMFW will be responsible for it. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 22 --- drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 86 - 2 files changed, 108 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index a1c2c952d882..65da72735e52 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -3996,27 +3996,6 @@ static void gfx_v9_4_3_inst_reset_utc_err_status(struct amdgpu_device *adev, WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_WALKER_MEM_ECC_STATUS, 0x3); } -static void gfx_v9_4_3_inst_reset_ea_err_status(struct amdgpu_device *adev, - int xcc_id) -{ - uint32_t i, j; - uint32_t value; - - mutex_lock(&adev->grbm_idx_mutex); - for (i = 0; i < gfx_v9_4_3_ea_err_status_regs.se_num; i++) { - for (j = 0; j < gfx_v9_4_3_ea_err_status_regs.instance; j++) { - gfx_v9_4_3_xcc_select_se_sh(adev, i, 0, j, xcc_id); - value = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGCEA_ERR_STATUS); - value = REG_SET_FIELD(value, GCEA_ERR_STATUS, - CLEAR_ERROR_STATUS, 0x1); - WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGCEA_ERR_STATUS, value); - } - } - gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x, - xcc_id); - mutex_unlock(&adev->grbm_idx_mutex); -} - static void gfx_v9_4_3_inst_reset_sq_timeout_status(struct amdgpu_device *adev, int xcc_id) { @@ -4042,7 +4021,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_status(struct amdgpu_device *adev, void *ras_error_status, int xcc_id) { gfx_v9_4_3_inst_reset_utc_err_status(adev, xcc_id); - gfx_v9_4_3_inst_reset_ea_err_status(adev, xcc_id); gfx_v9_4_3_inst_reset_sq_timeout_status(adev, xcc_id); } diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index aa00483e7b37..616d75add087 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -756,96 +756,10 @@ static void mmhub_v1_8_query_ras_error_status(struct amdgpu_device *adev) mmhub_v1_8_inst_query_ras_err_status(adev, i); } -static void mmhub_v1_8_inst_reset_ras_err_status(struct amdgpu_device *adev, -uint32_t mmhub_inst) -{ - uint32_t mmea_cgtt_clk_cntl_addr_dist; - uint32_t mmea_err_status_addr_dist; - uint32_t reg_value; - uint32_t i; - - /* reset mmea ras err status */ - mmea_cgtt_clk_cntl_addr_dist = regMMEA1_CGTT_CLK_CTRL - regMMEA0_CGTT_CLK_CTRL; - mmea_err_status_addr_dist = regMMEA1_ERR_STATUS - regMMEA0_ERR_STATUS; - for (i = 0; i < ARRAY_SIZE(mmhub_v1_8_mmea_err_status_reg); i++) { - /* force clk branch on for response path -* set MMEA0_CGTT_CLK_CTRL.SOFT_OVERRIDE_RETURN = 1 -*/ - reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_CGTT_CLK_CTRL, - i * mmea_cgtt_clk_cntl_addr_dist); - reg_value = REG_SET_FIELD(reg_value, MMEA0_CGTT_CLK_CTRL, - SOFT_OVERRIDE_RETURN, 1); - WREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_CGTT_CLK_CTRL, - i * mmea_cgtt_clk_cntl_addr_dist, - reg_value); - - /* set MMEA0_ERR_STATUS.CLEAR_ERROR_STATUS = 1 */ - reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_ERR_STATUS, - i * mmea_err_status_addr_dist); - reg_value = REG_SET_FIELD(reg_value, MMEA0_ERR_STATUS, - CLEAR_ERROR_STATUS, 1); - WREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_ERR_STATUS, - i * mmea_err_status_addr_dist, - reg_value); - - /* set MMEA0_CGTT_CLK_CTRL.SOFT_OVERRIDE_RETURN = 0 */ - reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_CGTT_CLK_CTRL, - i * mmea_cgtt_clk_cntl_addr_dist); - reg_value = REG_SET_FIELD(reg_value, MMEA0_CGTT_CLK_CTRL, - SOFT_OVERRIDE_RETURN, 0); - WREG32_SOC15_O
[PATCH 4/6] drm/amdgpu: bypass RAS error reset in some conditions
PMFW is responsible for RAS error reset in some conditions, driver can skip the operation. v2: add check for ras->in_recovery, it's set earlier than amdgpu_in_reset. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 95c7ba889e2d..806c6d4deb63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1178,11 +1178,19 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; if (!block_obj || !block_obj->hw_ops) return 0; - if (!amdgpu_ras_is_supported(adev, block)) + /* skip ras error reset in gpu reset */ + if (amdgpu_in_reset(adev) && atomic_read(&ras->in_recovery) && + mca_funcs && mca_funcs->mca_set_debug_mode) + return 0; + + if (!amdgpu_ras_is_supported(adev, block) || + !amdgpu_ras_get_mca_debug_mode(adev)) return 0; if (block_obj->hw_ops->reset_ras_error_count) @@ -1195,6 +1203,8 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1202,7 +1212,13 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, return 0; } - if (!amdgpu_ras_is_supported(adev, block)) + /* skip ras error reset in gpu reset */ + if (amdgpu_in_reset(adev) && atomic_read(&ras->in_recovery) && + mca_funcs && mca_funcs->mca_set_debug_mode) + return 0; + + if (!amdgpu_ras_is_supported(adev, block) || + !amdgpu_ras_get_mca_debug_mode(adev)) return 0; if (block_obj->hw_ops->reset_ras_error_count) -- 2.35.1
[PATCH 5/6] drm/amdgpu: reuse amdgpu_ras_reset_error_count code
To simplify the code of amdgpu_ras_reset_error_status without logical change. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 +++-- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 806c6d4deb63..b4e8d0c629cc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1181,17 +1181,20 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - if (!block_obj || !block_obj->hw_ops) - return 0; + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", + ras_block_str(block)); + return -EOPNOTSUPP; + } /* skip ras error reset in gpu reset */ if (amdgpu_in_reset(adev) && atomic_read(&ras->in_recovery) && mca_funcs && mca_funcs->mca_set_debug_mode) - return 0; + return -EOPNOTSUPP; if (!amdgpu_ras_is_supported(adev, block) || !amdgpu_ras_get_mca_debug_mode(adev)) - return 0; + return -EOPNOTSUPP; if (block_obj->hw_ops->reset_ras_error_count) block_obj->hw_ops->reset_ras_error_count(adev); @@ -1203,27 +1206,10 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - - if (!block_obj || !block_obj->hw_ops) { - dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", -ras_block_str(block)); - return 0; - } - /* skip ras error reset in gpu reset */ - if (amdgpu_in_reset(adev) && atomic_read(&ras->in_recovery) && - mca_funcs && mca_funcs->mca_set_debug_mode) + if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) return 0; - if (!amdgpu_ras_is_supported(adev, block) || - !amdgpu_ras_get_mca_debug_mode(adev)) - return 0; - - if (block_obj->hw_ops->reset_ras_error_count) - block_obj->hw_ops->reset_ras_error_count(adev); - if ((block == AMDGPU_RAS_BLOCK__GFX) || (block == AMDGPU_RAS_BLOCK__MMHUB)) { if (block_obj->hw_ops->reset_ras_error_status) -- 2.35.1
[PATCH 2/6] drm/amdgpu: add set/get mca debug mode operations
Record the debug mode status in RAS. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 + 2 files changed, 26 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 344ebcf1a6e5..95c7ba889e2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3329,6 +3329,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) return 0; } +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (con) + con->is_mca_debug_mode = enable; +} + +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + + if (!con) + return false; + + if (mca_funcs && mca_funcs->mca_set_debug_mode) + return con->is_mca_debug_mode; + else + return true; +} /* Register each ip ras block into amdgpu ras */ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 3f9ac0ab67e6..2fdfef62ee27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -434,6 +434,8 @@ struct amdgpu_ras { /* Indicates smu whether need update bad channel info */ bool update_channel_flag; + /* Record status of smu mca debug mode */ + bool is_mca_debug_mode; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; @@ -768,6 +770,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); + int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); -- 2.35.1
[PATCH 3/6] drm/amd/pm: record mca debug mode in RAS
Call amdgpu_ras_set_mca_debug_mode when we set mca debug mode in smu v13_0_6. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 74fc945a8f9b..c5c1f479b925 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1475,6 +1475,7 @@ static int smu_v13_0_6_mca_set_debug_mode(struct smu_context *smu, bool enable) if (smu->smc_fw_version < 0x554800) return 0; + amdgpu_ras_set_mca_debug_mode(smu->adev, enable); return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead, enable ? 0 : ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK, NULL); -- 2.35.1
[PATCH 1/6] drm/amdgpu: define ras_reset_error_count function
Make the code architecture more simple. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 17 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++-- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 31f8c3ead161..04cfd67a37a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3594,9 +3594,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) goto fail; - if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); } else { task_barrier_full(&hive->tb); @@ -5242,9 +5240,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (!r && amdgpu_ras_intr_triggered()) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); + amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); } amdgpu_ras_intr_cleared(); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1b23651cacf4..344ebcf1a6e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1174,6 +1174,23 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, return ret; } +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, + enum amdgpu_ras_block block) +{ + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + + if (!block_obj || !block_obj->hw_ops) + return 0; + + if (!amdgpu_ras_is_supported(adev, block)) + return 0; + + if (block_obj->hw_ops->reset_ras_error_count) + block_obj->hw_ops->reset_ras_error_count(adev); + + return 0; +} + int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 0a5c8a107fb2..3f9ac0ab67e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -714,6 +714,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev); int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info); +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, + enum amdgpu_ras_block block); int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 70e38b013309..2b7dc490ba6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -914,7 +914,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm adev->gmc.xgmi.num_physical_nodes == 0) return 0; - adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); return amdgpu_ras_block_late_init(adev, ras_block); } @@ -1081,7 +1081,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, break; } - adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); err_data->ue_count += ue_cnt; err_data->ce_count += ce_cnt; -- 2.35.1
[PATCH 5/5] drm/amdgpu: reuse amdgpu_ras_reset_error_count code
To simplify the code of amdgpu_ras_reset_error_status without logical change. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 +++-- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6dddb0423411..3698be22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1107,17 +1107,20 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - if (!block_obj || !block_obj->hw_ops) - return 0; + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", +ras_block_str(block)); + return -EOPNOTSUPP; + } /* skip ras error reset in gpu reset */ if (amdgpu_in_reset(adev) && mca_funcs && mca_funcs->mca_set_debug_mode) - return 0; + return -EOPNOTSUPP; if (!amdgpu_ras_is_supported(adev, block) || !amdgpu_ras_get_mca_debug_mode(adev)) - return 0; + return -EOPNOTSUPP; if (block_obj->hw_ops->reset_ras_error_count) block_obj->hw_ops->reset_ras_error_count(adev); @@ -1129,25 +1132,9 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - if (!block_obj || !block_obj->hw_ops) { - dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", -ras_block_str(block)); + if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) return 0; - } - - /* skip ras error reset in gpu reset */ - if (amdgpu_in_reset(adev) && - mca_funcs && mca_funcs->mca_set_debug_mode) - return 0; - - if (!amdgpu_ras_is_supported(adev, block) || - !amdgpu_ras_get_mca_debug_mode(adev)) - return 0; - - if (block_obj->hw_ops->reset_ras_error_count) - block_obj->hw_ops->reset_ras_error_count(adev); if ((block == AMDGPU_RAS_BLOCK__GFX) || (block == AMDGPU_RAS_BLOCK__MMHUB)) { -- 2.35.1
[PATCH 4/5] drm/amdgpu: bypass RAS error reset in some conditions
PMFW is responsible for RAS error reset in some conditions, driver can skip the operation. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 91ed4fd96ee1..6dddb0423411 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1105,11 +1105,18 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; if (!block_obj || !block_obj->hw_ops) return 0; - if (!amdgpu_ras_is_supported(adev, block)) + /* skip ras error reset in gpu reset */ + if (amdgpu_in_reset(adev) && + mca_funcs && mca_funcs->mca_set_debug_mode) + return 0; + + if (!amdgpu_ras_is_supported(adev, block) || + !amdgpu_ras_get_mca_debug_mode(adev)) return 0; if (block_obj->hw_ops->reset_ras_error_count) @@ -1122,6 +1129,7 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1129,7 +1137,13 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, return 0; } - if (!amdgpu_ras_is_supported(adev, block)) + /* skip ras error reset in gpu reset */ + if (amdgpu_in_reset(adev) && + mca_funcs && mca_funcs->mca_set_debug_mode) + return 0; + + if (!amdgpu_ras_is_supported(adev, block) || + !amdgpu_ras_get_mca_debug_mode(adev)) return 0; if (block_obj->hw_ops->reset_ras_error_count) -- 2.35.1
[PATCH 3/5] drm/amd/pm: record mca debug mode in RAS
Call amdgpu_ras_set_mca_debug_mode when we set mca debug mode in smu v13_0_6. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 8220bdcbd927..55b0846337a7 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1414,6 +1414,7 @@ static int smu_v13_0_6_mca_set_debug_mode(struct smu_context *smu, bool enable) if (smu_version < 0x554800) return 0; + amdgpu_ras_set_mca_debug_mode(smu->adev, enable); return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead, enable ? 0 : ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK, NULL); -- 2.35.1
[PATCH 2/5] drm/amdgpu: add set/get mca debug mode operations
Record the debug mode status in RAS. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 + 2 files changed, 26 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 82ee6f3d12da..91ed4fd96ee1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3250,6 +3250,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) return 0; } +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (con) + con->is_mca_debug_mode = enable; +} + +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + + if (!con) + return false; + + if (mca_funcs && mca_funcs->mca_set_debug_mode) + return con->is_mca_debug_mode; + else + return true; +} /* Register each ip ras block into amdgpu ras */ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index deea10b6c184..c60688dc73ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -433,6 +433,8 @@ struct amdgpu_ras { /* Indicates smu whether need update bad channel info */ bool update_channel_flag; + /* Record status of smu mca debug mode */ + bool is_mca_debug_mode; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; @@ -748,6 +750,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); + int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); -- 2.35.1
[PATCH 1/5] drm/amdgpu: define ras_reset_error_count function
Make the code architecture more simple. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 17 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++-- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 31f8c3ead161..04cfd67a37a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3594,9 +3594,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) goto fail; - if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); } else { task_barrier_full(&hive->tb); @@ -5242,9 +5240,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (!r && amdgpu_ras_intr_triggered()) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); + amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); } amdgpu_ras_intr_cleared(); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index dacce5f2bfa3..82ee6f3d12da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1101,6 +1101,23 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, return 0; } +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, + enum amdgpu_ras_block block) +{ + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + + if (!block_obj || !block_obj->hw_ops) + return 0; + + if (!amdgpu_ras_is_supported(adev, block)) + return 0; + + if (block_obj->hw_ops->reset_ras_error_count) + block_obj->hw_ops->reset_ras_error_count(adev); + + return 0; +} + int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 728f98c6fc1c..deea10b6c184 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -694,6 +694,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev); int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info); +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, + enum amdgpu_ras_block block); int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 70e38b013309..2b7dc490ba6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -914,7 +914,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm adev->gmc.xgmi.num_physical_nodes == 0) return 0; - adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); return amdgpu_ras_block_late_init(adev, ras_block); } @@ -1081,7 +1081,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, break; } - adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); err_data->ue_count += ue_cnt; err_data->ce_count += ce_cnt; -- 2.35.1
[PATCH 1/2] drm/amdgpu: exit directly if gpu reset fails
No need to perform the full reset operation in case of gpu reset failure. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5436d7a34014..e4627d92e1d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5075,7 +5075,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (r) { dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", r, adev_to_drm(tmp_adev)->unique); - break; + goto out; } } -- 2.35.1
[PATCH 2/2] drm/amdgpu: update retry times for psp vmbx wait
Increase the retry loops and replace the constant number with macro. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 54008a8991fc..b7bc00d4c696 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -59,6 +59,9 @@ MODULE_FIRMWARE("amdgpu/psp_14_0_0_ta.bin"); /* Read USB-PD from LFB */ #define GFX_CMD_USB_PD_USE_LFB 0x480 +/* Retry times for vmbx ready wait */ +#define PSP_VMBX_POLLING_LIMIT 2 + /* VBIOS gfl defines */ #define MBOX_READY_MASK 0x8000 #define MBOX_STATUS_MASK 0x @@ -138,7 +141,7 @@ static int psp_v13_0_wait_for_vmbx_ready(struct psp_context *psp) struct amdgpu_device *adev = psp->adev; int retry_loop, ret; - for (retry_loop = 0; retry_loop < 70; retry_loop++) { + for (retry_loop = 0; retry_loop < PSP_VMBX_POLLING_LIMIT; retry_loop++) { /* Wait for bootloader to signify that is ready having bit 31 of C2PMSG_33 set to 1 */ ret = psp_wait_for( -- 2.35.1
[PATCH 3/3] drm/amdgpu: change if condition for bad channel bitmap update
The amdgpu_ras_eeprom_control.bad_channel_bitmap is u32 type, but the channel index could be larger than 32. For the ASICs whose channel number is more than 32, the amdgpu_dpm_send_hbm_bad_channel_flag interface is not supported, so we simply bypass channel bitmap update under this condition. v2: replace sizeof with BITS_PER_TYPE, we should check bit number instead of byte number. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 8ced4be784e0..c60d2f79eeef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -616,7 +616,8 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, __encode_table_record_to_buf(control, &record[i], pp); /* update bad channel bitmap */ - if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) && + !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { control->bad_channel_bitmap |= 1 << record[i].mem_channel; con->update_channel_flag = true; } @@ -969,7 +970,8 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, __decode_table_record_from_buf(control, &record[i], pp); /* update bad channel bitmap */ - if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) && + !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { control->bad_channel_bitmap |= 1 << record[i].mem_channel; con->update_channel_flag = true; } -- 2.35.1
[PATCH 2/3] drm/amdgpu: fix value of some UMC parameters for UMC v12
Prepare for bad page retirement. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index a5510412acd0..bae4a0d18190 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1497,12 +1497,14 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev) adev->umc.channel_idx_tbl = &umc_v6_7_channel_idx_tbl_second[0][0]; break; case IP_VERSION(12, 0, 0): - adev->umc.max_ras_err_cnt_per_query = UMC_V12_0_TOTAL_CHANNEL_NUM(adev); + adev->umc.max_ras_err_cnt_per_query = + UMC_V12_0_TOTAL_CHANNEL_NUM(adev) * UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; adev->umc.channel_inst_num = UMC_V12_0_CHANNEL_INSTANCE_NUM; adev->umc.umc_inst_num = UMC_V12_0_UMC_INSTANCE_NUM; adev->umc.node_inst_num /= UMC_V12_0_UMC_INSTANCE_NUM; adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET; adev->umc.active_mask = adev->aid_mask; + adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0]; if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) adev->umc.ras = &umc_v12_0_ras; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index e3619d67ae3b..4885b9fff272 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -53,6 +53,8 @@ /* one piece of normalized address is mapped to 8 pieces of physical address */ #define UMC_V12_0_NA_MAP_PA_NUM8 +/* R13 bit shift should be considered, double the number */ +#define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2) /* bank bits in MCA error address */ #define UMC_V12_0_MCA_B0_BIT 6 #define UMC_V12_0_MCA_B1_BIT 7 -- 2.35.1
[PATCH 1/3] drm/amdgpu: print channel index for UMC bad page
Print channel index for UMC v12. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index c6742dd863d4..7714c2ef2cdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -240,15 +240,17 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, /* include column bit 0 and 1 */ col &= 0x3; col |= (column << 2); - dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x Bank:0x%x\n", - retired_page, row, col, bank); + dev_info(adev->dev, + "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", + retired_page, row, col, bank, channel_index); amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, umc_inst); /* shift R13 bit */ retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); - dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x Bank:0x%x\n", - retired_page, row_xor, col, bank); + dev_info(adev->dev, + "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", + retired_page, row_xor, col, bank, channel_index); amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, umc_inst); } -- 2.35.1
[PATCH 2/3] drm/amdgpu: fix value of some UMC parameters for UMC v12
Prepare for bad page retirement. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index a5510412acd0..bae4a0d18190 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1497,12 +1497,14 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev) adev->umc.channel_idx_tbl = &umc_v6_7_channel_idx_tbl_second[0][0]; break; case IP_VERSION(12, 0, 0): - adev->umc.max_ras_err_cnt_per_query = UMC_V12_0_TOTAL_CHANNEL_NUM(adev); + adev->umc.max_ras_err_cnt_per_query = + UMC_V12_0_TOTAL_CHANNEL_NUM(adev) * UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; adev->umc.channel_inst_num = UMC_V12_0_CHANNEL_INSTANCE_NUM; adev->umc.umc_inst_num = UMC_V12_0_UMC_INSTANCE_NUM; adev->umc.node_inst_num /= UMC_V12_0_UMC_INSTANCE_NUM; adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET; adev->umc.active_mask = adev->aid_mask; + adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0]; if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) adev->umc.ras = &umc_v12_0_ras; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index e3619d67ae3b..4885b9fff272 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -53,6 +53,8 @@ /* one piece of normalized address is mapped to 8 pieces of physical address */ #define UMC_V12_0_NA_MAP_PA_NUM8 +/* R13 bit shift should be considered, double the number */ +#define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2) /* bank bits in MCA error address */ #define UMC_V12_0_MCA_B0_BIT 6 #define UMC_V12_0_MCA_B1_BIT 7 -- 2.35.1
[PATCH 3/3] drm/amdgpu: change if condition for bad channel bitmap update
The amdgpu_ras_eeprom_control.bad_channel_bitmap is u32 type, but the channel index could be larger than 32. For the ASICs whose channel number is more than 32, the amdgpu_dpm_send_hbm_bad_channel_flag interface is not supported, so we simply bypass channel bitmap update under this condition. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 8ced4be784e0..1c4433f22f4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -616,7 +616,8 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, __encode_table_record_to_buf(control, &record[i], pp); /* update bad channel bitmap */ - if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + if ((record[i].mem_channel < sizeof(control->bad_channel_bitmap)) && + !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { control->bad_channel_bitmap |= 1 << record[i].mem_channel; con->update_channel_flag = true; } @@ -969,7 +970,8 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, __decode_table_record_from_buf(control, &record[i], pp); /* update bad channel bitmap */ - if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + if ((record[i].mem_channel < sizeof(control->bad_channel_bitmap)) && + !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { control->bad_channel_bitmap |= 1 << record[i].mem_channel; con->update_channel_flag = true; } -- 2.35.1
[PATCH 1/3] drm/amdgpu: print channel index for UMC bad page
Print channel index for UMC v12. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index c6742dd863d4..7714c2ef2cdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -240,15 +240,17 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, /* include column bit 0 and 1 */ col &= 0x3; col |= (column << 2); - dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x Bank:0x%x\n", - retired_page, row, col, bank); + dev_info(adev->dev, + "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", + retired_page, row, col, bank, channel_index); amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, umc_inst); /* shift R13 bit */ retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); - dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x Bank:0x%x\n", - retired_page, row_xor, col, bank); + dev_info(adev->dev, + "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", + retired_page, row_xor, col, bank, channel_index); amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, umc_inst); } -- 2.35.1
[PATCH 3/3] drm/amdgpu: print more address info of UMC bad page
Print out row, column and bank value of UMC error address for UMC v12. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 5f056dd7691e..6fde85367272 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -173,7 +173,7 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, { uint32_t channel_index, i; uint64_t soc_pa, na, retired_page, column; - uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row; + uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row, row_xor; uint32_t bank0, bank1, bank2, bank3, bank; bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL; @@ -228,17 +228,23 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, /* clear [C4] in soc physical address */ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); + row_xor = row ^ (0x1ULL << 13); /* loop for all possibilities of [C4 C3 C2] */ for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); + /* include column bit 0 and 1 */ + col &= 0x3; + col |= (column << 2); + dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x Bank:0x%x\n", + retired_page, row, col, bank); amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, umc_inst); /* shift R13 bit */ retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); + dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x Bank:0x%x\n", + retired_page, row_xor, col, bank); amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, umc_inst); } -- 2.35.1
[PATCH 2/3] drm/amdgpu: add channel index table for UMC v12
Get UMC phyical channel index according to node id, umc instance and channel instance. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 1 + drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 14 ++ drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 5 + 3 files changed, 20 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index f12c6c7e6204..7af6659ca936 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1498,6 +1498,7 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev) adev->umc.active_mask = adev->aid_mask; if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) adev->umc.ras = &umc_v12_0_ras; + adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0]; default: break; } diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 2a135fd8ec15..5f056dd7691e 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -27,6 +27,20 @@ #include "umc/umc_12_0_0_offset.h" #include "umc/umc_12_0_0_sh_mask.h" +const uint32_t + umc_v12_0_channel_idx_tbl[] + [UMC_V12_0_UMC_INSTANCE_NUM] + [UMC_V12_0_CHANNEL_INSTANCE_NUM] = { + {{3, 7, 11, 15, 2, 6, 10, 14}, {1, 5, 9, 13, 0, 4, 8, 12}, +{19, 23, 27, 31, 18, 22, 26, 30}, {17, 21, 25, 29, 16, 20, 24, 28}}, + {{47, 43, 39, 35, 46, 42, 38, 34}, {45, 41, 37, 33, 44, 40, 36, 32}, +{63, 59, 55, 51, 62, 58, 54, 50}, {61, 57, 53, 49, 60, 56, 52, 48}}, + {{79, 75, 71, 67, 78, 74, 70, 66}, {77, 73, 69, 65, 76, 72, 68, 64}, +{95, 91, 87, 83, 94, 90, 86, 82}, {93, 89, 85, 81, 92, 88, 84, 80}}, + {{99, 103, 107, 111, 98, 102, 106, 110}, {97, 101, 105, 109, 96, 100, 104, 108}, +{115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 112, 116, 120, 124}} + }; + /* mapping of MCA error address to normalized address */ static const uint32_t umc_v12_0_ma2na_mapping[] = { 0, 5, 6, 8, 9, 14, 12, 13, diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index c20b4b4cbfda..e8d358ed8e61 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -115,6 +115,11 @@ #define GET_CROSS_NODE_ADDR(reg) \ reg) >> 32) & 0x3) ? ((reg) | (1ULL << 34)) : (reg)) +extern const uint32_t + umc_v12_0_channel_idx_tbl[] + [UMC_V12_0_UMC_INSTANCE_NUM] + [UMC_V12_0_CHANNEL_INSTANCE_NUM]; + extern struct amdgpu_umc_ras umc_v12_0_ras; #endif -- 2.35.1
[PATCH 1/3] drm/amdgpu: add address conversion for UMC v12
Convert MCA error address to physical address and find out all pages in one physical row. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 5 ++ drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 97 - drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 64 3 files changed, 162 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 43321f57f557..417a6726c71b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -32,6 +32,11 @@ * is the index of 8KB block */ #define ADDR_OF_8KB_BLOCK(addr)(((addr) & ~0xffULL) << 5) +/* + * (addr / 256) * 32768, the higher 26 bits in ErrorAddr + * is the index of 8KB block + */ +#define ADDR_OF_32KB_BLOCK(addr) (((addr) & ~0xffULL) << 7) /* channel index is the index of 256B block */ #define ADDR_OF_256B_BLOCK(channel_index) ((channel_index) << 8) /* offset in 256B block */ diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 292159814340..2a135fd8ec15 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -27,6 +27,14 @@ #include "umc/umc_12_0_0_offset.h" #include "umc/umc_12_0_0_sh_mask.h" +/* mapping of MCA error address to normalized address */ +static const uint32_t umc_v12_0_ma2na_mapping[] = { + 0, 5, 6, 8, 9, 14, 12, 13, + 10, 11, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, + 24, 7, 29, 30, +}; + static inline uint32_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev, uint32_t node_inst, uint32_t umc_inst, @@ -133,12 +141,93 @@ static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev, umc_v12_0_reset_error_count(adev); } -static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, - struct ras_err_data *err_data, uint64_t err_addr, - uint32_t ch_inst, uint32_t umc_inst, - uint32_t node_inst, uint64_t mc_umc_status) +static bool umc_v12_0_bit_wise_xor(uint32_t val) { + bool result = 0; + int i; + for (i = 0; i < 32; i++) + result = result ^ ((val >> i) & 0x1); + + return result; +} + +static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst, + uint32_t node_inst, uint64_t mc_umc_status) +{ + uint32_t channel_index, i; + uint64_t soc_pa, na, retired_page, column; + uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row; + uint32_t bank0, bank1, bank2, bank3, bank; + + bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL; + bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL; + bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL; + bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL; + col = (err_addr >> 1) & 0x1fULL; + row = (err_addr >> 10) & 0x3fffULL; + + /* apply bank hash algorithm */ + bank0 = + bank_hash0 ^ (UMC_V12_0_XOR_EN0 & + (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^ + (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0; + bank1 = + bank_hash1 ^ (UMC_V12_0_XOR_EN1 & + (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^ + (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1; + bank2 = + bank_hash2 ^ (UMC_V12_0_XOR_EN2 & + (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^ + (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2; + bank3 = + bank_hash3 ^ (UMC_V12_0_XOR_EN3 & + (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^ + (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3; + + bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3); + err_addr &= ~0x3c0ULL; + err_addr |= (bank << UMC_V12_0_MCA_B0_BIT); + + na = 0x0; + /* convert mca error address to normalized address */ + for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++) + na |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i]; + + channel_index = + adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num * +
[PATCH] drm/amdgpu: use read-modify-write mode for gfx v9_4_3 SQ setting
Instead of using direct update, avoid touching unrelated fields. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index b4fdb269f856..f24a5474db35 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -4042,7 +4042,8 @@ static void gfx_v9_4_3_inst_enable_watchdog_timer(struct amdgpu_device *adev, uint32_t i; uint32_t data; - data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE, + data = RREG32_SOC15(GC, GET_INST(GC, 0), regSQ_TIMEOUT_CONFIG); + data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE, amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : 0); if (amdgpu_watchdog_timer.timeout_fatal_disable && -- 2.35.1
[PATCH] drm/amdgpu: add RAS fatal error handler for NBIO v7.9
Register RAS fatal error interrupt and add handler. v2: only register NBIO RAS for dGPU platform. change nbio_v7_9_set_ras_controller_irq_state and nbio_v7_9_set_ras_err_event_athub_irq_state to dummy functions. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 187 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h | 1 + 3 files changed, 193 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bb29cb57add5..00658c2816dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -35,6 +35,7 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include "nbio_v4_3.h" +#include "nbio_v7_9.h" #include "atom.h" #include "amdgpu_reset.h" @@ -2644,6 +2645,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev) * check DF RAS */ adev->nbio.ras = &nbio_v4_3_ras; break; + case IP_VERSION(7, 9, 0): + if (!adev->gmc.is_app_apu) + adev->nbio.ras = &nbio_v7_9_ras; + break; default: /* nbio ras is not available */ break; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index 1d1ab188ef15..781f98655567 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -471,3 +471,190 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { .init_registers = nbio_v7_9_init_registers, .get_pcie_replay_count = nbio_v7_9_get_pcie_replay_count, }; + +static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + return; +} + +static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_intr_cntl; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_CNTLR_INTERRUPT_CLEAR, 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + if (!ras->disable_ras_err_cnt_harvest) { + /* +* clear error status after ras_controller_intr +* according to hw team and count ue number +* for query +*/ + nbio_v7_9_query_ras_error_count(adev, &err_data); + + /* logging on error cnt and printing for awareness */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + + if (err_data.ce_count) + dev_info(adev->dev, "%ld correctable hardware " + "errors detected in %s block, " + "no user action is needed.\n", + obj->err_data.ce_count, + get_ras_block_str(adev->nbio.ras_if)); + + if (err_data.ue_count) + dev_info(adev->dev, "%ld uncorrectable hardware " + "errors detected in %s block\n", + obj->err_data.ue_count, + get_ras_block_str(adev->nbio.ras_if)); + } + + dev_info(adev->dev, "RAS controller interrupt triggered " + "by NBIF error\n"); + + /* ras_controller_int is dedicated for nbif ras error, +* not the global interrupt for sync flood +*/ + amdgpu_ras_reset_gpu(adev); + } +} + +static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) +{
[PATCH] drm/amdgpu: add RAS fatal error handler for NBIO v7.9
Register RAS fatal error interrupt and add handler. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 + drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 219 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h | 1 + 3 files changed, 224 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 832fa646b38f..bef0f9264b4f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -35,6 +35,7 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include "nbio_v4_3.h" +#include "nbio_v7_9.h" #include "atom.h" #include "amdgpu_reset.h" @@ -2663,6 +2664,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) * check DF RAS */ adev->nbio.ras = &nbio_v4_3_ras; break; + case IP_VERSION(7, 9, 0): + adev->nbio.ras = &nbio_v7_9_ras; + break; default: /* nbio ras is not available */ break; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index cd1a02d30420..cc2268b871e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -451,3 +451,222 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode, .init_registers = nbio_v7_9_init_registers, }; + +static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + return; +} + +static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_intr_cntl; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_CNTLR_INTERRUPT_CLEAR, 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + if (!ras->disable_ras_err_cnt_harvest) { + /* +* clear error status after ras_controller_intr +* according to hw team and count ue number +* for query +*/ + nbio_v7_9_query_ras_error_count(adev, &err_data); + + /* logging on error cnt and printing for awareness */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + + if (err_data.ce_count) + dev_info(adev->dev, "%ld correctable hardware " + "errors detected in %s block, " + "no user action is needed.\n", + obj->err_data.ce_count, + get_ras_block_str(adev->nbio.ras_if)); + + if (err_data.ue_count) + dev_info(adev->dev, "%ld uncorrectable hardware " + "errors detected in %s block\n", + obj->err_data.ue_count, + get_ras_block_str(adev->nbio.ras_if)); + } + + dev_info(adev->dev, "RAS controller interrupt triggered " + "by NBIF error\n"); + + /* ras_controller_int is dedicated for nbif ras error, +* not the global interrupt for sync flood +*/ + amdgpu_ras_reset_gpu(adev); + } +} + +static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_intr_cntl; + + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRU
[PATCH] drm/amdgpu: add watchdog timer enablement for gfx_v9_4_3
Configure SQ watchdog timer setting. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 38 + 1 file changed, 38 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 9e3b835bdbb2..590b0fa62ccc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -2197,6 +2197,10 @@ static int gfx_v9_4_3_late_init(void *handle) if (r) return r; + if (adev->gfx.ras && + adev->gfx.ras->enable_watchdog_timer) + adev->gfx.ras->enable_watchdog_timer(adev); + return 0; } @@ -4043,6 +4047,34 @@ static void gfx_v9_4_3_inst_reset_ras_err_status(struct amdgpu_device *adev, gfx_v9_4_3_inst_reset_sq_timeout_status(adev, xcc_id); } +static void gfx_v9_4_3_inst_enable_watchdog_timer(struct amdgpu_device *adev, + void *ras_error_status, int xcc_id) +{ + uint32_t i; + uint32_t data; + + data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE, +amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : 0); + + if (amdgpu_watchdog_timer.timeout_fatal_disable && + (amdgpu_watchdog_timer.period < 1 || +amdgpu_watchdog_timer.period > 0x23)) { + dev_warn(adev->dev, "Watchdog period range is 1 to 0x23\n"); + amdgpu_watchdog_timer.period = 0x23; + } + data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL, +amdgpu_watchdog_timer.period); + + mutex_lock(&adev->grbm_idx_mutex); + for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { + gfx_v9_4_3_xcc_select_se_sh(adev, i, 0x, 0x, xcc_id); + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_TIMEOUT_CONFIG, data); + } + gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x, + xcc_id); + mutex_unlock(&adev->grbm_idx_mutex); +} + static void gfx_v9_4_3_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) { @@ -4065,6 +4097,11 @@ static void gfx_v9_4_3_reset_ras_error_status(struct amdgpu_device *adev) amdgpu_gfx_ras_error_func(adev, NULL, gfx_v9_4_3_inst_reset_ras_err_status); } +static void gfx_v9_4_3_enable_watchdog_timer(struct amdgpu_device *adev) +{ + amdgpu_gfx_ras_error_func(adev, NULL, gfx_v9_4_3_inst_enable_watchdog_timer); +} + static const struct amd_ip_funcs gfx_v9_4_3_ip_funcs = { .name = "gfx_v9_4_3", .early_init = gfx_v9_4_3_early_init, @@ -4393,4 +4430,5 @@ struct amdgpu_gfx_ras gfx_v9_4_3_ras = { .ras_block = { .hw_ops = &gfx_v9_4_3_ras_ops, }, + .enable_watchdog_timer = &gfx_v9_4_3_enable_watchdog_timer, }; -- 2.35.1
[PATCH] drm/amdgpu: skip address adjustment for GFX RAS injection
The address parameter of GFX RAS injection isn't related to XGMI node number, keep it unchanged. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 046659bd4f9e..5371fbd3fe17 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1163,7 +1163,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, } /* Calculate XGMI relative offset */ - if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (adev->gmc.xgmi.num_physical_nodes > 1 && + info->head.block != AMDGPU_RAS_BLOCK__GFX) { block_info.address = amdgpu_xgmi_get_relative_phy_addr(adev, block_info.address); -- 2.35.1
[PATCH] drm/amdgpu: check RAS irq existence for VCN/JPEG
No RAS irq is allowed. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c index 3add4b4f0667..2ff2897fd1db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c @@ -255,7 +255,8 @@ int amdgpu_jpeg_ras_late_init(struct amdgpu_device *adev, struct ras_common_if * if (amdgpu_ras_is_supported(adev, ras_block->block)) { for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) { - if (adev->jpeg.harvest_config & (1 << i)) + if (adev->jpeg.harvest_config & (1 << i) || + !adev->jpeg.inst[i].ras_poison_irq.funcs) continue; r = amdgpu_irq_get(adev, &adev->jpeg.inst[i].ras_poison_irq, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 19a3bb5dd29a..d37ebd4402ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -1216,7 +1216,8 @@ int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r if (amdgpu_ras_is_supported(adev, ras_block->block)) { for (i = 0; i < adev->vcn.num_vcn_inst; i++) { - if (adev->vcn.harvest_config & (1 << i)) + if (adev->vcn.harvest_config & (1 << i) || + !adev->vcn.inst[i].ras_poison_irq.funcs) continue; r = amdgpu_irq_get(adev, &adev->vcn.inst[i].ras_poison_irq, 0); -- 2.35.1
[PATCH] drm/amdgpu: remove unused definition
mmhub_v1_8_mmea_cgtt_clk_cntl_reg is defined but not used. Reported-by: kernel test robot Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 8 1 file changed, 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index 3648994724c2..00e7e5db7c28 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -757,14 +757,6 @@ static void mmhub_v1_8_query_ras_error_status(struct amdgpu_device *adev) mmhub_v1_8_inst_query_ras_err_status(adev, i); } -static const uint32_t mmhub_v1_8_mmea_cgtt_clk_cntl_reg[] = { - regMMEA0_CGTT_CLK_CTRL, - regMMEA1_CGTT_CLK_CTRL, - regMMEA2_CGTT_CLK_CTRL, - regMMEA3_CGTT_CLK_CTRL, - regMMEA4_CGTT_CLK_CTRL, -}; - static void mmhub_v1_8_inst_reset_ras_err_status(struct amdgpu_device *adev, uint32_t mmhub_inst) { -- 2.35.1
[PATCH 2/2] drm/amdgpu: add bad_page_threshold check in ras_eeprom_check_err
bad_page_threshold controls page retirement behavior and it should be also checked. v2: simplify the condition of bad page handling path. Signed-off-by: Tao Zhou --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 19 ++- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 9d370465b08d..2e08fce87521 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -417,7 +417,8 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!__is_ras_eeprom_supported(adev)) + if (!__is_ras_eeprom_supported(adev) || + !amdgpu_bad_page_threshold) return false; /* skip check eeprom table for VEGA20 Gaming */ @@ -428,10 +429,18 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) return false; if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) { - dev_warn(adev->dev, "This GPU is in BAD status."); - dev_warn(adev->dev, "Please retire it or set a larger " -"threshold value when reloading driver.\n"); - return true; + if (amdgpu_bad_page_threshold == -1) { + dev_warn(adev->dev, "RAS records:%d exceed threshold:%d", + con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold); + dev_warn(adev->dev, + "But GPU can be operated due to bad_page_threshold = -1.\n"); + return false; + } else { + dev_warn(adev->dev, "This GPU is in BAD status."); + dev_warn(adev->dev, "Please retire it or set a larger " +"threshold value when reloading driver.\n"); + return true; + } } return false; -- 2.35.1
[PATCH 1/2] drm/amdgpu: change default behavior of bad_page_threshold parameter
Ignore ras umc bad page threshold by default, GPU initialization won't be stopped in this mode. v2: refine the description of bad_page_threshold. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 7 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 6c2fe50b528e..8a375394db0c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -921,7 +921,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444); * result in the GPU entering bad status when the number of total * faulty pages by ECC exceeds the threshold value. */ -MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement, -2 = ignore bad page threshold)"); +MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = driver sets threshold)"); module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5c02c6c9f773..63dfcc98152d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2196,11 +2196,12 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, /* * Justification of value bad_page_cnt_threshold in ras structure * -* Generally, -1 <= amdgpu_bad_page_threshold <= max record length -* in eeprom, and introduce two scenarios accordingly. +* Generally, 0 <= amdgpu_bad_page_threshold <= max record length +* in eeprom or amdgpu_bad_page_threshold == -2, introduce two +* scenarios accordingly. * * Bad page retirement enablement: -*- If amdgpu_bad_page_threshold = -1, +*- If amdgpu_bad_page_threshold = -2, * bad_page_cnt_threshold = typical value by formula. * *- When the value from user is 0 < amdgpu_bad_page_threshold < diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 2d9f3f4cd79e..9d370465b08d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -1191,8 +1191,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, } else { dev_err(adev->dev, "RAS records:%d exceed threshold:%d", control->ras_num_recs, ras->bad_page_cnt_threshold); - if (amdgpu_bad_page_threshold == -2) { - dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -2."); + if (amdgpu_bad_page_threshold == -1) { + dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1."); res = 0; } else { *exceed_err_limit = true; -- 2.35.1