from:"Tao Zhou"

[PATCH] drm/amdgpu: disable GPU RAS bad page feature for specific ASIC

2024-09-10 Thread Tao Zhou

The feature is not applicable to specific app platform.

v2: update the disablement condition and commit description
v3: move the setting to amdgpu_ras_check_supported

Signed-off-by: Tao Zhou 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index dbfc41ddc3c7..ebe3e8f01fe2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3483,6 +3483,11 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
 
/* aca is disabled by default */
adev->aca.is_enabled = false;
+
+   /* bad page feature is not applicable to specific app platform */
+   if (adev->gmc.is_app_apu &&
+   amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0))
+   amdgpu_bad_page_threshold = 0;
 }
 
 static void amdgpu_ras_counte_dw(struct work_struct *work)
-- 
2.34.1

[PATCH] drm/amdgpu: disable GPU RAS bad page feature for specific ASIC

2024-09-09 Thread Tao Zhou

The feature is not applicable to specific app platform.

v2: update the disablement condition and commit description

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index dbfc41ddc3c7..08efc9121adc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2055,6 +2055,11 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
con->event_state_attr = dev_attr_event_state;
sysfs_attr_init(attrs[3]);
 
+   /* bad page feature is not applicable to specific app platform */
+   if (adev->gmc.is_app_apu &&
+   amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0))
+   amdgpu_bad_page_threshold = 0;
+
if (amdgpu_bad_page_threshold != 0) {
/* add bad_page_features entry */
bin_attr_gpu_vram_bad_pages.private = NULL;
-- 
2.34.1

[PATCH] drm/amdgpu: disable RAS bad page feature for specific APU

2024-09-09 Thread Tao Zhou

The feature is unsupported on specific APU.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index dbfc41ddc3c7..d46f216a33b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3679,6 +3679,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 
amdgpu_ras_init_reserved_vram_size(adev);
 
+   /* bad page feature is unspported on specific APU */
+   if ((adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) &&
+   amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0))
+   amdgpu_bad_page_threshold = 0;
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
-- 
2.34.1

[PATCH 2/3] drm/amdgpu: update bad state check in GPU recovery

2024-08-01 Thread Tao Zhou

Return RMA status without message print.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 96e525ab9a84..5d49f70704c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5538,7 +5538,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
 * bad_page_threshold value to fix this once
 * probing driver again.
 */
-   if 
(!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
+   if (!amdgpu_ras_is_rma(tmp_adev)) {
/* must succeed. */
amdgpu_ras_resume(tmp_adev);
} else {
-- 
2.34.1

[PATCH 3/3] drm/amdgpu: report bad status in GPU recovery

2024-08-01 Thread Tao Zhou

Instead of printing GPU reset failed.

v2: add check for reset_context->src.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5d49f70704c6..7b21243c7c55 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5921,8 +5921,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
tmp_adev->asic_reset_res = 0;
 
if (r) {
-   /* bad news, how to tell it to userspace ? */
-   dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 
atomic_read(&tmp_adev->gpu_reset_counter));
+   /* bad news, how to tell it to userspace ?
+* for ras error, we should report GPU bad status 
instead of
+* reset failure
+*/
+   if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
+   !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
+   dev_info(tmp_adev->dev, "GPU reset(%d) 
failed\n",
+   
atomic_read(&tmp_adev->gpu_reset_counter));
amdgpu_vf_error_put(tmp_adev, 
AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
} else {
dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 
atomic_read(&tmp_adev->gpu_reset_counter));
-- 
2.34.1

[PATCH 1/3] drm/amdgpu: create function to check RAS RMA status

2024-08-01 Thread Tao Zhou

In the convenience of calling it globally.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 22 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c |  2 +-
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 12ab48f26bd5..0941518f04c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2153,7 +2153,7 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
/* gpu reset is fallback for failed and default cases.
 * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
 */
-   if (poison_stat && !con->is_rma) {
+   if (poison_stat && !amdgpu_ras_is_rma(adev)) {
event_id = amdgpu_ras_acquire_event_id(adev, type);
RAS_EVENT_LOG(adev, event_id,
  "GPU reset for %s RAS poison consumption is 
issued!\n",
@@ -2951,7 +2951,7 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
 
amdgpu_ras_error_data_fini(&err_data);
 
-   if (err_cnt && con->is_rma)
+   if (err_cnt && amdgpu_ras_is_rma(adev))
amdgpu_ras_reset_gpu(adev);
 
amdgpu_ras_schedule_retirement_dwork(con,
@@ -3053,7 +3053,7 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
}
 
/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
-   if (reset_flags && !con->is_rma) {
+   if (reset_flags && !amdgpu_ras_is_rma(adev)) {
if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET)
@@ -3202,7 +3202,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 * This calling fails when is_rma is true or
 * ret != 0.
 */
-   if (con->is_rma || ret)
+   if (amdgpu_ras_is_rma(adev) || ret)
goto free;
 
if (con->eeprom_control.ras_num_recs) {
@@ -3254,7 +3254,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 * Except error threshold exceeding case, other failure cases in this
 * function would not fail amdgpu driver init.
 */
-   if (!con->is_rma)
+   if (!amdgpu_ras_is_rma(adev))
ret = 0;
else
ret = -EINVAL;
@@ -4301,7 +4301,7 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
/* mode1 is the only selection for RMA status */
-   if (ras->is_rma) {
+   if (amdgpu_ras_is_rma(adev)) {
ras->gpu_reset_flags = 0;
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
}
@@ -4835,3 +4835,13 @@ void amdgpu_ras_event_log_print(struct amdgpu_device 
*adev, u64 event_id,
 
va_end(args);
 }
+
+bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   if (!con)
+   return false;
+
+   return con->is_rma;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7ddd13d5c06b..25a19760f098 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -972,4 +972,5 @@ __printf(3, 4)
 void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
const char *fmt, ...);
 
+bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 32be258d81e1..9e70a7b3aa64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -196,7 +196,7 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
 
if ((err_data->ue_count || err_data->de_count) &&
-   (reset || (con && con->is_rma))) {
+   (reset || amdgpu_ras_is_rma(adev))) {
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 9cd221ed240c..999bb3cc88b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -97,7 +97,7 @@ static int gfx_v11_0_3_poison_consumption_handler(struct 
amdgpu_device *adev,
ras->gpu_reset_flags |= 
AMDGPU_RAS_GPU

[PATCH] drm/amdgpu: report bad status in GPU recovery

2024-07-31 Thread Tao Zhou

Instead of printing GPU reset failed.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 355c2478c4b6..b7c967779b4b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5933,8 +5933,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
tmp_adev->asic_reset_res = 0;
 
if (r) {
-   /* bad news, how to tell it to userspace ? */
-   dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 
atomic_read(&tmp_adev->gpu_reset_counter));
+   /* bad news, how to tell it to userspace ?
+* for ras error, we should report GPU bad status 
instead of
+* reset failure
+*/
+   if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
+   dev_info(tmp_adev->dev, "GPU reset(%d) 
failed\n",
+   
atomic_read(&tmp_adev->gpu_reset_counter));
amdgpu_vf_error_put(tmp_adev, 
AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
} else {
dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 
atomic_read(&tmp_adev->gpu_reset_counter));
-- 
2.34.1

[PATCH] drm/amdkfd: add ASIC version check for the reset selection of RAS poison

2024-06-13 Thread Tao Zhou

GFX v9.4.3 uses mode1 reset, other ASICs choose mode2.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 78dde62fb04a..816800555f7f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -164,7 +164,10 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
block = AMDGPU_RAS_BLOCK__GFX;
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3))
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   else
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
@@ -177,7 +180,10 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 
4, 3))
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   else
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
default:
dev_warn(dev->adev->dev,
-- 
2.34.1

[PATCH] drm/amdkfd: use mode1 reset for RAS poison consumption

2024-06-12 Thread Tao Zhou

Per FW requirement, replace mode2 with mode1.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e1c21d250611..78dde62fb04a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -164,7 +164,7 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
block = AMDGPU_RAS_BLOCK__GFX;
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
@@ -177,7 +177,7 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
default:
dev_warn(dev->adev->dev,
-- 
2.34.1

[PATCH 2/2] drm/amd/pm: update check condition for SMU mode1 reset

2024-06-06 Thread Tao Zhou

The fed status does indicate RAS fatal error.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c   | 2 +-
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 2 +-
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 04533f99f1e3..2c35eb31475a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1876,7 +1876,7 @@ static int aldebaran_mode1_reset(struct smu_context *smu)
/* fatal error triggered by ras, PMFW supports the flag
   from 68.44.0 */
if ((smu->smc_fw_version >= 0x00442c00) &&
-   amdgpu_ras_in_recovery(adev))
+   amdgpu_ras_get_fed_status(adev))
fatal_err = 1;
 
param |= (fatal_err << 16);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index d1766a603bb9..c9639141792f 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2788,7 +2788,7 @@ static void smu_v13_0_0_set_mode1_reset_param(struct 
smu_context *smu,
struct amdgpu_device *adev = smu->adev;
 
if ((smu->smc_fw_version >= supported_version) &&
-   amdgpu_ras_in_recovery(adev))
+   amdgpu_ras_get_fed_status(adev))
/* Set RAS fatal error reset flag */
*param = 1 << 16;
else
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index c1d7528a6dc8..7fda7196fa7c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2580,7 +2580,7 @@ static int smu_v13_0_6_mode1_reset(struct smu_context 
*smu)
param = SMU_RESET_MODE_1;
 
/* fatal error triggered by ras, PMFW supports the flag */
-   if (amdgpu_ras_in_recovery(adev))
+   if (amdgpu_ras_get_fed_status(adev))
fatal_err = 1;
 
param |= (fatal_err << 16);
-- 
2.34.1

[PATCH 1/2] drm/amdgpu: set RAS fed status for more cases

2024-06-06 Thread Tao Zhou

Indicate fatal error for each RAS block and NBIO.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 +
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 77566dcc0852..183eae22b687 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2114,6 +2114,7 @@ static void amdgpu_ras_interrupt_umc_handler(struct 
ras_manager *obj,
/* Let IP handle its data, maybe we need get the output
 * from the callback to update the error type/count, etc
 */
+   amdgpu_ras_set_fed(obj->adev, true);
ret = data->cb(obj->adev, &err_data, entry);
/* ue will trigger an interrupt, and in that case
 * we need do a reset to recovery the whole system.
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index b8fc9e126e0d..9446bf6f82c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -414,6 +414,7 @@ static void 
nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
/* ras_controller_int is dedicated for nbif ras error,
 * not the global interrupt for sync flood
 */
+   amdgpu_ras_set_fed(adev, true);
amdgpu_ras_reset_gpu(adev);
}
 
-- 
2.34.1

[PATCH 4/5] drma/amdgpu: set fatal flag for RAS recovery

2024-05-31 Thread Tao Zhou

PMFW needs the flag to know the reason of mode1.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  |  6 +++---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   |  2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index fb5fc1fe6ad0..f55bff59052f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -940,7 +940,7 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device 
*adev,
if (adev->gfx.ras && adev->gfx.ras->ras_block.hw_ops &&
adev->gfx.ras->ras_block.hw_ops->query_ras_error_count)

adev->gfx.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, true);
}
return AMDGPU_RAS_SUCCESS;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ff2d34dc9718..2071e30d7e56 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2070,7 +2070,7 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (poison_stat && !con->is_rma) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
block_obj->ras_comm.name);
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, false);
}
 
if (!poison_stat)
@@ -2825,7 +2825,7 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
amdgpu_ras_error_data_fini(&err_data);
 
if (err_cnt && con->is_rma)
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, false);
 
mutex_lock(&con->umc_ecc_log.lock);
if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
@@ -2888,7 +2888,7 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
flush_delayed_work(&con->page_retirement_dwork);
 
con->gpu_reset_flags |= reset;
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, false);
}
 
return 0;
@@ -3815,7 +3815,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 
amdgpu_ras_set_fed(adev, true);
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, true);
}
 }
 
@@ -3996,7 +3996,7 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev,
return ret;
 }
 
-int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal)
 {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 37e1c93c243d..ed5793458a70 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -878,7 +878,7 @@ bool amdgpu_ras_is_poison_mode_supported(struct 
amdgpu_device *adev);
 
 int amdgpu_ras_is_supported(struct amdgpu_device *adev, unsigned int block);
 
-int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal);
 
 struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 151f83ea803b..f976b6deb42d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -129,7 +129,7 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device 
*adev,
if (amdgpu_sriov_vf(adev))
return AMDGPU_RAS_SUCCESS;
 
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, true);
 
return AMDGPU_RAS_SUCCESS;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 4a72ff8d8d80..2596a1c2a64e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -198,7 +198,7 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
if ((err_data->ue_count || err_data->de_count) &&
(reset || (con && con->is_rma))) {
con->gpu_reset_flags |= reset;
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, fa

[PATCH 1/5] drm/amdgpu: add RAS is_rma flag

2024-05-31 Thread Tao Zhou

Set the flag to true if bad page number reaches threshold.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c|  7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 +--
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2dc47475b8e9..616dc2387f34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
u32  max_eeprom_records_count = 0;
-   bool exc_err_limit = false;
int ret;
 
if (!con || amdgpu_sriov_vf(adev))
@@ -2977,12 +2976,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 */
if (adev->gmc.xgmi.pending_reset)
return 0;
-   ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
+   ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
/*
 * This calling fails when exc_err_limit is true or
 * ret != 0.
 */
-   if (exc_err_limit || ret)
+   if (con->is_rma || ret)
goto free;
 
if (con->eeprom_control.ras_num_recs) {
@@ -3033,7 +3032,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 * Except error threshold exceeding case, other failure cases in this
 * function would not fail amdgpu driver init.
 */
-   if (!exc_err_limit)
+   if (!con->is_rma)
ret = 0;
else
ret = -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index d06c01b978cd..437c58c85639 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -521,6 +521,7 @@ struct amdgpu_ras {
bool update_channel_flag;
/* Record status of smu mca debug mode */
bool is_aca_debug_mode;
+   bool is_rma;
 
/* Record special requirements of gpu reset caller */
uint32_t  gpu_reset_flags;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 9b789dcc2bd1..eae0a555df3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct 
amdgpu_ras_eeprom_control *control)
control->tbl_rai.health_percent = 0;
}
 
+   if (amdgpu_bad_page_threshold != -1)
+   ras->is_rma = true;
+
/* ignore the -ENOTSUPP return value */
amdgpu_dpm_send_rma_reason(adev);
}
@@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct 
amdgpu_ras_eeprom_control *control)
return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
 }
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
-  bool *exceed_err_limit)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 {
struct amdgpu_device *adev = to_amdgpu_device(control);
unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
@@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int res;
 
-   *exceed_err_limit = false;
+   ras->is_rma = false;
 
if (!__is_ras_eeprom_supported(adev))
return 0;
@@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
dev_warn(adev->dev, "GPU will be initialized 
due to bad_page_threshold = -1.");
res = 0;
} else {
-   *exceed_err_limit = true;
+   ras->is_rma = true;
dev_err(adev->dev,
"RAS records:%d exceed threshold:%d, "
"GPU will not be initialized. Replace 
this GPU or increase the threshold",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 6dfd667f3013..b9ebda577797 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -129,8 +129,7 @@ struct eeprom_table_record {
unsigned char mcumc_id;
 } __packed;
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
-  bool *exceed_err_limit);
+int amdgpu_ras_eeprom_init(struct am

[PATCH 5/5] drm/amdgpu: add ras fatal flag to distingush fatal error reset

2024-05-31 Thread Tao Zhou

Check it in mode1 reset.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 32 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |  1 +
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c|  2 +-
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  |  2 +-
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  |  2 +-
 6 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2071e30d7e56..97b770ba6424 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2451,6 +2451,26 @@ bool amdgpu_ras_in_recovery(struct amdgpu_device *adev)
return false;
 }
 
+bool amdgpu_ras_in_fatal(struct amdgpu_device *adev)
+{
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+   int hive_ras_fatal = 0;
+
+   if (!amdgpu_ras_in_recovery(adev))
+   return false;
+
+   if (hive) {
+   hive_ras_fatal = atomic_read(&hive->ras_fatal);
+   amdgpu_put_xgmi_hive(hive);
+   }
+
+   if (ras && (atomic_read(&ras->in_fatal) || hive_ras_fatal))
+   return true;
+
+   return false;
+}
+
 static void amdgpu_ras_do_recovery(struct work_struct *work)
 {
struct amdgpu_ras *ras =
@@ -2462,6 +2482,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
 
if (hive) {
atomic_set(&hive->ras_recovery, 1);
+   if (atomic_read(&ras->in_fatal))
+   atomic_set(&hive->ras_fatal, 1);
 
/* If any device which is part of the hive received RAS fatal
 * error interrupt, set fatal error status on all. This
@@ -2526,8 +2548,10 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
}
atomic_set(&ras->in_recovery, 0);
+   atomic_set(&ras->in_fatal, 0);
if (hive) {
atomic_set(&hive->ras_recovery, 0);
+   atomic_set(&hive->ras_fatal, 0);
amdgpu_put_xgmi_hive(hive);
}
 }
@@ -2982,6 +3006,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
+   atomic_set(&con->in_fatal, 0);
con->eeprom_control.bad_channel_bitmap = 0;
 
max_eeprom_records_count = 
amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
@@ -4006,8 +4031,13 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, 
bool fatal)
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
}
 
-   if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
+   if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) {
+   if (fatal)
+   atomic_set(&ras->in_fatal, 1);
+
amdgpu_reset_domain_schedule(ras->adev->reset_domain, 
&ras->recovery_work);
+   }
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ed5793458a70..444a7fb7fbe3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -489,6 +489,7 @@ struct amdgpu_ras {
/* gpu recovery */
struct work_struct recovery_work;
atomic_t in_recovery;
+   atomic_t in_fatal;
struct amdgpu_device *adev;
/* error handler data */
struct ras_err_handler_data *eh_data;
@@ -953,6 +954,7 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
pasid_notify pasid_fn, void *data, uint32_t reset);
 
 bool amdgpu_ras_in_recovery(struct amdgpu_device *adev);
+bool amdgpu_ras_in_fatal(struct amdgpu_device *adev);
 
 __printf(3, 4)
 void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index a3bfc16de6d4..a6d6272a4ec6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -44,6 +44,7 @@ struct amdgpu_hive_info {
 
struct amdgpu_reset_domain *reset_domain;
atomic_t ras_recovery;
+   atomic_t ras_fatal;
struct ras_event_manager event_mgr;
 };
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 04533f99f1e3..a850e7b29d9d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1876,7 +1876,7 @@ static int aldebaran_mode1_reset(str

[PATCH 2/5] drm/amdgpu: trigger mode1 reset for RAS RMA status

2024-05-31 Thread Tao Zhou

Check RMA status in bad page retirement flow.

v2: fix coding bugs in v1.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 28 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  |  8 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c |  4 +++-
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 616dc2387f34..10cbcc0d1a1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2049,8 +2049,9 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
struct amdgpu_device *adev = obj->adev;
struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
-   if (!block_obj)
+   if (!block_obj || !con)
return;
 
/* both query_poison_status and handle_poison_consumption are optional,
@@ -2073,14 +2074,17 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
 
-   /* gpu reset is fallback for failed and default cases */
-   if (poison_stat) {
+   /* gpu reset is fallback for failed and default cases.
+* For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
+*/
+   if (poison_stat && !con->is_rma) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev);
-   } else {
-   amdgpu_gfx_poison_consumption_handler(adev, entry);
}
+
+   if (!poison_stat)
+   amdgpu_gfx_poison_consumption_handler(adev, entry);
 }
 
 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager 
*obj,
@@ -2801,6 +2805,7 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
  page_retirement_dwork.work);
struct amdgpu_device *adev = con->adev;
struct ras_err_data err_data;
+   unsigned long err_cnt;
 
if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
return;
@@ -2808,9 +2813,13 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
amdgpu_ras_error_data_init(&err_data);
 
amdgpu_umc_handle_bad_pages(adev, &err_data);
+   err_cnt = err_data.err_addr_cnt;
 
amdgpu_ras_error_data_fini(&err_data);
 
+   if (err_cnt && con->is_rma)
+   amdgpu_ras_reset_gpu(adev);
+
mutex_lock(&con->umc_ecc_log.lock);
if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
UMC_ECC_NEW_DETECTED_TAG))
@@ -2867,7 +2876,8 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
if (poison_msg->pasid_fn)
poison_msg->pasid_fn(adev, pasid, poison_msg->data);
 
-   if (reset) {
+   /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
+   if (reset && !con->is_rma) {
flush_delayed_work(&con->page_retirement_dwork);
 
con->gpu_reset_flags |= reset;
@@ -3983,6 +3993,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+   /* mode1 is the only selection for RMA status */
+   if (ras->is_rma) {
+   ras->gpu_reset_flags = 0;
+   ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   }
+
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
amdgpu_reset_domain_schedule(ras->adev->reset_domain, 
&ras->recovery_work);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1dbe69eabb9a..4a72ff8d8d80 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -195,7 +195,8 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
 
-   if (err_data->ue_count && reset) {
+   if ((err_data->ue_count || err_data->de_count) &&
+   (reset || (con && con->is_rma))) {
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
@@ -211,6 +212,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
.block

[PATCH 3/5] drm/amdgpu: create amdgpu_ras_in_recovery to simplify code

2024-05-31 Thread Tao Zhou

Reduce redundant code and user doesn't need to pay attention to RAS
details.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 13 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   | 14 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 31 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  2 ++
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c|  5 ++-
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  |  3 +-
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 12 +--
 7 files changed, 29 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2ba8c4d5dc76..1811c7ba9bdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6325,20 +6325,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev 
*pdev)
struct amdgpu_reset_context reset_context;
u32 memsize;
struct list_head device_list;
-   struct amdgpu_hive_info *hive;
-   int hive_ras_recovery = 0;
-   struct amdgpu_ras *ras;
 
/* PCI error slot reset should be skipped During RAS recovery */
-   hive = amdgpu_get_xgmi_hive(adev);
-   if (hive) {
-   hive_ras_recovery = atomic_read(&hive->ras_recovery);
-   amdgpu_put_xgmi_hive(hive);
-   }
-   ras = amdgpu_ras_get_context(adev);
if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
-amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
-   ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery))
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
+   amdgpu_ras_in_recovery(adev))
return PCI_ERS_RESULT_RECOVERED;
 
DRM_INFO("PCI error: slot reset callback!!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 68505eaa92f9..fb5fc1fe6ad0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -505,9 +505,6 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int 
xcc_id)
 {
struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id];
struct amdgpu_ring *kiq_ring = &kiq->ring;
-   struct amdgpu_hive_info *hive;
-   struct amdgpu_ras *ras;
-   int hive_ras_recovery = 0;
int i, r = 0;
int j;
 
@@ -532,16 +529,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int 
xcc_id)
 * This is workaround: only skip kiq_ring test
 * during ras recovery in suspend stage for gfx9.4.3
 */
-   hive = amdgpu_get_xgmi_hive(adev);
-   if (hive) {
-   hive_ras_recovery = atomic_read(&hive->ras_recovery);
-   amdgpu_put_xgmi_hive(hive);
-   }
-
-   ras = amdgpu_ras_get_context(adev);
if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
-amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
-   ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) {
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
+   amdgpu_ras_in_recovery(adev)) {
spin_unlock(&kiq->ring_lock);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 10cbcc0d1a1a..ff2d34dc9718 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1409,11 +1409,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
-   struct amdgpu_hive_info *hive;
-   int hive_ras_recovery = 0;
 
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
@@ -1425,15 +1422,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
!amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP;
 
-   hive = amdgpu_get_xgmi_hive(adev);
-   if (hive) {
-   hive_ras_recovery = atomic_read(&hive->ras_recovery);
-   amdgpu_put_xgmi_hive(hive);
-   }
-
/* skip ras error reset in gpu reset */
-   if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
-   hive_ras_recovery) &&
+   if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) &&
((smu_funcs && smu_fu

[PATCH 2/2] drm/amdgpu: trigger mode1 reset for RAS RMA status

2024-05-23 Thread Tao Zhou

Check RMA status in bad page retirement flow.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  7 +++
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 934dfb2bf9e5..a6da44ac3fbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2049,8 +2049,9 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
struct amdgpu_device *adev = obj->adev;
struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
-   if (!block_obj)
+   if (!block_obj || !con)
return;
 
/* both query_poison_status and handle_poison_consumption are optional,
@@ -2074,7 +2075,7 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
 
/* gpu reset is fallback for failed and default cases */
-   if (poison_stat) {
+   if (poison_stat || con->is_rma) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev);
@@ -2817,6 +2818,9 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
schedule_delayed_work(&con->page_retirement_dwork,
msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
mutex_unlock(&con->umc_ecc_log.lock);
+
+   if (err_data->err_addr_cnt && con->is_rma)
+   amdgpu_ras_reset_gpu(adev);
 }
 
 static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
@@ -2867,7 +2871,7 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
if (poison_msg->pasid_fn)
poison_msg->pasid_fn(adev, pasid, poison_msg->data);
 
-   if (reset) {
+   if (reset && !con->is_rma) {
flush_delayed_work(&con->page_retirement_dwork);
 
con->gpu_reset_flags |= reset;
@@ -3983,6 +3987,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+   /* mode1 is the only selection for RMA status */
+   if (ras->is_rma) {
+   ras->gpu_reset_flags = 0;
+   ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   }
+
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
amdgpu_reset_domain_schedule(ras->adev->reset_domain, 
&ras->recovery_work);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1dbe69eabb9a..5f3866548cb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -195,7 +195,7 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
 
-   if (err_data->ue_count && reset) {
+   if ((err_data->ue_count && (reset || con->is_rma)) {
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
@@ -211,6 +211,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
.block = AMDGPU_RAS_BLOCK__UMC,
};
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint32_t timeout = timeout_ms;
 
memset(&err_data, 0, sizeof(err_data));
@@ -243,9 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
 
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 
-   if (reset) {
-   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
+   if (reset || (err_data.err_addr_cnt && con->is_rma) {
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
-- 
2.34.1

[PATCH 1/2] drm/amdgpu: add RAS is_rma flag

2024-05-23 Thread Tao Zhou

Set the flag to true if bad page number reaches threshold.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c|  7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 +--
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ecce022c657b..934dfb2bf9e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
u32  max_eeprom_records_count = 0;
-   bool exc_err_limit = false;
int ret;
 
if (!con || amdgpu_sriov_vf(adev))
@@ -2977,12 +2976,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 */
if (adev->gmc.xgmi.pending_reset)
return 0;
-   ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
+   ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
/*
 * This calling fails when exc_err_limit is true or
 * ret != 0.
 */
-   if (exc_err_limit || ret)
+   if (con->is_rma || ret)
goto free;
 
if (con->eeprom_control.ras_num_recs) {
@@ -3033,7 +3032,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 * Except error threshold exceeding case, other failure cases in this
 * function would not fail amdgpu driver init.
 */
-   if (!exc_err_limit)
+   if (!con->is_rma)
ret = 0;
else
ret = -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index d06c01b978cd..437c58c85639 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -521,6 +521,7 @@ struct amdgpu_ras {
bool update_channel_flag;
/* Record status of smu mca debug mode */
bool is_aca_debug_mode;
+   bool is_rma;
 
/* Record special requirements of gpu reset caller */
uint32_t  gpu_reset_flags;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 9b789dcc2bd1..eae0a555df3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct 
amdgpu_ras_eeprom_control *control)
control->tbl_rai.health_percent = 0;
}
 
+   if (amdgpu_bad_page_threshold != -1)
+   ras->is_rma = true;
+
/* ignore the -ENOTSUPP return value */
amdgpu_dpm_send_rma_reason(adev);
}
@@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct 
amdgpu_ras_eeprom_control *control)
return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
 }
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
-  bool *exceed_err_limit)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 {
struct amdgpu_device *adev = to_amdgpu_device(control);
unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
@@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int res;
 
-   *exceed_err_limit = false;
+   ras->is_rma = false;
 
if (!__is_ras_eeprom_supported(adev))
return 0;
@@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
dev_warn(adev->dev, "GPU will be initialized 
due to bad_page_threshold = -1.");
res = 0;
} else {
-   *exceed_err_limit = true;
+   ras->is_rma = true;
dev_err(adev->dev,
"RAS records:%d exceed threshold:%d, "
"GPU will not be initialized. Replace 
this GPU or increase the threshold",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 6dfd667f3013..b9ebda577797 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -129,8 +129,7 @@ struct eeprom_table_record {
unsigned char mcumc_id;
 } __packed;
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
-  bool *exceed_err_limit);
+int amdgpu_ras_eeprom_init(struct am

[PATCH] drm/amdgpu: use u32 for buf size in __amdgpu_eeprom_xfer

2024-05-20 Thread Tao Zhou

And also make sure the the value of msg[1].len should be in the range of u16.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c
index 09a34c7258e2..35fee3e8cde2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c
@@ -90,7 +90,7 @@
 #define MAKE_I2C_ADDR(_aa) ((0xA << 3) | (((_aa) >> 16) & 0xF))
 
 static int __amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr,
-   u8 *eeprom_buf, u16 buf_size, bool read)
+   u8 *eeprom_buf, u32 buf_size, bool read)
 {
u8 eeprom_offset_buf[EEPROM_OFFSET_SIZE];
struct i2c_msg msgs[] = {
@@ -133,15 +133,15 @@ static int __amdgpu_eeprom_xfer(struct i2c_adapter 
*i2c_adap, u32 eeprom_addr,
 * cycle begins. This is implied for the
 * "i2c_transfer()" abstraction.
 */
-   len = min(EEPROM_PAGE_SIZE - (eeprom_addr &
- EEPROM_PAGE_MASK),
- (u32)buf_size);
+   len = min(EEPROM_PAGE_SIZE - (eeprom_addr & 
EEPROM_PAGE_MASK),
+   buf_size);
} else {
/* Reading from the EEPROM has no limitation
 * on the number of bytes read from the EEPROM
 * device--they are simply sequenced out.
+* Keep in mind that i2c_msg.len is u16 type.
 */
-   len = buf_size;
+   len = min(U16_MAX, buf_size);
}
msgs[1].len = len;
msgs[1].buf = eeprom_buf;
-- 
2.34.1

[PATCH] drm/amdgpu: update type of buf size to u32 for eeprom functions

2024-05-19 Thread Tao Zhou

Avoid overflow issue.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c | 6 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c
index e71768661ca8..09a34c7258e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c
@@ -179,7 +179,7 @@ static int __amdgpu_eeprom_xfer(struct i2c_adapter 
*i2c_adap, u32 eeprom_addr,
  * Returns the number of bytes read/written; -errno on error.
  */
 static int amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr,
- u8 *eeprom_buf, u16 buf_size, bool read)
+ u8 *eeprom_buf, u32 buf_size, bool read)
 {
const struct i2c_adapter_quirks *quirks = i2c_adap->quirks;
u16 limit;
@@ -225,7 +225,7 @@ static int amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, 
u32 eeprom_addr,
 
 int amdgpu_eeprom_read(struct i2c_adapter *i2c_adap,
   u32 eeprom_addr, u8 *eeprom_buf,
-  u16 bytes)
+  u32 bytes)
 {
return amdgpu_eeprom_xfer(i2c_adap, eeprom_addr, eeprom_buf, bytes,
  true);
@@ -233,7 +233,7 @@ int amdgpu_eeprom_read(struct i2c_adapter *i2c_adap,
 
 int amdgpu_eeprom_write(struct i2c_adapter *i2c_adap,
u32 eeprom_addr, u8 *eeprom_buf,
-   u16 bytes)
+   u32 bytes)
 {
return amdgpu_eeprom_xfer(i2c_adap, eeprom_addr, eeprom_buf, bytes,
  false);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h
index 6935adb2be1f..8083b8253ef4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h
@@ -28,10 +28,10 @@
 
 int amdgpu_eeprom_read(struct i2c_adapter *i2c_adap,
   u32 eeprom_addr, u8 *eeprom_buf,
-  u16 bytes);
+  u32 bytes);
 
 int amdgpu_eeprom_write(struct i2c_adapter *i2c_adap,
u32 eeprom_addr, u8 *eeprom_buf,
-   u16 bytes);
+   u32 bytes);
 
 #endif
-- 
2.34.1

[PATCH] drm/amdgpu: retire UMC v12 mca_addr_to_pa

2024-04-02 Thread Tao Zhou

RAS TA will handle it, the interface is useless.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |   1 -
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 105 ++---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h |  62 +--
 3 files changed, 7 insertions(+), 161 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 4ba26d7e52bd..afae497cbf40 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1460,7 +1460,6 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device 
*adev)
adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET;
adev->umc.active_mask = adev->aid_mask;
adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
-   adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0];
if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
adev->umc.ras = &umc_v12_0_ras;
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index f46a176f9b55..a0122b22eda4 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -28,28 +28,6 @@
 #include "umc/umc_12_0_0_sh_mask.h"
 #include "mp/mp_13_0_6_sh_mask.h"
 
-const uint32_t
-   umc_v12_0_channel_idx_tbl[]
-   [UMC_V12_0_UMC_INSTANCE_NUM]
-   [UMC_V12_0_CHANNEL_INSTANCE_NUM] = {
-   {{3,   7,   11,  15,  2,   6,   10,  14},  {1,   5,   9,   13,  
0,   4,   8,   12},
-{19,  23,  27,  31,  18,  22,  26,  30},  {17,  21,  25,  29,  
16,  20,  24,  28}},
-   {{47,  43,  39,  35,  46,  42,  38,  34},  {45,  41,  37,  33,  
44,  40,  36,  32},
-{63,  59,  55,  51,  62,  58,  54,  50},  {61,  57,  53,  49,  
60,  56,  52,  48}},
-   {{79,  75,  71,  67,  78,  74,  70,  66},  {77,  73,  69,  65,  
76,  72,  68,  64},
-{95,  91,  87,  83,  94,  90,  86,  82},  {93,  89,  85,  81,  
92,  88,  84,  80}},
-   {{99,  103, 107, 111, 98,  102, 106, 110}, {97,  101, 105, 109, 
96,  100, 104, 108},
-{115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 
112, 116, 120, 124}}
-   };
-
-/* mapping of MCA error address to normalized address */
-static const uint32_t umc_v12_0_ma2na_mapping[] = {
-   0,  5,  6,  8,  9,  14, 12, 13,
-   10, 11, 15, 16, 17, 18, 19, 20,
-   21, 22, 23, 24, 25, 26, 27, 28,
-   24, 7,  29, 30,
-};
-
 static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
uint32_t node_inst,
uint32_t umc_inst,
@@ -192,79 +170,6 @@ static void umc_v12_0_query_ras_error_count(struct 
amdgpu_device *adev,
umc_v12_0_reset_error_count(adev);
 }
 
-static bool umc_v12_0_bit_wise_xor(uint32_t val)
-{
-   bool result = 0;
-   int i;
-
-   for (i = 0; i < 32; i++)
-   result = result ^ ((val >> i) & 0x1);
-
-   return result;
-}
-
-static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device *adev,
-   uint64_t err_addr, uint32_t ch_inst, 
uint32_t umc_inst,
-   uint32_t node_inst,
-   struct ta_ras_query_address_output 
*addr_out)
-{
-   uint32_t channel_index, i;
-   uint64_t na, soc_pa;
-   uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row;
-   uint32_t bank0, bank1, bank2, bank3, bank;
-
-   bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL;
-   bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL;
-   bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL;
-   bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL;
-   col = (err_addr >> 1) & 0x1fULL;
-   row = (err_addr >> 10) & 0x3fffULL;
-
-   /* apply bank hash algorithm */
-   bank0 =
-   bank_hash0 ^ (UMC_V12_0_XOR_EN0 &
-   (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^
-   (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0;
-   bank1 =
-   bank_hash1 ^ (UMC_V12_0_XOR_EN1 &
-   (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^
-   (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1;
-   bank2 =
-   bank_hash2 ^ (UMC_V12_0_XOR_EN2 &
-   (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^
-   (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2;
-   bank3 =
-   bank_hash3 ^ (UMC_V12_0_XOR_EN3 &
-   (umc_v12_0_bit_wise_xor(col &

[PATCH] drm/amdgpu: update check condition for XGMI ACA UE

2024-04-01 Thread Tao Zhou

Check more possibile ext error codes.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index f4be524b0dc1..be1f4efa9ef6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1066,7 +1066,9 @@ static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle 
*handle, struct aca_ban
 
switch (type) {
case ACA_SMU_TYPE_UE:
-   count = ext_error_code == 0 ? count : 0ULL;
+   if (ext_error_code != 0 && ext_error_code != 9)
+   count = 0ULL;
+
ret = aca_error_cache_log_bank_error(handle, &info, 
ACA_ERROR_TYPE_UE, count);
break;
case ACA_SMU_TYPE_CE:
-- 
2.34.1

[PATCH] drm/amd/pm: update XGMI RAS UE criteria for sum v13.0.6

2024-03-31 Thread Tao Zhou

Add more possible ext error code.

v2: still use ext error code instead of UC bit.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 443233563a52..7a7c7f4b7de3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2694,7 +2694,8 @@ static int mca_pcs_xgmi_mca_get_err_count(const struct 
mca_ras_info *mca_ras, st
ext_error_code = 
MCA_REG__STATUS__ERRORCODEEXT(entry->regs[MCA_REG_IDX_STATUS]);
err_cnt = MCA_REG__MISC0__ERRCNT(entry->regs[MCA_REG_IDX_MISC0]);
 
-   if (type == AMDGPU_MCA_ERROR_TYPE_UE && ext_error_code == 0)
+   if (type == AMDGPU_MCA_ERROR_TYPE_UE &&
+   (ext_error_code == 0 || ext_error_code == 9))
*count = err_cnt;
else if (type == AMDGPU_MCA_ERROR_TYPE_CE && ext_error_code == 6)
*count = err_cnt;
-- 
2.34.1

[PATCH] drm/amd/pm: update XGMI RAS UC criteria for sum v13.0.6

2024-03-31 Thread Tao Zhou

Check UC bit instead of ext error code.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 443233563a52..027bbebbf28e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2689,12 +2689,13 @@ static int mca_pcs_xgmi_mca_get_err_count(const struct 
mca_ras_info *mca_ras, st
  uint32_t *count)
 {
u32 ext_error_code;
-   u32 err_cnt;
+   u32 err_cnt, uc;
 
ext_error_code = 
MCA_REG__STATUS__ERRORCODEEXT(entry->regs[MCA_REG_IDX_STATUS]);
err_cnt = MCA_REG__MISC0__ERRCNT(entry->regs[MCA_REG_IDX_MISC0]);
+   uc = MCA_REG__STATUS__UC(entry->regs[MCA_REG_IDX_STATUS]);
 
-   if (type == AMDGPU_MCA_ERROR_TYPE_UE && ext_error_code == 0)
+   if (type == AMDGPU_MCA_ERROR_TYPE_UE && uc)
*count = err_cnt;
else if (type == AMDGPU_MCA_ERROR_TYPE_CE && ext_error_code == 6)
*count = err_cnt;
-- 
2.34.1

[PATCH] drm/amdgpu: implement IRQ_STATE_ENABLE for SDMA v4.4.2

2024-03-28 Thread Tao Zhou

SDMA_CNTL is not set in some cases, driver configures it by itself.

v2: simplify code

Signed-off-by: Tao Zhou 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 16 +++-
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 71c2f50530cb..f8e2cd514493 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1602,19 +1602,9 @@ static int sdma_v4_4_2_set_ecc_irq_state(struct 
amdgpu_device *adev,
u32 sdma_cntl;
 
sdma_cntl = RREG32_SDMA(type, regSDMA_CNTL);
-   switch (state) {
-   case AMDGPU_IRQ_STATE_DISABLE:
-   sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL,
- DRAM_ECC_INT_ENABLE, 0);
-   WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl);
-   break;
-   /* sdma ecc interrupt is enabled by default
-* driver doesn't need to do anything to
-* enable the interrupt */
-   case AMDGPU_IRQ_STATE_ENABLE:
-   default:
-   break;
-   }
+   sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, DRAM_ECC_INT_ENABLE,
+   state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 
0);
+   WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl);
 
return 0;
 }
-- 
2.34.1

[PATCH] drm/amdgpu: implement IRQ_STATE_ENABLE for SDMA v4.4.2

2024-03-28 Thread Tao Zhou

SDMA_CNTL is not set in some cases, driver configures it by itself.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 71c2f50530cb..d10ae4ce5ddd 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1608,10 +1608,11 @@ static int sdma_v4_4_2_set_ecc_irq_state(struct 
amdgpu_device *adev,
  DRAM_ECC_INT_ENABLE, 0);
WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl);
break;
-   /* sdma ecc interrupt is enabled by default
-* driver doesn't need to do anything to
-* enable the interrupt */
case AMDGPU_IRQ_STATE_ENABLE:
+   sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL,
+ DRAM_ECC_INT_ENABLE, 1);
+   WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl);
+   break;
default:
break;
}
-- 
2.34.1

[PATCH 1/2] drm/amdgpu: add socket id parameter for psp query address cmd

2024-03-20 Thread Tao Zhou

And set the socket id.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/ta_ras_if.h |  1 +
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 14 +++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h 
b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
index 056d4df8fa1f..3ac56a9645eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
+++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
@@ -146,6 +146,7 @@ struct ta_ras_mca_addr {
uint32_t ch_inst;
uint32_t umc_inst;
uint32_t node_inst;
+   uint32_t socket_id;
 };
 
 struct ta_ras_phy_addr {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 77af4e25ff46..0a9cc87e98d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -268,7 +268,7 @@ static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device 
*adev,
 static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, 
uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst,
-   uint32_t node_inst)
+   uint32_t node_inst, uint32_t 
socket_id)
 {
uint32_t col, row, row_xor, bank, channel_index;
uint64_t soc_pa, retired_page, column;
@@ -280,6 +280,7 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
addr_in.ma.ch_inst = ch_inst;
addr_in.ma.umc_inst = umc_inst;
addr_in.ma.node_inst = node_inst;
+   addr_in.ma.socket_id = socket_id;
 
if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out))
/* fallback to old path if fail to get pa from psp */
@@ -331,6 +332,7 @@ static int umc_v12_0_query_error_address(struct 
amdgpu_device *adev,
struct ras_err_data *err_data = (struct ras_err_data *)data;
uint64_t umc_reg_offset =
get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
+   uint32_t socket_id = 0;
 
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -357,8 +359,13 @@ static int umc_v12_0_query_error_address(struct 
amdgpu_device *adev,
 
err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, 
ErrorAddr);
 
+   if (!adev->aid_mask &&
+   adev->smuio.funcs &&
+   adev->smuio.funcs->get_socket_id)
+   socket_id = adev->smuio.funcs->get_socket_id(adev);
+
umc_v12_0_convert_error_address(adev, err_data, err_addr,
-   ch_inst, umc_inst, node_inst);
+   ch_inst, umc_inst, node_inst, 
socket_id);
}
 
/* clear umc status */
@@ -450,7 +457,8 @@ static void 
umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
err_data, err_addr,
MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
-   err_info->mcm_info.die_id);
+   err_info->mcm_info.die_id,
+   err_info->mcm_info.socket_id);
}
 
/* Delete error address node from list and free memory 
*/
-- 
2.34.1

[PATCH 2/2] drm/amdgpu: simplify convert_error_address interface for UMC v12

2024-03-20 Thread Tao Zhou

Replace separate parameters with struct ta_ras_query_address_input.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57 ++
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 0a9cc87e98d0..d0fcfcb3404f 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -266,26 +266,19 @@ static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device 
*adev,
 }
 
 static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
-   struct ras_err_data *err_data, 
uint64_t err_addr,
-   uint32_t ch_inst, uint32_t umc_inst,
-   uint32_t node_inst, uint32_t 
socket_id)
+   struct ras_err_data *err_data,
+   struct ta_ras_query_address_input 
*addr_in)
 {
uint32_t col, row, row_xor, bank, channel_index;
-   uint64_t soc_pa, retired_page, column;
-   struct ta_ras_query_address_input addr_in;
+   uint64_t soc_pa, retired_page, column, err_addr;
struct ta_ras_query_address_output addr_out;
 
-   addr_in.addr_type = TA_RAS_MCA_TO_PA;
-   addr_in.ma.err_addr = err_addr;
-   addr_in.ma.ch_inst = ch_inst;
-   addr_in.ma.umc_inst = umc_inst;
-   addr_in.ma.node_inst = node_inst;
-   addr_in.ma.socket_id = socket_id;
-
-   if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out))
+   err_addr = addr_in->ma.err_addr;
+   addr_in->addr_type = TA_RAS_MCA_TO_PA;
+   if (psp_ras_query_address(&adev->psp, addr_in, &addr_out))
/* fallback to old path if fail to get pa from psp */
-   umc_v12_0_mca_addr_to_pa(adev, err_addr, ch_inst, umc_inst,
-   node_inst, &addr_out);
+   umc_v12_0_mca_addr_to_pa(adev, err_addr, addr_in->ma.ch_inst,
+   addr_in->ma.umc_inst, addr_in->ma.node_inst, 
&addr_out);
 
soc_pa = addr_out.pa.pa;
bank = addr_out.pa.bank;
@@ -310,7 +303,7 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
retired_page, row, col, bank, channel_index);
amdgpu_umc_fill_error_record(err_data, err_addr,
-   retired_page, channel_index, umc_inst);
+   retired_page, channel_index, addr_in->ma.umc_inst);
 
/* shift R13 bit */
retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
@@ -318,7 +311,7 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
retired_page, row_xor, col, bank, channel_index);
amdgpu_umc_fill_error_record(err_data, err_addr,
-   retired_page, channel_index, umc_inst);
+   retired_page, channel_index, addr_in->ma.umc_inst);
}
 }
 
@@ -326,13 +319,13 @@ static int umc_v12_0_query_error_address(struct 
amdgpu_device *adev,
uint32_t node_inst, uint32_t umc_inst,
uint32_t ch_inst, void *data)
 {
+   struct ras_err_data *err_data = (struct ras_err_data *)data;
+   struct ta_ras_query_address_input addr_in;
uint64_t mc_umc_status_addr;
uint64_t mc_umc_status, err_addr;
uint64_t mc_umc_addrt0;
-   struct ras_err_data *err_data = (struct ras_err_data *)data;
uint64_t umc_reg_offset =
get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
-   uint32_t socket_id = 0;
 
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -362,10 +355,16 @@ static int umc_v12_0_query_error_address(struct 
amdgpu_device *adev,
if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id)
-   socket_id = adev->smuio.funcs->get_socket_id(adev);
+   addr_in.ma.socket_id = 
adev->smuio.funcs->get_socket_id(adev);
+   else
+   addr_in.ma.socket_id = 0;
+
+   addr_in.ma.err_addr = err_addr;
+   addr_in.ma.ch_inst = ch_inst;
+   addr_in.ma.umc_inst = umc_inst;
+   addr_in.ma.node_inst = node_inst;
 
-   umc_v12_0_convert_error_address(adev, err_data, err_addr,
-   ch_inst, umc_inst, node_inst, 
socke

[PATCH 3/3] drm/amdgpu: make reset method configurable for RAS poison

2024-03-18 Thread Tao Zhou

Each RAS block has different requirement for gpu reset in poison
consumption handling.
Add support for mmhub RAS poison consumption handling.

v2: remove the mmhub poison support for kfd int v10.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   | 14 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  4 ++--
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  | 13 +++-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  9 +
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 20 ++-
 8 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0b4910108f61..66753940bb4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -760,7 +760,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
 }
 
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
-   enum amdgpu_ras_block block, bool reset)
+   enum amdgpu_ras_block block, uint32_t reset)
 {
amdgpu_umc_poison_handler(adev, block, reset);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 03bf20e0e3da..ad50c7bbc326 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -400,7 +400,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device 
*adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
-   enum amdgpu_ras_block block, bool reset);
+   enum amdgpu_ras_block block, uint32_t reset);
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e32a186c2de1..58fe7bebdf1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2045,7 +2045,7 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
}
}
 
-   amdgpu_umc_poison_handler(adev, obj->head.block, false);
+   amdgpu_umc_poison_handler(adev, obj->head.block, 0);
 
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
@@ -2698,7 +2698,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_dec(&con->page_retirement_req_cnt);
 
amdgpu_umc_bad_page_polling_timeout(adev,
-   false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+   0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 20436f81856a..2c02585dcbff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
 
if (err_data->ue_count && reset) {
-   /* use mode-2 reset for poison consumption */
-   if (!entry)
-   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
 
@@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
 }
 
 int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
-   bool reset, uint32_t timeout_ms)
+   uint32_t reset, uint32_t timeout_ms)
 {
struct ras_err_data err_data;
struct ras_common_if head = {
@@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
if (reset) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
-   /* use mode-2 reset for poison consumption */
-   con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
 
@@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
 }
 
 int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-   enum amdgpu_ras_block block, bool reset)
+

[PATCH 2/3] drm/amdgpu: support utcl2 RAS poison query for mmhub

2024-03-18 Thread Tao Zhou

Support the query for both gfxhub and mmhub, also replace
xcc_id with hub_inst.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  | 17 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 ++---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c| 17 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 17 +++--
 5 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0c794301d18d..0b4910108f61 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -782,12 +782,19 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct 
amdgpu_device *adev,
 }
 
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
-   int xcc_id)
+   int hub_inst, int hub_type)
 {
-   if (adev->gfxhub.funcs->query_utcl2_poison_status)
-   return adev->gfxhub.funcs->query_utcl2_poison_status(adev, 
xcc_id);
-   else
-   return false;
+   if (!hub_type) {
+   if (adev->gfxhub.funcs->query_utcl2_poison_status)
+   return 
adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
+   else
+   return false;
+   } else {
+   if (adev->mmhub.funcs->query_utcl2_poison_status)
+   return 
adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
+   else
+   return false;
+   }
 }
 
 int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 73b7fa7c5116..03bf20e0e3da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -406,7 +406,7 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
-   int xcc_id);
+   int hub_inst, int hub_type);
 int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index b3894fe868b2..4ba26d7e52bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -670,10 +670,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
 
-   /* for gfx fed error, kfd will handle it, return directly */
+   /* for fed error, kfd will handle it, return directly */
if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
-   (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)) &&
-   (vmhub < AMDGPU_MMHUB0_START))
+   (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)))
return 0;
 
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a8e76287dde0..650da18b0d87 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -369,18 +369,23 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t vmid_type = 
SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
-   int xcc_id = 0;
+   int hub_inst = 0;
struct kfd_hsa_memory_exception_data exception_data;
 
+   /* gfxhub */
if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) 
{
-   xcc_id = 
dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
+   hub_inst = 
dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
node_id);
-   if (xcc_id < 0)
-   xcc_id = 0;
+   if (hub_inst < 0)
+   hub_inst = 0;
}
 
-   if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type &&
-   amdgpu_amdkfd_ras_query_utc

[PATCH 1/3] drm/amdgpu: add utcl2 RAS poison query for mmhub

2024-03-18 Thread Tao Zhou

Add it for mmhub v1.8.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c   | 15 +++
 2 files changed, 17 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 1ca9d4ed8063..95d676ee207f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -63,6 +63,8 @@ struct amdgpu_mmhub_funcs {
uint64_t page_table_base);
void (*update_power_gating)(struct amdgpu_device *adev,
 bool enable);
+   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
+   int hub_inst);
 };
 
 struct amdgpu_mmhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index c0fc44cdd658..b7aa05dbef86 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -559,6 +559,20 @@ static void mmhub_v1_8_get_clockgating(struct 
amdgpu_device *adev, u64 *flags)
 
 }
 
+static bool mmhub_v1_8_query_utcl2_poison_status(struct amdgpu_device *adev,
+   int hub_inst)
+{
+   u32 fed, status;
+
+   status = RREG32_SOC15(MMHUB, hub_inst, 
regVM_L2_PROTECTION_FAULT_STATUS);
+   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+   /* reset page fault status */
+   WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst,
+   regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
+
+   return fed;
+}
+
 const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
.get_fb_location = mmhub_v1_8_get_fb_location,
.init = mmhub_v1_8_init,
@@ -568,6 +582,7 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
.setup_vm_pt_regs = mmhub_v1_8_setup_vm_pt_regs,
.set_clockgating = mmhub_v1_8_set_clockgating,
.get_clockgating = mmhub_v1_8_get_clockgating,
+   .query_utcl2_poison_status = mmhub_v1_8_query_utcl2_poison_status,
 };
 
 static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = 
{
-- 
2.34.1

[PATCH 3/3] drm/amdgpu: make reset method configurable for RAS poison

2024-03-13 Thread Tao Zhou

Each RAS block has different requirement for gpu reset in poison
consumption handling.
Add support for mmhub RAS poison consumption handling.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   | 14 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  4 ++--
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  | 20 ++-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 20 ++-
 7 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 9687650b0fe3..262d20167039 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -760,7 +760,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
 }
 
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
-   enum amdgpu_ras_block block, bool reset)
+   enum amdgpu_ras_block block, uint32_t reset)
 {
amdgpu_umc_poison_handler(adev, block, reset);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 03bf20e0e3da..ad50c7bbc326 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -400,7 +400,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device 
*adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
-   enum amdgpu_ras_block block, bool reset);
+   enum amdgpu_ras_block block, uint32_t reset);
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e32a186c2de1..58fe7bebdf1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2045,7 +2045,7 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
}
}
 
-   amdgpu_umc_poison_handler(adev, obj->head.block, false);
+   amdgpu_umc_poison_handler(adev, obj->head.block, 0);
 
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
@@ -2698,7 +2698,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_dec(&con->page_retirement_req_cnt);
 
amdgpu_umc_bad_page_polling_timeout(adev,
-   false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+   0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 20436f81856a..2c02585dcbff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
 
if (err_data->ue_count && reset) {
-   /* use mode-2 reset for poison consumption */
-   if (!entry)
-   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
 
@@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
 }
 
 int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
-   bool reset, uint32_t timeout_ms)
+   uint32_t reset, uint32_t timeout_ms)
 {
struct ras_err_data err_data;
struct ras_common_if head = {
@@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
if (reset) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
-   /* use mode-2 reset for poison consumption */
-   con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
 
@@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
 }
 
 int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-   enum amdgpu_ras_block block, bool reset)
+   enum amdgpu_ras_block block, uint32_t reset)
 {
int ret = AMDGPU_RAS_SUCCESS;
 
@@ -311,7 +308,8

[PATCH 2/3] drm/amdgpu: support utcl2 RAS poison query for mmhub

2024-03-13 Thread Tao Zhou

Support the query for both gfxhub and mmhub, also replace
xcc_id with hub_inst.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  | 17 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  3 +--
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c| 17 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 17 +++--
 5 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index fa958cbc603a..9687650b0fe3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -782,12 +782,19 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct 
amdgpu_device *adev,
 }
 
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
-   int xcc_id)
+   int hub_inst, int hub_type)
 {
-   if (adev->gfxhub.funcs->query_utcl2_poison_status)
-   return adev->gfxhub.funcs->query_utcl2_poison_status(adev, 
xcc_id);
-   else
-   return false;
+   if (!hub_type) {
+   if (adev->gfxhub.funcs->query_utcl2_poison_status)
+   return 
adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
+   else
+   return false;
+   } else {
+   if (adev->mmhub.funcs->query_utcl2_poison_status)
+   return 
adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
+   else
+   return false;
+   }
 }
 
 int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 73b7fa7c5116..03bf20e0e3da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -406,7 +406,7 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
-   int xcc_id);
+   int hub_inst, int hub_type);
 int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index fb19b88e5522..d615d0fc2c6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -672,8 +672,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 
/* for gfx fed error, kfd will handle it, return directly */
if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
-   (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)) &&
-   (vmhub < AMDGPU_MMHUB0_START))
+   (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)))
return 0;
 
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a8e76287dde0..650da18b0d87 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -369,18 +369,23 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t vmid_type = 
SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
-   int xcc_id = 0;
+   int hub_inst = 0;
struct kfd_hsa_memory_exception_data exception_data;
 
+   /* gfxhub */
if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) 
{
-   xcc_id = 
dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
+   hub_inst = 
dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
node_id);
-   if (xcc_id < 0)
-   xcc_id = 0;
+   if (hub_inst < 0)
+   hub_inst = 0;
}
 
-   if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type &&
-   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, 
xcc_id)) {
+   /* mmhub */
+   if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
+   hub_inst = nod

[PATCH 1/3] drm/amdgpu: add utcl2 RAS poison query for mmhub

2024-03-13 Thread Tao Zhou

Add it for mmhub v1.8.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c   | 15 +++
 2 files changed, 17 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 1ca9d4ed8063..95d676ee207f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -63,6 +63,8 @@ struct amdgpu_mmhub_funcs {
uint64_t page_table_base);
void (*update_power_gating)(struct amdgpu_device *adev,
 bool enable);
+   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
+   int hub_inst);
 };
 
 struct amdgpu_mmhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index c0fc44cdd658..b7aa05dbef86 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -559,6 +559,20 @@ static void mmhub_v1_8_get_clockgating(struct 
amdgpu_device *adev, u64 *flags)
 
 }
 
+static bool mmhub_v1_8_query_utcl2_poison_status(struct amdgpu_device *adev,
+   int hub_inst)
+{
+   u32 fed, status;
+
+   status = RREG32_SOC15(MMHUB, hub_inst, 
regVM_L2_PROTECTION_FAULT_STATUS);
+   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+   /* reset page fault status */
+   WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst,
+   regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
+
+   return fed;
+}
+
 const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
.get_fb_location = mmhub_v1_8_get_fb_location,
.init = mmhub_v1_8_init,
@@ -568,6 +582,7 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
.setup_vm_pt_regs = mmhub_v1_8_setup_vm_pt_regs,
.set_clockgating = mmhub_v1_8_set_clockgating,
.get_clockgating = mmhub_v1_8_get_clockgating,
+   .query_utcl2_poison_status = mmhub_v1_8_query_utcl2_poison_status,
 };
 
 static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = 
{
-- 
2.34.1

[PATCH] drm/amdgpu: add deferred error check for UMC v12 address query

2024-02-28 Thread Tao Zhou

Both RAS UE and deferred errors need page retirement.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 14ef7a24be7b..77af4e25ff46 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -348,7 +348,8 @@ static int umc_v12_0_query_error_address(struct 
amdgpu_device *adev,
}
 
/* calculate error address if ue error is detected */
-   if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) {
+   if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
+   umc_v12_0_is_deferred_error(adev, mc_umc_status)) {
mc_umc_addrt0 =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
 
-- 
2.34.1

[PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling

2024-02-23 Thread Tao Zhou

Let kfd interrupt handler process it.

v2: return 0 instead of 1 for fed error.
drop the usage of strcmp in interrupt handler.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 001e96d89cd7..09364817ae97 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 {
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
-   uint32_t status = 0, cid = 0, rw = 0;
+   uint32_t status = 0, cid = 0, rw = 0, fed = 0;
struct amdgpu_task_info task_info;
struct amdgpu_vmhub *hub;
const char *mmhub_cid;
@@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
status = RREG32(hub->vm_l2_pro_fault_status);
cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
+   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+
+   /* for gfx fed error, kfd will handle it, return directly */
+   if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
+   (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)) &&
+   (vmhub < AMDGPU_MMHUB0_START))
+   return 0;
+
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 #ifdef HAVE_STRUCT_XARRAY
amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
-- 
2.34.1

[PATCH 4/5] amd/amdkfd: get node id for query_utcl2_poison_status

2024-02-23 Thread Tao Zhou

Obtain it from ring entry.

v2: replace node id with logical xcc id.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 14 --
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  | 14 --
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 9a06c6fb6605..a8e76287dde0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -367,10 +367,20 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
   client_id == SOC15_IH_CLIENTID_UTCL2) {
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+   uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
+   uint32_t vmid_type = 
SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
+   int xcc_id = 0;
struct kfd_hsa_memory_exception_data exception_data;
 
-   if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
-   
amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+   if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) 
{
+   xcc_id = 
dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
+   node_id);
+   if (xcc_id < 0)
+   xcc_id = 0;
+   }
+
+   if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type &&
+   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, 
xcc_id)) {
event_interrupt_poison_consumption(dev, pasid, 
client_id);
return;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 91dd5e045b51..ff7392336795 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -413,10 +413,20 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
   client_id == SOC15_IH_CLIENTID_UTCL2) {
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+   uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
+   uint32_t vmid_type = 
SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
+   int xcc_id = 0;
struct kfd_hsa_memory_exception_data exception_data;
 
-   if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
-   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+   if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) 
{
+   xcc_id = 
dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
+   node_id);
+   if (xcc_id < 0)
+   xcc_id = 0;
+   }
+
+   if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type &&
+   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, 
xcc_id)) {
event_interrupt_poison_consumption_v9(dev, pasid, 
client_id);
return;
}
-- 
2.34.1

[PATCH 3/5] drm/amdgpu: retire gfx ras query_utcl2_poison_status

2024-02-23 Thread Tao Zhou

Replace it with related interface in gfxhub functions.

v2: replace node id with xcc id.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  7 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  1 -
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c| 12 
 4 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index f759a42def59..817464ee6a01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -776,10 +776,11 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct 
amdgpu_device *adev,
return 0;
 }
 
-bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
+   int xcc_id)
 {
-   if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
-   return adev->gfx.ras->query_utcl2_poison_status(adev);
+   if (adev->gfxhub.funcs->query_utcl2_poison_status)
+   return adev->gfxhub.funcs->query_utcl2_poison_status(adev, 
xcc_id);
else
return false;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index b2990470d1c6..47d30f67f2b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -404,7 +404,8 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
-bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
+   int xcc_id);
 int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index a36d153b2ff7..9fa580b85417 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -268,7 +268,6 @@ struct amdgpu_cu_info {
 struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object  ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
-   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 0d5b4133fdf7..e3ed568eaacc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct 
amdgpu_device *adev)
mutex_unlock(&adev->grbm_idx_mutex);
 }
 
-static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
-{
-   u32 status = 0;
-   struct amdgpu_vmhub *hub;
-
-   hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
-   status = RREG32(hub->vm_l2_pro_fault_status);
-   /* reset page fault status */
-   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
-   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
-}
 
 struct amdgpu_ras_block_hw_ops  gfx_v9_4_2_ras_ops = {
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
.hw_ops = &gfx_v9_4_2_ras_ops,
},
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
-   .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
 };
-- 
2.34.1

[PATCH 1/5] drm/amdgpu: add new bit definitions for GC 9.0 PROTECTION_FAULT_STATUS

2024-02-23 Thread Tao Zhou

Add UCE and FED bit definitions.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
index efc16ddf274a..2dfa0e5b1aa3 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
@@ -6822,6 +6822,8 @@
 #define VM_L2_PROTECTION_FAULT_STATUS__VMID__SHIFT 
   0x14
 #define VM_L2_PROTECTION_FAULT_STATUS__VF__SHIFT   
   0x18
 #define VM_L2_PROTECTION_FAULT_STATUS__VFID__SHIFT 
   0x19
+#define VM_L2_PROTECTION_FAULT_STATUS__UCE__SHIFT  
   0x1d
+#define VM_L2_PROTECTION_FAULT_STATUS__FED__SHIFT  
   0x1e
 #define VM_L2_PROTECTION_FAULT_STATUS__MORE_FAULTS_MASK
   0x0001L
 #define VM_L2_PROTECTION_FAULT_STATUS__WALKER_ERROR_MASK   
   0x000EL
 #define VM_L2_PROTECTION_FAULT_STATUS__PERMISSION_FAULTS_MASK  
   0x00F0L
@@ -6832,6 +6834,8 @@
 #define VM_L2_PROTECTION_FAULT_STATUS__VMID_MASK   
   0x00F0L
 #define VM_L2_PROTECTION_FAULT_STATUS__VF_MASK 
   0x0100L
 #define VM_L2_PROTECTION_FAULT_STATUS__VFID_MASK   
   0x1E00L
+#define VM_L2_PROTECTION_FAULT_STATUS__UCE_MASK
   0x2000L
+#define VM_L2_PROTECTION_FAULT_STATUS__FED_MASK
   0x4000L
 //VM_L2_PROTECTION_FAULT_ADDR_LO32
 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32__SHIFT
   0x0
 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32_MASK  
   0xL
-- 
2.34.1

[PATCH 2/5] drm/amdgpu: add utcl2 poison query for gfxhub

2024-02-23 Thread Tao Zhou

Implement it for gfxhub 1.0 and 1.2.

v2: input logical xcc id for poison query interface.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c   | 17 +
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c   | 15 +++
 3 files changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
index c7b44aeb671b..103a837ccc71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
@@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs {
void (*mode2_save_regs)(struct amdgpu_device *adev);
void (*mode2_restore_regs)(struct amdgpu_device *adev);
void (*halt)(struct amdgpu_device *adev);
+   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
+   int xcc_id);
 };
 
 struct amdgpu_gfxhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
index 22175da0e16a..d200310d1731 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
@@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev)
mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32;
 }
 
+static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev,
+   int xcc_id)
+{
+   u32 status = 0;
+   struct amdgpu_vmhub *hub;
+
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2))
+   return false;
+
+   hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
+   status = RREG32(hub->vm_l2_pro_fault_status);
+   /* reset page fault status */
+   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+}
 
 const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset,
@@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default,
.init = gfxhub_v1_0_init,
.get_xgmi_info = gfxhub_v1_1_get_xgmi_info,
+   .query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
index 49aecdcee006..77df8c9cbad2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
@@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device 
*adev)
return 0;
 }
 
+static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev,
+   int xcc_id)
+{
+   u32 fed, status;
+
+   status = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regVM_L2_PROTECTION_FAULT_STATUS);
+   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+   /* reset page fault status */
+   WREG32_P(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id),
+   regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
+
+   return fed;
+}
+
 const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
.get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset,
.setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs,
@@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
.set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default,
.init = gfxhub_v1_2_init,
.get_xgmi_info = gfxhub_v1_2_get_xgmi_info,
+   .query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status,
 };
 
 static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask)
-- 
2.34.1

[PATCH 4/5] amd/amdkfd: get node id for query_utcl2_poison_status

2024-02-19 Thread Tao Zhou

Obtain it from ring entry.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 9a06c6fb6605..747cb785a7d3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -367,10 +367,11 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
   client_id == SOC15_IH_CLIENTID_UTCL2) {
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+   uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
struct kfd_hsa_memory_exception_data exception_data;
 
if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
-   
amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, 
node_id)) {
event_interrupt_poison_consumption(dev, pasid, 
client_id);
return;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 91dd5e045b51..eb94d967c532 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -413,10 +413,11 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
   client_id == SOC15_IH_CLIENTID_UTCL2) {
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+   uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
struct kfd_hsa_memory_exception_data exception_data;
 
if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
-   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, 
node_id)) {
event_interrupt_poison_consumption_v9(dev, pasid, 
client_id);
return;
}
-- 
2.34.1

[PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling

2024-02-19 Thread Tao Zhou

Let kfd interrupt handler process it.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 773725a92cf1..70defc394b7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 {
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
-   uint32_t status = 0, cid = 0, rw = 0;
+   uint32_t status = 0, cid = 0, rw = 0, fed = 0;
struct amdgpu_task_info task_info;
struct amdgpu_vmhub *hub;
const char *mmhub_cid;
@@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
status = RREG32(hub->vm_l2_pro_fault_status);
cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
+   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+
+   /* for gfx fed error, kfd will handle it, return directly */
+   if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
+   amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) &&
+   !strcmp(hub_name, "gfxhub0"))
+   return 1;
+
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 #ifdef HAVE_STRUCT_XARRAY
amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
-- 
2.34.1

[PATCH 3/5] drm/amdgpu: retire gfx ras query_utcl2_poison_status

2024-02-19 Thread Tao Zhou

Replace it with related interface in gfxhub functions.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  7 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  1 -
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c| 12 
 4 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index f759a42def59..bb509b26112d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -776,10 +776,11 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct 
amdgpu_device *adev,
return 0;
 }
 
-bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
+   uint32_t node_id)
 {
-   if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
-   return adev->gfx.ras->query_utcl2_poison_status(adev);
+   if (adev->gfxhub.funcs->query_utcl2_poison_status)
+   return adev->gfxhub.funcs->query_utcl2_poison_status(adev, 
node_id);
else
return false;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index b2990470d1c6..ae1e449e5479 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -404,7 +404,8 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
-bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
+   uint32_t node_id);
 int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 2af2e28952db..d91f83bd7267 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -268,7 +268,6 @@ struct amdgpu_cu_info {
 struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object  ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
-   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 0d5b4133fdf7..e3ed568eaacc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct 
amdgpu_device *adev)
mutex_unlock(&adev->grbm_idx_mutex);
 }
 
-static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
-{
-   u32 status = 0;
-   struct amdgpu_vmhub *hub;
-
-   hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
-   status = RREG32(hub->vm_l2_pro_fault_status);
-   /* reset page fault status */
-   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
-   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
-}
 
 struct amdgpu_ras_block_hw_ops  gfx_v9_4_2_ras_ops = {
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
.hw_ops = &gfx_v9_4_2_ras_ops,
},
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
-   .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
 };
-- 
2.34.1

[PATCH 2/5] drm/amdgpu: add utcl2 poison query for gfxhub

2024-02-19 Thread Tao Zhou

Implement it for gfxhub 1.0 and 1.2.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c   | 17 +
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c   | 15 +++
 3 files changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
index c7b44aeb671b..12b131a9cc42 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
@@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs {
void (*mode2_save_regs)(struct amdgpu_device *adev);
void (*mode2_restore_regs)(struct amdgpu_device *adev);
void (*halt)(struct amdgpu_device *adev);
+   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
+   uint32_t node_id);
 };
 
 struct amdgpu_gfxhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
index 22175da0e16a..1c14b1665c9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
@@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev)
mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32;
 }
 
+static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev,
+   uint32_t node_id)
+{
+   u32 status = 0;
+   struct amdgpu_vmhub *hub;
+
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2))
+   return false;
+
+   hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
+   status = RREG32(hub->vm_l2_pro_fault_status);
+   /* reset page fault status */
+   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+}
 
 const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset,
@@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default,
.init = gfxhub_v1_0_init,
.get_xgmi_info = gfxhub_v1_1_get_xgmi_info,
+   .query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
index 49aecdcee006..ebc96739e1c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
@@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device 
*adev)
return 0;
 }
 
+static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev,
+   uint32_t node_id)
+{
+   u32 fed, status;
+
+   status = RREG32_SOC15(GC, node_id, regVM_L2_PROTECTION_FAULT_STATUS);
+   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+   /* reset page fault status */
+   WREG32_P(SOC15_REG_OFFSET(GC, node_id,
+   regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
+
+   return fed;
+}
+
 const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
.get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset,
.setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs,
@@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
.set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default,
.init = gfxhub_v1_2_init,
.get_xgmi_info = gfxhub_v1_2_get_xgmi_info,
+   .query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status,
 };
 
 static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask)
-- 
2.34.1

[PATCH 1/5] drm/amdgpu: add new bit definitions for GC 9.0 PROTECTION_FAULT_STATUS

2024-02-19 Thread Tao Zhou

Add UCE and FED bit definitions.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
index efc16ddf274a..2dfa0e5b1aa3 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
@@ -6822,6 +6822,8 @@
 #define VM_L2_PROTECTION_FAULT_STATUS__VMID__SHIFT 
   0x14
 #define VM_L2_PROTECTION_FAULT_STATUS__VF__SHIFT   
   0x18
 #define VM_L2_PROTECTION_FAULT_STATUS__VFID__SHIFT 
   0x19
+#define VM_L2_PROTECTION_FAULT_STATUS__UCE__SHIFT  
   0x1d
+#define VM_L2_PROTECTION_FAULT_STATUS__FED__SHIFT  
   0x1e
 #define VM_L2_PROTECTION_FAULT_STATUS__MORE_FAULTS_MASK
   0x0001L
 #define VM_L2_PROTECTION_FAULT_STATUS__WALKER_ERROR_MASK   
   0x000EL
 #define VM_L2_PROTECTION_FAULT_STATUS__PERMISSION_FAULTS_MASK  
   0x00F0L
@@ -6832,6 +6834,8 @@
 #define VM_L2_PROTECTION_FAULT_STATUS__VMID_MASK   
   0x00F0L
 #define VM_L2_PROTECTION_FAULT_STATUS__VF_MASK 
   0x0100L
 #define VM_L2_PROTECTION_FAULT_STATUS__VFID_MASK   
   0x1E00L
+#define VM_L2_PROTECTION_FAULT_STATUS__UCE_MASK
   0x2000L
+#define VM_L2_PROTECTION_FAULT_STATUS__FED_MASK
   0x4000L
 //VM_L2_PROTECTION_FAULT_ADDR_LO32
 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32__SHIFT
   0x0
 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32_MASK  
   0xL
-- 
2.34.1

[PATCH] drm/amdgpu: add UTCL2 RAS poison query for gfx 9.4.3

2024-02-17 Thread Tao Zhou

Add help function to query and reset RAS UTCL2 poison status.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index aace4594a603..de04006f8db1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -4329,10 +4329,24 @@ static int gfx_v9_4_3_ras_late_init(struct 
amdgpu_device *adev, struct ras_commo
return r;
 }
 
+static bool gfx_v9_4_3_query_uctl2_poison_status(struct amdgpu_device *adev)
+{
+   u32 status = 0;
+   struct amdgpu_vmhub *hub;
+
+   hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
+   status = RREG32(hub->vm_l2_pro_fault_status);
+   /* reset page fault status */
+   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+}
+
 struct amdgpu_gfx_ras gfx_v9_4_3_ras = {
.ras_block = {
.hw_ops = &gfx_v9_4_3_ras_ops,
.ras_late_init = &gfx_v9_4_3_ras_late_init,
},
.enable_watchdog_timer = &gfx_v9_4_3_enable_watchdog_timer,
+   .query_utcl2_poison_status = &gfx_v9_4_3_query_uctl2_poison_status,
 };
-- 
2.34.1

[PATCH 2/2] use PSP address query command

2024-01-30 Thread Tao Zhou

Get UMC physical address from PSP in RAS error address coversion.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 46 ++
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 836a4cc1134e..14ef7a24be7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -203,14 +203,14 @@ static bool umc_v12_0_bit_wise_xor(uint32_t val)
return result;
 }
 
-static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
-   struct ras_err_data *err_data, 
uint64_t err_addr,
-   uint32_t ch_inst, uint32_t umc_inst,
-   uint32_t node_inst)
+static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device *adev,
+   uint64_t err_addr, uint32_t ch_inst, 
uint32_t umc_inst,
+   uint32_t node_inst,
+   struct ta_ras_query_address_output 
*addr_out)
 {
uint32_t channel_index, i;
-   uint64_t soc_pa, na, retired_page, column;
-   uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row, 
row_xor;
+   uint64_t na, soc_pa;
+   uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row;
uint32_t bank0, bank1, bank2, bank3, bank;
 
bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL;
@@ -260,12 +260,44 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
/* the umc channel bits are not original values, they are hashed */
UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa);
 
+   addr_out->pa.pa = soc_pa;
+   addr_out->pa.bank = bank;
+   addr_out->pa.channel_idx = channel_index;
+}
+
+static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
+   struct ras_err_data *err_data, 
uint64_t err_addr,
+   uint32_t ch_inst, uint32_t umc_inst,
+   uint32_t node_inst)
+{
+   uint32_t col, row, row_xor, bank, channel_index;
+   uint64_t soc_pa, retired_page, column;
+   struct ta_ras_query_address_input addr_in;
+   struct ta_ras_query_address_output addr_out;
+
+   addr_in.addr_type = TA_RAS_MCA_TO_PA;
+   addr_in.ma.err_addr = err_addr;
+   addr_in.ma.ch_inst = ch_inst;
+   addr_in.ma.umc_inst = umc_inst;
+   addr_in.ma.node_inst = node_inst;
+
+   if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out))
+   /* fallback to old path if fail to get pa from psp */
+   umc_v12_0_mca_addr_to_pa(adev, err_addr, ch_inst, umc_inst,
+   node_inst, &addr_out);
+
+   soc_pa = addr_out.pa.pa;
+   bank = addr_out.pa.bank;
+   channel_index = addr_out.pa.channel_idx;
+
+   col = (err_addr >> 1) & 0x1fULL;
+   row = (err_addr >> 10) & 0x3fffULL;
+   row_xor = row ^ (0x1ULL << 13);
/* clear [C3 C2] in soc physical address */
soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
/* clear [C4] in soc physical address */
soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
 
-   row_xor = row ^ (0x1ULL << 13);
/* loop for all possibilities of [C4 C3 C2] */
for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
-- 
2.34.1

[PATCH 1/2] add PSP RAS address query command

2024-01-30 Thread Tao Zhou

Convert mca address to physical address or vice versa via RAS TA.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 25 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  3 +++
 drivers/gpu/drm/amd/amdgpu/ta_ras_if.h  | 36 +
 3 files changed, 64 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 9eff8753f9b9..bb2d419fe914 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1782,6 +1782,31 @@ int psp_ras_trigger_error(struct psp_context *psp,
 
return 0;
 }
+
+int psp_ras_query_address(struct psp_context *psp,
+ struct ta_ras_query_address_input *addr_in,
+ struct ta_ras_query_address_output *addr_out)
+{
+   struct ta_ras_shared_memory *ras_cmd;
+   int ret;
+
+   if (!psp->ras_context.context.initialized)
+   return -EINVAL;
+
+   ras_cmd = (struct ta_ras_shared_memory 
*)psp->ras_context.context.mem_context.shared_buf;
+   memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+
+   ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS;
+   ras_cmd->ras_in_message.address = *addr_in;
+
+   ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+   if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
+   return -EINVAL;
+
+   *addr_out = ras_cmd->ras_out_message.address;
+
+   return 0;
+}
 // ras end
 
 // HDCP start
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 652b0a01854a..9951bdd022de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -502,6 +502,9 @@ int psp_ras_enable_features(struct psp_context *psp,
 int psp_ras_trigger_error(struct psp_context *psp,
  struct ta_ras_trigger_error_input *info, uint32_t 
instance_mask);
 int psp_ras_terminate(struct psp_context *psp);
+int psp_ras_query_address(struct psp_context *psp,
+ struct ta_ras_query_address_input *addr_in,
+ struct ta_ras_query_address_output *addr_out);
 
 int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
 int psp_dtm_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h 
b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
index 879bb7af297c..056d4df8fa1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
+++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
@@ -36,6 +36,9 @@ enum ras_command {
TA_RAS_COMMAND__ENABLE_FEATURES = 0,
TA_RAS_COMMAND__DISABLE_FEATURES,
TA_RAS_COMMAND__TRIGGER_ERROR,
+   TA_RAS_COMMAND__QUERY_BLOCK_INFO,
+   TA_RAS_COMMAND__QUERY_SUB_BLOCK_INFO,
+   TA_RAS_COMMAND__QUERY_ADDRESS,
 };
 
 enum ta_ras_status {
@@ -105,6 +108,11 @@ enum ta_ras_error_type {
TA_RAS_ERROR__POISON= 8,
 };
 
+enum ta_ras_address_type {
+   TA_RAS_MCA_TO_PA,
+   TA_RAS_PA_TO_MCA,
+};
+
 /* Input/output structures for RAS commands */
 /**/
 
@@ -133,12 +141,38 @@ struct ta_ras_init_flags {
uint8_t channel_dis_num;
 };
 
+struct ta_ras_mca_addr {
+   uint64_t err_addr;
+   uint32_t ch_inst;
+   uint32_t umc_inst;
+   uint32_t node_inst;
+};
+
+struct ta_ras_phy_addr {
+   uint64_t pa;
+   uint32_t bank;
+   uint32_t channel_idx;
+};
+
+struct ta_ras_query_address_input {
+   enum ta_ras_address_type addr_type;
+   struct ta_ras_mca_addr ma;
+   struct ta_ras_phy_addr pa;
+};
+
 struct ta_ras_output_flags {
uint8_t ras_init_success_flag;
uint8_t err_inject_switch_disable_flag;
uint8_t reg_access_failure_flag;
 };
 
+struct ta_ras_query_address_output {
+   /* don't use the flags here */
+   struct ta_ras_output_flags flags;
+   struct ta_ras_mca_addr ma;
+   struct ta_ras_phy_addr pa;
+};
+
 /* Common input structure for RAS callbacks */
 /**/
 union ta_ras_cmd_input {
@@ -146,12 +180,14 @@ union ta_ras_cmd_input {
struct ta_ras_enable_features_input enable_features;
struct ta_ras_disable_features_inputdisable_features;
struct ta_ras_trigger_error_input   trigger_error;
+   struct ta_ras_query_address_input   address;
 
uint32_t reserve_pad[256];
 };
 
 union ta_ras_cmd_output {
struct ta_ras_output_flags flags;
+   struct ta_ras_query_address_output address;
 
uint32_t reserve_pad[256];
 };
-- 
2.34.1

[PATCH] drm/amdgpu: disable ras feature when fini

2024-01-28 Thread Tao Zhou

Send ras disable feature command in fini.

Signed-off-by: Tao Zhou 
Change-Id: I95f1d1e0a46fb613631e5cd77497e64c0551c4c7
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a249f24ed038..a9fa2d134670 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3437,7 +3437,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not 
cleared");
 
if (AMDGPU_RAS_GET_FEATURES(con->features))
-   amdgpu_ras_disable_all_features(adev, 1);
+   amdgpu_ras_disable_all_features(adev, 0);
 
cancel_delayed_work_sync(&con->ras_counte_delay_work);
 
-- 
2.34.1

[PATCH 2/2] update check condition of query for ras page retire

2024-01-17 Thread Tao Zhou

Support page retirement handling in debug mode.

v2: revert smu_v13_0_6_get_ecc_info directly.

Signed-off-by: Tao Zhou 
Change-Id: I0aaa807d7fe87b3da0f023c380e57ab6dd446fcf
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9d1cf41cf483..d8d263956e85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -93,11 +93,14 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
+   unsigned int error_query_mode;
unsigned long err_count;
 
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   amdgpu_ras_get_error_query_mode(adev, &error_query_mode);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
-   if (ret == -EOPNOTSUPP) {
+   if (ret == -EOPNOTSUPP &&
+   error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
adev->umc.ras->ras_block.hw_ops->query_ras_error_count)

adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
@@ -121,7 +124,8 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
 */

adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, 
ras_error_status);
}
-   } else if (!ret) {
+   } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
+   (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
if (adev->umc.ras &&
adev->umc.ras->ecc_info_query_ras_error_count)
adev->umc.ras->ecc_info_query_ras_error_count(adev, 
ras_error_status);
-- 
2.34.1

[PATCH 1/2] Revert "drm/amd/pm: smu v13_0_6 supports ecc info by default"

2024-01-17 Thread Tao Zhou

This reverts commit affdce050ab4119a3cdf74d7faa8f1eb30f6f6aa.
We use debug mode flag instead of this interface.

Signed-off-by: Tao Zhou 
Change-Id: I49eae821ce352d542143d68c05802634b4bf469d
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..6d8fdf8c538c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -3034,13 +3034,6 @@ static int smu_v13_0_6_select_xgmi_plpd_policy(struct 
smu_context *smu,
return ret;
 }
 
-static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu,
-   void *table)
-{
-   /* Support ecc info by default */
-   return 0;
-}
-
 static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
/* init dpm */
.get_allowed_feature_mask = smu_v13_0_6_get_allowed_feature_mask,
@@ -3095,7 +3088,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = 
{
.i2c_init = smu_v13_0_6_i2c_control_init,
.i2c_fini = smu_v13_0_6_i2c_control_fini,
.send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
-   .get_ecc_info = smu_v13_0_6_get_ecc_info,
 };
 
 void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)
-- 
2.34.1

[PATCH 2/2] update check condition of query for ras page retire

2024-01-17 Thread Tao Zhou

Support page retirement handling in debug mode.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  | 9 +++--
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 41139bac7643..6df32f0afd89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -90,12 +90,16 @@ static void amdgpu_umc_handle_bad_pages(struct 
amdgpu_device *adev,
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   unsigned int error_query_mode;
int ret = 0;
 
+   amdgpu_ras_get_error_query_mode(adev, &error_query_mode);
+
mutex_lock(&con->page_retirement_lock);
 
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
-   if (ret == -EOPNOTSUPP) {
+   if (ret == -EOPNOTSUPP &&
+   error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
adev->umc.ras->ras_block.hw_ops->query_ras_error_count)

adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
@@ -119,7 +123,8 @@ static void amdgpu_umc_handle_bad_pages(struct 
amdgpu_device *adev,
 */

adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, 
ras_error_status);
}
-   } else if (!ret) {
+   } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
+   (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
if (adev->umc.ras &&
adev->umc.ras->ecc_info_query_ras_error_count)
adev->umc.ras->ecc_info_query_ras_error_count(adev, 
ras_error_status);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index c560f4af214d..d86c9e7fc64b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2909,8 +2909,8 @@ static int smu_v13_0_6_select_xgmi_plpd_policy(struct 
smu_context *smu,
 static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu,
void *table)
 {
-   /* Support ecc info by default */
-   return 0;
+   /* we use debug mode flag instead of this interface */
+   return -EOPNOTSUPP;
 }
 
 static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
-- 
2.35.1

[PATCH 1/2] update error condition check for umc_v12_0_query_error_address

2024-01-17 Thread Tao Zhou

Deferred error is also taken into account.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 10edf818acf5..2e0bd4312f2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -337,10 +337,7 @@ static int umc_v12_0_query_error_address(struct 
amdgpu_device *adev,
}
 
/* calculate error address if ue error is detected */
-   if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 
1 &&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1) 
{
-
+   if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) {
mc_umc_addrt0 =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
 
-- 
2.35.1

[PATCH] drm/amdgpu: Don't warn for unsupported set_xgmi_plpd_mode

2023-11-02 Thread Tao Zhou

set_xgmi_plpd_mode may be unsupported and this isn't error, no need to
print warning for it.

v2: add ret2 to save the status of psp_ras_trigger_error.

Suggested-by: lijo.la...@amd.com
Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 0533f873001b..a5a72e5aae94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1131,28 +1131,30 @@ static void amdgpu_xgmi_query_ras_error_count(struct 
amdgpu_device *adev,
 static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
void *inject_if, uint32_t instance_mask)
 {
-   int ret = 0;
+   int ret1, ret2;
struct ta_ras_trigger_error_input *block_info =
(struct ta_ras_trigger_error_input *)inject_if;
 
if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
dev_warn(adev->dev, "Failed to disallow df cstate");
 
-   if (amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW))
+   ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW);
+   if (ret1 && ret1 != -EOPNOTSUPP)
dev_warn(adev->dev, "Failed to disallow XGMI power down");
 
-   ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask);
+   ret2 = psp_ras_trigger_error(&adev->psp, block_info, instance_mask);
 
if (amdgpu_ras_intr_triggered())
-   return ret;
+   return ret2;
 
-   if (amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT))
+   ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT);
+   if (ret1 && ret1 != -EOPNOTSUPP)
dev_warn(adev->dev, "Failed to allow XGMI power down");
 
if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
dev_warn(adev->dev, "Failed to allow df cstate");
 
-   return ret;
+   return ret2;
 }
 
 struct amdgpu_ras_block_hw_ops  xgmi_ras_hw_ops = {
-- 
2.35.1

[PATCH] drm/amdgpu: check recovery status of xgmi hive in ras_reset_error_count

2023-10-31 Thread Tao Zhou

Handle xgmi hive case.

Suggested-by: Hawking Zhang 
Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 753260745554..0093c28f4343 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1226,6 +1226,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+   struct amdgpu_hive_info *hive;
+   int hive_ras_recovery = 0;
 
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
@@ -1237,8 +1239,15 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
!amdgpu_ras_get_mca_debug_mode(adev))
return -EOPNOTSUPP;
 
+   hive = amdgpu_get_xgmi_hive(adev);
+   if (hive) {
+   hive_ras_recovery = atomic_read(&hive->ras_recovery);
+   amdgpu_put_xgmi_hive(hive);
+   }
+
/* skip ras error reset in gpu reset */
-   if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) &&
+   if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
+   hive_ras_recovery) &&
mca_funcs && mca_funcs->mca_set_debug_mode)
return -EOPNOTSUPP;
 
-- 
2.35.1

[PATCH] drm/amdgpu: handle extra UE register entries for gfx v9_4_3

2023-10-31 Thread Tao Zhou

The UE registe list is larger than CE list.

Reported-by: yipeng.c...@amd.com
Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 38 +
 1 file changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 41bbabd9ad4d..046ae95b366a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3799,6 +3799,27 @@ static void gfx_v9_4_3_inst_query_ras_err_count(struct 
amdgpu_device *adev,
}
}
 
+   /* handle extra register entries of UE */
+   for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) {
+   for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) {
+   for (k = 0; k < 
gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) {
+   /* no need to select if instance number is 1 */
+   if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 ||
+   
gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1)
+   gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, 
k, xcc_id);
+
+   amdgpu_ras_inst_query_ras_error_count(adev,
+   &(gfx_v9_4_3_ue_reg_list[i].reg_entry),
+   1,
+   
gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].mem_id_ent,
+   
gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].size,
+   GET_INST(GC, xcc_id),
+   AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+   &ue_count);
+   }
+   }
+   }
+
gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x,
xcc_id);
mutex_unlock(&adev->grbm_idx_mutex);
@@ -3838,6 +3859,23 @@ static void gfx_v9_4_3_inst_reset_ras_err_count(struct 
amdgpu_device *adev,
}
}
 
+   /* handle extra register entries of UE */
+   for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) {
+   for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) {
+   for (k = 0; k < 
gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) {
+   /* no need to select if instance number is 1 */
+   if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 ||
+   
gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1)
+   gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, 
k, xcc_id);
+
+   amdgpu_ras_inst_reset_ras_error_count(adev,
+   &(gfx_v9_4_3_ue_reg_list[i].reg_entry),
+   1,
+   GET_INST(GC, xcc_id));
+   }
+   }
+   }
+
gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x,
xcc_id);
mutex_unlock(&adev->grbm_idx_mutex);
-- 
2.35.1

[PATCH] drm/amdgpu: Don't warn for unsupported set_xgmi_plpd_mode

2023-10-31 Thread Tao Zhou

set_xgmi_plpd_mode may be unsupported and this isn't error, no need to
print warning for it.

Suggested-by: lijo.la...@amd.com
Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 0533f873001b..c9b09bddbcdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1138,7 +1138,8 @@ static int amdgpu_ras_error_inject_xgmi(struct 
amdgpu_device *adev,
if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
dev_warn(adev->dev, "Failed to disallow df cstate");
 
-   if (amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW))
+   ret = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW);
+   if (ret && ret != -EOPNOTSUPP)
dev_warn(adev->dev, "Failed to disallow XGMI power down");
 
ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask);
@@ -1146,7 +1147,8 @@ static int amdgpu_ras_error_inject_xgmi(struct 
amdgpu_device *adev,
if (amdgpu_ras_intr_triggered())
return ret;
 
-   if (amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT))
+   ret = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT);
+   if (ret && ret != -EOPNOTSUPP)
dev_warn(adev->dev, "Failed to allow XGMI power down");
 
if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
-- 
2.35.1

[PATCH 2/2] drm/amdgpu: add RAS reset/query operations for XGMI v6_4

2023-10-27 Thread Tao Zhou

Reset/query RAS error status and count.

v2: use XGMI IP version instead of WAFL version.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 46 ++--
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 2b7dc490ba6b..0533f873001b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -103,6 +103,16 @@ static const int 
walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x10
 };
 
+static const int xgmi3x16_pcs_err_status_reg_v6_4[] = {
+   smnPCS_XGMI3X16_PCS_ERROR_STATUS,
+   smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x10
+};
+
+static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = {
+   smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK,
+   smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x10
+};
+
 static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
{"XGMI PCS DataLossErr",
 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
@@ -958,6 +968,16 @@ static void amdgpu_xgmi_reset_ras_error_count(struct 
amdgpu_device *adev)
default:
break;
}
+
+   switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+   case IP_VERSION(6, 4, 0):
+   for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); 
i++)
+   pcs_clear_status(adev,
+   xgmi3x16_pcs_err_status_reg_v6_4[i]);
+   break;
+   default:
+   break;
+   }
 }
 
 static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
@@ -975,7 +995,9 @@ static int amdgpu_xgmi_query_pcs_error_status(struct 
amdgpu_device *adev,
 
if (is_xgmi_pcs) {
if (amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
-   IP_VERSION(6, 1, 0)) {
+   IP_VERSION(6, 1, 0) ||
+   amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
+   IP_VERSION(6, 4, 0)) {
pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0];
field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields);
} else {
@@ -1013,7 +1035,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct 
amdgpu_device *adev,
 void *ras_error_status)
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
-   int i;
+   int i, supported = 1;
uint32_t data, mask_data = 0;
uint32_t ue_cnt = 0, ce_cnt = 0;
 
@@ -1077,7 +1099,25 @@ static void amdgpu_xgmi_query_ras_error_count(struct 
amdgpu_device *adev,
}
break;
default:
-   dev_warn(adev->dev, "XGMI RAS error query not supported");
+   supported = 0;
+   break;
+   }
+
+   switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+   case IP_VERSION(6, 4, 0):
+   /* check xgmi3x16 pcs error */
+   for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); 
i++) {
+   data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]);
+   mask_data =
+   
RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]);
+   if (data)
+   amdgpu_xgmi_query_pcs_error_status(adev, data,
+   mask_data, &ue_cnt, &ce_cnt, 
true, true);
+   }
+   break;
+   default:
+   if (!supported)
+   dev_warn(adev->dev, "XGMI RAS error query not 
supported");
break;
}
 
-- 
2.35.1

[PATCH 1/2] drm/amdgpu: set XGMI IP version manually for v6_4

2023-10-27 Thread Tao Zhou

The version can't be queried from discovery table.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 0b711bac2092..d22f22d706e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -2464,6 +2464,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device 
*adev)
if (amdgpu_ip_version(adev, XGMI_HWIP, 0) == IP_VERSION(4, 8, 0))
adev->gmc.xgmi.supported = true;
 
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3))
+   adev->ip_versions[XGMI_HWIP][0] = IP_VERSION(6, 4, 0);
+
/* set NBIO version */
switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) {
case IP_VERSION(6, 1, 0):
-- 
2.35.1

[PATCH] drm/amdgpu: use mode-2 reset for RAS poison consumption

2023-10-26 Thread Tao Zhou

Switch from mode-1 reset to mode-2 for poison consumption.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index f74347cc087a..d65e21914d8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -166,8 +166,12 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
}
}
 
-   if (reset)
+   if (reset) {
+   /* use mode-2 reset for poison consumption */
+   if (!entry)
+   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
amdgpu_ras_reset_gpu(adev);
+   }
}
 
kfree(err_data->err_addr);
-- 
2.35.1

[PATCH] drm/amdgpu: check RAS supported first in ras_reset_error_count

2023-10-24 Thread Tao Zhou

Not all platforms support RAS.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c71321edf50b..a6cff4a31c54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1233,15 +1233,15 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
return -EOPNOTSUPP;
}
 
+   if (!amdgpu_ras_is_supported(adev, block) ||
+   !amdgpu_ras_get_mca_debug_mode(adev))
+   return -EOPNOTSUPP;
+
/* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) &&
mca_funcs && mca_funcs->mca_set_debug_mode)
return -EOPNOTSUPP;
 
-   if (!amdgpu_ras_is_supported(adev, block) ||
-   !amdgpu_ras_get_mca_debug_mode(adev))
-   return -EOPNOTSUPP;
-
if (block_obj->hw_ops->reset_ras_error_count)
block_obj->hw_ops->reset_ras_error_count(adev);
 
-- 
2.35.1

[PATCH] drm/amdgpu: get RAS poison status from DF v4_6_2

2023-10-23 Thread Tao Zhou

Add DF block and RAS poison mode query for DF v4_6_2.

Signed-off-by: Tao Zhou 
Reviewed-by: Stanley.Yang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  4 +++
 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c| 34 +++
 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h| 31 +
 4 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index ec1daf7112a9..260e32ef7bae 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -104,7 +104,8 @@ amdgpu-y += \
 amdgpu-y += \
df_v1_7.o \
df_v3_6.o \
-   df_v4_3.o
+   df_v4_3.o \
+   df_v4_6_2.o
 
 # add GMC block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 17d4311e22d5..8d3681172cea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -35,6 +35,7 @@
 #include "df_v1_7.h"
 #include "df_v3_6.h"
 #include "df_v4_3.h"
+#include "df_v4_6_2.h"
 #include "nbio_v6_1.h"
 #include "nbio_v7_0.h"
 #include "nbio_v7_4.h"
@@ -2557,6 +2558,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device 
*adev)
case IP_VERSION(4, 3, 0):
adev->df.funcs = &df_v4_3_funcs;
break;
+   case IP_VERSION(4, 6, 2):
+   adev->df.funcs = &df_v4_6_2_funcs;
+   break;
default:
break;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c 
b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c
new file mode 100644
index ..a47960a0babd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "df_v4_6_2.h"
+
+static bool df_v4_6_2_query_ras_poison_mode(struct amdgpu_device *adev)
+{
+   /* return true since related regs are inaccessible */
+   return true;
+}
+
+const struct amdgpu_df_funcs df_v4_6_2_funcs = {
+   .query_ras_poison_mode = df_v4_6_2_query_ras_poison_mode,
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h 
b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h
new file mode 100644
index ..3bc3e6d216e2
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __DF_V4_6_2_H__
+#define __DF_V4_6_2_H__
+
+#include "soc15_common.h"
+
+extern const struct amdgpu_df_funcs df_v4_6_2_funcs;
+
+#endif
-- 
2.35.1

[PATCH] drm/amdgpu: enable RAS poison mode for APU

2023-10-20 Thread Tao Zhou

Enable it by default on APU platform.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 95c181cd1fea..a41cab0a2f9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2710,7 +2710,8 @@ static void amdgpu_ras_query_poison_mode(struct 
amdgpu_device *adev)
return;
 
/* Init poison supported flag, the default value is false */
-   if (adev->gmc.xgmi.connected_to_cpu) {
+   if (adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) {
/* enabled by default when GPU is connected to CPU */
con->poison_supported = true;
} else if (adev->df.funcs &&
-- 
2.35.1

[PATCH 6/6] drm/amdgpu: drop status query/reset for GCEA 9.4.3 and MMEA 1.8

2023-10-18 Thread Tao Zhou

PMFW will be responsible for them.

v2: remove query interfaces.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |  60 --
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 143 
 2 files changed, 203 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index a1c2c952d882..362bf51ab1d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3754,10 +3754,6 @@ static const struct amdgpu_gfx_ras_reg_entry 
gfx_v9_4_3_ue_reg_list[] = {
AMDGPU_GFX_LDS_MEM, 4},
 };
 
-static const struct soc15_reg_entry gfx_v9_4_3_ea_err_status_regs = {
-   SOC15_REG_ENTRY(GC, 0, regGCEA_ERR_STATUS), 0, 1, 16
-};
-
 static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev,
void *ras_error_status, int xcc_id)
 {
@@ -3846,39 +3842,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_count(struct 
amdgpu_device *adev,
mutex_unlock(&adev->grbm_idx_mutex);
 }
 
-static void gfx_v9_4_3_inst_query_ea_err_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   uint32_t i, j;
-   uint32_t reg_value;
-
-   mutex_lock(&adev->grbm_idx_mutex);
-
-   for (i = 0; i < gfx_v9_4_3_ea_err_status_regs.se_num; i++) {
-   for (j = 0; j < gfx_v9_4_3_ea_err_status_regs.instance; j++) {
-   gfx_v9_4_3_xcc_select_se_sh(adev, i, 0, j, xcc_id);
-   reg_value = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
-   regGCEA_ERR_STATUS);
-   if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, 
SDP_RDRSP_STATUS) ||
-   REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, 
SDP_WRRSP_STATUS) ||
-   REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, 
SDP_RDRSP_DATAPARITY_ERROR)) {
-   dev_warn(adev->dev,
-   "GCEA err detected at instance: %d, 
status: 0x%x!\n",
-   j, reg_value);
-   }
-   /* clear after read */
-   reg_value = REG_SET_FIELD(reg_value, GCEA_ERR_STATUS,
- CLEAR_ERROR_STATUS, 0x1);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regGCEA_ERR_STATUS,
-   reg_value);
-   }
-   }
-
-   gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x,
-   xcc_id);
-   mutex_unlock(&adev->grbm_idx_mutex);
-}
-
 static void gfx_v9_4_3_inst_query_utc_err_status(struct amdgpu_device *adev,
int xcc_id)
 {
@@ -3983,7 +3946,6 @@ static void 
gfx_v9_4_3_inst_query_sq_timeout_status(struct amdgpu_device *adev,
 static void gfx_v9_4_3_inst_query_ras_err_status(struct amdgpu_device *adev,
void *ras_error_status, int xcc_id)
 {
-   gfx_v9_4_3_inst_query_ea_err_status(adev, xcc_id);
gfx_v9_4_3_inst_query_utc_err_status(adev, xcc_id);
gfx_v9_4_3_inst_query_sq_timeout_status(adev, xcc_id);
 }
@@ -3996,27 +3958,6 @@ static void gfx_v9_4_3_inst_reset_utc_err_status(struct 
amdgpu_device *adev,
WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_WALKER_MEM_ECC_STATUS, 
0x3);
 }
 
-static void gfx_v9_4_3_inst_reset_ea_err_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   uint32_t i, j;
-   uint32_t value;
-
-   mutex_lock(&adev->grbm_idx_mutex);
-   for (i = 0; i < gfx_v9_4_3_ea_err_status_regs.se_num; i++) {
-   for (j = 0; j < gfx_v9_4_3_ea_err_status_regs.instance; j++) {
-   gfx_v9_4_3_xcc_select_se_sh(adev, i, 0, j, xcc_id);
-   value = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regGCEA_ERR_STATUS);
-   value = REG_SET_FIELD(value, GCEA_ERR_STATUS,
-   CLEAR_ERROR_STATUS, 0x1);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regGCEA_ERR_STATUS, value);
-   }
-   }
-   gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x,
-   xcc_id);
-   mutex_unlock(&adev->grbm_idx_mutex);
-}
-
 static void gfx_v9_4_3_inst_reset_sq_timeout_status(struct amdgpu_device *adev,
int xcc_id)
 {
@@ -4042,7 +3983,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_status(struct 
amdgpu_device *adev,
void *ras_error_status, int xcc_id)
 {
gfx_v9_4_3_inst_reset_utc_err_status(adev, xcc_id);
-   gfx_v9_4_3_inst_reset_ea_err_status(adev, xcc_id);
gfx_v9_4_3_inst_reset_sq_timeout_status(adev, xcc_id);

[PATCH 4/6] drm/amd/pm: record mca debug mode in RAS

2023-10-18 Thread Tao Zhou

Call amdgpu_ras_set_mca_debug_mode when we set mca debug mode in smu
v13_0_6.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 74fc945a8f9b..c5c1f479b925 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1475,6 +1475,7 @@ static int smu_v13_0_6_mca_set_debug_mode(struct 
smu_context *smu, bool enable)
if (smu->smc_fw_version < 0x554800)
return 0;
 
+   amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
   enable ? 0 : 
ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
   NULL);
-- 
2.35.1

[PATCH 5/6] drm/amdgpu: bypass RAS error reset in some conditions

2023-10-18 Thread Tao Zhou

PMFW is responsible for RAS error reset in some conditions, driver can
skip the operation.

v2: add check for ras->in_recovery, it's set earlier than
amdgpu_in_reset.

v3: fix error in gpu reset check.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0eb3dbd9d548..95c181cd1fea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1178,6 +1178,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
@@ -1185,7 +1187,13 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
return -EOPNOTSUPP;
}
 
-   if (!amdgpu_ras_is_supported(adev, block))
+   /* skip ras error reset in gpu reset */
+   if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) &&
+   mca_funcs && mca_funcs->mca_set_debug_mode)
+   return -EOPNOTSUPP;
+
+   if (!amdgpu_ras_is_supported(adev, block) ||
+   !amdgpu_ras_get_mca_debug_mode(adev))
return -EOPNOTSUPP;
 
if (block_obj->hw_ops->reset_ras_error_count)
-- 
2.35.1

[PATCH 3/6] drm/amdgpu: add set/get mca debug mode operations

2023-10-18 Thread Tao Zhou

Record the debug mode status in RAS.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +
 2 files changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 335f5d8bc20b..0eb3dbd9d548 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3323,6 +3323,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
return 0;
 }
 
+void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   if (con)
+   con->is_mca_debug_mode = enable;
+}
+
+bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+
+   if (!con)
+   return false;
+
+   if (mca_funcs && mca_funcs->mca_set_debug_mode)
+   return con->is_mca_debug_mode;
+   else
+   return true;
+}
 
 /* Register each ip ras block into amdgpu ras */
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 3f9ac0ab67e6..2fdfef62ee27 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -434,6 +434,8 @@ struct amdgpu_ras {
 
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
+   /* Record status of smu mca debug mode */
+   bool is_mca_debug_mode;
 
/* Record special requirements of gpu reset caller */
uint32_t  gpu_reset_flags;
@@ -768,6 +770,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);
 
 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras 
*ras_con);
 
+void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
+bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
+
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj);
 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev);
-- 
2.35.1

[PATCH 2/6] drm/amdgpu: replace reset_error_count with amdgpu_ras_reset_error_count

2023-10-18 Thread Tao Zhou

Simplify the code.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 9 ++---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 7 ++-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c   | 7 ++-
 5 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 31f8c3ead161..04cfd67a37a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3594,9 +3594,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
if (adev->asic_reset_res)
goto fail;
 
-   if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
-   adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
-   
adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
} else {
 
task_barrier_full(&hive->tb);
@@ -5242,9 +5240,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
 
if (!r && amdgpu_ras_intr_triggered()) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-   if (tmp_adev->mmhub.ras && 
tmp_adev->mmhub.ras->ras_block.hw_ops &&
-   
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
-   
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
+   amdgpu_ras_reset_error_count(tmp_adev, 
AMDGPU_RAS_BLOCK__MMHUB);
}
 
amdgpu_ras_intr_cleared();
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 70e38b013309..2b7dc490ba6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -914,7 +914,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device 
*adev, struct ras_comm
adev->gmc.xgmi.num_physical_nodes == 0)
return 0;
 
-   adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
 
return amdgpu_ras_block_late_init(adev, ras_block);
 }
@@ -1081,7 +1081,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct 
amdgpu_device *adev,
break;
}
 
-   adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
 
err_data->ue_count += ue_cnt;
err_data->ce_count += ce_cnt;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index d95fafe7f7ed..70e7e93d382f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1604,13 +1604,8 @@ static int gmc_v9_0_late_init(void *handle)
}
 
if (!amdgpu_persistent_edc_harvesting_supported(adev)) {
-   if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
-   adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
-   
adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
-
-   if (adev->hdp.ras && adev->hdp.ras->ras_block.hw_ops &&
-   adev->hdp.ras->ras_block.hw_ops->reset_ras_error_count)
-   
adev->hdp.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__HDP);
}
 
r = amdgpu_gmc_ras_late_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index ef04aad788a8..7ae5f134f09b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1750,11 +1750,8 @@ static int sdma_v4_0_late_init(void *handle)
 
sdma_v4_0_setup_ulv(adev);
 
-   if (!amdgpu_persistent_edc_harvesting_supported(adev)) {
-   if (adev->sdma.ras && adev->sdma.ras->ras_block.hw_ops &&
-   adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count)
-   
adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count(adev);
-   }
+   if (!amdgpu_persistent_edc_harvesting_supported(adev))
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__SDMA);
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/

[PATCH 1/6] drm/amdgpu: define ras_reset_error_count function

2023-10-18 Thread Tao Zhou

Make the code architecture more simple.

v2: reuse ras_reset_error_count in ras_reset_error_status.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 ++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1b23651cacf4..335f5d8bc20b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1174,23 +1174,34 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
return ret;
 }
 
-int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
+int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
 
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
-ras_block_str(block));
-   return 0;
+   ras_block_str(block));
+   return -EOPNOTSUPP;
}
 
if (!amdgpu_ras_is_supported(adev, block))
-   return 0;
+   return -EOPNOTSUPP;
 
if (block_obj->hw_ops->reset_ras_error_count)
block_obj->hw_ops->reset_ras_error_count(adev);
 
+   return 0;
+}
+
+int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
+   enum amdgpu_ras_block block)
+{
+   struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
+
+   if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP)
+   return 0;
+
if ((block == AMDGPU_RAS_BLOCK__GFX) ||
(block == AMDGPU_RAS_BLOCK__MMHUB)) {
if (block_obj->hw_ops->reset_ras_error_status)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 0a5c8a107fb2..3f9ac0ab67e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -714,6 +714,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device 
*adev);
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info);
 
+int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
+   enum amdgpu_ras_block block);
 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
enum amdgpu_ras_block block);
 
-- 
2.35.1

[PATCH 6/6] drm/amdgpu: drop status reset for GCEA 9.4.3 and MMEA 1.8

2023-10-17 Thread Tao Zhou

PMFW will be responsible for it.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 22 ---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 86 -
 2 files changed, 108 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index a1c2c952d882..65da72735e52 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3996,27 +3996,6 @@ static void gfx_v9_4_3_inst_reset_utc_err_status(struct 
amdgpu_device *adev,
WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_WALKER_MEM_ECC_STATUS, 
0x3);
 }
 
-static void gfx_v9_4_3_inst_reset_ea_err_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   uint32_t i, j;
-   uint32_t value;
-
-   mutex_lock(&adev->grbm_idx_mutex);
-   for (i = 0; i < gfx_v9_4_3_ea_err_status_regs.se_num; i++) {
-   for (j = 0; j < gfx_v9_4_3_ea_err_status_regs.instance; j++) {
-   gfx_v9_4_3_xcc_select_se_sh(adev, i, 0, j, xcc_id);
-   value = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regGCEA_ERR_STATUS);
-   value = REG_SET_FIELD(value, GCEA_ERR_STATUS,
-   CLEAR_ERROR_STATUS, 0x1);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regGCEA_ERR_STATUS, value);
-   }
-   }
-   gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x,
-   xcc_id);
-   mutex_unlock(&adev->grbm_idx_mutex);
-}
-
 static void gfx_v9_4_3_inst_reset_sq_timeout_status(struct amdgpu_device *adev,
int xcc_id)
 {
@@ -4042,7 +4021,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_status(struct 
amdgpu_device *adev,
void *ras_error_status, int xcc_id)
 {
gfx_v9_4_3_inst_reset_utc_err_status(adev, xcc_id);
-   gfx_v9_4_3_inst_reset_ea_err_status(adev, xcc_id);
gfx_v9_4_3_inst_reset_sq_timeout_status(adev, xcc_id);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index aa00483e7b37..616d75add087 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -756,96 +756,10 @@ static void mmhub_v1_8_query_ras_error_status(struct 
amdgpu_device *adev)
mmhub_v1_8_inst_query_ras_err_status(adev, i);
 }
 
-static void mmhub_v1_8_inst_reset_ras_err_status(struct amdgpu_device *adev,
-uint32_t mmhub_inst)
-{
-   uint32_t mmea_cgtt_clk_cntl_addr_dist;
-   uint32_t mmea_err_status_addr_dist;
-   uint32_t reg_value;
-   uint32_t i;
-
-   /* reset mmea ras err status */
-   mmea_cgtt_clk_cntl_addr_dist = regMMEA1_CGTT_CLK_CTRL - 
regMMEA0_CGTT_CLK_CTRL;
-   mmea_err_status_addr_dist = regMMEA1_ERR_STATUS - regMMEA0_ERR_STATUS;
-   for (i = 0; i < ARRAY_SIZE(mmhub_v1_8_mmea_err_status_reg); i++) {
-   /* force clk branch on for response path
-* set MMEA0_CGTT_CLK_CTRL.SOFT_OVERRIDE_RETURN = 1
-*/
-   reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
-   regMMEA0_CGTT_CLK_CTRL,
-   i * 
mmea_cgtt_clk_cntl_addr_dist);
-   reg_value = REG_SET_FIELD(reg_value, MMEA0_CGTT_CLK_CTRL,
- SOFT_OVERRIDE_RETURN, 1);
-   WREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
-   regMMEA0_CGTT_CLK_CTRL,
-   i * mmea_cgtt_clk_cntl_addr_dist,
-   reg_value);
-
-   /* set MMEA0_ERR_STATUS.CLEAR_ERROR_STATUS = 1 */
-   reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
-   regMMEA0_ERR_STATUS,
-   i * mmea_err_status_addr_dist);
-   reg_value = REG_SET_FIELD(reg_value, MMEA0_ERR_STATUS,
- CLEAR_ERROR_STATUS, 1);
-   WREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
-   regMMEA0_ERR_STATUS,
-   i * mmea_err_status_addr_dist,
-   reg_value);
-
-   /* set MMEA0_CGTT_CLK_CTRL.SOFT_OVERRIDE_RETURN = 0 */
-   reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
-   regMMEA0_CGTT_CLK_CTRL,
-   i * 
mmea_cgtt_clk_cntl_addr_dist);
-   reg_value = REG_SET_FIELD(reg_value, MMEA0_CGTT_CLK_CTRL,
- SOFT_OVERRIDE_RETURN, 0);
-   WREG32_SOC15_O

[PATCH 4/6] drm/amdgpu: bypass RAS error reset in some conditions

2023-10-17 Thread Tao Zhou

PMFW is responsible for RAS error reset in some conditions, driver can
skip the operation.

v2: add check for ras->in_recovery, it's set earlier than
amdgpu_in_reset.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 95c7ba889e2d..806c6d4deb63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1178,11 +1178,19 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
if (!block_obj || !block_obj->hw_ops)
return 0;
 
-   if (!amdgpu_ras_is_supported(adev, block))
+   /* skip ras error reset in gpu reset */
+   if (amdgpu_in_reset(adev) && atomic_read(&ras->in_recovery) &&
+   mca_funcs && mca_funcs->mca_set_debug_mode)
+   return 0;
+
+   if (!amdgpu_ras_is_supported(adev, block) ||
+   !amdgpu_ras_get_mca_debug_mode(adev))
return 0;
 
if (block_obj->hw_ops->reset_ras_error_count)
@@ -1195,6 +1203,8 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
@@ -1202,7 +1212,13 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
return 0;
}
 
-   if (!amdgpu_ras_is_supported(adev, block))
+   /* skip ras error reset in gpu reset */
+   if (amdgpu_in_reset(adev) && atomic_read(&ras->in_recovery) &&
+   mca_funcs && mca_funcs->mca_set_debug_mode)
+   return 0;
+
+   if (!amdgpu_ras_is_supported(adev, block) ||
+   !amdgpu_ras_get_mca_debug_mode(adev))
return 0;
 
if (block_obj->hw_ops->reset_ras_error_count)
-- 
2.35.1

[PATCH 5/6] drm/amdgpu: reuse amdgpu_ras_reset_error_count code

2023-10-17 Thread Tao Zhou

To simplify the code of amdgpu_ras_reset_error_status without logical
change.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 +++--
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 806c6d4deb63..b4e8d0c629cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1181,17 +1181,20 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
-   if (!block_obj || !block_obj->hw_ops)
-   return 0;
+   if (!block_obj || !block_obj->hw_ops) {
+   dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
+   ras_block_str(block));
+   return -EOPNOTSUPP;
+   }
 
/* skip ras error reset in gpu reset */
if (amdgpu_in_reset(adev) && atomic_read(&ras->in_recovery) &&
mca_funcs && mca_funcs->mca_set_debug_mode)
-   return 0;
+   return -EOPNOTSUPP;
 
if (!amdgpu_ras_is_supported(adev, block) ||
!amdgpu_ras_get_mca_debug_mode(adev))
-   return 0;
+   return -EOPNOTSUPP;
 
if (block_obj->hw_ops->reset_ras_error_count)
block_obj->hw_ops->reset_ras_error_count(adev);
@@ -1203,27 +1206,10 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
-
-   if (!block_obj || !block_obj->hw_ops) {
-   dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
-ras_block_str(block));
-   return 0;
-   }
 
-   /* skip ras error reset in gpu reset */
-   if (amdgpu_in_reset(adev) && atomic_read(&ras->in_recovery) &&
-   mca_funcs && mca_funcs->mca_set_debug_mode)
+   if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP)
return 0;
 
-   if (!amdgpu_ras_is_supported(adev, block) ||
-   !amdgpu_ras_get_mca_debug_mode(adev))
-   return 0;
-
-   if (block_obj->hw_ops->reset_ras_error_count)
-   block_obj->hw_ops->reset_ras_error_count(adev);
-
if ((block == AMDGPU_RAS_BLOCK__GFX) ||
(block == AMDGPU_RAS_BLOCK__MMHUB)) {
if (block_obj->hw_ops->reset_ras_error_status)
-- 
2.35.1

[PATCH 2/6] drm/amdgpu: add set/get mca debug mode operations

2023-10-17 Thread Tao Zhou

Record the debug mode status in RAS.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +
 2 files changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 344ebcf1a6e5..95c7ba889e2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3329,6 +3329,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
return 0;
 }
 
+void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   if (con)
+   con->is_mca_debug_mode = enable;
+}
+
+bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+
+   if (!con)
+   return false;
+
+   if (mca_funcs && mca_funcs->mca_set_debug_mode)
+   return con->is_mca_debug_mode;
+   else
+   return true;
+}
 
 /* Register each ip ras block into amdgpu ras */
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 3f9ac0ab67e6..2fdfef62ee27 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -434,6 +434,8 @@ struct amdgpu_ras {
 
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
+   /* Record status of smu mca debug mode */
+   bool is_mca_debug_mode;
 
/* Record special requirements of gpu reset caller */
uint32_t  gpu_reset_flags;
@@ -768,6 +770,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);
 
 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras 
*ras_con);
 
+void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
+bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
+
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj);
 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev);
-- 
2.35.1

[PATCH 3/6] drm/amd/pm: record mca debug mode in RAS

2023-10-17 Thread Tao Zhou

Call amdgpu_ras_set_mca_debug_mode when we set mca debug mode in smu
v13_0_6.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 74fc945a8f9b..c5c1f479b925 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1475,6 +1475,7 @@ static int smu_v13_0_6_mca_set_debug_mode(struct 
smu_context *smu, bool enable)
if (smu->smc_fw_version < 0x554800)
return 0;
 
+   amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
   enable ? 0 : 
ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
   NULL);
-- 
2.35.1

[PATCH 1/6] drm/amdgpu: define ras_reset_error_count function

2023-10-17 Thread Tao Zhou

Make the code architecture more simple.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 17 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  4 ++--
 4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 31f8c3ead161..04cfd67a37a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3594,9 +3594,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
if (adev->asic_reset_res)
goto fail;
 
-   if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
-   adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
-   
adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
} else {
 
task_barrier_full(&hive->tb);
@@ -5242,9 +5240,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
 
if (!r && amdgpu_ras_intr_triggered()) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-   if (tmp_adev->mmhub.ras && 
tmp_adev->mmhub.ras->ras_block.hw_ops &&
-   
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
-   
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
+   amdgpu_ras_reset_error_count(tmp_adev, 
AMDGPU_RAS_BLOCK__MMHUB);
}
 
amdgpu_ras_intr_cleared();
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1b23651cacf4..344ebcf1a6e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1174,6 +1174,23 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
return ret;
 }
 
+int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
+   enum amdgpu_ras_block block)
+{
+   struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
+
+   if (!block_obj || !block_obj->hw_ops)
+   return 0;
+
+   if (!amdgpu_ras_is_supported(adev, block))
+   return 0;
+
+   if (block_obj->hw_ops->reset_ras_error_count)
+   block_obj->hw_ops->reset_ras_error_count(adev);
+
+   return 0;
+}
+
 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
enum amdgpu_ras_block block)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 0a5c8a107fb2..3f9ac0ab67e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -714,6 +714,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device 
*adev);
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info);
 
+int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
+   enum amdgpu_ras_block block);
 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
enum amdgpu_ras_block block);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 70e38b013309..2b7dc490ba6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -914,7 +914,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device 
*adev, struct ras_comm
adev->gmc.xgmi.num_physical_nodes == 0)
return 0;
 
-   adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
 
return amdgpu_ras_block_late_init(adev, ras_block);
 }
@@ -1081,7 +1081,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct 
amdgpu_device *adev,
break;
}
 
-   adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
 
err_data->ue_count += ue_cnt;
err_data->ce_count += ce_cnt;
-- 
2.35.1

[PATCH 5/5] drm/amdgpu: reuse amdgpu_ras_reset_error_count code

2023-10-12 Thread Tao Zhou

To simplify the code of amdgpu_ras_reset_error_status without logical
change.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 +++--
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6dddb0423411..3698be22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1107,17 +1107,20 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
-   if (!block_obj || !block_obj->hw_ops)
-   return 0;
+   if (!block_obj || !block_obj->hw_ops) {
+   dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
+ras_block_str(block));
+   return -EOPNOTSUPP;
+   }
 
/* skip ras error reset in gpu reset */
if (amdgpu_in_reset(adev) &&
mca_funcs && mca_funcs->mca_set_debug_mode)
-   return 0;
+   return -EOPNOTSUPP;
 
if (!amdgpu_ras_is_supported(adev, block) ||
!amdgpu_ras_get_mca_debug_mode(adev))
-   return 0;
+   return -EOPNOTSUPP;
 
if (block_obj->hw_ops->reset_ras_error_count)
block_obj->hw_ops->reset_ras_error_count(adev);
@@ -1129,25 +1132,9 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
-   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
-   if (!block_obj || !block_obj->hw_ops) {
-   dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
-ras_block_str(block));
+   if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP)
return 0;
-   }
-
-   /* skip ras error reset in gpu reset */
-   if (amdgpu_in_reset(adev) &&
-   mca_funcs && mca_funcs->mca_set_debug_mode)
-   return 0;
-
-   if (!amdgpu_ras_is_supported(adev, block) ||
-   !amdgpu_ras_get_mca_debug_mode(adev))
-   return 0;
-
-   if (block_obj->hw_ops->reset_ras_error_count)
-   block_obj->hw_ops->reset_ras_error_count(adev);
 
if ((block == AMDGPU_RAS_BLOCK__GFX) ||
(block == AMDGPU_RAS_BLOCK__MMHUB)) {
-- 
2.35.1

[PATCH 4/5] drm/amdgpu: bypass RAS error reset in some conditions

2023-10-12 Thread Tao Zhou

PMFW is responsible for RAS error reset in some conditions, driver can
skip the operation.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 91ed4fd96ee1..6dddb0423411 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1105,11 +1105,18 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
+   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
if (!block_obj || !block_obj->hw_ops)
return 0;
 
-   if (!amdgpu_ras_is_supported(adev, block))
+   /* skip ras error reset in gpu reset */
+   if (amdgpu_in_reset(adev) &&
+   mca_funcs && mca_funcs->mca_set_debug_mode)
+   return 0;
+
+   if (!amdgpu_ras_is_supported(adev, block) ||
+   !amdgpu_ras_get_mca_debug_mode(adev))
return 0;
 
if (block_obj->hw_ops->reset_ras_error_count)
@@ -1122,6 +1129,7 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
+   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
@@ -1129,7 +1137,13 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
return 0;
}
 
-   if (!amdgpu_ras_is_supported(adev, block))
+   /* skip ras error reset in gpu reset */
+   if (amdgpu_in_reset(adev) &&
+   mca_funcs && mca_funcs->mca_set_debug_mode)
+   return 0;
+
+   if (!amdgpu_ras_is_supported(adev, block) ||
+   !amdgpu_ras_get_mca_debug_mode(adev))
return 0;
 
if (block_obj->hw_ops->reset_ras_error_count)
-- 
2.35.1

[PATCH 3/5] drm/amd/pm: record mca debug mode in RAS

2023-10-12 Thread Tao Zhou

Call amdgpu_ras_set_mca_debug_mode when we set mca debug mode in smu
v13_0_6.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 8220bdcbd927..55b0846337a7 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1414,6 +1414,7 @@ static int smu_v13_0_6_mca_set_debug_mode(struct 
smu_context *smu, bool enable)
if (smu_version < 0x554800)
return 0;
 
+   amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
   enable ? 0 : 
ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
   NULL);
-- 
2.35.1

[PATCH 2/5] drm/amdgpu: add set/get mca debug mode operations

2023-10-12 Thread Tao Zhou

Record the debug mode status in RAS.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +
 2 files changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 82ee6f3d12da..91ed4fd96ee1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3250,6 +3250,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
return 0;
 }
 
+void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   if (con)
+   con->is_mca_debug_mode = enable;
+}
+
+bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+
+   if (!con)
+   return false;
+
+   if (mca_funcs && mca_funcs->mca_set_debug_mode)
+   return con->is_mca_debug_mode;
+   else
+   return true;
+}
 
 /* Register each ip ras block into amdgpu ras */
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index deea10b6c184..c60688dc73ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -433,6 +433,8 @@ struct amdgpu_ras {
 
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
+   /* Record status of smu mca debug mode */
+   bool is_mca_debug_mode;
 
/* Record special requirements of gpu reset caller */
uint32_t  gpu_reset_flags;
@@ -748,6 +750,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);
 
 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras 
*ras_con);
 
+void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
+bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
+
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj);
 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev);
-- 
2.35.1

[PATCH 1/5] drm/amdgpu: define ras_reset_error_count function

2023-10-12 Thread Tao Zhou

Make the code architecture more simple.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 17 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  4 ++--
 4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 31f8c3ead161..04cfd67a37a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3594,9 +3594,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
if (adev->asic_reset_res)
goto fail;
 
-   if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
-   adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
-   
adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
} else {
 
task_barrier_full(&hive->tb);
@@ -5242,9 +5240,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
 
if (!r && amdgpu_ras_intr_triggered()) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-   if (tmp_adev->mmhub.ras && 
tmp_adev->mmhub.ras->ras_block.hw_ops &&
-   
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
-   
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
+   amdgpu_ras_reset_error_count(tmp_adev, 
AMDGPU_RAS_BLOCK__MMHUB);
}
 
amdgpu_ras_intr_cleared();
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index dacce5f2bfa3..82ee6f3d12da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1101,6 +1101,23 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
return 0;
 }
 
+int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
+   enum amdgpu_ras_block block)
+{
+   struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
+
+   if (!block_obj || !block_obj->hw_ops)
+   return 0;
+
+   if (!amdgpu_ras_is_supported(adev, block))
+   return 0;
+
+   if (block_obj->hw_ops->reset_ras_error_count)
+   block_obj->hw_ops->reset_ras_error_count(adev);
+
+   return 0;
+}
+
 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
enum amdgpu_ras_block block)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 728f98c6fc1c..deea10b6c184 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -694,6 +694,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device 
*adev);
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info);
 
+int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
+   enum amdgpu_ras_block block);
 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
enum amdgpu_ras_block block);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 70e38b013309..2b7dc490ba6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -914,7 +914,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device 
*adev, struct ras_comm
adev->gmc.xgmi.num_physical_nodes == 0)
return 0;
 
-   adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
 
return amdgpu_ras_block_late_init(adev, ras_block);
 }
@@ -1081,7 +1081,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct 
amdgpu_device *adev,
break;
}
 
-   adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+   amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
 
err_data->ue_count += ue_cnt;
err_data->ce_count += ce_cnt;
-- 
2.35.1

[PATCH 1/2] drm/amdgpu: exit directly if gpu reset fails

2023-09-27 Thread Tao Zhou

No need to perform the full reset operation in case of gpu reset
failure.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5436d7a34014..e4627d92e1d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5075,7 +5075,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
if (r) {
dev_err(tmp_adev->dev, "ASIC reset failed with 
error, %d for drm dev, %s",
 r, adev_to_drm(tmp_adev)->unique);
-   break;
+   goto out;
}
}
 
-- 
2.35.1

[PATCH 2/2] drm/amdgpu: update retry times for psp vmbx wait

2023-09-27 Thread Tao Zhou

Increase the retry loops and replace the constant number with macro.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 54008a8991fc..b7bc00d4c696 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -59,6 +59,9 @@ MODULE_FIRMWARE("amdgpu/psp_14_0_0_ta.bin");
 /* Read USB-PD from LFB */
 #define GFX_CMD_USB_PD_USE_LFB 0x480
 
+/* Retry times for vmbx ready wait */
+#define PSP_VMBX_POLLING_LIMIT 2
+
 /* VBIOS gfl defines */
 #define MBOX_READY_MASK 0x8000
 #define MBOX_STATUS_MASK 0x
@@ -138,7 +141,7 @@ static int psp_v13_0_wait_for_vmbx_ready(struct psp_context 
*psp)
struct amdgpu_device *adev = psp->adev;
int retry_loop, ret;
 
-   for (retry_loop = 0; retry_loop < 70; retry_loop++) {
+   for (retry_loop = 0; retry_loop < PSP_VMBX_POLLING_LIMIT; retry_loop++) 
{
/* Wait for bootloader to signify that is
   ready having bit 31 of C2PMSG_33 set to 1 */
ret = psp_wait_for(
-- 
2.35.1

[PATCH 3/3] drm/amdgpu: change if condition for bad channel bitmap update

2023-09-20 Thread Tao Zhou

The amdgpu_ras_eeprom_control.bad_channel_bitmap is u32 type, but the
channel index could be larger than 32. For the ASICs whose channel
number is more than 32, the amdgpu_dpm_send_hbm_bad_channel_flag
interface is not supported, so we simply bypass channel bitmap update under
this condition.

v2: replace sizeof with BITS_PER_TYPE, we should check bit number
instead of byte number.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 8ced4be784e0..c60d2f79eeef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -616,7 +616,8 @@ amdgpu_ras_eeprom_append_table(struct 
amdgpu_ras_eeprom_control *control,
__encode_table_record_to_buf(control, &record[i], pp);
 
/* update bad channel bitmap */
-   if (!(control->bad_channel_bitmap & (1 << 
record[i].mem_channel))) {
+   if ((record[i].mem_channel < 
BITS_PER_TYPE(control->bad_channel_bitmap)) &&
+   !(control->bad_channel_bitmap & (1 << 
record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << 
record[i].mem_channel;
con->update_channel_flag = true;
}
@@ -969,7 +970,8 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control 
*control,
__decode_table_record_from_buf(control, &record[i], pp);
 
/* update bad channel bitmap */
-   if (!(control->bad_channel_bitmap & (1 << 
record[i].mem_channel))) {
+   if ((record[i].mem_channel < 
BITS_PER_TYPE(control->bad_channel_bitmap)) &&
+   !(control->bad_channel_bitmap & (1 << 
record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << 
record[i].mem_channel;
con->update_channel_flag = true;
}
-- 
2.35.1

[PATCH 2/3] drm/amdgpu: fix value of some UMC parameters for UMC v12

2023-09-20 Thread Tao Zhou

Prepare for bad page retirement.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 4 +++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index a5510412acd0..bae4a0d18190 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1497,12 +1497,14 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device 
*adev)
adev->umc.channel_idx_tbl = 
&umc_v6_7_channel_idx_tbl_second[0][0];
break;
case IP_VERSION(12, 0, 0):
-   adev->umc.max_ras_err_cnt_per_query = 
UMC_V12_0_TOTAL_CHANNEL_NUM(adev);
+   adev->umc.max_ras_err_cnt_per_query =
+   UMC_V12_0_TOTAL_CHANNEL_NUM(adev) * 
UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
adev->umc.channel_inst_num = UMC_V12_0_CHANNEL_INSTANCE_NUM;
adev->umc.umc_inst_num = UMC_V12_0_UMC_INSTANCE_NUM;
adev->umc.node_inst_num /= UMC_V12_0_UMC_INSTANCE_NUM;
adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET;
adev->umc.active_mask = adev->aid_mask;
+   adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0];
if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
adev->umc.ras = &umc_v12_0_ras;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index e3619d67ae3b..4885b9fff272 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -53,6 +53,8 @@
 
 /* one piece of normalized address is mapped to 8 pieces of physical address */
 #define UMC_V12_0_NA_MAP_PA_NUM8
+/* R13 bit shift should be considered, double the number */
+#define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2)
 /* bank bits in MCA error address */
 #define UMC_V12_0_MCA_B0_BIT 6
 #define UMC_V12_0_MCA_B1_BIT 7
-- 
2.35.1

[PATCH 1/3] drm/amdgpu: print channel index for UMC bad page

2023-09-20 Thread Tao Zhou

Print channel index for UMC v12.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index c6742dd863d4..7714c2ef2cdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -240,15 +240,17 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
/* include column bit 0 and 1 */
col &= 0x3;
col |= (column << 2);
-   dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x 
Bank:0x%x\n",
-   retired_page, row, col, bank);
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row, col, bank, channel_index);
amdgpu_umc_fill_error_record(err_data, err_addr,
retired_page, channel_index, umc_inst);
 
/* shift R13 bit */
retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
-   dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x 
Bank:0x%x\n",
-   retired_page, row_xor, col, bank);
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row_xor, col, bank, channel_index);
amdgpu_umc_fill_error_record(err_data, err_addr,
retired_page, channel_index, umc_inst);
}
-- 
2.35.1

[PATCH 2/3] drm/amdgpu: fix value of some UMC parameters for UMC v12

2023-09-19 Thread Tao Zhou

Prepare for bad page retirement.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 4 +++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index a5510412acd0..bae4a0d18190 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1497,12 +1497,14 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device 
*adev)
adev->umc.channel_idx_tbl = 
&umc_v6_7_channel_idx_tbl_second[0][0];
break;
case IP_VERSION(12, 0, 0):
-   adev->umc.max_ras_err_cnt_per_query = 
UMC_V12_0_TOTAL_CHANNEL_NUM(adev);
+   adev->umc.max_ras_err_cnt_per_query =
+   UMC_V12_0_TOTAL_CHANNEL_NUM(adev) * 
UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
adev->umc.channel_inst_num = UMC_V12_0_CHANNEL_INSTANCE_NUM;
adev->umc.umc_inst_num = UMC_V12_0_UMC_INSTANCE_NUM;
adev->umc.node_inst_num /= UMC_V12_0_UMC_INSTANCE_NUM;
adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET;
adev->umc.active_mask = adev->aid_mask;
+   adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0];
if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
adev->umc.ras = &umc_v12_0_ras;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index e3619d67ae3b..4885b9fff272 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -53,6 +53,8 @@
 
 /* one piece of normalized address is mapped to 8 pieces of physical address */
 #define UMC_V12_0_NA_MAP_PA_NUM8
+/* R13 bit shift should be considered, double the number */
+#define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2)
 /* bank bits in MCA error address */
 #define UMC_V12_0_MCA_B0_BIT 6
 #define UMC_V12_0_MCA_B1_BIT 7
-- 
2.35.1

[PATCH 3/3] drm/amdgpu: change if condition for bad channel bitmap update

2023-09-19 Thread Tao Zhou

The amdgpu_ras_eeprom_control.bad_channel_bitmap is u32 type, but the
channel index could be larger than 32. For the ASICs whose channel
number is more than 32, the amdgpu_dpm_send_hbm_bad_channel_flag
interface is not supported, so we simply bypass channel bitmap update under
this condition.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 8ced4be784e0..1c4433f22f4b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -616,7 +616,8 @@ amdgpu_ras_eeprom_append_table(struct 
amdgpu_ras_eeprom_control *control,
__encode_table_record_to_buf(control, &record[i], pp);
 
/* update bad channel bitmap */
-   if (!(control->bad_channel_bitmap & (1 << 
record[i].mem_channel))) {
+   if ((record[i].mem_channel < 
sizeof(control->bad_channel_bitmap)) &&
+   !(control->bad_channel_bitmap & (1 << 
record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << 
record[i].mem_channel;
con->update_channel_flag = true;
}
@@ -969,7 +970,8 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control 
*control,
__decode_table_record_from_buf(control, &record[i], pp);
 
/* update bad channel bitmap */
-   if (!(control->bad_channel_bitmap & (1 << 
record[i].mem_channel))) {
+   if ((record[i].mem_channel < 
sizeof(control->bad_channel_bitmap)) &&
+   !(control->bad_channel_bitmap & (1 << 
record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << 
record[i].mem_channel;
con->update_channel_flag = true;
}
-- 
2.35.1

[PATCH 1/3] drm/amdgpu: print channel index for UMC bad page

2023-09-19 Thread Tao Zhou

Print channel index for UMC v12.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index c6742dd863d4..7714c2ef2cdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -240,15 +240,17 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
/* include column bit 0 and 1 */
col &= 0x3;
col |= (column << 2);
-   dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x 
Bank:0x%x\n",
-   retired_page, row, col, bank);
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row, col, bank, channel_index);
amdgpu_umc_fill_error_record(err_data, err_addr,
retired_page, channel_index, umc_inst);
 
/* shift R13 bit */
retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
-   dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x 
Bank:0x%x\n",
-   retired_page, row_xor, col, bank);
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row_xor, col, bank, channel_index);
amdgpu_umc_fill_error_record(err_data, err_addr,
retired_page, channel_index, umc_inst);
}
-- 
2.35.1

[PATCH 3/3] drm/amdgpu: print more address info of UMC bad page

2023-09-06 Thread Tao Zhou

Print out row, column and bank value of UMC error address for UMC v12.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 5f056dd7691e..6fde85367272 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -173,7 +173,7 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
 {
uint32_t channel_index, i;
uint64_t soc_pa, na, retired_page, column;
-   uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row;
+   uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row, 
row_xor;
uint32_t bank0, bank1, bank2, bank3, bank;
 
bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL;
@@ -228,17 +228,23 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
/* clear [C4] in soc physical address */
soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
 
+   row_xor = row ^ (0x1ULL << 13);
/* loop for all possibilities of [C4 C3 C2] */
for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
-   dev_info(adev->dev, "Error Address(PA): 0x%llx\n", 
retired_page);
+   /* include column bit 0 and 1 */
+   col &= 0x3;
+   col |= (column << 2);
+   dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x 
Bank:0x%x\n",
+   retired_page, row, col, bank);
amdgpu_umc_fill_error_record(err_data, err_addr,
retired_page, channel_index, umc_inst);
 
/* shift R13 bit */
retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
-   dev_info(adev->dev, "Error Address(PA): 0x%llx\n", 
retired_page);
+   dev_info(adev->dev, "Error Address(PA):0x%llx Row:0x%x Col:0x%x 
Bank:0x%x\n",
+   retired_page, row_xor, col, bank);
amdgpu_umc_fill_error_record(err_data, err_addr,
retired_page, channel_index, umc_inst);
}
-- 
2.35.1

[PATCH 2/3] drm/amdgpu: add channel index table for UMC v12

2023-09-06 Thread Tao Zhou

Get UMC phyical channel index according to node id, umc instance and
channel instance.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 14 ++
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h |  5 +
 3 files changed, 20 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index f12c6c7e6204..7af6659ca936 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1498,6 +1498,7 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device 
*adev)
adev->umc.active_mask = adev->aid_mask;
if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
adev->umc.ras = &umc_v12_0_ras;
+   adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0];
default:
break;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 2a135fd8ec15..5f056dd7691e 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -27,6 +27,20 @@
 #include "umc/umc_12_0_0_offset.h"
 #include "umc/umc_12_0_0_sh_mask.h"
 
+const uint32_t
+   umc_v12_0_channel_idx_tbl[]
+   [UMC_V12_0_UMC_INSTANCE_NUM]
+   [UMC_V12_0_CHANNEL_INSTANCE_NUM] = {
+   {{3,   7,   11,  15,  2,   6,   10,  14},  {1,   5,   9,   13,  
0,   4,   8,   12},
+{19,  23,  27,  31,  18,  22,  26,  30},  {17,  21,  25,  29,  
16,  20,  24,  28}},
+   {{47,  43,  39,  35,  46,  42,  38,  34},  {45,  41,  37,  33,  
44,  40,  36,  32},
+{63,  59,  55,  51,  62,  58,  54,  50},  {61,  57,  53,  49,  
60,  56,  52,  48}},
+   {{79,  75,  71,  67,  78,  74,  70,  66},  {77,  73,  69,  65,  
76,  72,  68,  64},
+{95,  91,  87,  83,  94,  90,  86,  82},  {93,  89,  85,  81,  
92,  88,  84,  80}},
+   {{99,  103, 107, 111, 98,  102, 106, 110}, {97,  101, 105, 109, 
96,  100, 104, 108},
+{115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 
112, 116, 120, 124}}
+   };
+
 /* mapping of MCA error address to normalized address */
 static const uint32_t umc_v12_0_ma2na_mapping[] = {
0,  5,  6,  8,  9,  14, 12, 13,
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index c20b4b4cbfda..e8d358ed8e61 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -115,6 +115,11 @@
 #define GET_CROSS_NODE_ADDR(reg) \
reg) >> 32) & 0x3) ? ((reg) | (1ULL << 34)) : (reg))
 
+extern const uint32_t
+   umc_v12_0_channel_idx_tbl[]
+   [UMC_V12_0_UMC_INSTANCE_NUM]
+   [UMC_V12_0_CHANNEL_INSTANCE_NUM];
+
 extern struct amdgpu_umc_ras umc_v12_0_ras;
 
 #endif
-- 
2.35.1

[PATCH 1/3] drm/amdgpu: add address conversion for UMC v12

2023-09-06 Thread Tao Zhou

Convert MCA error address to physical address and find out all pages in
one physical row.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  5 ++
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 97 -
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h  | 64 
 3 files changed, 162 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 43321f57f557..417a6726c71b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -32,6 +32,11 @@
  * is the index of 8KB block
  */
 #define ADDR_OF_8KB_BLOCK(addr)(((addr) & ~0xffULL) << 
5)
+/*
+ * (addr / 256) * 32768, the higher 26 bits in ErrorAddr
+ * is the index of 8KB block
+ */
+#define ADDR_OF_32KB_BLOCK(addr)   (((addr) & ~0xffULL) << 
7)
 /* channel index is the index of 256B block */
 #define ADDR_OF_256B_BLOCK(channel_index)  ((channel_index) << 8)
 /* offset in 256B block */
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 292159814340..2a135fd8ec15 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -27,6 +27,14 @@
 #include "umc/umc_12_0_0_offset.h"
 #include "umc/umc_12_0_0_sh_mask.h"
 
+/* mapping of MCA error address to normalized address */
+static const uint32_t umc_v12_0_ma2na_mapping[] = {
+   0,  5,  6,  8,  9,  14, 12, 13,
+   10, 11, 15, 16, 17, 18, 19, 20,
+   21, 22, 23, 24, 25, 26, 27, 28,
+   24, 7,  29, 30,
+};
+
 static inline uint32_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
uint32_t node_inst,
uint32_t umc_inst,
@@ -133,12 +141,93 @@ static void umc_v12_0_query_ras_error_count(struct 
amdgpu_device *adev,
umc_v12_0_reset_error_count(adev);
 }
 
-static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
-   struct ras_err_data *err_data, 
uint64_t err_addr,
-   uint32_t ch_inst, uint32_t umc_inst,
-   uint32_t node_inst, uint64_t 
mc_umc_status)
+static bool umc_v12_0_bit_wise_xor(uint32_t val)
 {
+   bool result = 0;
+   int i;
 
+   for (i = 0; i < 32; i++)
+   result = result ^ ((val >> i) & 0x1);
+
+   return result;
+}
+
+static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
+   struct ras_err_data *err_data, uint64_t 
err_addr,
+   uint32_t ch_inst, uint32_t umc_inst,
+   uint32_t node_inst, uint64_t 
mc_umc_status)
+{
+   uint32_t channel_index, i;
+   uint64_t soc_pa, na, retired_page, column;
+   uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row;
+   uint32_t bank0, bank1, bank2, bank3, bank;
+
+   bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL;
+   bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL;
+   bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL;
+   bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL;
+   col = (err_addr >> 1) & 0x1fULL;
+   row = (err_addr >> 10) & 0x3fffULL;
+
+   /* apply bank hash algorithm */
+   bank0 =
+   bank_hash0 ^ (UMC_V12_0_XOR_EN0 &
+   (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^
+   (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0;
+   bank1 =
+   bank_hash1 ^ (UMC_V12_0_XOR_EN1 &
+   (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^
+   (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1;
+   bank2 =
+   bank_hash2 ^ (UMC_V12_0_XOR_EN2 &
+   (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^
+   (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2;
+   bank3 =
+   bank_hash3 ^ (UMC_V12_0_XOR_EN3 &
+   (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^
+   (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3;
+
+   bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3);
+   err_addr &= ~0x3c0ULL;
+   err_addr |= (bank << UMC_V12_0_MCA_B0_BIT);
+
+   na = 0x0;
+   /* convert mca error address to normalized address */
+   for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++)
+   na |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i];
+
+   channel_index =
+   adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
+

[PATCH] drm/amdgpu: use read-modify-write mode for gfx v9_4_3 SQ setting

2023-08-25 Thread Tao Zhou

Instead of using direct update, avoid touching unrelated fields.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index b4fdb269f856..f24a5474db35 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -4042,7 +4042,8 @@ static void gfx_v9_4_3_inst_enable_watchdog_timer(struct 
amdgpu_device *adev,
uint32_t i;
uint32_t data;
 
-   data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
+   data = RREG32_SOC15(GC, GET_INST(GC, 0), regSQ_TIMEOUT_CONFIG);
+   data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
 amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : 
0);
 
if (amdgpu_watchdog_timer.timeout_fatal_disable &&
-- 
2.35.1

[PATCH] drm/amdgpu: add RAS fatal error handler for NBIO v7.9

2023-08-07 Thread Tao Zhou

Register RAS fatal error interrupt and add handler.

v2: only register NBIO RAS for dGPU platform.
change nbio_v7_9_set_ras_controller_irq_state and 
nbio_v7_9_set_ras_err_event_athub_irq_state
to dummy functions.

Signed-off-by: Tao Zhou 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c  | 187 
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h  |   1 +
 3 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bb29cb57add5..00658c2816dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -35,6 +35,7 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "nbio_v4_3.h"
+#include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
 
@@ -2644,6 +2645,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 * check DF RAS */
adev->nbio.ras = &nbio_v4_3_ras;
break;
+   case IP_VERSION(7, 9, 0):
+   if (!adev->gmc.is_app_apu)
+   adev->nbio.ras = &nbio_v7_9_ras;
+   break;
default:
/* nbio ras is not available */
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index 1d1ab188ef15..781f98655567 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -471,3 +471,190 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
.init_registers = nbio_v7_9_init_registers,
.get_pcie_replay_count = nbio_v7_9_get_pcie_replay_count,
 };
+
+static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev,
+   void *ras_error_status)
+{
+   return;
+}
+
+static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct 
amdgpu_device *adev)
+{
+   uint32_t bif_doorbell_intr_cntl;
+   struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
+   struct ras_err_data err_data = {0, 0, 0, NULL};
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+
+   if (REG_GET_FIELD(bif_doorbell_intr_cntl,
+   BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) {
+   /* driver has to clear the interrupt status when bif ring is 
disabled */
+   bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
+   BIF_BX0_BIF_DOORBELL_INT_CNTL,
+   RAS_CNTLR_INTERRUPT_CLEAR, 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_intr_cntl);
+
+   if (!ras->disable_ras_err_cnt_harvest) {
+   /*
+* clear error status after ras_controller_intr
+* according to hw team and count ue number
+* for query
+*/
+   nbio_v7_9_query_ras_error_count(adev, &err_data);
+
+   /* logging on error cnt and printing for awareness */
+   obj->err_data.ue_count += err_data.ue_count;
+   obj->err_data.ce_count += err_data.ce_count;
+
+   if (err_data.ce_count)
+   dev_info(adev->dev, "%ld correctable hardware "
+   "errors detected in %s block, "
+   "no user action is needed.\n",
+   obj->err_data.ce_count,
+   
get_ras_block_str(adev->nbio.ras_if));
+
+   if (err_data.ue_count)
+   dev_info(adev->dev, "%ld uncorrectable hardware 
"
+   "errors detected in %s block\n",
+   obj->err_data.ue_count,
+   
get_ras_block_str(adev->nbio.ras_if));
+   }
+
+   dev_info(adev->dev, "RAS controller interrupt triggered "
+   "by NBIF error\n");
+
+   /* ras_controller_int is dedicated for nbif ras error,
+* not the global interrupt for sync flood
+*/
+   amdgpu_ras_reset_gpu(adev);
+   }
+}
+
+static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct 
amdgpu_device *adev)
+{

[PATCH] drm/amdgpu: add RAS fatal error handler for NBIO v7.9

2023-08-06 Thread Tao Zhou

Register RAS fatal error interrupt and add handler.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   4 +
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c  | 219 
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h  |   1 +
 3 files changed, 224 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 832fa646b38f..bef0f9264b4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -35,6 +35,7 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "nbio_v4_3.h"
+#include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
 
@@ -2663,6 +2664,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 * check DF RAS */
adev->nbio.ras = &nbio_v4_3_ras;
break;
+   case IP_VERSION(7, 9, 0):
+   adev->nbio.ras = &nbio_v7_9_ras;
+   break;
default:
/* nbio ras is not available */
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index cd1a02d30420..cc2268b871e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -451,3 +451,222 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
.get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode,
.init_registers = nbio_v7_9_init_registers,
 };
+
+static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev,
+   void *ras_error_status)
+{
+   return;
+}
+
+static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct 
amdgpu_device *adev)
+{
+   uint32_t bif_doorbell_intr_cntl;
+   struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
+   struct ras_err_data err_data = {0, 0, 0, NULL};
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+
+   if (REG_GET_FIELD(bif_doorbell_intr_cntl,
+   BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) {
+   /* driver has to clear the interrupt status when bif ring is 
disabled */
+   bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
+   BIF_BX0_BIF_DOORBELL_INT_CNTL,
+   RAS_CNTLR_INTERRUPT_CLEAR, 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_intr_cntl);
+
+   if (!ras->disable_ras_err_cnt_harvest) {
+   /*
+* clear error status after ras_controller_intr
+* according to hw team and count ue number
+* for query
+*/
+   nbio_v7_9_query_ras_error_count(adev, &err_data);
+
+   /* logging on error cnt and printing for awareness */
+   obj->err_data.ue_count += err_data.ue_count;
+   obj->err_data.ce_count += err_data.ce_count;
+
+   if (err_data.ce_count)
+   dev_info(adev->dev, "%ld correctable hardware "
+   "errors detected in %s block, "
+   "no user action is needed.\n",
+   obj->err_data.ce_count,
+   
get_ras_block_str(adev->nbio.ras_if));
+
+   if (err_data.ue_count)
+   dev_info(adev->dev, "%ld uncorrectable hardware 
"
+   "errors detected in %s block\n",
+   obj->err_data.ue_count,
+   
get_ras_block_str(adev->nbio.ras_if));
+   }
+
+   dev_info(adev->dev, "RAS controller interrupt triggered "
+   "by NBIF error\n");
+
+   /* ras_controller_int is dedicated for nbif ras error,
+* not the global interrupt for sync flood
+*/
+   amdgpu_ras_reset_gpu(adev);
+   }
+}
+
+static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct 
amdgpu_device *adev)
+{
+   uint32_t bif_doorbell_intr_cntl;
+
+   bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+
+   if (REG_GET_FIELD(bif_doorbell_intr_cntl,
+   BIF_BX0_BIF_DOORBELL_INT_CNTL, 
RAS_ATHUB_ERR_EVENT_INTERRU

[PATCH] drm/amdgpu: add watchdog timer enablement for gfx_v9_4_3

2023-07-06 Thread Tao Zhou

Configure SQ watchdog timer setting.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 38 +
 1 file changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 9e3b835bdbb2..590b0fa62ccc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -2197,6 +2197,10 @@ static int gfx_v9_4_3_late_init(void *handle)
if (r)
return r;
 
+   if (adev->gfx.ras &&
+   adev->gfx.ras->enable_watchdog_timer)
+   adev->gfx.ras->enable_watchdog_timer(adev);
+
return 0;
 }
 
@@ -4043,6 +4047,34 @@ static void gfx_v9_4_3_inst_reset_ras_err_status(struct 
amdgpu_device *adev,
gfx_v9_4_3_inst_reset_sq_timeout_status(adev, xcc_id);
 }
 
+static void gfx_v9_4_3_inst_enable_watchdog_timer(struct amdgpu_device *adev,
+   void *ras_error_status, int xcc_id)
+{
+   uint32_t i;
+   uint32_t data;
+
+   data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
+amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : 
0);
+
+   if (amdgpu_watchdog_timer.timeout_fatal_disable &&
+   (amdgpu_watchdog_timer.period < 1 ||
+amdgpu_watchdog_timer.period > 0x23)) {
+   dev_warn(adev->dev, "Watchdog period range is 1 to 0x23\n");
+   amdgpu_watchdog_timer.period = 0x23;
+   }
+   data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL,
+amdgpu_watchdog_timer.period);
+
+   mutex_lock(&adev->grbm_idx_mutex);
+   for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
+   gfx_v9_4_3_xcc_select_se_sh(adev, i, 0x, 0x, 
xcc_id);
+   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_TIMEOUT_CONFIG, 
data);
+   }
+   gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x,
+   xcc_id);
+   mutex_unlock(&adev->grbm_idx_mutex);
+}
+
 static void gfx_v9_4_3_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
 {
@@ -4065,6 +4097,11 @@ static void gfx_v9_4_3_reset_ras_error_status(struct 
amdgpu_device *adev)
amdgpu_gfx_ras_error_func(adev, NULL, 
gfx_v9_4_3_inst_reset_ras_err_status);
 }
 
+static void gfx_v9_4_3_enable_watchdog_timer(struct amdgpu_device *adev)
+{
+   amdgpu_gfx_ras_error_func(adev, NULL, 
gfx_v9_4_3_inst_enable_watchdog_timer);
+}
+
 static const struct amd_ip_funcs gfx_v9_4_3_ip_funcs = {
.name = "gfx_v9_4_3",
.early_init = gfx_v9_4_3_early_init,
@@ -4393,4 +4430,5 @@ struct amdgpu_gfx_ras gfx_v9_4_3_ras = {
.ras_block = {
.hw_ops = &gfx_v9_4_3_ras_ops,
},
+   .enable_watchdog_timer = &gfx_v9_4_3_enable_watchdog_timer,
 };
-- 
2.35.1

[PATCH] drm/amdgpu: skip address adjustment for GFX RAS injection

2023-06-29 Thread Tao Zhou

The address parameter of GFX RAS injection isn't related to XGMI node
number, keep it unchanged.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 046659bd4f9e..5371fbd3fe17 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1163,7 +1163,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
}
 
/* Calculate XGMI relative offset */
-   if (adev->gmc.xgmi.num_physical_nodes > 1) {
+   if (adev->gmc.xgmi.num_physical_nodes > 1 &&
+   info->head.block != AMDGPU_RAS_BLOCK__GFX) {
block_info.address =
amdgpu_xgmi_get_relative_phy_addr(adev,
  block_info.address);
-- 
2.35.1

[PATCH] drm/amdgpu: check RAS irq existence for VCN/JPEG

2023-06-20 Thread Tao Zhou

No RAS irq is allowed.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 3add4b4f0667..2ff2897fd1db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -255,7 +255,8 @@ int amdgpu_jpeg_ras_late_init(struct amdgpu_device *adev, 
struct ras_common_if *
 
if (amdgpu_ras_is_supported(adev, ras_block->block)) {
for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) {
-   if (adev->jpeg.harvest_config & (1 << i))
+   if (adev->jpeg.harvest_config & (1 << i) ||
+   !adev->jpeg.inst[i].ras_poison_irq.funcs)
continue;
 
r = amdgpu_irq_get(adev, 
&adev->jpeg.inst[i].ras_poison_irq, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 19a3bb5dd29a..d37ebd4402ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1216,7 +1216,8 @@ int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, 
struct ras_common_if *r
 
if (amdgpu_ras_is_supported(adev, ras_block->block)) {
for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
-   if (adev->vcn.harvest_config & (1 << i))
+   if (adev->vcn.harvest_config & (1 << i) ||
+   !adev->vcn.inst[i].ras_poison_irq.funcs)
continue;
 
r = amdgpu_irq_get(adev, 
&adev->vcn.inst[i].ras_poison_irq, 0);
-- 
2.35.1

[PATCH] drm/amdgpu: remove unused definition

2023-05-18 Thread Tao Zhou

mmhub_v1_8_mmea_cgtt_clk_cntl_reg is defined but not used.

Reported-by: kernel test robot 
Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index 3648994724c2..00e7e5db7c28 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -757,14 +757,6 @@ static void mmhub_v1_8_query_ras_error_status(struct 
amdgpu_device *adev)
mmhub_v1_8_inst_query_ras_err_status(adev, i);
 }
 
-static const uint32_t mmhub_v1_8_mmea_cgtt_clk_cntl_reg[] = {
-   regMMEA0_CGTT_CLK_CTRL,
-   regMMEA1_CGTT_CLK_CTRL,
-   regMMEA2_CGTT_CLK_CTRL,
-   regMMEA3_CGTT_CLK_CTRL,
-   regMMEA4_CGTT_CLK_CTRL,
-};
-
 static void mmhub_v1_8_inst_reset_ras_err_status(struct amdgpu_device *adev,
 uint32_t mmhub_inst)
 {
-- 
2.35.1

[PATCH 2/2] drm/amdgpu: add bad_page_threshold check in ras_eeprom_check_err

2023-02-21 Thread Tao Zhou

bad_page_threshold controls page retirement behavior and it should be
also checked.

v2: simplify the condition of bad page handling path.

Signed-off-by: Tao Zhou 
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 9d370465b08d..2e08fce87521 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -417,7 +417,8 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct 
amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
-   if (!__is_ras_eeprom_supported(adev))
+   if (!__is_ras_eeprom_supported(adev) ||
+   !amdgpu_bad_page_threshold)
return false;
 
/* skip check eeprom table for VEGA20 Gaming */
@@ -428,10 +429,18 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct 
amdgpu_device *adev)
return false;
 
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
-   dev_warn(adev->dev, "This GPU is in BAD status.");
-   dev_warn(adev->dev, "Please retire it or set a larger "
-"threshold value when reloading driver.\n");
-   return true;
+   if (amdgpu_bad_page_threshold == -1) {
+   dev_warn(adev->dev, "RAS records:%d exceed 
threshold:%d",
+   con->eeprom_control.ras_num_recs, 
con->bad_page_cnt_threshold);
+   dev_warn(adev->dev,
+   "But GPU can be operated due to 
bad_page_threshold = -1.\n");
+   return false;
+   } else {
+   dev_warn(adev->dev, "This GPU is in BAD status.");
+   dev_warn(adev->dev, "Please retire it or set a larger "
+"threshold value when reloading driver.\n");
+   return true;
+   }
}
 
return false;
-- 
2.35.1

[PATCH 1/2] drm/amdgpu: change default behavior of bad_page_threshold parameter

2023-02-21 Thread Tao Zhou

Ignore ras umc bad page threshold by default, GPU initialization won't
be stopped in this mode.

v2: refine the description of bad_page_threshold.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 7 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 6c2fe50b528e..8a375394db0c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -921,7 +921,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 
0444);
  * result in the GPU entering bad status when the number of total
  * faulty pages by ECC exceeds the threshold value.
  */
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default 
value), 0 = disable bad page retirement, -2 = ignore bad page threshold)");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold 
(default value), 0 = disable bad page retirement, -2 = driver sets threshold)");
 module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
 
 MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup 
(8 if set to greater than 8 or less than 0, only affect gfx 8+)");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5c02c6c9f773..63dfcc98152d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2196,11 +2196,12 @@ static void amdgpu_ras_validate_threshold(struct 
amdgpu_device *adev,
/*
 * Justification of value bad_page_cnt_threshold in ras structure
 *
-* Generally, -1 <= amdgpu_bad_page_threshold <= max record length
-* in eeprom, and introduce two scenarios accordingly.
+* Generally, 0 <= amdgpu_bad_page_threshold <= max record length
+* in eeprom or amdgpu_bad_page_threshold == -2, introduce two
+* scenarios accordingly.
 *
 * Bad page retirement enablement:
-*- If amdgpu_bad_page_threshold = -1,
+*- If amdgpu_bad_page_threshold = -2,
 *  bad_page_cnt_threshold = typical value by formula.
 *
 *- When the value from user is 0 < amdgpu_bad_page_threshold <
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 2d9f3f4cd79e..9d370465b08d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1191,8 +1191,8 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
} else {
dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
control->ras_num_recs, 
ras->bad_page_cnt_threshold);
-   if (amdgpu_bad_page_threshold == -2) {
-   dev_warn(adev->dev, "GPU will be initialized 
due to bad_page_threshold = -2.");
+   if (amdgpu_bad_page_threshold == -1) {
+   dev_warn(adev->dev, "GPU will be initialized 
due to bad_page_threshold = -1.");
res = 0;
} else {
*exceed_err_limit = true;
-- 
2.35.1

1 2 3 >

1 - 100 of 284 matches

Mail list logo