RE: [PATCH 2/3] drm/amdgpu: set poison supported flag for RAS (v2)

2021-09-22 Thread Zhang, Hawking
[AMD Official Use Only]

Might be better call the function is_poison_mode_supported. Other than that the 
series is

Reviewed-by: Hawking Zhang 

-Original Message-
From: Zhou1, Tao  
Sent: Wednesday, September 22, 2021 18:33
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Clements, John ; Yang, Stanley 
Cc: Zhou1, Tao 
Subject: [PATCH 2/3] drm/amdgpu: set poison supported flag for RAS (v2)

Add RAS poison supported flag and tell PSP RAS TA about the info.

v2: rename poison_mode to poison_supported, we can also disable poison mode 
even we support it.
print poison_supported value if ras feature enablement fails.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  4 ++--  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 +++--  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 7d09b28889af..c5cf84829ea8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1442,9 +1442,9 @@ static int psp_ras_initialize(struct psp_context *psp)
ras_cmd = (struct ta_ras_shared_memory 
*)psp->ras_context.context.mem_context.shared_buf;
memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
 
-   if (psp->adev->gmc.xgmi.connected_to_cpu)
+   if (amdgpu_ras_is_poison_supported(adev))
ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
-   else
+   if (!adev->gmc.xgmi.connected_to_cpu)
ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
 
ret = psp_ras_load(psp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 912ea1f9fd04..5b362e944541 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -710,10 +710,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (!amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(>psp, info, enable);
if (ret) {
-   dev_err(adev->dev, "ras %s %s failed %d\n",
+   dev_err(adev->dev, "ras %s %s failed poison:%d 
ret:%d\n",
enable ? "enable":"disable",
get_ras_block_str(head),
-   ret);
+   amdgpu_ras_is_poison_supported(adev), ret);
goto out;
}
}
@@ -2251,6 +2251,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int r;
+   bool df_poison, umc_poison;
 
if (con)
return 0;
@@ -2321,6 +2322,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
 
+   /* Init poison supported flag, the default value is false */
+   if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras_funcs &&
+   adev->umc.ras_funcs->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras_funcs->query_ras_poison_mode(adev);
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev, "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
@@ -2364,6 +2382,16 @@ static int amdgpu_persistent_edc_harvesting(struct 
amdgpu_device *adev,
return 0;
 }
 
+bool amdgpu_ras_is_poison_supported(struct amdgpu_device *adev) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   if (!con)
+   return false;
+
+   return con->poison_supported;
+}
+
 /* helper function to handle common stuff in ip late init phase */  int 
amdgpu_ras_late_init(struct amdgpu_device *adev,
 struct ras_common_if *ras_block,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ec42e9873aaa..d6377e1ad20a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -352,6 +352,9 @@ struct amdgpu_ras {
/* disable ras error count harvest in recovery */
bool disable_ras_err_cnt_harvest;
 
+   /* is poison mode supported */
+   b

[PATCH 2/3] drm/amdgpu: set poison supported flag for RAS (v2)

2021-09-22 Thread Tao Zhou
Add RAS poison supported flag and tell PSP RAS TA about the info.

v2: rename poison_mode to poison_supported, we can also disable poison
mode even we support it.
print poison_supported value if ras feature enablement fails.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 7d09b28889af..c5cf84829ea8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1442,9 +1442,9 @@ static int psp_ras_initialize(struct psp_context *psp)
ras_cmd = (struct ta_ras_shared_memory 
*)psp->ras_context.context.mem_context.shared_buf;
memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
 
-   if (psp->adev->gmc.xgmi.connected_to_cpu)
+   if (amdgpu_ras_is_poison_supported(adev))
ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
-   else
+   if (!adev->gmc.xgmi.connected_to_cpu)
ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
 
ret = psp_ras_load(psp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 912ea1f9fd04..5b362e944541 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -710,10 +710,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (!amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(>psp, info, enable);
if (ret) {
-   dev_err(adev->dev, "ras %s %s failed %d\n",
+   dev_err(adev->dev, "ras %s %s failed poison:%d 
ret:%d\n",
enable ? "enable":"disable",
get_ras_block_str(head),
-   ret);
+   amdgpu_ras_is_poison_supported(adev), ret);
goto out;
}
}
@@ -2251,6 +2251,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int r;
+   bool df_poison, umc_poison;
 
if (con)
return 0;
@@ -2321,6 +2322,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
 
+   /* Init poison supported flag, the default value is false */
+   if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras_funcs &&
+   adev->umc.ras_funcs->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras_funcs->query_ras_poison_mode(adev);
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev, "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
@@ -2364,6 +2382,16 @@ static int amdgpu_persistent_edc_harvesting(struct 
amdgpu_device *adev,
return 0;
 }
 
+bool amdgpu_ras_is_poison_supported(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   if (!con)
+   return false;
+
+   return con->poison_supported;
+}
+
 /* helper function to handle common stuff in ip late init phase */
 int amdgpu_ras_late_init(struct amdgpu_device *adev,
 struct ras_common_if *ras_block,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ec42e9873aaa..d6377e1ad20a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -352,6 +352,9 @@ struct amdgpu_ras {
/* disable ras error count harvest in recovery */
bool disable_ras_err_cnt_harvest;
 
+   /* is poison mode supported */
+   bool poison_supported;
+
/* RAS count errors delayed work */
struct delayed_work ras_counte_delay_work;
atomic_t ras_ue_count;
@@ -649,4 +652,6 @@ int amdgpu_persistent_edc_harvesting_supported(struct 
amdgpu_device *adev);
 
 const char *get_ras_block_str(struct ras_common_if *ras_block);
 
+bool amdgpu_ras_is_poison_supported(struct amdgpu_device *adev);
+
 #endif
-- 
2.17.1