RE: [PATCH v3] drm/amdgpu: Move mca debug mode decision to ras

2023-11-16 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Lazar, Lijo 
Sent: Friday, November 10, 2023 15:56
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Wang, Yang(Kevin) 
Subject: [PATCH v3] drm/amdgpu: Move mca debug mode decision to ras

Refactor code such that ras block decides the default mca debug mode, and not 
swsmu block.

By default mca debug mode is set to false.

Signed-off-by: Lijo Lazar 
---
v3: Default mca debug mode is set to false

v2: Set mca debug mode early before ras block late init as ras query is 
initiated during late init of ras blocks (KevinYang)

 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 14 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  2 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 12 
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index cf33eb219e25..54f2f346579e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -377,7 +377,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 
val)
struct amdgpu_device *adev = (struct amdgpu_device *)data;
int ret;

-   ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false);
+   ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
if (ret)
return ret;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 84e5987b14e0..6747fbe4feab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3132,6 +3132,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;

+   amdgpu_ras_set_mca_debug_mode(adev, false);
+
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
dev_warn(adev->dev, "Warning: abnormal ras list 
node.\n"); @@ -3405,12 +3407,18 @@ int amdgpu_ras_reset_gpu(struct 
amdgpu_device *adev)
return 0;
 }

-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool
+enable)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   int ret;

-   if (con)
-   con->is_mca_debug_mode = enable;
+   if (con) {
+   ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
+   if (!ret)
+   con->is_mca_debug_mode = enable;
+   }
+
+   return ret;
 }

 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 19161916ac46..6a941eb8fb8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -773,7 +773,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);

 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras 
*ras_con);

-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool
+enable);
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);  bool 
amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
 unsigned int *mode);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 6cbfb25a05de..f09f56efbdc3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1516,7 +1516,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct 
smu_context *smu, bool enable)
if (smu->smc_fw_version < 0x554800)
return 0;

-   amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
   enable ? 0 : 
ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
   NULL);
@@ -2338,16 +2337,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct 
smu_context *smu,
return ret;
 }

-static int smu_v13_0_6_post_init(struct smu_context *smu) -{
-   struct amdgpu_device *adev = smu->adev;
-
-   if (!amdgpu_sriov_vf(adev) && adev->ras_enabled)
-   return smu_v13_0_6_mca_set_debug_mode(smu, false);
-
-   return 0;
-}
-
 static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)  {
struct smu_context *smu = adev->powerplay.pp_handle; @@ -2904,7 +2893,6 
@@ static const struct pp

Re: [PATCH v3] drm/amdgpu: Move mca debug mode decision to ras

2023-11-16 Thread Lazar, Lijo



On 11/10/2023 1:25 PM, Lijo Lazar wrote:

Refactor code such that ras block decides the default mca debug mode,
and not swsmu block.

By default mca debug mode is set to false.

Signed-off-by: Lijo Lazar 
---
v3: Default mca debug mode is set to false

v2: Set mca debug mode early before ras block late init as ras query is
initiated during late init of ras blocks (KevinYang)

  drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c|  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 14 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  2 +-
  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 12 
  4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index cf33eb219e25..54f2f346579e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -377,7 +377,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 
val)
struct amdgpu_device *adev = (struct amdgpu_device *)data;
int ret;
  
-	ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false);

+   ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
if (ret)
return ret;
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 84e5987b14e0..6747fbe4feab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3132,6 +3132,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
  
+	amdgpu_ras_set_mca_debug_mode(adev, false);

+
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
dev_warn(adev->dev, "Warning: abnormal ras list 
node.\n");
@@ -3405,12 +3407,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
return 0;
  }
  
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)

+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
  {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   int ret;
  
-	if (con)

-   con->is_mca_debug_mode = enable;
+   if (con) {
+   ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
+   if (!ret)
+   con->is_mca_debug_mode = enable;
+   }
+
+   return ret;
  }
  
  bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 19161916ac46..6a941eb8fb8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -773,7 +773,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);
  
  int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con);
  
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);

+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
  bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
  bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
 unsigned int *mode);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 6cbfb25a05de..f09f56efbdc3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1516,7 +1516,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct 
smu_context *smu, bool enable)
if (smu->smc_fw_version < 0x554800)
return 0;
  
-	amdgpu_ras_set_mca_debug_mode(smu->adev, enable);

return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
   enable ? 0 : 
ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
   NULL);
@@ -2338,16 +2337,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct 
smu_context *smu,
return ret;
  }
  
-static int smu_v13_0_6_post_init(struct smu_context *smu)

-{
-   struct amdgpu_device *adev = smu->adev;
-
-   if (!amdgpu_sriov_vf(adev) && adev->ras_enabled)
-   return smu_v13_0_6_mca_set_debug_mode(smu, false);
-
-   return 0;
-}
-
  static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
  {
struct smu_context *smu = adev->powerplay.pp_handle;
@@ -2904,7 +2893,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = 
{
.i2c_init = smu_v13_0_6_i2c_control_init,
.i2c_fini = smu_v13_0_6_i2c_control_fini,
.send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
-   .post_init = smu_v13_0_6_post_init,
  };
  
  void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)


[PATCH v3] drm/amdgpu: Move mca debug mode decision to ras

2023-11-09 Thread Lijo Lazar
Refactor code such that ras block decides the default mca debug mode,
and not swsmu block.

By default mca debug mode is set to false.

Signed-off-by: Lijo Lazar 
---
v3: Default mca debug mode is set to false

v2: Set mca debug mode early before ras block late init as ras query is
initiated during late init of ras blocks (KevinYang)

 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 14 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  2 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 12 
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index cf33eb219e25..54f2f346579e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -377,7 +377,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 
val)
struct amdgpu_device *adev = (struct amdgpu_device *)data;
int ret;
 
-   ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false);
+   ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
if (ret)
return ret;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 84e5987b14e0..6747fbe4feab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3132,6 +3132,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
 
+   amdgpu_ras_set_mca_debug_mode(adev, false);
+
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
dev_warn(adev->dev, "Warning: abnormal ras list 
node.\n");
@@ -3405,12 +3407,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
return 0;
 }
 
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   int ret;
 
-   if (con)
-   con->is_mca_debug_mode = enable;
+   if (con) {
+   ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
+   if (!ret)
+   con->is_mca_debug_mode = enable;
+   }
+
+   return ret;
 }
 
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 19161916ac46..6a941eb8fb8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -773,7 +773,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);
 
 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras 
*ras_con);
 
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
 bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
 unsigned int *mode);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 6cbfb25a05de..f09f56efbdc3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1516,7 +1516,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct 
smu_context *smu, bool enable)
if (smu->smc_fw_version < 0x554800)
return 0;
 
-   amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
   enable ? 0 : 
ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
   NULL);
@@ -2338,16 +2337,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct 
smu_context *smu,
return ret;
 }
 
-static int smu_v13_0_6_post_init(struct smu_context *smu)
-{
-   struct amdgpu_device *adev = smu->adev;
-
-   if (!amdgpu_sriov_vf(adev) && adev->ras_enabled)
-   return smu_v13_0_6_mca_set_debug_mode(smu, false);
-
-   return 0;
-}
-
 static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
 {
struct smu_context *smu = adev->powerplay.pp_handle;
@@ -2904,7 +2893,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = 
{
.i2c_init = smu_v13_0_6_i2c_control_init,
.i2c_fini = smu_v13_0_6_i2c_control_fini,
.send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
-   .post_init = smu_v13_0_6_post_init,
 };
 
 void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)
-- 
2.25.1