RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]

OK


-
Best Regards,
Thomas

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Thursday, January 18, 2024 11:00 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Yang, Stanley 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 4:21 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 
smu_context *smu, enum amdgpu_

ret = smu_cmn_send_smc_msg(smu, msg, count);
if (ret) {
+   dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, 
ret:%d\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   ret);
*count = 0;
return ret;
}

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);


[Kevin]:
Please make following function public then use this helper function to get msg 
name string.
- smu_get_message_name()

Best Regards,
Kevin
+
return 0;
 }

--
2.34.1




RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Wang, Yang(Kevin)
[AMD Official Use Only - General]

-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 4:21 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 
smu_context *smu, enum amdgpu_

ret = smu_cmn_send_smc_msg(smu, msg, count);
if (ret) {
+   dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, 
ret:%d\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   ret);
*count = 0;
return ret;
}

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);


[Kevin]:
Please make following function public then use this helper function to get msg 
name string.
- smu_get_message_name()

Best Regards,
Kevin
+
return 0;
 }

--
2.34.1



RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]

OK


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Wednesday, January 17, 2024 7:40 PM
To: Zhang, Hawking ; Chai, Thomas ; 
amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Li, Candice 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

Please ignore my first comment. It doesn't necessarily associated with socket  
id in UMC MCA status log at this stage.

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of Zhang, 
Hawking
Sent: Wednesday, January 17, 2024 19:12
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Li, Candice 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

[AMD Official Use Only - General]

+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,

Please also print out socket id for UMC MCA status.

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);
+

This seems redundant or was added for debugging purpose. We can drop this print 
since there is log to cover failures.

Regards,
Hawking


-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6

RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Zhang, Hawking
[AMD Official Use Only - General]

Please ignore my first comment. It doesn't necessarily associated with socket  
id in UMC MCA status log at this stage.

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of Zhang, 
Hawking
Sent: Wednesday, January 17, 2024 19:12
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Li, Candice 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

[AMD Official Use Only - General]

+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,

Please also print out socket id for UMC MCA status.

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);
+

This seems redundant or was added for debugging purpose. We can drop this print 
since there is log to cover failures.

Regards,
Hawking


-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 
smu_context *smu, enum amdgpu_

ret = smu_cmn_send_smc_msg(smu, msg, count);
if (ret) {
+   dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, 
ret:%d\n",
+   (msg == 

RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Zhang, Hawking
[AMD Official Use Only - General]

+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,

Please also print out socket id for UMC MCA status.

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);
+

This seems redundant or was added for debugging purpose. We can drop this print 
since there is log to cover failures.

Regards,
Hawking


-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 
smu_context *smu, enum amdgpu_

ret = smu_cmn_send_smc_msg(smu, msg, count);
if (ret) {
+   dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, 
ret:%d\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   ret);
*count = 0;
return ret;
}

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);
+