RE: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

2022-01-27 Thread Yu, Lang
[Public]

>  Should we set *flags = 0 before we return?

That will clear other bit masks. Actually, the caller initialize flags to 0, we 
can just return.
Or  just *flags &= ~( AMD_CG_SUPPORT_XXX) before we return.

Regards,
Lang

From: Deucher, Alexander 
Sent: Thursday, January 27, 2022 4:27 AM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo ; Huang, Ray 
Subject: Re: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers


[Public]

Should we set *flags = 0 before we return?

Alex


From: Yu, Lang mailto:lang...@amd.com>>
Sent: Wednesday, January 26, 2022 2:53 AM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Deucher, Alexander 
mailto:alexander.deuc...@amd.com>>; Lazar, Lijo 
mailto:lijo.la...@amd.com>>; Huang, Ray 
mailto:ray.hu...@amd.com>>; Yu, Lang 
mailto:lang...@amd.com>>
Subject: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

We observed a gpu hang when querying mmhub CG status(i.e.,
cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
skillfish doesn't support any CG features.

Only allow asics which support CG features accessing related
registers. Will add similar safeguards for other IPs in the
furture.

Signed-off-by: Lang Yu mailto:lang...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 3 +++
 5 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index 4c9f0c0f3116..1869e2019461 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -550,6 +550,9 @@ static void mmhub_v1_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
index 3b901f941627..f7b9843b36e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
@@ -546,6 +546,9 @@ static void mmhub_v1_7_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
index 3718ff610ab2..3f5f326379b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
@@ -686,6 +686,9 @@ static void mmhub_v2_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
index 9e16da28505a..b23dd9ddfb5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
@@ -580,6 +580,9 @@ static void mmhub_v2_3_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1, data2, data3;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 619106f7d23d..a2d5c8424e2b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -651,6 +651,9 @@ static void mmhub_v9_4_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

--
2.25.1


Re: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

2022-01-26 Thread Deucher, Alexander
[Public]

Should we set *flags = 0 before we return?

Alex


From: Yu, Lang 
Sent: Wednesday, January 26, 2022 2:53 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Lazar, Lijo 
; Huang, Ray ; Yu, Lang 
Subject: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

We observed a gpu hang when querying mmhub CG status(i.e.,
cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
skillfish doesn't support any CG features.

Only allow asics which support CG features accessing related
registers. Will add similar safeguards for other IPs in the
furture.

Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 3 +++
 5 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index 4c9f0c0f3116..1869e2019461 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -550,6 +550,9 @@ static void mmhub_v1_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
index 3b901f941627..f7b9843b36e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
@@ -546,6 +546,9 @@ static void mmhub_v1_7_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
index 3718ff610ab2..3f5f326379b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
@@ -686,6 +686,9 @@ static void mmhub_v2_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
index 9e16da28505a..b23dd9ddfb5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
@@ -580,6 +580,9 @@ static void mmhub_v2_3_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1, data2, data3;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 619106f7d23d..a2d5c8424e2b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -651,6 +651,9 @@ static void mmhub_v9_4_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

--
2.25.1



RE: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

2022-01-26 Thread Yu, Lang
[Public]

Hi Lijo,

For cyan skillfish, both adev->cg_flags and adev->pg_flags are zero. I just 
found "RREG32_SOC15(MMHUB, 0, mmMM_ATC_L2_MISC_CG);" in 
mmhub_v2_0_get_clockgating() caused a gpu hang(cat amdgpu_pm_info). I didn't 
check if it's some sort of PG which causes the issue.

Regards,
Lang

From: Lazar, Lijo 
Sent: Wednesday, January 26, 2022 8:06 PM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Huang, Ray 
; Yu, Lang 
Subject: Re: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers


[Public]

Hi Lang,

There are ASICs in which driver doesn't enable CG, and then these flags will be 
false. However, the CG will be enabled by another component like VBIOS. Driver 
still reports the CG status eventhough driver doesn't enable it. For those, 
this logic doesn't work.

BTW, could you check if it's some sort of PG which causes the issue?

Thanks,
Lijo


Re: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

2022-01-26 Thread Lazar, Lijo
[Public]

Hi Lang,

There are ASICs in which driver doesn't enable CG, and then these flags will be 
false. However, the CG will be enabled by another component like VBIOS. Driver 
still reports the CG status eventhough driver doesn't enable it. For those, 
this logic doesn't work.

BTW, could you check if it's some sort of PG which causes the issue?

Thanks,
Lijo


Re: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

2022-01-26 Thread Christian König

Am 26.01.22 um 12:02 schrieb Lang Yu:

On 01/26/ , Christian König wrote:

Am 26.01.22 um 08:53 schrieb Lang Yu:

We observed a gpu hang when querying mmhub CG status(i.e.,
cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
skillfish doesn't support any CG features.

Only allow asics which support CG features accessing related
registers. Will add similar safeguards for other IPs in the
furture.

I think you should probably add a macro or function for this check, apart
from that looks good to me.

Thanks for you advice. Is it fine to use such a macro?

#define amdgpu_device_cg_flag_isset(flag) ((adev->cg_flags) & (flag))


No, first of all that can also be a function and doesn't need to be a macro.

Then we should probably encapsulate more functionality or otherwise it 
is rather usless.


Think more about a function like amdgpu_gmc_cg_enabled() instead.

Regards,
Christian.



Regards,
Lang


Christian.


Signed-off-by: Lang Yu 
---
   drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 3 +++
   drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 3 +++
   drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c | 3 +++
   drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c | 3 +++
   drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 3 +++
   5 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index 4c9f0c0f3116..1869e2019461 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -550,6 +550,9 @@ static void mmhub_v1_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
   {
int data, data1;
+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
index 3b901f941627..f7b9843b36e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
@@ -546,6 +546,9 @@ static void mmhub_v1_7_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
   {
int data, data1;
+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
index 3718ff610ab2..3f5f326379b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
@@ -686,6 +686,9 @@ static void mmhub_v2_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
   {
int data, data1;
+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
index 9e16da28505a..b23dd9ddfb5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
@@ -580,6 +580,9 @@ static void mmhub_v2_3_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
   {
int data, data1, data2, data3;
+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 619106f7d23d..a2d5c8424e2b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -651,6 +651,9 @@ static void mmhub_v9_4_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
   {
int data, data1;
+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;




Re: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

2022-01-26 Thread Christian König

Am 26.01.22 um 08:53 schrieb Lang Yu:

We observed a gpu hang when querying mmhub CG status(i.e.,
cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
skillfish doesn't support any CG features.

Only allow asics which support CG features accessing related
registers. Will add similar safeguards for other IPs in the
furture.


I think you should probably add a macro or function for this check, 
apart from that looks good to me.


Christian.



Signed-off-by: Lang Yu 
---
  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 3 +++
  5 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index 4c9f0c0f3116..1869e2019461 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -550,6 +550,9 @@ static void mmhub_v1_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
  {
int data, data1;
  
+	if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))

+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;
  
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c

index 3b901f941627..f7b9843b36e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
@@ -546,6 +546,9 @@ static void mmhub_v1_7_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
  {
int data, data1;
  
+	if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))

+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;
  
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

index 3718ff610ab2..3f5f326379b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
@@ -686,6 +686,9 @@ static void mmhub_v2_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
  {
int data, data1;
  
+	if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))

+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;
  
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c

index 9e16da28505a..b23dd9ddfb5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
@@ -580,6 +580,9 @@ static void mmhub_v2_3_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
  {
int data, data1, data2, data3;
  
+	if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))

+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;
  
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c

index 619106f7d23d..a2d5c8424e2b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -651,6 +651,9 @@ static void mmhub_v9_4_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
  {
int data, data1;
  
+	if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))

+   return;
+
if (amdgpu_sriov_vf(adev))
*flags = 0;