RE: [PATCH] drm/amdgpu: Use MAX_HWIP instead of HW_ID_MAX

2021-11-25 Thread Chen, Guchun
[Public]

Reviewed-by: Guchun Chen 

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Friday, November 26, 2021 2:43 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Hawking 

Subject: [PATCH] drm/amdgpu: Use MAX_HWIP instead of HW_ID_MAX

HW_ID_MAX considers HWID of all IPs, far more than what amdgpu uses.
amdgpu tracks only the IPs defined by amd_hw_ip_block_type whose max is 
MAX_HWIP.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b85b67a88a3d..c5cfe2926ca1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1096,7 +1096,7 @@ struct amdgpu_device {
pci_channel_state_t pci_channel_state;
 
struct amdgpu_reset_control *reset_cntl;
-   uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+   uint32_t
ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
--
2.25.1


Re: [PATCH 0/2] Create shared array of power profile name strings

2021-11-25 Thread Lazar, Lijo




On 11/25/2021 7:49 AM, Darren Powell wrote:

== Description ==
  All the power profile modes use the same strings (or a subset of)
  Creating a public array of the strings will allow sharing rather than
  duplicating for each chip
  First patch only implements change for navi10
  Second patch extends the changes to all other implementations of
  pp_hwmgr_func->get_power_profile_mode  (smu10, smu7, vega10, vega20)
  and pptable_funcs->get_power_profile_mode  (arcturus, sienna_cichlid, 
vangogh, renoir)

=== Test System ===
  * DESKTOP(AMD FX-8350 + NAVI10(731F/ca), BIOS: F2)
   + ISO(Ubuntu 20.04.3 LTS)
   + Kernel(5.13.0-geabeb4f20a07-fdoagd5f)

=== Patch Summary ===
linux: (g...@gitlab.freedesktop.org:agd5f) origin/amd-staging-drm-next @ 
1e7a606dca04
 + 482319edaabb amdgpu/pm: Create shared array of power profile name strings
 + 1e7a606dca04 amdgpu/pm: Modify implmentations of get_power_profile_mode 
to use amdgpu_pp_profile_name

=== Test ===
  LOGFILE=pp_profile_strings.test.log
  AMDGPU_PCI_ADDR=`lspci -nn | grep "VGA\|Display" | cut -d " " -f 1`
  AMDGPU_HWMON=`ls -la /sys/class/hwmon | grep $AMDGPU_PCI_ADDR | awk '{print 
$9}'`
  HWMON_DIR=/sys/class/hwmon/${AMDGPU_HWMON}

  lspci -nn | grep "VGA\|Display"  > $LOGFILE
  FILES="pp_power_profile_mode "

  for f in $FILES
  do
echo === $f === >> $LOGFILE
cat $HWMON_DIR/device/$f >> $LOGFILE
  done
  cat $LOGFILE

Darren Powell (2):
   amdgpu/pm: Create shared array of power profile name strings
   amdgpu/pm: Modify implmentations of get_power_profile_mode to use
 amdgpu_pp_profile_name



Series is -
Reviewed-by: Lijo Lazar 

Thanks,
Lijo


  drivers/gpu/drm/amd/include/kgd_pp_interface.h |  4 
  drivers/gpu/drm/amd/pm/amdgpu_pm.c | 10 ++
  .../gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c   |  9 +
  .../gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c| 14 +++---
  .../gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c  | 12 +++-
  .../gpu/drm/amd/pm/powerplay/hwmgr/vega20_hwmgr.c  | 10 +-
  drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c  | 10 +-
  drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c| 10 +-
  .../drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c| 10 +-
  drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c   | 10 +-
  drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c| 10 +-
  11 files changed, 27 insertions(+), 82 deletions(-)


base-commit: eabeb4f20a0786188fba07a2dd1b0a614c4e15f6



[PATCH] drm/amdgpu: Use MAX_HWIP instead of HW_ID_MAX

2021-11-25 Thread Lijo Lazar
HW_ID_MAX considers HWID of all IPs, far more than what amdgpu uses.
amdgpu tracks only the IPs defined by amd_hw_ip_block_type whose max
is MAX_HWIP.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b85b67a88a3d..c5cfe2926ca1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1096,7 +1096,7 @@ struct amdgpu_device {
pci_channel_state_t pci_channel_state;
 
struct amdgpu_reset_control *reset_cntl;
-   uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+   uint32_t
ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
-- 
2.25.1



Re: [PATCH v2] drm/amd/pm: Add warning for unexpected PG requests

2021-11-25 Thread Wang, Yang(Kevin)
[AMD Official Use Only]


Reviewed-by: Kevin Yang 

Best Regards,
Kevin

From: amd-gfx  on behalf of Lijo Lazar 

Sent: Friday, November 26, 2021 1:25 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Limonciello, Mario 
; Zhang, Hawking 
Subject: [PATCH v2] drm/amd/pm: Add warning for unexpected PG requests

v1: Ideally power gate/ungate requests shouldn't come when smu block is
uninitialized. Add a WARN message to check the origins if such a thing
ever happens.

v2: Use dev_WARN to log device info (Felix/Guchun).

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index e156add7b560..ea99afb38d2b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -277,8 +277,12 @@ static int smu_dpm_set_power_gate(void *handle,
 struct smu_context *smu = handle;
 int ret = 0;

-   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
+   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
+   dev_WARN(smu->adev->dev,
+"SMU uninitialized but power %s requested for %u!\n",
+gate ? "gate" : "ungate", block_type);
 return -EOPNOTSUPP;
+   }

 switch (block_type) {
 /*
--
2.25.1



Re: [PATCH] drm/amdgpu: declare static function to fix compiler warning

2021-11-25 Thread Wang, Yang(Kevin)
[AMD Official Use Only]

Reviewed-by: Kevin Wang 

It is reasonable to modify it to a static function, which can be matched with 
function of acquire_psp_cmd_buf() .

Best Regards,
Kevin

From: amd-gfx  on behalf of Guchun Chen 

Sent: Friday, November 26, 2021 1:16 PM
To: amd-gfx@lists.freedesktop.org ; Deucher, 
Alexander ; Koenig, Christian 
; Pan, Xinhui ; Clements, John 

Cc: Chen, Guchun 
Subject: [PATCH] drm/amdgpu: declare static function to fix compiler warning

>> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c:503:6: warning: no previous 
>> prototype for function 'release_psp_cmd_buf' [-Wmissing-prototypes]
   void release_psp_cmd_buf(struct psp_context *psp)
^
   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c:503:1: note: declare 'static' if the 
function is not intended to be used outside of this translation unit
   void release_psp_cmd_buf(struct psp_context *psp)
   ^
   static
   1 warning generated.

Reported-by: kernel test robot 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index c641f84649d6..b48d68d30d80 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -518,7 +518,7 @@ static struct psp_gfx_cmd_resp *acquire_psp_cmd_buf(struct 
psp_context *psp)
 return cmd;
 }

-void release_psp_cmd_buf(struct psp_context *psp)
+static void release_psp_cmd_buf(struct psp_context *psp)
 {
 mutex_unlock(>mutex);
 }
--
2.17.1



Re: [PATCH] drm/amdgpu: fix the missed handling for SDMA2 and SDMA3

2021-11-25 Thread Wang, Yang(Kevin)
[AMD Official Use Only]

Reviewed-by: Kevin Wang 

Best Regards,
Kevin


From: amd-gfx  on behalf of Guchun Chen 

Sent: Friday, November 26, 2021 1:12 PM
To: amd-gfx@lists.freedesktop.org ; Deucher, 
Alexander ; Koenig, Christian 
; Pan, Xinhui 
Cc: Chen, Guchun 
Subject: [PATCH] drm/amdgpu: fix the missed handling for SDMA2 and SDMA3

There is no base reg offset or ip_version set for SDMA2
and SDMA3 on SIENNA_CICHLID, so add them.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index f6fae79203ee..ea00090b3fb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -157,6 +157,8 @@ static int hw_id_map[MAX_HWIP] = {
 [HDP_HWIP]  = HDP_HWID,
 [SDMA0_HWIP]= SDMA0_HWID,
 [SDMA1_HWIP]= SDMA1_HWID,
+   [SDMA2_HWIP]= SDMA2_HWID,
+   [SDMA3_HWIP]= SDMA3_HWID,
 [MMHUB_HWIP]= MMHUB_HWID,
 [ATHUB_HWIP]= ATHUB_HWID,
 [NBIO_HWIP] = NBIF_HWID,
--
2.17.1



RE: [PATCH v2] drm/amd/pm: Add warning for unexpected PG requests

2021-11-25 Thread Chen, Guchun
[Public]

Reviewed-by: Guchun Chen 

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Friday, November 26, 2021 1:25 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Limonciello, Mario 
; Zhang, Hawking 
Subject: [PATCH v2] drm/amd/pm: Add warning for unexpected PG requests

v1: Ideally power gate/ungate requests shouldn't come when smu block is 
uninitialized. Add a WARN message to check the origins if such a thing ever 
happens.

v2: Use dev_WARN to log device info (Felix/Guchun).

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index e156add7b560..ea99afb38d2b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -277,8 +277,12 @@ static int smu_dpm_set_power_gate(void *handle,
struct smu_context *smu = handle;
int ret = 0;
 
-   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
+   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
+   dev_WARN(smu->adev->dev,
+"SMU uninitialized but power %s requested for %u!\n",
+gate ? "gate" : "ungate", block_type);
return -EOPNOTSUPP;
+   }
 
switch (block_type) {
/*
--
2.25.1


Re: [PATCH] drm/amd/pm: Add warning for unexpected PG requests

2021-11-25 Thread Lazar, Lijo




On 11/25/2021 10:35 PM, Felix Kuehling wrote:

Am 2021-11-25 um 8:32 a.m. schrieb Lazar, Lijo:



On 11/25/2021 6:52 PM, Chen, Guchun wrote:

[Public]

Use dev_warn to be mGPU friendly?


The intention is to get a trace as well along with that. There are
multiple paths to this function.


There is also a dev_WARN and dev_WARN_ONCE.



Thanks Felix for the pointer. Sent a revised version.

Thanks,
Lijo


Regards,
   Felix




Thanks,
Lijo



Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of
Lijo Lazar
Sent: Thursday, November 25, 2021 7:51 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Limonciello,
Mario ; Zhang, Hawking

Subject: [PATCH] drm/amd/pm: Add warning for unexpected PG requests

Ideally power gate/ungate requests shouldn't come when smu block is
uninitialized. Add a WARN message to check the origins if such a
thing ever happens.

Signed-off-by: Lijo Lazar 
---
   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 5 -
   1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index e156add7b560..e0f8ab8be975 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -277,8 +277,11 @@ static int smu_dpm_set_power_gate(void *handle,
   struct smu_context *smu = handle;
   int ret = 0;
   -    if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
+    if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
+    WARN(true, "SMU uninitialized but power %s requested for
%u!\n",
+ gate ? "gate" : "ungate", block_type);
   return -EOPNOTSUPP;
+    }
     switch (block_type) {
   /*
--
2.25.1



[PATCH v2] drm/amd/pm: Add warning for unexpected PG requests

2021-11-25 Thread Lijo Lazar
v1: Ideally power gate/ungate requests shouldn't come when smu block is
uninitialized. Add a WARN message to check the origins if such a thing
ever happens.

v2: Use dev_WARN to log device info (Felix/Guchun).

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index e156add7b560..ea99afb38d2b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -277,8 +277,12 @@ static int smu_dpm_set_power_gate(void *handle,
struct smu_context *smu = handle;
int ret = 0;
 
-   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
+   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
+   dev_WARN(smu->adev->dev,
+"SMU uninitialized but power %s requested for %u!\n",
+gate ? "gate" : "ungate", block_type);
return -EOPNOTSUPP;
+   }
 
switch (block_type) {
/*
-- 
2.25.1



[PATCH] drm/amdgpu: declare static function to fix compiler warning

2021-11-25 Thread Guchun Chen
>> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c:503:6: warning: no previous 
>> prototype for function 'release_psp_cmd_buf' [-Wmissing-prototypes]
   void release_psp_cmd_buf(struct psp_context *psp)
^
   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c:503:1: note: declare 'static' if the 
function is not intended to be used outside of this translation unit
   void release_psp_cmd_buf(struct psp_context *psp)
   ^
   static
   1 warning generated.

Reported-by: kernel test robot 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index c641f84649d6..b48d68d30d80 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -518,7 +518,7 @@ static struct psp_gfx_cmd_resp *acquire_psp_cmd_buf(struct 
psp_context *psp)
return cmd;
 }
 
-void release_psp_cmd_buf(struct psp_context *psp)
+static void release_psp_cmd_buf(struct psp_context *psp)
 {
mutex_unlock(>mutex);
 }
-- 
2.17.1



[PATCH] drm/amdgpu: fix the missed handling for SDMA2 and SDMA3

2021-11-25 Thread Guchun Chen
There is no base reg offset or ip_version set for SDMA2
and SDMA3 on SIENNA_CICHLID, so add them.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index f6fae79203ee..ea00090b3fb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -157,6 +157,8 @@ static int hw_id_map[MAX_HWIP] = {
[HDP_HWIP]  = HDP_HWID,
[SDMA0_HWIP]= SDMA0_HWID,
[SDMA1_HWIP]= SDMA1_HWID,
+   [SDMA2_HWIP]= SDMA2_HWID,
+   [SDMA3_HWIP]= SDMA3_HWID,
[MMHUB_HWIP]= MMHUB_HWID,
[ATHUB_HWIP]= ATHUB_HWID,
[NBIO_HWIP] = NBIF_HWID,
-- 
2.17.1



RE: [PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of each IP block

2021-11-25 Thread Chai, Thomas
Hi Lijo:
   I add my replay after your comment.

Thanks,
Thomas
-Original Message-
From: Lazar, Lijo  
Sent: Thursday, November 25, 2021 7:41 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas 
Subject: Re: [PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of 
each IP block



On 11/25/2021 4:26 PM, yipechai wrote:
> Define an unified ras function pointers for each ip block to adapt.
> 
> Signed-off-by: yipechai 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 36 -
>   2 files changed, 37 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 90f0db3b4f65..dc6c8130e2d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2739,3 +2739,23 @@ static void 
> amdgpu_register_bad_pages_mca_notifier(void)
>   }
>   }
>   #endif
> +
> +/* check if ras is supported on block, say, sdma, gfx */ int 
> +amdgpu_ras_is_supported(struct amdgpu_device *adev,
> + unsigned int block)
> +{
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + if (block >= AMDGPU_RAS_BLOCK_COUNT)
> + return 0;
> + return ras && (adev->ras_enabled & (1 << block)); }
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) {
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
> + schedule_work(>recovery_work);
> + return 0;
> +}

>These changes look unrelated. Maybe as another patch to move from .h file to 
>.c file.
   When add amdgpu_ras.h  to other ip blocks .h file (such as amdgpu_gfx.h 
amdgpu_xgmi.h ...) for other block using 'struct amdgpu_ras_block_ops',  the 
code compilation will make an error:
“drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h:499:46: error: dereferencing 
pointer to incomplete type ‘struct amdgpu_device’
 499 | #define amdgpu_ras_get_context(adev)  
((adev)->psp.ras_context.ras)”
   The struct amdgpu_device has been defined in amdgpu.h file, and the amdgpu.h 
file has been included in amdgpu_ras.h, it seems that there are some problems 
for .h file cross-include. Due to the amdgpu_ras_get_context(adev)  has only 
been used in the functions of 'amdgpu_ras_is_supported' and ' 
amdgpu_ras_reset_gpu '. When move these two function to .c file, the code 
compilation becomes successful.

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index cdd0010a5389..4b7da40dd837 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -469,6 +469,19 @@ struct ras_debug_if {
>   };
>   int op;
>   };
> +
> +struct amdgpu_ras_block_ops {
> + int (*ras_late_init)(struct amdgpu_device *adev);
> + void (*ras_fini)(struct amdgpu_device *adev);
> + int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
> + void  (*query_ras_error_count)(struct amdgpu_device *adev,void 
> *ras_error_status);
> + void (*query_ras_error_status)(struct amdgpu_device *adev);
> + bool  (*query_ras_poison_mode)(struct amdgpu_device *adev);
> + void (*query_ras_error_address)(struct amdgpu_device *adev, void 
> *ras_error_status);
> + void (*reset_ras_error_count)(struct amdgpu_device *adev);
> + void (*reset_ras_error_status)(struct amdgpu_device *adev); };
> +

>Generic comment - Since all the operations are consolidated under _ops, it 
>makes sense to rename the _ras_funcs to _ras.

>Ex: amdgpu_gfx_ras_funcs => amdgpu_gfx_ras, amdgpu_xgmi_ras_funcs => 
>amdgpu_xgmi_ras and so forth.

>In future, these ras blocks may have data members to keep IP specific ras data.

OK, I will do it.

Thanks,
Lijo

>   /* work flow
>* vbios
>* 1: ras feature enable (enabled by default) @@ -486,16 +499,6 @@ 
> struct ras_debug_if {
>   #define amdgpu_ras_get_context(adev)
> ((adev)->psp.ras_context.ras)
>   #define amdgpu_ras_set_context(adev, ras_con)   
> ((adev)->psp.ras_context.ras = (ras_con))
>   
> -/* check if ras is supported on block, say, sdma, gfx */ -static 
> inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
> - unsigned int block)
> -{
> - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> - if (block >= AMDGPU_RAS_BLOCK_COUNT)
> - return 0;
> - return ras && (adev->ras_enabled & (1 << block));
> -}
>   
>   int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
>   
> @@ -512,15 +515,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device 
> *adev,
>   
>   int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
>   
> -static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) -{
> - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> - if (atomic_cmpxchg(>in_recovery, 0, 

[PATCH AUTOSEL 5.10 19/28] drm/amd/amdgpu: fix potential memleak

2021-11-25 Thread Sasha Levin
From: Bernard Zhao 

[ Upstream commit 27dfaedc0d321b4ea4e10c53e4679d6911ab17aa ]

In function amdgpu_get_xgmi_hive, when kobject_init_and_add failed
There is a potential memleak if not call kobject_put.

Reviewed-by: Felix Kuehling 
Signed-off-by: Bernard Zhao 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 0526dec1d736e..042c85fc528bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -358,6 +358,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
"%s", "xgmi_hive_info");
if (ret) {
dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi 
hive\n");
+   kobject_put(>kobj);
kfree(hive);
hive = NULL;
goto pro_end;
-- 
2.33.0



[PATCH AUTOSEL 5.10 18/28] drm/amd/amdkfd: Fix kernel panic when reset failed and been triggered again

2021-11-25 Thread Sasha Levin
From: shaoyunl 

[ Upstream commit 2cf49e00d40d5132e3d067b5aa6d84791929ab15 ]

In SRIOV configuration, the reset may failed to bring asic back to normal but 
stop cpsch
already been called, the start_cpsch will not be called since there is no 
resume in this
case.  When reset been triggered again, driver should avoid to do 
uninitialization again.

Signed-off-by: shaoyunl 
Reviewed-by: Felix Kuehling 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 352a32dc609b2..2645ebc63a14d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1207,6 +1207,11 @@ static int stop_cpsch(struct device_queue_manager *dqm)
bool hanging;
 
dqm_lock(dqm);
+   if (!dqm->sched_running) {
+   dqm_unlock(dqm);
+   return 0;
+   }
+
if (!dqm->is_hws_hang)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
hanging = dqm->is_hws_hang || dqm->is_resetting;
-- 
2.33.0



[PATCH AUTOSEL 5.15 26/39] drm/amd/amdgpu: fix potential memleak

2021-11-25 Thread Sasha Levin
From: Bernard Zhao 

[ Upstream commit 27dfaedc0d321b4ea4e10c53e4679d6911ab17aa ]

In function amdgpu_get_xgmi_hive, when kobject_init_and_add failed
There is a potential memleak if not call kobject_put.

Reviewed-by: Felix Kuehling 
Signed-off-by: Bernard Zhao 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 978ac927ac11d..a799e0b1ff736 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -386,6 +386,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
"%s", "xgmi_hive_info");
if (ret) {
dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi 
hive\n");
+   kobject_put(>kobj);
kfree(hive);
hive = NULL;
goto pro_end;
-- 
2.33.0



[PATCH AUTOSEL 5.15 25/39] drm/amd/amdkfd: Fix kernel panic when reset failed and been triggered again

2021-11-25 Thread Sasha Levin
From: shaoyunl 

[ Upstream commit 2cf49e00d40d5132e3d067b5aa6d84791929ab15 ]

In SRIOV configuration, the reset may failed to bring asic back to normal but 
stop cpsch
already been called, the start_cpsch will not be called since there is no 
resume in this
case.  When reset been triggered again, driver should avoid to do 
uninitialization again.

Signed-off-by: shaoyunl 
Reviewed-by: Felix Kuehling 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f8fce9d05f50c..4f2e0cc8a51a8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1225,6 +1225,11 @@ static int stop_cpsch(struct device_queue_manager *dqm)
bool hanging;
 
dqm_lock(dqm);
+   if (!dqm->sched_running) {
+   dqm_unlock(dqm);
+   return 0;
+   }
+
if (!dqm->is_hws_hang)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
hanging = dqm->is_hws_hang || dqm->is_resetting;
-- 
2.33.0



[PATCH AUTOSEL 5.15 24/39] drm/amd/pm: Remove artificial freq level on Navi1x

2021-11-25 Thread Sasha Levin
From: Lijo Lazar 

[ Upstream commit be83a5676767c99c2417083c29d42aa1e109a69d ]

Print Navi1x fine grained clocks in a consistent manner with other SOCs.
Don't show aritificial DPM level when the current clock equals min or max.

Signed-off-by: Lijo Lazar 
Reviewed-by: Evan Quan 
Acked-by: Alex Deucher 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
index b1ad451af06bd..dfba0bc732073 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
@@ -1265,7 +1265,7 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
enum smu_clk_type clk_type, char *buf)
 {
uint16_t *curve_settings;
-   int i, size = 0, ret = 0;
+   int i, levels, size = 0, ret = 0;
uint32_t cur_value = 0, value = 0, count = 0;
uint32_t freq_values[3] = {0};
uint32_t mark_index = 0;
@@ -1319,14 +1319,17 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
freq_values[1] = cur_value;
mark_index = cur_value == freq_values[0] ? 0 :
 cur_value == freq_values[2] ? 2 : 1;
-   if (mark_index != 1)
-   freq_values[1] = (freq_values[0] + 
freq_values[2]) / 2;
 
-   for (i = 0; i < 3; i++) {
+   levels = 3;
+   if (mark_index != 1) {
+   levels = 2;
+   freq_values[1] = freq_values[2];
+   }
+
+   for (i = 0; i < levels; i++) {
size += sysfs_emit_at(buf, size, "%d: %uMhz 
%s\n", i, freq_values[i],
i == mark_index ? "*" : "");
}
-
}
break;
case SMU_PCIE:
-- 
2.33.0



Re: [PATCH v2 2/2] drm/amdkfd: Slighly optimize 'init_doorbell_bitmap()'

2021-11-25 Thread Felix Kuehling
Am 2021-11-23 um 3:46 p.m. schrieb Christophe JAILLET:
> The 'doorbell_bitmap' bitmap has just been allocated. So we can use the
> non-atomic '__set_bit()' function to save a few cycles as no concurrent
> access can happen.
>
> Reviewed-by: Felix Kuehling 
> Signed-off-by: Christophe JAILLET 

Thank you! I applied the series to amd-staging-drm-next.

Regards,
  Felix


> ---
> bitmap_set() could certainly also be use, but range checking would be
> tricky.
>
> v1 --> v2: No change
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 67bb1654becc..9158f9754a24 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1446,9 +1446,9 @@ static int init_doorbell_bitmap(struct 
> qcm_process_device *qpd,
>  
>   for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS / 2; i++) {
>   if (i >= range_start && i <= range_end) {
> - set_bit(i, qpd->doorbell_bitmap);
> - set_bit(i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET,
> - qpd->doorbell_bitmap);
> + __set_bit(i, qpd->doorbell_bitmap);
> + __set_bit(i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET,
> +   qpd->doorbell_bitmap);
>   }
>   }
>  


[PATCH v1] drm/amd/display: Add DP-HDMI PCON Support in DC

2021-11-25 Thread Fangzhi Zuo
Signed-off-by: Fangzhi Zuo 
---
 drivers/gpu/drm/amd/display/dc/core/dc_link.c | 15 
 .../gpu/drm/amd/display/dc/core/dc_link_dp.c  | 71 +++
 drivers/gpu/drm/amd/display/dc/dc.h   |  6 ++
 drivers/gpu/drm/amd/display/dc/dc_dp_types.h  | 31 
 drivers/gpu/drm/amd/display/dc/dc_hw_types.h  |  3 +
 drivers/gpu/drm/amd/display/dc/dc_link.h  |  1 +
 drivers/gpu/drm/amd/display/dc/dc_types.h |  1 +
 .../drm/amd/display/dc/dcn20/dcn20_resource.c |  2 +
 .../drm/amd/display/dc/dcn21/dcn21_resource.c |  2 +
 .../drm/amd/display/dc/dcn30/dcn30_resource.c |  2 +
 .../drm/amd/display/dc/dcn31/dcn31_resource.c |  1 +
 11 files changed, 135 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c 
b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
index 3d08f8eba402..dad7a4fdc427 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
@@ -2750,8 +2750,23 @@ static bool dp_active_dongle_validate_timing(
return false;
}
 
+#if defined(CONFIG_DRM_AMD_DC_DCN)
+   if (dongle_caps->dp_hdmi_frl_max_link_bw_in_kbps > 0) { // DP to HDMI 
FRL converter
+   struct dc_crtc_timing outputTiming = *timing;
+
+   if (timing->flags.DSC && !timing->dsc_cfg.is_frl)
+   /* DP input has DSC, HDMI FRL output doesn't have DSC, 
remove DSC from output timing */
+   outputTiming.flags.DSC = 0;
+   if (dc_bandwidth_in_kbps_from_timing() > 
dongle_caps->dp_hdmi_frl_max_link_bw_in_kbps)
+   return false;
+   } else { // DP to HDMI TMDS converter
+   if (get_timing_pixel_clock_100hz(timing) > 
(dongle_caps->dp_hdmi_max_pixel_clk_in_khz * 10))
+   return false;
+   }
+#else
if (get_timing_pixel_clock_100hz(timing) > 
(dongle_caps->dp_hdmi_max_pixel_clk_in_khz * 10))
return false;
+#endif
 
 #if defined(CONFIG_DRM_AMD_DC_DCN)
}
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c 
b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
index 84f3545c3032..da1532356c07 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
@@ -4313,6 +4313,56 @@ static int translate_dpcd_max_bpc(enum 
dpcd_downstream_port_max_bpc bpc)
return -1;
 }
 
+#if defined(CONFIG_DRM_AMD_DC_DCN)
+uint32_t dc_link_bw_kbps_from_raw_frl_link_rate_data(uint8_t bw)
+{
+   switch (bw) {
+   case 0b001:
+   return 900;
+   case 0b010:
+   return 1800;
+   case 0b011:
+   return 2400;
+   case 0b100:
+   return 3200;
+   case 0b101:
+   return 4000;
+   case 0b110:
+   return 4800;
+   }
+
+   return 0;
+}
+
+/**
+ * Return PCON's post FRL link training supported BW if its non-zero, 
otherwise return max_supported_frl_bw.
+ */
+static uint32_t intersect_frl_link_bw_support(
+   const uint32_t max_supported_frl_bw_in_kbps,
+   const union hdmi_encoded_link_bw hdmi_encoded_link_bw)
+{
+   uint32_t supported_bw_in_kbps = max_supported_frl_bw_in_kbps;
+
+   // HDMI_ENCODED_LINK_BW bits are only valid if HDMI Link Configuration 
bit is 1 (FRL mode)
+   if (hdmi_encoded_link_bw.bits.FRL_MODE) {
+   if (hdmi_encoded_link_bw.bits.BW_48Gbps)
+   supported_bw_in_kbps = 4800;
+   else if (hdmi_encoded_link_bw.bits.BW_40Gbps)
+   supported_bw_in_kbps = 4000;
+   else if (hdmi_encoded_link_bw.bits.BW_32Gbps)
+   supported_bw_in_kbps = 3200;
+   else if (hdmi_encoded_link_bw.bits.BW_24Gbps)
+   supported_bw_in_kbps = 2400;
+   else if (hdmi_encoded_link_bw.bits.BW_18Gbps)
+   supported_bw_in_kbps = 1800;
+   else if (hdmi_encoded_link_bw.bits.BW_9Gbps)
+   supported_bw_in_kbps = 900;
+   }
+
+   return supported_bw_in_kbps;
+}
+#endif
+
 static void read_dp_device_vendor_id(struct dc_link *link)
 {
struct dp_device_vendor_id dp_id;
@@ -4424,6 +4474,27 @@ static void get_active_converter_info(
translate_dpcd_max_bpc(

hdmi_color_caps.bits.MAX_BITS_PER_COLOR_COMPONENT);
 
+#if defined(CONFIG_DRM_AMD_DC_DCN)
+   if 
(link->dc->caps.hdmi_frl_pcon_support) {
+   
link->dpcd_caps.dongle_caps.dp_hdmi_frl_max_link_bw_in_kbps =
+   
dc_link_bw_kbps_from_raw_frl_link_rate_data(
+   
hdmi_color_caps.bits.MAX_ENCODED_LINK_BW_SUPPORT);
+
+

Re: [PATCH v8] drm/amdgpu: handle IH ring1 overflow

2021-11-25 Thread Felix Kuehling
Am 2021-11-25 um 3:30 p.m. schrieb Philip Yang:
> IH ring1 is used to process GPU retry fault, overflow is enabled to
> drain retry fault because we want receive other interrupts while
> handling retry fault to recover range. There is no overflow flag set
> when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
> and drain retry fault.
>
> If fault timestamp goes backward, the fault is filtered and should not
> be processed. Drain fault is finished if processed_timestamp is equal to
> or larger than checkpoint timestamp.
>
> Add amdgpu_ih_function interface decode_iv_ts for different chips to get
> timestamp from IV entry with different iv size and timestamp offset.
> amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.
>
> Signed-off-by: Philip Yang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  8 +++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c  | 55 +++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  | 16 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c |  3 ++
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  2 +-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 +-
>  drivers/gpu/drm/amd/amdgpu/navi10_ih.c  |  1 +
>  drivers/gpu/drm/amd/amdgpu/vega10_ih.c  |  1 +
>  drivers/gpu/drm/amd/amdgpu/vega20_ih.c  |  1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c|  2 +-
>  11 files changed, 57 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 45761d0328c7..403a968f3d2f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -350,6 +350,7 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t 
> addr, uint16_t pasid)
>   * amdgpu_gmc_filter_faults - filter VM faults
>   *
>   * @adev: amdgpu device structure
> + * @ih: interrupt ring that the fault received from
>   * @addr: address of the VM fault
>   * @pasid: PASID of the process causing the fault
>   * @timestamp: timestamp of the fault
> @@ -358,7 +359,8 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t 
> addr, uint16_t pasid)
>   * True if the fault was filtered and should not be processed further.
>   * False if the fault is a new one and needs to be handled.
>   */
> -bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
> +   struct amdgpu_ih_ring *ih, uint64_t addr,
> uint16_t pasid, uint64_t timestamp)
>  {
>   struct amdgpu_gmc *gmc = >gmc;
> @@ -366,6 +368,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device 
> *adev, uint64_t addr,
>   struct amdgpu_gmc_fault *fault;
>   uint32_t hash;
>  
> + /* Stale retry fault if timestamp goes backward */
> + if (amdgpu_ih_ts_after(timestamp, ih->processed_timestamp))
> + return true;
> +
>   /* If we don't have space left in the ring buffer return immediately */
>   stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
>   AMDGPU_GMC_FAULT_TIMEOUT;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index e55201134a01..8458cebc6d5b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -316,7 +316,8 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
> struct amdgpu_gmc *mc);
>  void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
>struct amdgpu_gmc *mc);
> -bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
> +   struct amdgpu_ih_ring *ih, uint64_t addr,
> uint16_t pasid, uint64_t timestamp);
>  void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t 
> addr,
>uint16_t pasid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> index 0c7963dfacad..8050f7ba93ad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> @@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, 
> const uint32_t *iv,
>   }
>  }
>  
> -/* Waiter helper that checks current rptr matches or passes checkpoint wptr 
> */
> -static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev,
> - struct amdgpu_ih_ring *ih,
> - uint32_t checkpoint_wptr,
> - uint32_t *prev_rptr)
> -{
> - uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask);
> -
> - /* rptr has wrapped. */
> - if (cur_rptr < *prev_rptr)
> - cur_rptr += ih->ptr_mask + 1;
> - *prev_rptr = cur_rptr;
> -
> - /* check ring is 

[PATCH v8] drm/amdgpu: handle IH ring1 overflow

2021-11-25 Thread Philip Yang
IH ring1 is used to process GPU retry fault, overflow is enabled to
drain retry fault because we want receive other interrupts while
handling retry fault to recover range. There is no overflow flag set
when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
and drain retry fault.

If fault timestamp goes backward, the fault is filtered and should not
be processed. Drain fault is finished if processed_timestamp is equal to
or larger than checkpoint timestamp.

Add amdgpu_ih_function interface decode_iv_ts for different chips to get
timestamp from IV entry with different iv size and timestamp offset.
amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  8 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c  | 55 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  | 16 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c |  3 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/navi10_ih.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/vega20_ih.c  |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c|  2 +-
 11 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 45761d0328c7..403a968f3d2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -350,6 +350,7 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, 
uint16_t pasid)
  * amdgpu_gmc_filter_faults - filter VM faults
  *
  * @adev: amdgpu device structure
+ * @ih: interrupt ring that the fault received from
  * @addr: address of the VM fault
  * @pasid: PASID of the process causing the fault
  * @timestamp: timestamp of the fault
@@ -358,7 +359,8 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, 
uint16_t pasid)
  * True if the fault was filtered and should not be processed further.
  * False if the fault is a new one and needs to be handled.
  */
-bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
+ struct amdgpu_ih_ring *ih, uint64_t addr,
  uint16_t pasid, uint64_t timestamp)
 {
struct amdgpu_gmc *gmc = >gmc;
@@ -366,6 +368,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, 
uint64_t addr,
struct amdgpu_gmc_fault *fault;
uint32_t hash;
 
+   /* Stale retry fault if timestamp goes backward */
+   if (amdgpu_ih_ts_after(timestamp, ih->processed_timestamp))
+   return true;
+
/* If we don't have space left in the ring buffer return immediately */
stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
AMDGPU_GMC_FAULT_TIMEOUT;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index e55201134a01..8458cebc6d5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -316,7 +316,8 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
  struct amdgpu_gmc *mc);
 void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
 struct amdgpu_gmc *mc);
-bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
+ struct amdgpu_ih_ring *ih, uint64_t addr,
  uint16_t pasid, uint64_t timestamp);
 void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
 uint16_t pasid);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index 0c7963dfacad..8050f7ba93ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, 
const uint32_t *iv,
}
 }
 
-/* Waiter helper that checks current rptr matches or passes checkpoint wptr */
-static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev,
-   struct amdgpu_ih_ring *ih,
-   uint32_t checkpoint_wptr,
-   uint32_t *prev_rptr)
-{
-   uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask);
-
-   /* rptr has wrapped. */
-   if (cur_rptr < *prev_rptr)
-   cur_rptr += ih->ptr_mask + 1;
-   *prev_rptr = cur_rptr;
-
-   /* check ring is empty to workaround missing wptr overflow flag */
-   return cur_rptr >= checkpoint_wptr ||
-  (cur_rptr & ih->ptr_mask) == amdgpu_ih_get_wptr(adev, ih);
-}
-
 /**
- * 

Re: [PATCH v7] drm/amdgpu: handle IH ring1 overflow

2021-11-25 Thread philip yang

  


On 2021-11-25 1:03 p.m., Felix Kuehling
  wrote:


  
Am 2021-11-25 um 12:52 p.m. schrieb Felix Kuehling:

  
Am 2021-11-25 um 10:16 a.m. schrieb Philip Yang:


  IH ring1 is used to process GPU retry fault, overflow is enabled to
drain retry fault because we want receive other interrupts while
handling retry fault to recover range. There is no overflow flag set
when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
and drain retry fault.

If fault timestamp goes backward, the fault is filtered and should not
be processed. Drain fault is finished if latest_decoded_timestamp is
equal to or larger than checkpoint timestamp.


If there can be multiple faults with the same time stamp, then this is
not sufficient because it would allow a stale fault with the same
timestamp to sneak through.

For example there are 3 faults with the same timestamp in the ring:

... <- rptr
...
fault1
fault2
fault3  <- wptr

The timestamp is taken from fault3, the current wptr.
amdgpu_ih_wait_on_checkpoint_process_ts returns when the rptr reaches
fault1 because it has the same timestamp.

fault1  <- rptr
fault2
fault3  <- wptr

At that time fault2 and fault3 are still stale faults that could lead to
a VM fault.

You would need to wait for latest_decoded_timestamp to be truly greater
than the checkpoint (or the ring to be empty) to be sure that you've
seen all stale faults. Other than that, this patch looks good to me.

  

fault timestamp keep increasing, never see the same timestamp,
probably because the timestamp clock is faster than the HW fills in
IV speed.

  

Regards,
  Felix




  Add amdgpu_ih_function interface decode_iv_ts for different chips to get
timestamp from IV entry with different iv size and timestamp offset.
amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  8 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c  | 57 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  | 16 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/navi10_ih.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/vega20_ih.c  |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c|  2 +-
 10 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 45761d0328c7..45e08677207d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -350,6 +350,7 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid)
  * amdgpu_gmc_filter_faults - filter VM faults
  *
  * @adev: amdgpu device structure
+ * @ih: interrupt ring that the fault received from
  * @addr: address of the VM fault
  * @pasid: PASID of the process causing the fault
  * @timestamp: timestamp of the fault
@@ -358,7 +359,8 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid)
  * True if the fault was filtered and should not be processed further.
  * False if the fault is a new one and needs to be handled.
  */
-bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
+			  struct amdgpu_ih_ring *ih, uint64_t addr,
 			  uint16_t pasid, uint64_t timestamp)
 {
 	struct amdgpu_gmc *gmc = >gmc;
@@ -366,6 +368,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 	struct amdgpu_gmc_fault *fault;
 	uint32_t hash;
 
+	/* Stale retry fault if timestamp goes backward */
+	if (amdgpu_ih_ts_after(timestamp, ih->latest_decoded_timestamp))
+		return true;
+
 	/* If we don't have space left in the ring buffer return immediately */
 	stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
 		AMDGPU_GMC_FAULT_TIMEOUT;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index e55201134a01..8458cebc6d5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -316,7 +316,8 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
 			  struct amdgpu_gmc *mc);
 void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
 			 struct amdgpu_gmc *mc);
-bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
+			  struct amdgpu_ih_ring *ih, uint64_t addr,
 			  uint16_t pasid, uint64_t timestamp);
 void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
  uint16_t pasid);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index 

Re: [PATCH v7] drm/amdgpu: handle IH ring1 overflow

2021-11-25 Thread Felix Kuehling


Am 2021-11-25 um 12:52 p.m. schrieb Felix Kuehling:
> Am 2021-11-25 um 10:16 a.m. schrieb Philip Yang:
>> IH ring1 is used to process GPU retry fault, overflow is enabled to
>> drain retry fault because we want receive other interrupts while
>> handling retry fault to recover range. There is no overflow flag set
>> when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
>> and drain retry fault.
>>
>> If fault timestamp goes backward, the fault is filtered and should not
>> be processed. Drain fault is finished if latest_decoded_timestamp is
>> equal to or larger than checkpoint timestamp.
> If there can be multiple faults with the same time stamp, then this is
> not sufficient because it would allow a stale fault with the same
> timestamp to sneak through.
>
> For example there are 3 faults with the same timestamp in the ring:
>
> ... <- rptr
> ...
> fault1
> fault2
> fault3  <- wptr
>
> The timestamp is taken from fault3, the current wptr.
> amdgpu_ih_wait_on_checkpoint_process_ts returns when the rptr reaches
> fault1 because it has the same timestamp.
>
> fault1  <- rptr
> fault2
> fault3  <- wptr
>
> At that time fault2 and fault3 are still stale faults that could lead to
> a VM fault.
>
> You would need to wait for latest_decoded_timestamp to be truly greater
> than the checkpoint (or the ring to be empty) to be sure that you've
> seen all stale faults. Other than that, this patch looks good to me.
>
> Regards,
>   Felix
>
>
>> Add amdgpu_ih_function interface decode_iv_ts for different chips to get
>> timestamp from IV entry with different iv size and timestamp offset.
>> amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.
>>
>> Signed-off-by: Philip Yang 
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  8 +++-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c  | 57 -
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  | 16 ++-
>>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  2 +-
>>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 +-
>>  drivers/gpu/drm/amd/amdgpu/navi10_ih.c  |  1 +
>>  drivers/gpu/drm/amd/amdgpu/vega10_ih.c  |  1 +
>>  drivers/gpu/drm/amd/amdgpu/vega20_ih.c  |  1 +
>>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c|  2 +-
>>  10 files changed, 56 insertions(+), 37 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> index 45761d0328c7..45e08677207d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> @@ -350,6 +350,7 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t 
>> addr, uint16_t pasid)
>>   * amdgpu_gmc_filter_faults - filter VM faults
>>   *
>>   * @adev: amdgpu device structure
>> + * @ih: interrupt ring that the fault received from
>>   * @addr: address of the VM fault
>>   * @pasid: PASID of the process causing the fault
>>   * @timestamp: timestamp of the fault
>> @@ -358,7 +359,8 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t 
>> addr, uint16_t pasid)
>>   * True if the fault was filtered and should not be processed further.
>>   * False if the fault is a new one and needs to be handled.
>>   */
>> -bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
>> +  struct amdgpu_ih_ring *ih, uint64_t addr,
>>uint16_t pasid, uint64_t timestamp)
>>  {
>>  struct amdgpu_gmc *gmc = >gmc;
>> @@ -366,6 +368,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device 
>> *adev, uint64_t addr,
>>  struct amdgpu_gmc_fault *fault;
>>  uint32_t hash;
>>  
>> +/* Stale retry fault if timestamp goes backward */
>> +if (amdgpu_ih_ts_after(timestamp, ih->latest_decoded_timestamp))
>> +return true;
>> +
>>  /* If we don't have space left in the ring buffer return immediately */
>>  stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
>>  AMDGPU_GMC_FAULT_TIMEOUT;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index e55201134a01..8458cebc6d5b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -316,7 +316,8 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
>>struct amdgpu_gmc *mc);
>>  void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
>>   struct amdgpu_gmc *mc);
>> -bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
>> +  struct amdgpu_ih_ring *ih, uint64_t addr,
>>uint16_t pasid, uint64_t timestamp);
>>  void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t 
>> addr,
>>   uint16_t 

Re: [PATCH v7] drm/amdgpu: handle IH ring1 overflow

2021-11-25 Thread Felix Kuehling
Am 2021-11-25 um 10:16 a.m. schrieb Philip Yang:
> IH ring1 is used to process GPU retry fault, overflow is enabled to
> drain retry fault because we want receive other interrupts while
> handling retry fault to recover range. There is no overflow flag set
> when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
> and drain retry fault.
>
> If fault timestamp goes backward, the fault is filtered and should not
> be processed. Drain fault is finished if latest_decoded_timestamp is
> equal to or larger than checkpoint timestamp.

If there can be multiple faults with the same time stamp, then this is
not sufficient because it would allow a stale fault with the same
timestamp to sneak through.

For example there are 3 faults with the same timestamp in the ring:

... <- rptr
...
fault1
fault2
fault3  <- wptr

The timestamp is taken from fault3, the current wptr.
amdgpu_ih_wait_on_checkpoint_process_ts returns when the rptr reaches
fault1 because it has the same timestamp.

fault1  <- rptr
fault2
fault3  <- wptr

At that time fault2 and fault3 are still stale faults that could lead to
a VM fault.

You would need to wait for latest_decoded_timestamp to be truly greater
than the checkpoint (or the ring to be empty) to be sure that you've
seen all stale faults. Other than that, this patch looks good to me.

Regards,
  Felix


>
> Add amdgpu_ih_function interface decode_iv_ts for different chips to get
> timestamp from IV entry with different iv size and timestamp offset.
> amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.
>
> Signed-off-by: Philip Yang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  8 +++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c  | 57 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  | 16 ++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  2 +-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 +-
>  drivers/gpu/drm/amd/amdgpu/navi10_ih.c  |  1 +
>  drivers/gpu/drm/amd/amdgpu/vega10_ih.c  |  1 +
>  drivers/gpu/drm/amd/amdgpu/vega20_ih.c  |  1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c|  2 +-
>  10 files changed, 56 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 45761d0328c7..45e08677207d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -350,6 +350,7 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t 
> addr, uint16_t pasid)
>   * amdgpu_gmc_filter_faults - filter VM faults
>   *
>   * @adev: amdgpu device structure
> + * @ih: interrupt ring that the fault received from
>   * @addr: address of the VM fault
>   * @pasid: PASID of the process causing the fault
>   * @timestamp: timestamp of the fault
> @@ -358,7 +359,8 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t 
> addr, uint16_t pasid)
>   * True if the fault was filtered and should not be processed further.
>   * False if the fault is a new one and needs to be handled.
>   */
> -bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
> +   struct amdgpu_ih_ring *ih, uint64_t addr,
> uint16_t pasid, uint64_t timestamp)
>  {
>   struct amdgpu_gmc *gmc = >gmc;
> @@ -366,6 +368,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device 
> *adev, uint64_t addr,
>   struct amdgpu_gmc_fault *fault;
>   uint32_t hash;
>  
> + /* Stale retry fault if timestamp goes backward */
> + if (amdgpu_ih_ts_after(timestamp, ih->latest_decoded_timestamp))
> + return true;
> +
>   /* If we don't have space left in the ring buffer return immediately */
>   stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
>   AMDGPU_GMC_FAULT_TIMEOUT;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index e55201134a01..8458cebc6d5b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -316,7 +316,8 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
> struct amdgpu_gmc *mc);
>  void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
>struct amdgpu_gmc *mc);
> -bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
> +   struct amdgpu_ih_ring *ih, uint64_t addr,
> uint16_t pasid, uint64_t timestamp);
>  void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t 
> addr,
>uint16_t pasid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> index 0c7963dfacad..8d02f975f915 100644
> --- 

Re: [PATCH] drm/amd/pm: Add warning for unexpected PG requests

2021-11-25 Thread Felix Kuehling
Am 2021-11-25 um 8:32 a.m. schrieb Lazar, Lijo:
>
>
> On 11/25/2021 6:52 PM, Chen, Guchun wrote:
>> [Public]
>>
>> Use dev_warn to be mGPU friendly?
>
> The intention is to get a trace as well along with that. There are
> multiple paths to this function.

There is also a dev_WARN and dev_WARN_ONCE.

Regards,
  Felix


>
> Thanks,
> Lijo
>
>>
>> Regards,
>> Guchun
>>
>> -Original Message-
>> From: amd-gfx  On Behalf Of
>> Lijo Lazar
>> Sent: Thursday, November 25, 2021 7:51 PM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander ; Limonciello,
>> Mario ; Zhang, Hawking
>> 
>> Subject: [PATCH] drm/amd/pm: Add warning for unexpected PG requests
>>
>> Ideally power gate/ungate requests shouldn't come when smu block is
>> uninitialized. Add a WARN message to check the origins if such a
>> thing ever happens.
>>
>> Signed-off-by: Lijo Lazar 
>> ---
>>   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 5 -
>>   1 file changed, 4 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> index e156add7b560..e0f8ab8be975 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> @@ -277,8 +277,11 @@ static int smu_dpm_set_power_gate(void *handle,
>>   struct smu_context *smu = handle;
>>   int ret = 0;
>>   -    if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
>> +    if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
>> +    WARN(true, "SMU uninitialized but power %s requested for
>> %u!\n",
>> + gate ? "gate" : "ungate", block_type);
>>   return -EOPNOTSUPP;
>> +    }
>>     switch (block_type) {
>>   /*
>> -- 
>> 2.25.1
>>


Re: [PATCH] drm/amd/display: fix application of sizeof to pointer

2021-11-25 Thread Rodrigo Siqueira Jordao




On 2021-11-23 10:04 p.m., cgel@gmail.com wrote:

From: Lv Ruyi 

Both of split and merge are pointers, not arrays.

Reported-by: Zeal Robot 
Signed-off-by: Lv Ruyi 
---
  drivers/gpu/drm/amd/display/dc/dml/dml_wrapper.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dml/dml_wrapper.c 
b/drivers/gpu/drm/amd/display/dc/dml/dml_wrapper.c
index ece34b0b8a46..91810aaee5a3 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dml_wrapper.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dml_wrapper.c
@@ -1223,8 +1223,8 @@ static void dml_full_validate_bw_helper(struct dc *dc,
*pipe_cnt = dml_populate_dml_pipes_from_context(dc, context, 
pipes, false);
*vlevel = dml_get_voltage_level(>bw_ctx.dml, pipes, 
*pipe_cnt);
if (*vlevel < context->bw_ctx.dml.soc.num_states) {
-   memset(split, 0, sizeof(split));
-   memset(merge, 0, sizeof(merge));
+   memset(split, 0, MAX_PIPES * sizeof(*split));
+   memset(merge, 0, MAX_PIPES * sizeof(*merge));
*vlevel = dml_validate_apply_pipe_split_flags(dc, 
context, *vlevel, split, merge);
}
  


Nice catch!

Reviewed-by: Rodrigo Siqueira 

and applied to amd-staging-drm-next

Thanks
Siqueira


[PATCH 5/6] Documentation/gpu: Add basic overview of DC pipeline

2021-11-25 Thread Rodrigo Siqueira
This commit describes how DCN works by providing high-level diagrams
with an explanation of each component. In particular, it details the
Global Sync signals.

Signed-off-by: Rodrigo Siqueira 
---
 Documentation/gpu/amdgpu-dc/amdgpu-dc.rst |   22 +-
 .../gpu/amdgpu-dc/amdgpu-dcn-overview.rst |  168 +++
 .../gpu/amdgpu-dc/config_example.svg  |  414 ++
 .../gpu/amdgpu-dc/dc_pipeline_overview.svg| 1125 +
 .../gpu/amdgpu-dc/global_sync_vblank.svg  |  485 +++
 5 files changed, 2203 insertions(+), 11 deletions(-)
 create mode 100644 Documentation/gpu/amdgpu-dc/amdgpu-dcn-overview.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/config_example.svg
 create mode 100644 Documentation/gpu/amdgpu-dc/dc_pipeline_overview.svg
 create mode 100644 Documentation/gpu/amdgpu-dc/global_sync_vblank.svg

diff --git a/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst 
b/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
index 3685b3b1ad64..2e45e83d9a2a 100644
--- a/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
+++ b/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
@@ -2,28 +2,28 @@
 drm/amd/display - Display Core (DC)
 ===
 
-*placeholder - general description of supported platforms, what dc is, etc.*
-
-Because it is partially shared with other operating systems, the Display Core
-Driver is divided in two pieces.
+AMD display engine is partially shared with other operating systems; for this
+reason, our Display Core Driver is divided into two pieces:
 
 1. **Display Core (DC)** contains the OS-agnostic components. Things like
hardware programming and resource management are handled here.
 2. **Display Manager (DM)** contains the OS-dependent components. Hooks to the
amdgpu base driver and DRM are implemented here.
 
-It doesn't help that the entire package is frequently referred to as DC. But
-with the context in mind, it should be clear.
+The display pipe is responsible for "scanning out" a rendered frame from the
+GPU memory (also called VRAM, FrameBuffer, etc.) to a display. In other words,
+it would:
 
-When CONFIG_DRM_AMD_DC is enabled, DC will be initialized by default for
-supported ASICs. To force disable, set `amdgpu.dc=0` on kernel command line.
-Likewise, to force enable on unsupported ASICs, set `amdgpu.dc=1`.
+1. Read frame information from memory;
+2. Perform required transformation;
+3. Send pixel data to sink devices.
 
-To determine if DC is loaded, search dmesg for the following entry:
+If you want to learn more about our driver details, take a look at the below
+table of content:
 
 .. toctree::
 
+   amdgpu-dcn-overview.rst
amdgpu-dm.rst
amdgpu-dc-debug.rst
 
-``Display Core initialized with ``
diff --git a/Documentation/gpu/amdgpu-dc/amdgpu-dcn-overview.rst 
b/Documentation/gpu/amdgpu-dc/amdgpu-dcn-overview.rst
new file mode 100644
index ..47e9a70de8ae
--- /dev/null
+++ b/Documentation/gpu/amdgpu-dc/amdgpu-dcn-overview.rst
@@ -0,0 +1,168 @@
+===
+Display Core Next (DCN)
+===
+
+To equip our readers with the basic knowledge of how AMD Display Core Next
+(DCN) works, we need to start with an overview of the hardware pipeline. Below
+you can see a picture that provides a DCN overview, keep in mind that this is a
+generic diagram, and we have variations per ASIC.
+
+.. kernel-figure:: dc_pipeline_overview.svg
+
+Based on this diagram, we can pass through each block and briefly describe
+them:
+
+* **Display Controller Hub (DCHUB)**: This is the gateway between the Scalable
+  Data Port (SDP) and DCN. This component has multiple features, such as memory
+  arbitration, rotation, and cursor manipulation.
+
+* **Display Pipe and Plane (DPP)**: This block provides pre-blend pixel
+  processing such as color space conversion, linearization of pixel data, tone
+  mapping, and gamut mapping.
+
+* **Multiple Pipe/Plane Combined (MPC)**: This component performs blending of
+  multiple planes, using global or per-pixel alpha.
+
+* **Output Pixel Processing (OPP)**: Process and format pixels to be sent to
+  the display.
+
+* **Output Pipe Timing Combiner (OPTC)**: It generates time output to combine
+  streams or divide capabilities. CRC values are generated in this block.
+
+* **Display Output (DIO)**: Codify the output to the display connected to our
+  GPU.
+
+* **Display Writeback (DWB)**: It provides the ability to write the output of
+  the display pipe back to memory as video frames.
+
+* **DCN Management Unit (DMU)**: It provides registers with access control and
+  interrupts the controller to the SOC host interrupt unit. This block includes
+  the Display Micro-Controller Unit - version B (DMCUB), which is handled via
+  firmware.
+
+* **DCN Clock Generator Block (DCCG)**: It provides the clocks and resets
+  for all of the display controller clock domains.
+
+* **Azalia (AZ)**: Audio engine.
+
+The above diagram is an architecture generalization of DCN, which means that
+every ASIC has 

[PATCH 6/6] Documentation/gpu: Add DC glossary

2021-11-25 Thread Rodrigo Siqueira
In the DC driver, we have multiple acronyms that are not obvious most of
the time. This commit introduces a DC glossary in order to make it
easier to navigate through our driver.

Signed-off-by: Rodrigo Siqueira 
---
 Documentation/gpu/amdgpu-dc/amdgpu-dc.rst   |   2 +-
 Documentation/gpu/amdgpu-dc/dc-glossary.rst | 257 
 2 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/gpu/amdgpu-dc/dc-glossary.rst

diff --git a/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst 
b/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
index 2e45e83d9a2a..15405c43786a 100644
--- a/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
+++ b/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
@@ -26,4 +26,4 @@ table of content:
amdgpu-dcn-overview.rst
amdgpu-dm.rst
amdgpu-dc-debug.rst
-
+   dc-glossary.rst
diff --git a/Documentation/gpu/amdgpu-dc/dc-glossary.rst 
b/Documentation/gpu/amdgpu-dc/dc-glossary.rst
new file mode 100644
index ..48698fc1799f
--- /dev/null
+++ b/Documentation/gpu/amdgpu-dc/dc-glossary.rst
@@ -0,0 +1,257 @@
+===
+DC Glossary
+===
+
+.. glossary::
+
+ABM
+  Adaptive Backlight Modulation
+
+APU
+  Accelerated Processing Unit
+
+ASIC
+  Application-Specific Integrated Circuit
+
+ASSR
+  Alternate Scrambler Seed Reset
+
+AZ
+  Azalia (HD audio DMA engine)
+
+BPC
+  Bits Per Colour/Component
+
+BPP
+  Bits Per Pixel
+
+Clocks
+  * PCLK: Pixel Clock
+  * SYMCLK: Symbol Clock
+  * SOCCLK: GPU Engine Clock
+  * DISPCLK: Display Clock
+  * DPPCLK: DPP Clock
+  * DCFCLK: Display Controller Fabric Clock
+  * REFCLK: Real Time Reference Clock
+  * PPLL: Pixel PLL
+  * FCLK: Fabric Clock
+  * MCLK: Memory Clock
+  * CPLIB: Content Protection Library
+
+CRC
+  Cyclic Redundancy Check
+
+CRTC
+  Cathode Ray Tube Controller - commonly called "Controller" - Generates
+  raw stream of pixels, clocked at pixel clock
+
+CVT
+  Coordinated Video Timings
+
+DAL
+  Display Abstraction layer
+
+DC (Software)
+  Display Core
+
+DC (Hardware)
+  Display Controller
+
+DCC
+  Delta Colour Compression
+
+DCE
+  Display Controller Engine
+
+DCHUB
+  Display Controller Hub
+
+ARB
+  Arbiter
+
+VTG
+  Vertical Timing Generator
+
+DCN
+  Display Core Next
+
+DCCG
+  Display Clock Generator block
+
+DDC
+  Display Data Channel
+
+DFS
+  Digital Frequency Synthesizer
+
+DIO
+  Display IO
+
+DPP
+  Display Pipes and Planes
+
+DSC
+  Display Stream Compression (Reduce the amount of bits to represent pixel
+  count while at the same pixel clock)
+
+dGPU
+  discrete GPU
+
+DMIF
+  Display Memory Interface
+
+DML
+  Display Mode Library
+
+DMCU
+  Display Micro Controller Unit
+
+DMCUB
+  Display Micro-Controller Unit, version B
+
+DPCD
+  DisplayPort Configuration Data
+
+DPM(S)
+  Display Power Management (Signaling)
+
+DRR
+  Dynamic Refresh Rate
+
+DWB
+  Display writeback
+
+ECP
+  Enhanced Content Protection
+
+FB
+  Frame Buffer
+
+FBC
+  Frame Buffer Compression
+
+FEC
+  Forward Error Correction
+
+FRL
+  Fixed Rate Link
+
+GCO
+  Graphical Controller Object
+
+GMC
+  Graphic Memory Controller
+
+GSL
+  Global Swap Lock
+
+iGPU
+  integrated GPU
+
+IH
+  Interrupt Handler
+
+ISR
+  Interrupt Service Request
+
+ISV
+  Independent Software Vendor
+
+KMD
+  Kernel Mode Driver
+
+LB
+  Line Buffer
+
+LFC
+  Low Framerate Compensation
+
+LTTPR
+  Link Training Tunable Phy Repeater
+
+LUT
+  Lookup Table
+
+MALL
+  Memory Access at Last Level
+
+MC
+  Memory Controller
+
+MPC
+  Multiple pipes and plane combine
+
+MPO
+  Multi Plane Overlay
+
+MST
+  Multi Stream Transport
+
+NBP State
+  Northbridge Power State
+
+NBIO
+  North Bridge Input/Output
+
+ODM
+  Output Data Mapping
+
+OPM
+  Output Protection Manager
+
+OPP
+  Output Plane Processor
+
+OPTC
+  Output Pipe Timing Combiner
+
+OTG
+  Output Timing Generator
+
+PCON
+  Power Controller
+
+PGFSM
+  Power Gate Finite State Machine
+
+PPLib
+  PowerPlay Library
+
+PSR
+  Panel Self Refresh
+
+SCL
+  Scaler
+
+SDP
+  Scalable Data Port
+
+SMU
+  System Management Unit
+
+SLS
+  Single Large Surface
+
+SST
+  Single Stream Transport
+
+TMDS
+  Transition-Minimized Differential Signaling
+
+TMZ
+  Trusted Memory Zone
+
+TTU
+  Time to Underflow
+
+VRR
+  Variable Refresh Rate
+
+UVD
+  Unified Video Decoder
+
+VCE
+  Video Compression Engine
+
+VCN
+  Video 

[PATCH 4/6] Documentation/gpu: How to collect DTN log

2021-11-25 Thread Rodrigo Siqueira
Introduce how to collect DTN log from debugfs.

Signed-off-by: Rodrigo Siqueira 
---
 Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst | 17 +
 1 file changed, 17 insertions(+)

diff --git a/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst 
b/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
index 6dbd21f7f59e..40c55a618918 100644
--- a/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
+++ b/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
@@ -58,3 +58,20 @@ In this case, if you have a pipe split, you will see one 
small red bar at the
 bottom of the display covering the entire display width and another bar
 covering the second pipe. In other words, you will see a bit high bar in the
 second pipe.
+
+DTN Debug
+=
+
+DC (DCN) provides an extensive log that dumps multiple details from our
+hardware configuration. Via debugfs, you can capture those status values by
+using Display Test Next (DTN) log, which can be captured via debugfs by using::
+
+  cat /sys/kernel/debug/dri/0/amdgpu_dm_dtn_log
+
+Since this log is updated accordingly with DCN status, you can also follow the
+change in real-time by using something like::
+
+  sudo watch -d cat /sys/kernel/debug/dri/0/amdgpu_dm_dtn_log
+
+When reporting a bug related to DC, consider attaching this log before and
+after you reproduce the bug.
-- 
2.25.1



[PATCH 1/6] Documentation/gpu: Reorganize DC documentation

2021-11-25 Thread Rodrigo Siqueira
Display core documentation is not well organized, and it is hard to find
information due to the lack of sections. This commit reorganizes the
documentation layout, and it is preparation work for future changes.

Signed-off-by: Rodrigo Siqueira 
---
 Documentation/gpu/amdgpu-dc.rst   | 74 ---
 .../gpu/amdgpu-dc/amdgpu-dc-debug.rst |  4 +
 Documentation/gpu/amdgpu-dc/amdgpu-dc.rst | 29 
 Documentation/gpu/amdgpu-dc/amdgpu-dm.rst | 42 +++
 Documentation/gpu/drivers.rst |  2 +-
 5 files changed, 76 insertions(+), 75 deletions(-)
 delete mode 100644 Documentation/gpu/amdgpu-dc.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/amdgpu-dm.rst

diff --git a/Documentation/gpu/amdgpu-dc.rst b/Documentation/gpu/amdgpu-dc.rst
deleted file mode 100644
index f7ff7e1309de..
--- a/Documentation/gpu/amdgpu-dc.rst
+++ /dev/null
@@ -1,74 +0,0 @@
-===
-drm/amd/display - Display Core (DC)
-===
-
-*placeholder - general description of supported platforms, what dc is, etc.*
-
-Because it is partially shared with other operating systems, the Display Core
-Driver is divided in two pieces.
-
-1. **Display Core (DC)** contains the OS-agnostic components. Things like
-   hardware programming and resource management are handled here.
-2. **Display Manager (DM)** contains the OS-dependent components. Hooks to the
-   amdgpu base driver and DRM are implemented here.
-
-It doesn't help that the entire package is frequently referred to as DC. But
-with the context in mind, it should be clear.
-
-When CONFIG_DRM_AMD_DC is enabled, DC will be initialized by default for
-supported ASICs. To force disable, set `amdgpu.dc=0` on kernel command line.
-Likewise, to force enable on unsupported ASICs, set `amdgpu.dc=1`.
-
-To determine if DC is loaded, search dmesg for the following entry:
-
-``Display Core initialized with ``
-
-AMDgpu Display Manager
-==
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-   :doc: overview
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
-   :internal:
-
-Lifecycle
--
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-   :doc: DM Lifecycle
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-   :functions: dm_hw_init dm_hw_fini
-
-Interrupts
---
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
-   :doc: overview
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
-   :internal:
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-   :functions: register_hpd_handlers dm_crtc_high_irq dm_pflip_high_irq
-
-Atomic Implementation
--
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-   :doc: atomic
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-   :functions: amdgpu_dm_atomic_check amdgpu_dm_atomic_commit_tail
-
-Display Core
-
-
-**WIP**
-
-FreeSync Video
---
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-   :doc: FreeSync Video
diff --git a/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst 
b/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
new file mode 100644
index ..bbb8c3fc8eee
--- /dev/null
+++ b/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
@@ -0,0 +1,4 @@
+Display Core Debug tools
+
+
+TODO
diff --git a/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst 
b/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
new file mode 100644
index ..3685b3b1ad64
--- /dev/null
+++ b/Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
@@ -0,0 +1,29 @@
+===
+drm/amd/display - Display Core (DC)
+===
+
+*placeholder - general description of supported platforms, what dc is, etc.*
+
+Because it is partially shared with other operating systems, the Display Core
+Driver is divided in two pieces.
+
+1. **Display Core (DC)** contains the OS-agnostic components. Things like
+   hardware programming and resource management are handled here.
+2. **Display Manager (DM)** contains the OS-dependent components. Hooks to the
+   amdgpu base driver and DRM are implemented here.
+
+It doesn't help that the entire package is frequently referred to as DC. But
+with the context in mind, it should be clear.
+
+When CONFIG_DRM_AMD_DC is enabled, DC will be initialized by default for
+supported ASICs. To force disable, set `amdgpu.dc=0` on kernel command line.
+Likewise, to force enable on unsupported ASICs, set `amdgpu.dc=1`.
+
+To determine if DC is loaded, search dmesg for the following entry:
+
+.. toctree::
+
+   amdgpu-dm.rst
+   amdgpu-dc-debug.rst
+
+``Display Core initialized with ``
diff --git 

[PATCH 3/6] Documentation/gpu: Document pipe split visual confirmation

2021-11-25 Thread Rodrigo Siqueira
Display core provides a feature that makes it easy for users to debug
Pipe Split. This commit introduces how to use such a debug option.

Signed-off-by: Rodrigo Siqueira 
---
 .../gpu/amdgpu-dc/amdgpu-dc-debug.rst | 28 +--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst 
b/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
index 532cbbd64863..6dbd21f7f59e 100644
--- a/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
+++ b/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
@@ -2,8 +2,18 @@
 Display Core Debug tools
 
 
-DC Debugfs
-==
+DC Visual Confirmation
+==
+
+Display core provides a feature named visual confirmation, which is a set of
+bars added at the scanout time by the driver to convey some specific
+information. In general, you can enable this debug option by using::
+
+  echo  > /sys/kernel/debug/dri/0/amdgpu_dm_visual_confirm
+
+Where `N` is an integer number for some specific scenarios that the developer
+wants to enable, you will see some of these debug cases in the following
+subsection.
 
 Multiple Planes Debug
 -
@@ -34,3 +44,17 @@ split configuration.
 * There should **not** be any cursor corruption
 * Multiple plane **may** be briefly disabled during window transitions or
   resizing but should come back after the action has finished
+
+Pipe Split Debug
+
+
+Sometimes we need to debug if DCN is splitting pipes correctly, and visual
+confirmation is also handy for this case. Similar to the MPO case, you can use
+the below command to enable visual confirmation::
+
+  echo 1 > /sys/kernel/debug/dri/0/amdgpu_dm_visual_confirm
+
+In this case, if you have a pipe split, you will see one small red bar at the
+bottom of the display covering the entire display width and another bar
+covering the second pipe. In other words, you will see a bit high bar in the
+second pipe.
-- 
2.25.1



[PATCH 2/6] Documentation/gpu: Document amdgpu_dm_visual_confirm debugfs entry

2021-11-25 Thread Rodrigo Siqueira
Display core provides a feature that makes it easy for users to debug
Multiple planes by enabling a visual notification at the bottom of each
plane. This commit introduces how to use such a feature.

Signed-off-by: Rodrigo Siqueira 
---
 .../gpu/amdgpu-dc/amdgpu-dc-debug.rst | 34 ++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst 
b/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
index bbb8c3fc8eee..532cbbd64863 100644
--- a/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
+++ b/Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
@@ -1,4 +1,36 @@
+
 Display Core Debug tools
 
 
-TODO
+DC Debugfs
+==
+
+Multiple Planes Debug
+-
+
+If you want to enable or debug multiple planes in a specific user-space
+application, you can leverage a debug feature named visual confirm. For
+enabling it, you will need::
+
+  echo 1 > /sys/kernel/debug/dri/0/amdgpu_dm_visual_confirm
+
+You need to reload your GUI to see the visual confirmation. When the plane
+configuration changes or a full update occurs there will be a colored bar at
+the bottom of each hardware plane being drawn on the screen.
+
+* The color indicates the format - For example, red is AR24 and green is NV12
+* The height of the bar indicates the index of the plane
+* Pipe split can be observed if there are two bars with a difference in height
+  covering the same plane
+
+Consider the video playback case in which a video is played in a specific
+plane, and the desktop is drawn in another plane. The video plane should
+feature one or two green bars at the bottom of the video depending on pipe
+split configuration.
+
+* There should **not** be any visual corruption
+* There should **not** be any underflow or screen flashes
+* There should **not** be any black screens
+* There should **not** be any cursor corruption
+* Multiple plane **may** be briefly disabled during window transitions or
+  resizing but should come back after the action has finished
-- 
2.25.1



[PATCH 0/6] Expand display core documentation

2021-11-25 Thread Rodrigo Siqueira
Display Core (DC) is one of the components under amdgpu, and it has
multiple features directly related to the KMS API. Unfortunately, we
don't have enough documentation about DC in the upstream, which makes
the life of some external contributors a little bit more challenging.
For these reasons, this patchset reworks part of the DC documentation
and introduces a new set of details on how the display core works on DCN
IP. Another improvement that this documentation effort tries to bring is
making explicit some of our hardware-specific details to guide
user-space developers better.

In my view, it is easier to review this series if you apply it in your
local kernel and build the HTML version (make htmldocs). I'm suggesting
this approach because I added a few SVG diagrams that will be easier to
see in the HTML version. If you cannot build the documentation, try to
open the SVG images while reviewing the content. In summary, in this
series, you will find:

1. Patch 1: Re-arrange of display core documentation. This is
   preparation work for the other patches, but it is also a way to expand
   this documentation.
2. Patch 2 to 4: Document some common debug options related to display.
3. Patch 5: This patch provides an overview of how our display core next
   works and a brief explanation of each component.
4. Patch 6: We use a lot of acronyms in our driver; for this reason, we
   exposed a glossary with common terms used by display core.

Please let us know what you think we can improve in this series and what
kind of content you want to see for the next series.

Thanks
Siqueira

Rodrigo Siqueira (6):
  Documentation/gpu: Reorganize DC documentation
  Documentation/gpu: Document amdgpu_dm_visual_confirm debugfs entry
  Documentation/gpu: Document pipe split visual confirmation
  Documentation/gpu: How to collect DTN log
  Documentation/gpu: Add basic overview of DC pipeline
  Documentation/gpu: Add DC glossary

 Documentation/gpu/amdgpu-dc.rst   |   74 --
 .../gpu/amdgpu-dc/amdgpu-dc-debug.rst |   77 ++
 Documentation/gpu/amdgpu-dc/amdgpu-dc.rst |   29 +
 .../gpu/amdgpu-dc/amdgpu-dcn-overview.rst |  168 +++
 Documentation/gpu/amdgpu-dc/amdgpu-dm.rst |   42 +
 .../gpu/amdgpu-dc/config_example.svg  |  414 ++
 Documentation/gpu/amdgpu-dc/dc-glossary.rst   |  257 
 .../gpu/amdgpu-dc/dc_pipeline_overview.svg| 1125 +
 .../gpu/amdgpu-dc/global_sync_vblank.svg  |  485 +++
 Documentation/gpu/drivers.rst |2 +-
 10 files changed, 2598 insertions(+), 75 deletions(-)
 delete mode 100644 Documentation/gpu/amdgpu-dc.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/amdgpu-dc-debug.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/amdgpu-dc.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/amdgpu-dcn-overview.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/amdgpu-dm.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/config_example.svg
 create mode 100644 Documentation/gpu/amdgpu-dc/dc-glossary.rst
 create mode 100644 Documentation/gpu/amdgpu-dc/dc_pipeline_overview.svg
 create mode 100644 Documentation/gpu/amdgpu-dc/global_sync_vblank.svg

-- 
2.25.1



Re: [PATCH v5] drm/amdgpu: handle IH ring1 overflow

2021-11-25 Thread philip yang

  


On 2021-11-25 2:03 a.m., Christian
  König wrote:

Am
  24.11.21 um 21:20 schrieb Felix Kuehling:
  
  Am 2021-11-24 um 10:23 a.m. schrieb philip
yang:


  
     #define
  amdgpu_ih_get_wptr(adev, ih)
  
  (adev)->irq.ih_funcs->get_wptr((adev), (ih))
  
     #define amdgpu_ih_decode_iv(adev, iv) \
  
     (adev)->irq.ih_funcs->decode_iv((adev), (ih),
  (iv))
  
  +#define amdgpu_ih_decode_iv_ts(adev, ih, rptr, offset) \
  
  +   
  (WARN_ON_ONCE(!(adev)->irq.ih_funcs->decode_iv_ts) ?
  0 : \
  

Please drop that WARN_ON_ONCE here.


  
  Agree, will drop it.
  
  

I suggested this. We're assuming that this function will never
be called

on hardware that doesn't support time stamps, and that all
hardware with

time stamps will implement the decode_iv_ts function. But it's
good to

get a log message if that assumption is ever broken, rather than
just

silently getting wrong results.

  
  
  Well exactly that's the point, you won't get wrong results but a
  NULL pointer exception instead.
  
  
  So we already have a backtrace in the logs.
  

The check will avoid NULL pointer access backtrace, so WARNING
  backtrace is obvious, useful for future chips to implement
  decode_iv_ts interface or use the helper function.
Regards,
Philip


  
  Regards,
  
  Christian.
  
  
  

Regards,

   Felix



Regards,
  
  
  Philip
  
  

  
  

  



[PATCH v7] drm/amdgpu: handle IH ring1 overflow

2021-11-25 Thread Philip Yang
IH ring1 is used to process GPU retry fault, overflow is enabled to
drain retry fault because we want receive other interrupts while
handling retry fault to recover range. There is no overflow flag set
when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
and drain retry fault.

If fault timestamp goes backward, the fault is filtered and should not
be processed. Drain fault is finished if latest_decoded_timestamp is
equal to or larger than checkpoint timestamp.

Add amdgpu_ih_function interface decode_iv_ts for different chips to get
timestamp from IV entry with different iv size and timestamp offset.
amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  8 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c  | 57 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  | 16 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/navi10_ih.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/vega20_ih.c  |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c|  2 +-
 10 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 45761d0328c7..45e08677207d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -350,6 +350,7 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, 
uint16_t pasid)
  * amdgpu_gmc_filter_faults - filter VM faults
  *
  * @adev: amdgpu device structure
+ * @ih: interrupt ring that the fault received from
  * @addr: address of the VM fault
  * @pasid: PASID of the process causing the fault
  * @timestamp: timestamp of the fault
@@ -358,7 +359,8 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, 
uint16_t pasid)
  * True if the fault was filtered and should not be processed further.
  * False if the fault is a new one and needs to be handled.
  */
-bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
+ struct amdgpu_ih_ring *ih, uint64_t addr,
  uint16_t pasid, uint64_t timestamp)
 {
struct amdgpu_gmc *gmc = >gmc;
@@ -366,6 +368,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, 
uint64_t addr,
struct amdgpu_gmc_fault *fault;
uint32_t hash;
 
+   /* Stale retry fault if timestamp goes backward */
+   if (amdgpu_ih_ts_after(timestamp, ih->latest_decoded_timestamp))
+   return true;
+
/* If we don't have space left in the ring buffer return immediately */
stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
AMDGPU_GMC_FAULT_TIMEOUT;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index e55201134a01..8458cebc6d5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -316,7 +316,8 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
  struct amdgpu_gmc *mc);
 void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
 struct amdgpu_gmc *mc);
-bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
+ struct amdgpu_ih_ring *ih, uint64_t addr,
  uint16_t pasid, uint64_t timestamp);
 void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
 uint16_t pasid);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index 0c7963dfacad..8d02f975f915 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, 
const uint32_t *iv,
}
 }
 
-/* Waiter helper that checks current rptr matches or passes checkpoint wptr */
-static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev,
-   struct amdgpu_ih_ring *ih,
-   uint32_t checkpoint_wptr,
-   uint32_t *prev_rptr)
-{
-   uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask);
-
-   /* rptr has wrapped. */
-   if (cur_rptr < *prev_rptr)
-   cur_rptr += ih->ptr_mask + 1;
-   *prev_rptr = cur_rptr;
-
-   /* check ring is empty to workaround missing wptr overflow flag */
-   return cur_rptr >= checkpoint_wptr ||
-  (cur_rptr & ih->ptr_mask) == amdgpu_ih_get_wptr(adev, ih);
-}
-
 /**
- * amdgpu_ih_wait_on_checkpoint_process - wait to process IVs up to 

Re: [PATCH v6] drm/amdgpu: handle IH ring1 overflow

2021-11-25 Thread philip yang

  


On 2021-11-25 2:11 a.m., Christian
  König wrote:

Am
  25.11.21 um 02:56 schrieb Felix Kuehling:
  
  Am 2021-11-24 um 5:58 p.m. schrieb Philip
Yang:

[SNIP]

  #define amdgpu_ih_get_wptr(adev, ih)
  (adev)->irq.ih_funcs->get_wptr((adev), (ih))
  
    #define amdgpu_ih_decode_iv(adev, iv) \
  
    (adev)->irq.ih_funcs->decode_iv((adev), (ih),
  (iv))
  
  +#define amdgpu_ih_decode_iv_ts(adev, ih, rptr, offset) \
  
  +    (WARN_ON_ONCE(!(adev)->irq.ih_funcs->decode_iv_ts)
  ? 0 : \
  
  +    (adev)->irq.ih_funcs->decode_iv_ts((ih), (rptr),
  (offset)))
  
    #define amdgpu_ih_set_rptr(adev, ih)
  (adev)->irq.ih_funcs->set_rptr((adev), (ih))
  
      int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct
  amdgpu_ih_ring *ih,
  
  @@ -89,10 +99,12 @@ int amdgpu_ih_ring_init(struct
  amdgpu_device *adev, struct amdgpu_ih_ring *ih,
  
    void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct
  amdgpu_ih_ring *ih);
  
    void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const
  uint32_t *iv,
  
      unsigned int num_dw);
  
  -int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device
  *adev,
  
  -    struct amdgpu_ih_ring *ih);
  
  +int amdgpu_ih_wait_on_checkpoint_process_ts(struct
  amdgpu_device *adev,
  
  +    struct amdgpu_ih_ring *ih);
  
    int amdgpu_ih_process(struct amdgpu_device *adev, struct
  amdgpu_ih_ring *ih);
  
    void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
  
    struct amdgpu_ih_ring *ih,
  
    struct amdgpu_iv_entry *entry);
  
  +uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring
  *ih, u32 rptr,
  
  +   signed int offset);
  
    #endif
  
  diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
  b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
  
  index 3ec5ff5a6dbe..b129898db433 100644
  
  --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
  
  +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
  
  @@ -119,6 +119,11 @@ static int
  gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
  
    return 1;
  
    }
  
    +    /* Stale retry fault if timestamp goes backward */
  
  +    if (entry->ih == >irq.ih1 &&
  
  +    amdgpu_ih_ts_after(entry->timestamp,
  entry->ih->processed_timestamp))
  
  +    return 1;
  
  +
  

This check should go before amdgpu_gmc_filter_faults. Otherwise

amdgpu_gmc_filter_faults may later drop a real fault because it
added a

stale fault in its hash table.

  
  
  I was already wondering if we shouldn't move all of this
  completely into amdgpu_gmc_filter_faults().
  
  
  I mean essentially we are filtering faults here once more, just
  based on a different criteria.
  

It is good idea, it also removes duplicate code in different
  interrupt handler. And retry fault timestamp check for both ring0
  and ring1.

Thanks,
Philip


  
  Regards,
  
  Christian.
  
  

  



Re: [PATCH v6] drm/amdgpu: handle IH ring1 overflow

2021-11-25 Thread philip yang

  


On 2021-11-24 8:56 p.m., Felix Kuehling
  wrote:


  Am 2021-11-24 um 5:58 p.m. schrieb Philip Yang:

  
IH ring1 is used to process GPU retry fault, overflow is enabled to
drain retry fault because we want receive other interrupts while
handling retry fault to recover range. There is no overflow flag set
when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
and drain retry fault.

Add amdgpu_ih_function interface decode_iv_ts for different chips to get
timestamp from IV entry with different iv size and timestamp offset.
amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.

Drain retry fault is done if processed_timestamp is equal to or larger
than checkpoint timestamp. Page fault handler skips retry fault entry if
entry timestamp goes backward.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 58 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 16 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++
 drivers/gpu/drm/amd/amdgpu/navi10_ih.c |  1 +
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  1 +
 drivers/gpu/drm/amd/amdgpu/vega20_ih.c |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   |  2 +-
 8 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index 0c7963dfacad..3e043acaab82 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
 	}
 }
 
-/* Waiter helper that checks current rptr matches or passes checkpoint wptr */
-static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev,
-	struct amdgpu_ih_ring *ih,
-	uint32_t checkpoint_wptr,
-	uint32_t *prev_rptr)
-{
-	uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask);
-
-	/* rptr has wrapped. */
-	if (cur_rptr < *prev_rptr)
-		cur_rptr += ih->ptr_mask + 1;
-	*prev_rptr = cur_rptr;
-
-	/* check ring is empty to workaround missing wptr overflow flag */
-	return cur_rptr >= checkpoint_wptr ||
-	   (cur_rptr & ih->ptr_mask) == amdgpu_ih_get_wptr(adev, ih);
-}
-
 /**
- * amdgpu_ih_wait_on_checkpoint_process - wait to process IVs up to checkpoint
+ * amdgpu_ih_wait_on_checkpoint_process_ts - wait to process IVs up to checkpoint
  *
  * @adev: amdgpu_device pointer
  * @ih: ih ring to process
  *
  * Used to ensure ring has processed IVs up to the checkpoint write pointer.
  */
-int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev,
+int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
 	struct amdgpu_ih_ring *ih)
 {
-	uint32_t checkpoint_wptr, rptr;
+	uint32_t checkpoint_wptr;
+	uint64_t checkpoint_ts;
+	long timeout = HZ;
 
 	if (!ih->enabled || adev->shutdown)
 		return -ENODEV;
 
 	checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
-	/* Order wptr with rptr. */
+	/* Order wptr with ring data. */
 	rmb();
-	rptr = READ_ONCE(ih->rptr);
-
-	/* wptr has wrapped. */
-	if (rptr > checkpoint_wptr)
-		checkpoint_wptr += ih->ptr_mask + 1;
+	checkpoint_ts = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1);
 
-	return wait_event_interruptible(ih->wait_process,
-amdgpu_ih_has_checkpoint_processed(adev, ih,
-		checkpoint_wptr, ));
+	return wait_event_interruptible_timeout(ih->wait_process,
+		!amdgpu_ih_ts_after(ih->processed_timestamp, checkpoint_ts),
+		timeout);
 }
 
 /**
@@ -298,4 +278,22 @@ void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
 
 	/* wptr/rptr are in bytes! */
 	ih->rptr += 32;
+	if (ih == >irq.ih1 &&
+	amdgpu_ih_ts_after(ih->processed_timestamp, entry->timestamp))
+		ih->processed_timestamp = entry->timestamp;

  
  
I'd call this "latest_decoded_timestamp". At this point it hasn't been
processed yet.

Also, I think it would be safe and cheap enough to do this on all IH
rings, in case someone finds it useful for something else, e.g. using
amdgpu_ih_wait_on_checkpoint_process_ts on IH ring 0.


Thanks, yes, vega20 uses ring0, do this on all IH rings.

  


  
+}
+
+uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
+   signed int offset)
+{
+	uint32_t iv_size = 32;
+	uint32_t dw1, dw2;
+	uint32_t index;
+
+	rptr += iv_size * offset;
+	index = (rptr & ih->ptr_mask) >> 2;
+
+	dw1 = le32_to_cpu(ih->ring[index + 1]);
+	dw2 = le32_to_cpu(ih->ring[index + 2]);
+	return dw1 | ((u64)(dw2 & 0x) << 32);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
index 0649b59830a5..dd1c2eded6b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
@@ -68,20 +68,30 @@ struct amdgpu_ih_ring {
 
 	/* For waiting on IH processing at checkpoint. */
 	wait_queue_head_t wait_process;
+	uint64_t		processed_timestamp;
 };
 
+/* 

RE: [PATCH] drm/amd/pm: Add warning for unexpected PG requests

2021-11-25 Thread Chen, Guchun
[Public]

Thanks for clarification, Lijo.

Reviewed-by: Guchun Chen 

Regards,
Guchun

-Original Message-
From: Lazar, Lijo  
Sent: Thursday, November 25, 2021 9:32 PM
To: Chen, Guchun ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Limonciello, Mario 
; Zhang, Hawking 
Subject: Re: [PATCH] drm/amd/pm: Add warning for unexpected PG requests



On 11/25/2021 6:52 PM, Chen, Guchun wrote:
> [Public]
> 
> Use dev_warn to be mGPU friendly?

The intention is to get a trace as well along with that. There are multiple 
paths to this function.

Thanks,
Lijo

> 
> Regards,
> Guchun
> 
> -Original Message-
> From: amd-gfx  On Behalf Of 
> Lijo Lazar
> Sent: Thursday, November 25, 2021 7:51 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Limonciello, Mario 
> ; Zhang, Hawking 
> Subject: [PATCH] drm/amd/pm: Add warning for unexpected PG requests
> 
> Ideally power gate/ungate requests shouldn't come when smu block is 
> uninitialized. Add a WARN message to check the origins if such a thing ever 
> happens.
> 
> Signed-off-by: Lijo Lazar 
> ---
>   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 5 -
>   1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index e156add7b560..e0f8ab8be975 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -277,8 +277,11 @@ static int smu_dpm_set_power_gate(void *handle,
>   struct smu_context *smu = handle;
>   int ret = 0;
>   
> - if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
> + if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
> + WARN(true, "SMU uninitialized but power %s requested for %u!\n",
> +  gate ? "gate" : "ungate", block_type);
>   return -EOPNOTSUPP;
> + }
>   
>   switch (block_type) {
>   /*
> --
> 2.25.1
> 


Re: [PATCH] drm/amd/pm: Add warning for unexpected PG requests

2021-11-25 Thread Lazar, Lijo




On 11/25/2021 6:52 PM, Chen, Guchun wrote:

[Public]

Use dev_warn to be mGPU friendly?


The intention is to get a trace as well along with that. There are 
multiple paths to this function.


Thanks,
Lijo



Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Thursday, November 25, 2021 7:51 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Limonciello, Mario 
; Zhang, Hawking 
Subject: [PATCH] drm/amd/pm: Add warning for unexpected PG requests

Ideally power gate/ungate requests shouldn't come when smu block is 
uninitialized. Add a WARN message to check the origins if such a thing ever 
happens.

Signed-off-by: Lijo Lazar 
---
  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index e156add7b560..e0f8ab8be975 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -277,8 +277,11 @@ static int smu_dpm_set_power_gate(void *handle,
struct smu_context *smu = handle;
int ret = 0;
  
-	if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)

+   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
+   WARN(true, "SMU uninitialized but power %s requested for %u!\n",
+gate ? "gate" : "ungate", block_type);
return -EOPNOTSUPP;
+   }
  
  	switch (block_type) {

/*
--
2.25.1



RE: [PATCH] drm/amd/pm: Add warning for unexpected PG requests

2021-11-25 Thread Chen, Guchun
[Public]

Use dev_warn to be mGPU friendly?

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Thursday, November 25, 2021 7:51 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Limonciello, Mario 
; Zhang, Hawking 
Subject: [PATCH] drm/amd/pm: Add warning for unexpected PG requests

Ideally power gate/ungate requests shouldn't come when smu block is 
uninitialized. Add a WARN message to check the origins if such a thing ever 
happens.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index e156add7b560..e0f8ab8be975 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -277,8 +277,11 @@ static int smu_dpm_set_power_gate(void *handle,
struct smu_context *smu = handle;
int ret = 0;
 
-   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
+   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
+   WARN(true, "SMU uninitialized but power %s requested for %u!\n",
+gate ? "gate" : "ungate", block_type);
return -EOPNOTSUPP;
+   }
 
switch (block_type) {
/*
--
2.25.1


[PATCH] drm/amd/pm: Add warning for unexpected PG requests

2021-11-25 Thread Lijo Lazar
Ideally power gate/ungate requests shouldn't come when smu block is
uninitialized. Add a WARN message to check the origins if such a thing
ever happens.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index e156add7b560..e0f8ab8be975 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -277,8 +277,11 @@ static int smu_dpm_set_power_gate(void *handle,
struct smu_context *smu = handle;
int ret = 0;
 
-   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
+   if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
+   WARN(true, "SMU uninitialized but power %s requested for %u!\n",
+gate ? "gate" : "ungate", block_type);
return -EOPNOTSUPP;
+   }
 
switch (block_type) {
/*
-- 
2.25.1



Re: [PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of each IP block

2021-11-25 Thread Lazar, Lijo




On 11/25/2021 4:26 PM, yipechai wrote:

Define an unified ras function pointers for each ip block to adapt.

Signed-off-by: yipechai 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 36 -
  2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 90f0db3b4f65..dc6c8130e2d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2739,3 +2739,23 @@ static void amdgpu_register_bad_pages_mca_notifier(void)
  }
  }
  #endif
+
+/* check if ras is supported on block, say, sdma, gfx */
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,
+   unsigned int block)
+{
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   if (block >= AMDGPU_RAS_BLOCK_COUNT)
+   return 0;
+   return ras && (adev->ras_enabled & (1 << block));
+}
+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
+   schedule_work(>recovery_work);
+   return 0;
+}


These changes look unrelated. Maybe as another patch to move from .h 
file to .c file.



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cdd0010a5389..4b7da40dd837 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -469,6 +469,19 @@ struct ras_debug_if {
};
int op;
  };
+
+struct amdgpu_ras_block_ops {
+   int (*ras_late_init)(struct amdgpu_device *adev);
+   void (*ras_fini)(struct amdgpu_device *adev);
+   int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
+   void  (*query_ras_error_count)(struct amdgpu_device *adev,void 
*ras_error_status);
+   void (*query_ras_error_status)(struct amdgpu_device *adev);
+   bool  (*query_ras_poison_mode)(struct amdgpu_device *adev);
+   void (*query_ras_error_address)(struct amdgpu_device *adev, void 
*ras_error_status);
+   void (*reset_ras_error_count)(struct amdgpu_device *adev);
+   void (*reset_ras_error_status)(struct amdgpu_device *adev);
+};
+


Generic comment - Since all the operations are consolidated under _ops, 
it makes sense to rename the _ras_funcs to _ras.


Ex: amdgpu_gfx_ras_funcs => amdgpu_gfx_ras, amdgpu_xgmi_ras_funcs => 
amdgpu_xgmi_ras and so forth.


In future, these ras blocks may have data members to keep IP specific 
ras data.


Thanks,
Lijo


  /* work flow
   * vbios
   * 1: ras feature enable (enabled by default)
@@ -486,16 +499,6 @@ struct ras_debug_if {
  #define amdgpu_ras_get_context(adev)  ((adev)->psp.ras_context.ras)
  #define amdgpu_ras_set_context(adev, ras_con) ((adev)->psp.ras_context.ras = 
(ras_con))
  
-/* check if ras is supported on block, say, sdma, gfx */

-static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
-   unsigned int block)
-{
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-   if (block >= AMDGPU_RAS_BLOCK_COUNT)
-   return 0;
-   return ras && (adev->ras_enabled & (1 << block));
-}
  
  int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
  
@@ -512,15 +515,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
  
  int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
  
-static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)

-{
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-   if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
-   schedule_work(>recovery_work);
-   return 0;
-}
-
  static inline enum ta_ras_block
  amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
switch (block) {
@@ -652,4 +646,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block);
  
  bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
  
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,	unsigned int block);

+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
+
  #endif



[PATCH 8/9] drm/amdgpu: Modify umc block to fit for the unified ras function pointers.

2021-11-25 Thread yipechai
Modify umc block ras functions to fit for the unified ras function pointers.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  9 ++---
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c   | 10 ++
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   | 12 +++-
 drivers/gpu/drm/amd/amdgpu/umc_v8_7.c   | 11 ++-
 7 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 7780effdf3ac..4499cc5186cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -435,8 +435,8 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
int r;
 
if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->ras_late_init) {
-   r = adev->umc.ras_funcs->ras_late_init(adev);
+   adev->umc.ras_funcs->ops.ras_late_init) {
+   r = adev->umc.ras_funcs->ops.ras_late_init(adev);
if (r)
return r;
}
@@ -492,8 +492,8 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
 {
if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->ras_fini)
-   adev->umc.ras_funcs->ras_fini(adev);
+   adev->umc.ras_funcs->ops.ras_fini)
+   adev->umc.ras_funcs->ops.ras_fini(adev);
 
if (adev->mmhub.ras_funcs &&
adev->mmhub.ras_funcs->ops.ras_fini)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2c79172f6031..65306e0079af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -902,14 +902,14 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC:
if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->query_ras_error_count)
-   adev->umc.ras_funcs->query_ras_error_count(adev, 
_data);
+   adev->umc.ras_funcs->ops.query_ras_error_count)
+   adev->umc.ras_funcs->ops.query_ras_error_count(adev, 
_data);
/* umc query_ras_error_address is also responsible for clearing
 * error status
 */
if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->query_ras_error_address)
-   adev->umc.ras_funcs->query_ras_error_address(adev, 
_data);
+   adev->umc.ras_funcs->ops.query_ras_error_address)
+   adev->umc.ras_funcs->ops.query_ras_error_address(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->query_ras_error_count) {
@@ -2341,11 +2341,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (adev->df.funcs &&
adev->df.funcs->query_ras_poison_mode &&
adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->query_ras_poison_mode) {
+   adev->umc.ras_funcs->ops.query_ras_poison_mode) {
df_poison =
adev->df.funcs->query_ras_poison_mode(adev);
umc_poison =
-   adev->umc.ras_funcs->query_ras_poison_mode(adev);
+   adev->umc.ras_funcs->ops.query_ras_poison_mode(adev);
/* Only poison is set in both DF and UMC, we can support it */
if (df_poison && umc_poison)
con->poison_supported = true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0c7c56a91b25..9a44c410be06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -98,11 +98,11 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device 
*adev,
 
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->query_ras_error_count)
-   adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
+   adev->umc.ras_funcs->ops.query_ras_error_count)
+   adev->umc.ras_funcs->ops.query_ras_error_count(adev, 
ras_error_status);
 
if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->query_ras_error_address &&
+   adev->umc.ras_funcs->ops.query_ras_error_address &&
adev->umc.max_ras_err_cnt_per_query) {
err_data->err_addr =
kcalloc(adev->umc.max_ras_err_cnt_per_query,
@@ -118,7 +118,7 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device 
*adev,
/* umc query_ras_error_address is also responsible for clearing
 * error status
 */
-

[PATCH 9/9] drm/amdgpu: Modify sdma block to fit for the unified ras function pointers.

2021-11-25 Thread yipechai
Modify sdma block ras functions to fit for the unified ras function pointers.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 11 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 11 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 42 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c   | 25 +++---
 4 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 65306e0079af..e6d82e6e702c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -912,11 +912,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
adev->umc.ras_funcs->ops.query_ras_error_address(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__SDMA:
-   if (adev->sdma.funcs->query_ras_error_count) {
-   for (i = 0; i < adev->sdma.num_instances; i++)
-   adev->sdma.funcs->query_ras_error_count(adev, i,
-   
_data);
-   }
+   if (adev->sdma.ras_funcs->ops.query_ras_error_count)
+   adev->sdma.ras_funcs->ops.query_ras_error_count(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.ras_funcs &&
@@ -1035,8 +1032,8 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
adev->mmhub.ras_funcs->ops.reset_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__SDMA:
-   if (adev->sdma.funcs->reset_ras_error_count)
-   adev->sdma.funcs->reset_ras_error_count(adev);
+   if (adev->sdma.ras_funcs->ops.reset_ras_error_count)
+   adev->sdma.ras_funcs->ops.reset_ras_error_count(adev);
break;
case AMDGPU_RAS_BLOCK__HDP:
if (adev->hdp.ras_funcs &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index f8fb755e3aa6..a76c63520ca0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -23,6 +23,7 @@
 
 #ifndef __AMDGPU_SDMA_H__
 #define __AMDGPU_SDMA_H__
+#include "amdgpu_ras.h"
 
 /* max number of IP instances */
 #define AMDGPU_MAX_SDMA_INSTANCES  8
@@ -51,12 +52,8 @@ struct amdgpu_sdma_instance {
 };
 
 struct amdgpu_sdma_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev,
-   void *ras_ih_info);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   int (*query_ras_error_count)(struct amdgpu_device *adev,
-   uint32_t instance, void *ras_error_status);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
+   struct amdgpu_ras_block_ops ops;
+   int (*sdma_ras_late_init)(struct amdgpu_device *adev, void 
*ras_ih_info);
 };
 
 struct amdgpu_sdma {
@@ -73,7 +70,7 @@ struct amdgpu_sdma {
uint32_tsrbm_soft_reset;
boolhas_page_queue;
struct ras_common_if*ras_if;
-   const struct amdgpu_sdma_ras_funcs  *funcs;
+   const struct amdgpu_sdma_ras_funcs  *ras_funcs;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 69c9e460c1eb..d5bd23b57f5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1898,13 +1898,13 @@ static int sdma_v4_0_late_init(void *handle)
sdma_v4_0_setup_ulv(adev);
 
if (!amdgpu_persistent_edc_harvesting_supported(adev)) {
-   if (adev->sdma.funcs &&
-   adev->sdma.funcs->reset_ras_error_count)
-   adev->sdma.funcs->reset_ras_error_count(adev);
+   if (adev->sdma.ras_funcs &&
+   adev->sdma.ras_funcs->ops.reset_ras_error_count)
+   adev->sdma.ras_funcs->ops.reset_ras_error_count(adev);
}
 
-   if (adev->sdma.funcs && adev->sdma.funcs->ras_late_init)
-   return adev->sdma.funcs->ras_late_init(adev, _info);
+   if (adev->sdma.ras_funcs && adev->sdma.ras_funcs->sdma_ras_late_init)
+   return adev->sdma.ras_funcs->sdma_ras_late_init(adev, _info);
else
return 0;
 }
@@ -2007,8 +2007,8 @@ static int sdma_v4_0_sw_fini(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
int i;
 
-   if (adev->sdma.funcs && adev->sdma.funcs->ras_fini)
-   adev->sdma.funcs->ras_fini(adev);
+   if (adev->sdma.ras_funcs && adev->sdma.ras_funcs->ops.ras_fini)
+   adev->sdma.ras_funcs->ops.ras_fini(adev);
 
for (i = 0; i < adev->sdma.num_instances; i++) {
amdgpu_ring_fini(>sdma.instance[i].ring);
@@ -2745,7 +2745,7 @@ 

[PATCH 7/9] drm/amdgpu: Modify nbio block to fit for the unified ras function pointers.

2021-11-25 Thread yipechai
Modify nbio block ras functions to fit for the unified ras function pointers.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 7 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 4 ++--
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 8 +---
 drivers/gpu/drm/amd/amdgpu/soc15.c   | 8 
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 843052205bd5..21574493afff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -22,7 +22,7 @@
  */
 #ifndef __AMDGPU_NBIO_H__
 #define __AMDGPU_NBIO_H__
-
+#include "amdgpu_ras.h"
 /*
  * amdgpu nbio functions
  */
@@ -48,14 +48,11 @@ struct nbio_hdp_flush_reg {
 };
 
 struct amdgpu_nbio_ras_funcs {
+   struct amdgpu_ras_block_ops ops;
void (*handle_ras_controller_intr_no_bifring)(struct amdgpu_device 
*adev);
void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device 
*adev);
int (*init_ras_controller_interrupt)(struct amdgpu_device *adev);
int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev);
-   void (*query_ras_error_count)(struct amdgpu_device *adev,
- void *ras_error_status);
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_nbio_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2d9ef677a2ef..2c79172f6031 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -938,8 +938,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
break;
case AMDGPU_RAS_BLOCK__PCIE_BIF:
if (adev->nbio.ras_funcs &&
-   adev->nbio.ras_funcs->query_ras_error_count)
-   adev->nbio.ras_funcs->query_ras_error_count(adev, 
_data);
+   adev->nbio.ras_funcs->ops.query_ras_error_count)
+   adev->nbio.ras_funcs->ops.query_ras_error_count(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__XGMI_WAFL:
if (adev->gmc.xgmi.ras_funcs &&
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 91b3afa946f5..ebbe78d2ca52 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -643,9 +643,11 @@ const struct amdgpu_nbio_ras_funcs nbio_v7_4_ras_funcs = {
.handle_ras_err_event_athub_intr_no_bifring = 
nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
.init_ras_controller_interrupt = 
nbio_v7_4_init_ras_controller_interrupt,
.init_ras_err_event_athub_interrupt = 
nbio_v7_4_init_ras_err_event_athub_interrupt,
-   .query_ras_error_count = nbio_v7_4_query_ras_error_count,
-   .ras_late_init = amdgpu_nbio_ras_late_init,
-   .ras_fini = amdgpu_nbio_ras_fini,
+   .ops = {
+   .query_ras_error_count = nbio_v7_4_query_ras_error_count,
+   .ras_late_init = amdgpu_nbio_ras_late_init,
+   .ras_fini = amdgpu_nbio_ras_fini,
+   },
 };
 
 static void nbio_v7_4_program_ltr(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index f9d92b6deef0..99176af847f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1386,8 +1386,8 @@ static int soc15_common_late_init(void *handle)
xgpu_ai_mailbox_get_irq(adev);
 
if (adev->nbio.ras_funcs &&
-   adev->nbio.ras_funcs->ras_late_init)
-   r = adev->nbio.ras_funcs->ras_late_init(adev);
+   adev->nbio.ras_funcs->ops.ras_late_init)
+   r = adev->nbio.ras_funcs->ops.ras_late_init(adev);
 
return r;
 }
@@ -1409,8 +1409,8 @@ static int soc15_common_sw_fini(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
if (adev->nbio.ras_funcs &&
-   adev->nbio.ras_funcs->ras_fini)
-   adev->nbio.ras_funcs->ras_fini(adev);
+   adev->nbio.ras_funcs->ops.ras_fini)
+   adev->nbio.ras_funcs->ops.ras_fini(adev);
adev->df.funcs->sw_fini(adev);
return 0;
 }
-- 
2.25.1



[PATCH 6/9] drm/amdgpu: Modify mmhub block to fit for the unified ras function pointers.

2021-11-25 Thread yipechai
Modify mmhub block ras funcions to fit for the unified ras function pointers.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c|  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h  |  9 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 20 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c| 10 ++
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c| 14 --
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c| 12 +++-
 8 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 73ec46140d68..fcdd06bdb5d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3378,8 +3378,8 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
goto fail;
 
if (adev->mmhub.ras_funcs &&
-   adev->mmhub.ras_funcs->reset_ras_error_count)
-   adev->mmhub.ras_funcs->reset_ras_error_count(adev);
+   adev->mmhub.ras_funcs->ops.reset_ras_error_count)
+   adev->mmhub.ras_funcs->ops.reset_ras_error_count(adev);
} else {
 
task_barrier_full(>tb);
@@ -4704,8 +4704,8 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
if (!r && amdgpu_ras_intr_triggered()) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (tmp_adev->mmhub.ras_funcs &&
-   tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
-   
tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
+   
tmp_adev->mmhub.ras_funcs->ops.reset_ras_error_count)
+   
tmp_adev->mmhub.ras_funcs->ops.reset_ras_error_count(tmp_adev);
}
 
amdgpu_ras_intr_cleared();
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 024342969267..7780effdf3ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -442,8 +442,8 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
}
 
if (adev->mmhub.ras_funcs &&
-   adev->mmhub.ras_funcs->ras_late_init) {
-   r = adev->mmhub.ras_funcs->ras_late_init(adev);
+   adev->mmhub.ras_funcs->ops.ras_late_init) {
+   r = adev->mmhub.ras_funcs->ops.ras_late_init(adev);
if (r)
return r;
}
@@ -496,8 +496,8 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
adev->umc.ras_funcs->ras_fini(adev);
 
if (adev->mmhub.ras_funcs &&
-   adev->mmhub.ras_funcs->ras_fini)
-   adev->mmhub.ras_funcs->ras_fini(adev);
+   adev->mmhub.ras_funcs->ops.ras_fini)
+   adev->mmhub.ras_funcs->ops.ras_fini(adev);
 
if (adev->gmc.xgmi.ras_funcs &&
adev->gmc.xgmi.ras_funcs->ops.ras_fini)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index b27fcbccce2b..ff7f28ef1d6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -20,15 +20,10 @@
  */
 #ifndef __AMDGPU_MMHUB_H__
 #define __AMDGPU_MMHUB_H__
+#include "amdgpu_ras.h"
 
 struct amdgpu_mmhub_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   void (*query_ras_error_count)(struct amdgpu_device *adev,
- void *ras_error_status);
-   void (*query_ras_error_status)(struct amdgpu_device *adev);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
-   void (*reset_ras_error_status)(struct amdgpu_device *adev);
+   struct amdgpu_ras_block_ops ops;
 };
 
 struct amdgpu_mmhub_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e7cd2de07665..2d9ef677a2ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -929,12 +929,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.ras_funcs &&
-   adev->mmhub.ras_funcs->query_ras_error_count)
-   adev->mmhub.ras_funcs->query_ras_error_count(adev, 
_data);
+   adev->mmhub.ras_funcs->ops.query_ras_error_count)
+   adev->mmhub.ras_funcs->ops.query_ras_error_count(adev, 
_data);
 
if (adev->mmhub.ras_funcs &&
-   adev->mmhub.ras_funcs->query_ras_error_status)
-   

[PATCH 5/9] drm/amdgpu: Modify mca block to fit for the unified ras function pointers.

2021-11-25 Thread yipechai
Modify mca block ras functions to fit for the unified ras function pointers.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 12 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h |  8 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 +-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c   | 30 +++--
 4 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 0aab31fce997..024342969267 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -466,22 +466,22 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
}
 
if (adev->mca.mp0.ras_funcs &&
-   adev->mca.mp0.ras_funcs->ras_late_init) {
-   r = adev->mca.mp0.ras_funcs->ras_late_init(adev);
+   adev->mca.mp0.ras_funcs->ops.ras_late_init) {
+   r = adev->mca.mp0.ras_funcs->ops.ras_late_init(adev);
if (r)
return r;
}
 
if (adev->mca.mp1.ras_funcs &&
-   adev->mca.mp1.ras_funcs->ras_late_init) {
-   r = adev->mca.mp1.ras_funcs->ras_late_init(adev);
+   adev->mca.mp1.ras_funcs->ops.ras_late_init) {
+   r = adev->mca.mp1.ras_funcs->ops.ras_late_init(adev);
if (r)
return r;
}
 
if (adev->mca.mpio.ras_funcs &&
-   adev->mca.mpio.ras_funcs->ras_late_init) {
-   r = adev->mca.mpio.ras_funcs->ras_late_init(adev);
+   adev->mca.mpio.ras_funcs->ops.ras_late_init) {
+   r = adev->mca.mpio.ras_funcs->ops.ras_late_init(adev);
if (r)
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index c74bc7177066..fbc3ebc81b99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -20,14 +20,10 @@
  */
 #ifndef __AMDGPU_MCA_H__
 #define __AMDGPU_MCA_H__
+#include "amdgpu_ras.h"
 
 struct amdgpu_mca_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   void (*query_ras_error_count)(struct amdgpu_device *adev,
- void *ras_error_status);
-   void (*query_ras_error_address)(struct amdgpu_device *adev,
-   void *ras_error_status);
+   struct amdgpu_ras_block_ops ops;
uint32_t ras_block;
uint32_t ras_sub_block;
const char* sysfs_name;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a3b606c84d45..e7cd2de07665 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -870,18 +870,18 @@ void amdgpu_ras_mca_query_error_status(struct 
amdgpu_device *adev,
switch (ras_block->sub_block_index) {
case AMDGPU_RAS_MCA_BLOCK__MP0:
if (adev->mca.mp0.ras_funcs &&
-   adev->mca.mp0.ras_funcs->query_ras_error_count)
-   adev->mca.mp0.ras_funcs->query_ras_error_count(adev, 
_data);
+   adev->mca.mp0.ras_funcs->ops.query_ras_error_count)
+   
adev->mca.mp0.ras_funcs->ops.query_ras_error_count(adev, _data);
break;
case AMDGPU_RAS_MCA_BLOCK__MP1:
if (adev->mca.mp1.ras_funcs &&
-   adev->mca.mp1.ras_funcs->query_ras_error_count)
-   adev->mca.mp1.ras_funcs->query_ras_error_count(adev, 
_data);
+   adev->mca.mp1.ras_funcs->ops.query_ras_error_count)
+   
adev->mca.mp1.ras_funcs->ops.query_ras_error_count(adev, _data);
break;
case AMDGPU_RAS_MCA_BLOCK__MPIO:
if (adev->mca.mpio.ras_funcs &&
-   adev->mca.mpio.ras_funcs->query_ras_error_count)
-   adev->mca.mpio.ras_funcs->query_ras_error_count(adev, 
_data);
+   adev->mca.mpio.ras_funcs->ops.query_ras_error_count)
+   
adev->mca.mpio.ras_funcs->ops.query_ras_error_count(adev, _data);
break;
default:
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
index 8f7107d392af..dc2424587f12 100644
--- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
@@ -48,10 +48,12 @@ static void mca_v3_0_mp0_ras_fini(struct amdgpu_device 
*adev)
 }
 
 const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = {
-   .ras_late_init = mca_v3_0_mp0_ras_late_init,
-   .ras_fini = mca_v3_0_mp0_ras_fini,
-   .query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
-   .query_ras_error_address = NULL,
+   .ops = {
+   .ras_late_init = 

[PATCH 4/9] drm/amdgpu: Modify hdp block to fit for the unified ras function pointers.

2021-11-25 Thread yipechai
Modify hdp block ras functions to fit for the unified ras function pointers.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h |  7 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  8 
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  4 ++--
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   | 10 ++
 5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index b7c462749d37..0aab31fce997 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -459,8 +459,8 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
}
 
if (adev->hdp.ras_funcs &&
-   adev->hdp.ras_funcs->ras_late_init) {
-   r = adev->hdp.ras_funcs->ras_late_init(adev);
+   adev->hdp.ras_funcs->ops.ras_late_init) {
+   r = adev->hdp.ras_funcs->ops.ras_late_init(adev);
if (r)
return r;
}
@@ -504,8 +504,8 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
adev->gmc.xgmi.ras_funcs->ops.ras_fini(adev);
 
if (adev->hdp.ras_funcs &&
-   adev->hdp.ras_funcs->ras_fini)
-   adev->hdp.ras_funcs->ras_fini(adev);
+   adev->hdp.ras_funcs->ops.ras_fini)
+   adev->hdp.ras_funcs->ops.ras_fini(adev);
 }
 
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
index 7ec99d591584..49121eb7d599 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
@@ -22,13 +22,10 @@
  */
 #ifndef __AMDGPU_HDP_H__
 #define __AMDGPU_HDP_H__
+#include "amdgpu_ras.h"
 
 struct amdgpu_hdp_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   void (*query_ras_error_count)(struct amdgpu_device *adev,
- void *ras_error_status);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
+   struct amdgpu_ras_block_ops ops;
 };
 
 struct amdgpu_hdp_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7f830bf8f8df..a3b606c84d45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -948,8 +948,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
break;
case AMDGPU_RAS_BLOCK__HDP:
if (adev->hdp.ras_funcs &&
-   adev->hdp.ras_funcs->query_ras_error_count)
-   adev->hdp.ras_funcs->query_ras_error_count(adev, 
_data);
+   adev->hdp.ras_funcs->ops.query_ras_error_count)
+   adev->hdp.ras_funcs->ops.query_ras_error_count(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__MCA:
amdgpu_ras_mca_query_error_status(adev, >head, _data);
@@ -1040,8 +1040,8 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
break;
case AMDGPU_RAS_BLOCK__HDP:
if (adev->hdp.ras_funcs &&
-   adev->hdp.ras_funcs->reset_ras_error_count)
-   adev->hdp.ras_funcs->reset_ras_error_count(adev);
+   adev->hdp.ras_funcs->ops.reset_ras_error_count)
+   adev->hdp.ras_funcs->ops.reset_ras_error_count(adev);
break;
default:
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 3606d2cbff5e..c40c669d49c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1301,8 +1301,8 @@ static int gmc_v9_0_late_init(void *handle)
adev->mmhub.ras_funcs->reset_ras_error_count(adev);
 
if (adev->hdp.ras_funcs &&
-   adev->hdp.ras_funcs->reset_ras_error_count)
-   adev->hdp.ras_funcs->reset_ras_error_count(adev);
+   adev->hdp.ras_funcs->ops.reset_ras_error_count)
+   adev->hdp.ras_funcs->ops.reset_ras_error_count(adev);
}
 
r = amdgpu_gmc_ras_late_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
index 74b90cc2bf48..9021ea08ee0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
@@ -150,10 +150,12 @@ static void hdp_v4_0_init_registers(struct amdgpu_device 
*adev)
 }
 
 const struct amdgpu_hdp_ras_funcs hdp_v4_0_ras_funcs = {
-   .ras_late_init = amdgpu_hdp_ras_late_init,
-   .ras_fini = amdgpu_hdp_ras_fini,
-   .query_ras_error_count = hdp_v4_0_query_ras_error_count,
-   .reset_ras_error_count = hdp_v4_0_reset_ras_error_count,
+   .ops = {
+   .ras_late_init = 

[PATCH 3/9] drm/amdgpu: Modify gmc block to fit for the unified ras function pointers.

2021-11-25 Thread yipechai
Modify gmc block ras functions to fit for the unified ras function pointers.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  |  7 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 20 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  2 +-
 5 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 83f26bca7dac..b7c462749d37 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -452,8 +452,8 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
adev->gmc.xgmi.ras_funcs = _ras_funcs;
 
if (adev->gmc.xgmi.ras_funcs &&
-   adev->gmc.xgmi.ras_funcs->ras_late_init) {
-   r = adev->gmc.xgmi.ras_funcs->ras_late_init(adev);
+   adev->gmc.xgmi.ras_funcs->ops.ras_late_init) {
+   r = adev->gmc.xgmi.ras_funcs->ops.ras_late_init(adev);
if (r)
return r;
}
@@ -500,8 +500,8 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
adev->mmhub.ras_funcs->ras_fini(adev);
 
if (adev->gmc.xgmi.ras_funcs &&
-   adev->gmc.xgmi.ras_funcs->ras_fini)
-   adev->gmc.xgmi.ras_funcs->ras_fini(adev);
+   adev->gmc.xgmi.ras_funcs->ops.ras_fini)
+   adev->gmc.xgmi.ras_funcs->ops.ras_fini(adev);
 
if (adev->hdp.ras_funcs &&
adev->hdp.ras_funcs->ras_fini)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index e55201134a01..f6f7d996ff98 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -29,6 +29,7 @@
 #include 
 
 #include "amdgpu_irq.h"
+#include "amdgpu_ras.h"
 
 /* VA hole for 48bit addresses on Vega10 */
 #define AMDGPU_GMC_HOLE_START  0x8000ULL
@@ -136,11 +137,7 @@ struct amdgpu_gmc_funcs {
 };
 
 struct amdgpu_xgmi_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   int (*query_ras_error_count)(struct amdgpu_device *adev,
-void *ras_error_status);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
+   struct amdgpu_ras_block_ops ops;
 };
 
 struct amdgpu_xgmi {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 790aaba065ab..7f830bf8f8df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -943,8 +943,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
break;
case AMDGPU_RAS_BLOCK__XGMI_WAFL:
if (adev->gmc.xgmi.ras_funcs &&
-   adev->gmc.xgmi.ras_funcs->query_ras_error_count)
-   adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, 
_data);
+   adev->gmc.xgmi.ras_funcs->ops.query_ras_error_count)
+   
adev->gmc.xgmi.ras_funcs->ops.query_ras_error_count(adev, _data);
break;
case AMDGPU_RAS_BLOCK__HDP:
if (adev->hdp.ras_funcs &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 0d149f5f000e..306962c95d52 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -739,7 +739,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device 
*adev)
adev->gmc.xgmi.num_physical_nodes == 0)
return 0;
 
-   adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
+   adev->gmc.xgmi.ras_funcs->ops.reset_ras_error_count(adev);
 
if (!adev->gmc.xgmi.ras_if) {
adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
@@ -859,7 +859,7 @@ static int amdgpu_xgmi_query_pcs_error_status(struct 
amdgpu_device *adev,
return 0;
 }
 
-static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
+static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 void *ras_error_status)
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
@@ -868,7 +868,7 @@ static int amdgpu_xgmi_query_ras_error_count(struct 
amdgpu_device *adev,
uint32_t ue_cnt = 0, ce_cnt = 0;
 
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
-   return -EINVAL;
+   return ;
 
err_data->ue_count = 0;
err_data->ce_count = 0;
@@ -934,17 +934,17 @@ static int amdgpu_xgmi_query_ras_error_count(struct 
amdgpu_device *adev,
break;
}
 
-   adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
+   

[PATCH 2/9] drm/amdgpu: Modify gfx block to fit for the unified ras function pointers.

2021-11-25 Thread yipechai
Modify gfx block ras functions to fit for the unified ras function pointers.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 11 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 ++---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 28 -
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 19 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 21 ++-
 6 files changed, 51 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 1795d448c700..90ac0e9a32cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -697,8 +697,8 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device 
*adev,
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (adev->gfx.ras_funcs &&
-   adev->gfx.ras_funcs->query_ras_error_count)
-   adev->gfx.ras_funcs->query_ras_error_count(adev, 
err_data);
+   adev->gfx.ras_funcs->ops.query_ras_error_count)
+   adev->gfx.ras_funcs->ops.query_ras_error_count(adev, 
err_data);
amdgpu_ras_reset_gpu(adev);
}
return AMDGPU_RAS_SUCCESS;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 6b78b4a0e182..2a7f78f17c3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -31,6 +31,7 @@
 #include "amdgpu_ring.h"
 #include "amdgpu_rlc.h"
 #include "soc15.h"
+#include "amdgpu_ras.h"
 
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE 0xL
@@ -214,15 +215,7 @@ struct amdgpu_cu_info {
 };
 
 struct amdgpu_gfx_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   int (*ras_error_inject)(struct amdgpu_device *adev,
-   void *inject_if);
-   int (*query_ras_error_count)(struct amdgpu_device *adev,
-void *ras_error_status);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
-   void (*query_ras_error_status)(struct amdgpu_device *adev);
-   void (*reset_ras_error_status)(struct amdgpu_device *adev);
+   struct amdgpu_ras_block_ops ops;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index dc6c8130e2d7..790aaba065ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -920,12 +920,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
break;
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.ras_funcs &&
-   adev->gfx.ras_funcs->query_ras_error_count)
-   adev->gfx.ras_funcs->query_ras_error_count(adev, 
_data);
+   adev->gfx.ras_funcs->ops.query_ras_error_count)
+   adev->gfx.ras_funcs->ops.query_ras_error_count(adev, 
_data);
 
if (adev->gfx.ras_funcs &&
-   adev->gfx.ras_funcs->query_ras_error_status)
-   adev->gfx.ras_funcs->query_ras_error_status(adev);
+   adev->gfx.ras_funcs->ops.query_ras_error_status)
+   adev->gfx.ras_funcs->ops.query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.ras_funcs &&
@@ -1018,12 +1018,12 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
switch (block) {
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.ras_funcs &&
-   adev->gfx.ras_funcs->reset_ras_error_count)
-   adev->gfx.ras_funcs->reset_ras_error_count(adev);
+   adev->gfx.ras_funcs->ops.reset_ras_error_count)
+   adev->gfx.ras_funcs->ops.reset_ras_error_count(adev);
 
if (adev->gfx.ras_funcs &&
-   adev->gfx.ras_funcs->reset_ras_error_status)
-   adev->gfx.ras_funcs->reset_ras_error_status(adev);
+   adev->gfx.ras_funcs->ops.reset_ras_error_status)
+   adev->gfx.ras_funcs->ops.reset_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.ras_funcs &&
@@ -1103,8 +1103,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.ras_funcs &&
-   adev->gfx.ras_funcs->ras_error_inject)
-   ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
+

[PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of each IP block

2021-11-25 Thread yipechai
Define an unified ras function pointers for each ip block to adapt.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 36 -
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 90f0db3b4f65..dc6c8130e2d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2739,3 +2739,23 @@ static void amdgpu_register_bad_pages_mca_notifier(void)
 }
 }
 #endif
+
+/* check if ras is supported on block, say, sdma, gfx */
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,
+   unsigned int block)
+{
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   if (block >= AMDGPU_RAS_BLOCK_COUNT)
+   return 0;
+   return ras && (adev->ras_enabled & (1 << block));
+}
+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
+   schedule_work(>recovery_work);
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cdd0010a5389..4b7da40dd837 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -469,6 +469,19 @@ struct ras_debug_if {
};
int op;
 };
+
+struct amdgpu_ras_block_ops {
+   int (*ras_late_init)(struct amdgpu_device *adev);
+   void (*ras_fini)(struct amdgpu_device *adev);
+   int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
+   void  (*query_ras_error_count)(struct amdgpu_device *adev,void 
*ras_error_status);
+   void (*query_ras_error_status)(struct amdgpu_device *adev);
+   bool  (*query_ras_poison_mode)(struct amdgpu_device *adev);
+   void (*query_ras_error_address)(struct amdgpu_device *adev, void 
*ras_error_status);
+   void (*reset_ras_error_count)(struct amdgpu_device *adev);
+   void (*reset_ras_error_status)(struct amdgpu_device *adev);
+};
+
 /* work flow
  * vbios
  * 1: ras feature enable (enabled by default)
@@ -486,16 +499,6 @@ struct ras_debug_if {
 #define amdgpu_ras_get_context(adev)   ((adev)->psp.ras_context.ras)
 #define amdgpu_ras_set_context(adev, ras_con)  ((adev)->psp.ras_context.ras = 
(ras_con))
 
-/* check if ras is supported on block, say, sdma, gfx */
-static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
-   unsigned int block)
-{
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-   if (block >= AMDGPU_RAS_BLOCK_COUNT)
-   return 0;
-   return ras && (adev->ras_enabled & (1 << block));
-}
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 
@@ -512,15 +515,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 
 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
 
-static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
-{
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-   if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
-   schedule_work(>recovery_work);
-   return 0;
-}
-
 static inline enum ta_ras_block
 amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
switch (block) {
@@ -652,4 +646,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block);
 
 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
 
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,unsigned int 
block);
+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
+
 #endif
-- 
2.25.1



Re: [PATCH 1/6] dma-buf: move dma_resv_prune_unlocked into dma_resv.c

2021-11-25 Thread Christian König

Am 25.11.21 um 10:31 schrieb Maarten Lankhorst:

[SNIP]

diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index eebf04325b34..e0558429a5ee 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -458,6 +458,7 @@ void dma_resv_fini(struct dma_resv *obj);
  int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences);
  void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence);
  void dma_resv_add_excl_fence(struct dma_resv *obj, struct dma_fence *fence);
+void dma_resv_prune_unlocked(struct dma_resv *obj);
  int dma_resv_get_fences(struct dma_resv *obj, struct dma_fence **pfence_excl,
unsigned *pshared_count, struct dma_fence ***pshared);
  int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src);

I don't mind adding a dma_resv_prune for locked case, but I don't think 
unlocked would have benefits.

Furthermore, I'm trying to remove the unlocked versions from i915. Could this 
be a prereq patch instead?

https://patchwork.freedesktop.org/patch/460722/?series=96115=1


Yeah, that works for me as well.

I was on the edge of dropping that from TTM as well since this is really 
just abusing the interface to save a few bytes of memory.


Feel free to add an Acked-by: Christian König  
to the i915 patch if it helps to get that committed.


Regards,
Christian.



~Maarten

~Maarten





Re: [PATCH 1/6] dma-buf: move dma_resv_prune_unlocked into dma_resv.c

2021-11-25 Thread Maarten Lankhorst
On 28-10-2021 15:26, Christian König wrote:
> The i915 driver implements a prune function which is called when it is very
> likely that the fences inside the dma_resv object can be removed because they
> are all signaled.
>
> Move that function into the dma-resv.c code since the behavior of pruning
> fences is something internal to the object.
>
> Signed-off-by: Christian König 
> ---
>  drivers/dma-buf/dma-resv.c   | 18 ++
>  drivers/gpu/drm/i915/Makefile|  1 -
>  drivers/gpu/drm/i915/dma_resv_utils.c| 17 -
>  drivers/gpu/drm/i915/dma_resv_utils.h| 13 -
>  drivers/gpu/drm/i915/gem/i915_gem_shrinker.c |  3 +--
>  drivers/gpu/drm/i915/gem/i915_gem_wait.c |  3 +--
>  include/linux/dma-resv.h |  1 +
>  7 files changed, 21 insertions(+), 35 deletions(-)
>  delete mode 100644 drivers/gpu/drm/i915/dma_resv_utils.c
>  delete mode 100644 drivers/gpu/drm/i915/dma_resv_utils.h
>
> diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
> index ff3c0558b3b8..64d4f95778c4 100644
> --- a/drivers/dma-buf/dma-resv.c
> +++ b/drivers/dma-buf/dma-resv.c
> @@ -324,6 +324,24 @@ void dma_resv_add_excl_fence(struct dma_resv *obj, 
> struct dma_fence *fence)
>  }
>  EXPORT_SYMBOL(dma_resv_add_excl_fence);
>  
> +/**
> + * dma_resv_prune_unlocked - try to remove signaled fences
> + * @obj: The dma_resv object to prune
> + *
> + * Try to lock the object, test if it is signaled and if yes then remove all 
> the
> + * signaled fences.
> + */
> +void dma_resv_prune_unlocked(struct dma_resv *obj)
> +{
> + if (!dma_resv_trylock(obj))
> + return;
> +
> + if (dma_resv_test_signaled(obj, true))
> + dma_resv_add_excl_fence(obj, NULL);
> + dma_resv_unlock(obj);
> +}
> +EXPORT_SYMBOL(dma_resv_prune_unlocked);
> +
>  /**
>   * dma_resv_iter_restart_unlocked - restart the unlocked iterator
>   * @cursor: The dma_resv_iter object to restart
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 660bb03de6fc..5c1af130cb6d 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -60,7 +60,6 @@ i915-y += i915_drv.o \
>  
>  # core library code
>  i915-y += \
> - dma_resv_utils.o \
>   i915_memcpy.o \
>   i915_mm.o \
>   i915_sw_fence.o \
> diff --git a/drivers/gpu/drm/i915/dma_resv_utils.c 
> b/drivers/gpu/drm/i915/dma_resv_utils.c
> deleted file mode 100644
> index 7df91b7e4ca8..
> --- a/drivers/gpu/drm/i915/dma_resv_utils.c
> +++ /dev/null
> @@ -1,17 +0,0 @@
> -// SPDX-License-Identifier: MIT
> -/*
> - * Copyright © 2020 Intel Corporation
> - */
> -
> -#include 
> -
> -#include "dma_resv_utils.h"
> -
> -void dma_resv_prune(struct dma_resv *resv)
> -{
> - if (dma_resv_trylock(resv)) {
> - if (dma_resv_test_signaled(resv, true))
> - dma_resv_add_excl_fence(resv, NULL);
> - dma_resv_unlock(resv);
> - }
> -}
> diff --git a/drivers/gpu/drm/i915/dma_resv_utils.h 
> b/drivers/gpu/drm/i915/dma_resv_utils.h
> deleted file mode 100644
> index b9d8fb5f8367..
> --- a/drivers/gpu/drm/i915/dma_resv_utils.h
> +++ /dev/null
> @@ -1,13 +0,0 @@
> -/* SPDX-License-Identifier: MIT */
> -/*
> - * Copyright © 2020 Intel Corporation
> - */
> -
> -#ifndef DMA_RESV_UTILS_H
> -#define DMA_RESV_UTILS_H
> -
> -struct dma_resv;
> -
> -void dma_resv_prune(struct dma_resv *resv);
> -
> -#endif /* DMA_RESV_UTILS_H */
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
> index 5ab136ffdeb2..48029bbda682 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
> @@ -15,7 +15,6 @@
>  
>  #include "gt/intel_gt_requests.h"
>  
> -#include "dma_resv_utils.h"
>  #include "i915_trace.h"
>  
>  static bool swap_available(void)
> @@ -229,7 +228,7 @@ i915_gem_shrink(struct i915_gem_ww_ctx *ww,
>   i915_gem_object_unlock(obj);
>   }
>  
> - dma_resv_prune(obj->base.resv);
> + dma_resv_prune_unlocked(obj->base.resv);
>  
>   scanned += obj->base.size >> PAGE_SHIFT;
>  skip:
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> index 569658c7859c..1915d203a72d 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> @@ -10,7 +10,6 @@
>  
>  #include "gt/intel_engine.h"
>  
> -#include "dma_resv_utils.h"
>  #include "i915_gem_ioctls.h"
>  #include "i915_gem_object.h"
>  
> @@ -53,7 +52,7 @@ i915_gem_object_wait_reservation(struct dma_resv *resv,
>* signaled.
>*/
>   if (timeout > 0)
> - dma_resv_prune(resv);
> + dma_resv_prune_unlocked(resv);
>  
>   return timeout;
>  }
> diff --git