[PATCH] drm/amdgpu: Improve error checking in amdgpu_virt_rlcg_reg_rw (v2)

2024-02-13 Thread Victor Lu
The current error detection only looks for a timeout.
This should be changed to also check scratch_reg1 for any errors
returned from RLCG.

v2: remove new error value

Signed-off-by: Victor Lu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 6ff7d3fb2008..7a4eae36778a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -979,7 +979,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 
offset, u32 v, u32 f
 * SCRATCH_REG0 = read/write value
 * SCRATCH_REG1[30:28]  = command
 * SCRATCH_REG1[19:0]   = address in dword
-* SCRATCH_REG1[26:24]  = Error reporting
+* SCRATCH_REG1[27:24]  = Error reporting
 */
writel(v, scratch_reg0);
writel((offset | flag), scratch_reg1);
@@ -993,7 +993,8 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 
offset, u32 v, u32 f
udelay(10);
}
 
-   if (i >= timeout) {
+   tmp = readl(scratch_reg1);
+   if (i >= timeout || (tmp & AMDGPU_RLCG_SCRATCH1_ERROR_MASK) != 
0) {
if (amdgpu_sriov_rlcg_error_report_enabled(adev)) {
if (tmp & AMDGPU_RLCG_VFGATE_DISABLED) {
dev_err(adev->dev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index fa7be5f277b9..3f59b7b5523f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -45,6 +45,7 @@
 #define AMDGPU_RLCG_REG_NOT_IN_RANGE   0x100
 
 #define AMDGPU_RLCG_SCRATCH1_ADDRESS_MASK  0xF
+#define AMDGPU_RLCG_SCRATCH1_ERROR_MASK0xF00
 
 /* all asic after AI use this offset */
 #define mmRCC_IOV_FUNC_IDENTIFIER 0xDE5
-- 
2.34.1



Re: [PATCH] drm/amdgpu: Improve error checking in amdgpu_virt_rlcg_reg_rw (v2)

2024-02-16 Thread Alex Deucher
On Tue, Feb 13, 2024 at 2:03 PM Victor Lu  wrote:
>
> The current error detection only looks for a timeout.
> This should be changed to also check scratch_reg1 for any errors
> returned from RLCG.
>
> v2: remove new error value
>
> Signed-off-by: Victor Lu 

Acked-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 5 +++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 +
>  2 files changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 6ff7d3fb2008..7a4eae36778a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -979,7 +979,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, 
> u32 offset, u32 v, u32 f
>  * SCRATCH_REG0 = read/write value
>  * SCRATCH_REG1[30:28]  = command
>  * SCRATCH_REG1[19:0]   = address in dword
> -* SCRATCH_REG1[26:24]  = Error reporting
> +* SCRATCH_REG1[27:24]  = Error reporting
>  */
> writel(v, scratch_reg0);
> writel((offset | flag), scratch_reg1);
> @@ -993,7 +993,8 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, 
> u32 offset, u32 v, u32 f
> udelay(10);
> }
>
> -   if (i >= timeout) {
> +   tmp = readl(scratch_reg1);
> +   if (i >= timeout || (tmp & AMDGPU_RLCG_SCRATCH1_ERROR_MASK) 
> != 0) {
> if (amdgpu_sriov_rlcg_error_report_enabled(adev)) {
> if (tmp & AMDGPU_RLCG_VFGATE_DISABLED) {
> dev_err(adev->dev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index fa7be5f277b9..3f59b7b5523f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -45,6 +45,7 @@
>  #define AMDGPU_RLCG_REG_NOT_IN_RANGE   0x100
>
>  #define AMDGPU_RLCG_SCRATCH1_ADDRESS_MASK  0xF
> +#define AMDGPU_RLCG_SCRATCH1_ERROR_MASK0xF00
>
>  /* all asic after AI use this offset */
>  #define mmRCC_IOV_FUNC_IDENTIFIER 0xDE5
> --
> 2.34.1
>