RE: [PATCH V4 1/1] drm/amdgpu: update athub interrupt harvesting handle

2020-09-21 Thread Zhang, Hawking
[AMD Public Use]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Stanley.Yang  
Sent: Monday, September 21, 2020 21:48
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Chen, Guchun ; 
Clements, John ; Li, Dennis ; Zhou1, 
Tao ; Yang, Stanley 
Subject: [PATCH V4 1/1] drm/amdgpu: update athub interrupt harvesting handle

GCEA/MMHUB EA error should not result to DF freeze, this is fixed in next 
generation, but for some reasons the GCEA/MMHUB EA error will result to DF 
freeze in previous generation, diver should avoid to indicate GCEA/MMHUB EA 
error as hw fatal error in kernel message by read GCEA/MMHUB err status 
registers.

Changed from V1:
make query_ras_error_status function more general
make read mmhub er status register more friendly

Changed from V2:
move ras error status query function into do_recovery workqueue

Changed from V3:
remove useless code from V2, print GCEA error status
instance number

Signed-off-by: Stanley.Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 43 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 29 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h |  2 +
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c   | 29 +
 .../amd/include/asic_reg/gc/gc_9_4_1_offset.h |  4 +-
 8 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index a611e78dd4ba..258498cbf1eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -217,6 +217,7 @@ struct amdgpu_gfx_funcs {
int (*query_ras_error_count) (struct amdgpu_device *adev, void 
*ras_error_status);
void (*reset_ras_error_count) (struct amdgpu_device *adev);
void (*init_spm_golden)(struct amdgpu_device *adev);
+   void (*query_ras_error_status) (struct amdgpu_device *adev);
 };
 
 struct sq_work {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 0c43d7fe893c..1ae9bdae7311 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -40,6 +40,7 @@ struct amdgpu_mmhub_funcs {
uint64_t page_table_base);
void (*update_power_gating)(struct amdgpu_device *adev,
 bool enable);
+   void (*query_ras_error_status)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_mmhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e5ea14774c0c..40614ac9a111 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1498,6 +1498,45 @@ static void amdgpu_ras_log_on_err_counter(struct 
amdgpu_device *adev)
}
 }
 
+/* Parse RdRspStatus and WrRspStatus */ void 
+amdgpu_ras_error_status_query(struct amdgpu_device *adev,
+   struct ras_query_if *info)
+{
+   /*
+* Only two block need to query read/write
+* RspStatus at current state
+*/
+   switch (info->head.block) {
+   case AMDGPU_RAS_BLOCK__GFX:
+   if (adev->gfx.funcs->query_ras_error_status)
+   adev->gfx.funcs->query_ras_error_status(adev);
+   break;
+   case AMDGPU_RAS_BLOCK__MMHUB:
+   if (adev->mmhub.funcs->query_ras_error_status)
+   adev->mmhub.funcs->query_ras_error_status(adev);
+   break;
+   default:
+   break;
+   }
+}
+
+static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_manager *obj;
+
+   if (!con)
+   return;
+
+   list_for_each_entry(obj, >head, node) {
+   struct ras_query_if info = {
+   .head = obj->head,
+   };
+
+   amdgpu_ras_error_status_query(adev, );
+   }
+}
+
 /* recovery begin */
 
 /* return 0 on success.
@@ -1568,8 +1607,10 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
}
 
list_for_each_entry(remote_adev,
-   device_list_handle, gmc.xgmi.head)
+   device_list_handle, gmc.xgmi.head) {
+   amdgpu_ras_query_err_status(remote_adev);
amdgpu_ras_log_on_err_counter(remote_adev);
+   }
 
amdgpu_put_xgmi_hive(hive);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index d898c9ff3526..adee0177654e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@

[PATCH V4 1/1] drm/amdgpu: update athub interrupt harvesting handle

2020-09-21 Thread Stanley . Yang
GCEA/MMHUB EA error should not result to DF freeze, this is
fixed in next generation, but for some reasons the GCEA/MMHUB
EA error will result to DF freeze in previous generation,
diver should avoid to indicate GCEA/MMHUB EA error as hw fatal
error in kernel message by read GCEA/MMHUB err status registers.

Changed from V1:
make query_ras_error_status function more general
make read mmhub er status register more friendly

Changed from V2:
move ras error status query function into do_recovery workqueue

Changed from V3:
remove useless code from V2, print GCEA error status
instance number

Signed-off-by: Stanley.Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 43 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 29 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h |  2 +
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c   | 29 +
 .../amd/include/asic_reg/gc/gc_9_4_1_offset.h |  4 +-
 8 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index a611e78dd4ba..258498cbf1eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -217,6 +217,7 @@ struct amdgpu_gfx_funcs {
int (*query_ras_error_count) (struct amdgpu_device *adev, void 
*ras_error_status);
void (*reset_ras_error_count) (struct amdgpu_device *adev);
void (*init_spm_golden)(struct amdgpu_device *adev);
+   void (*query_ras_error_status) (struct amdgpu_device *adev);
 };
 
 struct sq_work {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 0c43d7fe893c..1ae9bdae7311 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -40,6 +40,7 @@ struct amdgpu_mmhub_funcs {
uint64_t page_table_base);
void (*update_power_gating)(struct amdgpu_device *adev,
 bool enable);
+   void (*query_ras_error_status)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_mmhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e5ea14774c0c..40614ac9a111 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1498,6 +1498,45 @@ static void amdgpu_ras_log_on_err_counter(struct 
amdgpu_device *adev)
}
 }
 
+/* Parse RdRspStatus and WrRspStatus */
+void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
+   struct ras_query_if *info)
+{
+   /*
+* Only two block need to query read/write
+* RspStatus at current state
+*/
+   switch (info->head.block) {
+   case AMDGPU_RAS_BLOCK__GFX:
+   if (adev->gfx.funcs->query_ras_error_status)
+   adev->gfx.funcs->query_ras_error_status(adev);
+   break;
+   case AMDGPU_RAS_BLOCK__MMHUB:
+   if (adev->mmhub.funcs->query_ras_error_status)
+   adev->mmhub.funcs->query_ras_error_status(adev);
+   break;
+   default:
+   break;
+   }
+}
+
+static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_manager *obj;
+
+   if (!con)
+   return;
+
+   list_for_each_entry(obj, >head, node) {
+   struct ras_query_if info = {
+   .head = obj->head,
+   };
+
+   amdgpu_ras_error_status_query(adev, );
+   }
+}
+
 /* recovery begin */
 
 /* return 0 on success.
@@ -1568,8 +1607,10 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
}
 
list_for_each_entry(remote_adev,
-   device_list_handle, gmc.xgmi.head)
+   device_list_handle, gmc.xgmi.head) {
+   amdgpu_ras_query_err_status(remote_adev);
amdgpu_ras_log_on_err_counter(remote_adev);
+   }
 
amdgpu_put_xgmi_hive(hive);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index d898c9ff3526..adee0177654e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2075,6 +2075,7 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = 
{
.ras_error_inject = _v9_4_ras_error_inject,
.query_ras_error_count = _v9_4_query_ras_error_count,
.reset_ras_error_count = _v9_4_reset_ras_error_count,
+   .query_ras_error_status = _v9_4_query_ras_error_status,
 };
 
 static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
diff