amdgpu: add reset_ras_error_count function for HDP

Deucher, Alexander Mon, 02 Mar 2020 07:46:29 -0800

[AMD Public Use]

Series is:
Reviewed-by: Alex Deucher <alexander.deuc...@amd.com>
________________________________
From: Hawking Zhang <hawking.zh...@amd.com>
Sent: Monday, March 2, 2020 5:33 AM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Clements, 
John <john.cleme...@amd.com>; Li, Dennis <dennis...@amd.com>; Chen, Guchun 
<guchun.c...@amd.com>; Zhou1, Tao <tao.zh...@amd.com>; Deucher, Alexander 
<alexander.deuc...@amd.com>
Cc: Zhang, Hawking <hawking.zh...@amd.com>
Subject: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP


HDP ras error counters are dirty ones after cold reboot
Read operation is needed to reset them to 0

Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  1 -
 drivers/gpu/drm/amd/amdgpu/soc15.c    | 14 ++++++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a58b0cf9da51..b735e20888a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -595,6 +595,7 @@ struct amdgpu_asic_funcs {
         /* invalidate hdp read cache */
         void (*invalidate_hdp)(struct amdgpu_device *adev,
                                struct amdgpu_ring *ring);
+       void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev);
         /* check if the asic needs a full reset of if soft reset will work */
         bool (*need_full_reset)(struct amdgpu_device *adev);
         /* initialize doorbell layout for specific asic*/
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index b746f26f933c..efd52bcf8785 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry 
gfx_v9_0_edc_counter_regs[] = {
    { SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16},
    { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
    { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
-   { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
 };

 static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 4aa5b9c8e43b..6b717691d554 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct amdgpu_device 
*adev)
         /* change this when we implement soft reset */
         return true;
 }
+
+static void vega20_reset_hdp_ras_error_count(struct amdgpu_device *adev)
+{
+       if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP))
+               return;
+       /*read back hdp ras counter to reset it to 0 */
+       RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT);
+}
+
 static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0,
                                  uint64_t *count1)
 {
@@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs =
         .get_config_memsize = &soc15_get_config_memsize,
         .flush_hdp = &soc15_flush_hdp,
         .invalidate_hdp = &soc15_invalidate_hdp,
+       .reset_hdp_ras_error_count = &vega20_reset_hdp_ras_error_count,
         .need_full_reset = &soc15_need_full_reset,
         .init_doorbell_index = &vega20_doorbell_index_init,
         .get_pcie_usage = &vega20_get_pcie_usage,
@@ -1239,6 +1249,10 @@ static int soc15_common_late_init(void *handle)
         if (amdgpu_sriov_vf(adev))
                 xgpu_ai_mailbox_get_irq(adev);

+       if (adev->asic_funcs &&
+           adev->asic_funcs->reset_hdp_ras_error_count)
+               adev->asic_funcs->reset_hdp_ras_error_count(adev);
+
         if (adev->nbio.funcs->ras_late_init)
                 r = adev->nbio.funcs->ras_late_init(adev);

--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP

Reply via email to