In ras poison mode, umc uncorrectable error will be ignored until
the corrupted data consumed by another ras module (such as gfx, sdma).

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +++++++++++++++----------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7b7e54fdd785..195637725c7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1473,22 +1473,28 @@ static void amdgpu_ras_interrupt_handler(struct 
ras_manager *obj)
                data->rptr = (data->aligned_element_size +
                                data->rptr) % data->ring_size;
 
-               /* Let IP handle its data, maybe we need get the output
-                * from the callback to udpate the error type/count, etc
-                */
                if (data->cb) {
-                       ret = data->cb(obj->adev, &err_data, &entry);
-                       /* ue will trigger an interrupt, and in that case
-                        * we need do a reset to recovery the whole system.
-                        * But leave IP do that recovery, here we just dispatch
-                        * the error.
-                        */
-                       if (ret == AMDGPU_RAS_SUCCESS) {
-                               /* these counts could be left as 0 if
-                                * some blocks do not count error number
+                       if (amdgpu_ras_is_poison_enabled(obj->adev) &&
+                           obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+                               dev_warn(obj->adev->dev, "Poison mode, no need 
to do page retirement"
+                                                               "in UMC ras 
handler!\n");
+                       else {
+                               /* Let IP handle its data, maybe we need get 
the output
+                                * from the callback to udpate the error 
type/count, etc
+                                */
+                               ret = data->cb(obj->adev, &err_data, &entry);
+                               /* ue will trigger an interrupt, and in that 
case
+                                * we need do a reset to recovery the whole 
system.
+                                * But leave IP do that recovery, here we just 
dispatch
+                                * the error.
                                 */
-                               obj->err_data.ue_count += err_data.ue_count;
-                               obj->err_data.ce_count += err_data.ce_count;
+                               if (ret == AMDGPU_RAS_SUCCESS) {
+                                       /* these counts could be left as 0 if
+                                        * some blocks do not count error number
+                                        */
+                                       obj->err_data.ue_count += 
err_data.ue_count;
+                                       obj->err_data.ce_count += 
err_data.ce_count;
+                               }
                        }
                }
        }
-- 
2.17.1

Reply via email to