Check RMA status in bad page retirement flow.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  7 +++----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 934dfb2bf9e5..a6da44ac3fbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2049,8 +2049,9 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
        struct amdgpu_device *adev = obj->adev;
        struct amdgpu_ras_block_object *block_obj =
                amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
-       if (!block_obj)
+       if (!block_obj || !con)
                return;
 
        /* both query_poison_status and handle_poison_consumption are optional,
@@ -2074,7 +2075,7 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
                poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
 
        /* gpu reset is fallback for failed and default cases */
-       if (poison_stat) {
+       if (poison_stat || con->is_rma) {
                dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
                                block_obj->ras_comm.name);
                amdgpu_ras_reset_gpu(adev);
@@ -2817,6 +2818,9 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
                schedule_delayed_work(&con->page_retirement_dwork,
                        msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
        mutex_unlock(&con->umc_ecc_log.lock);
+
+       if (err_data->err_addr_cnt && con->is_rma)
+               amdgpu_ras_reset_gpu(adev);
 }
 
 static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
@@ -2867,7 +2871,7 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
        if (poison_msg->pasid_fn)
                poison_msg->pasid_fn(adev, pasid, poison_msg->data);
 
-       if (reset) {
+       if (reset && !con->is_rma) {
                flush_delayed_work(&con->page_retirement_dwork);
 
                con->gpu_reset_flags |= reset;
@@ -3983,6 +3987,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+       /* mode1 is the only selection for RMA status */
+       if (ras->is_rma) {
+               ras->gpu_reset_flags = 0;
+               ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+       }
+
        if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
                amdgpu_reset_domain_schedule(ras->adev->reset_domain, 
&ras->recovery_work);
        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1dbe69eabb9a..5f3866548cb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -195,7 +195,7 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
        amdgpu_umc_handle_bad_pages(adev, ras_error_status);
 
-       if (err_data->ue_count && reset) {
+       if ((err_data->ue_count && (reset || con->is_rma)) {
                con->gpu_reset_flags |= reset;
                amdgpu_ras_reset_gpu(adev);
        }
@@ -211,6 +211,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
                .block = AMDGPU_RAS_BLOCK__UMC,
        };
        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        uint32_t timeout = timeout_ms;
 
        memset(&err_data, 0, sizeof(err_data));
@@ -243,9 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
 
        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 
-       if (reset) {
-               struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
+       if (reset || (err_data.err_addr_cnt && con->is_rma) {
                con->gpu_reset_flags |= reset;
                amdgpu_ras_reset_gpu(adev);
        }
-- 
2.34.1

Reply via email to