Firmware and monitoring tools may not be ready to receive a CPER when we
read the bad pages, so send the CPERs at the end of RAs initialization
to ensure that the FW is ready to receive and process the CPER. This
removes the previous CPER submission that was added during bad page
load, and sends both in-band and out-of-band at the same time.

Signed-off-by: Kent Russell <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  2 ++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 27 ++++++++++++++++---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  2 ++
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b28fcf932f7e..856b1bf83533 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4650,6 +4650,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
                        amdgpu_ras_block_late_init_default(adev, 
&obj->ras_comm);
        }
 
+       amdgpu_ras_check_bad_page_status(adev);
+
        return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 469d04a39d7d..91de4085a66d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1712,10 +1712,6 @@ int amdgpu_ras_eeprom_check(struct 
amdgpu_ras_eeprom_control *control)
                        dev_warn(adev->dev, "RAS records:%u exceeds 90%% of 
threshold:%d",
                                        control->ras_num_bad_pages,
                                        ras->bad_page_cnt_threshold);
-               if (amdgpu_bad_page_threshold != 0 &&
-                       control->ras_num_bad_pages >= 
ras->bad_page_cnt_threshold)
-                       amdgpu_dpm_send_rma_reason(adev);
-
        } else if (hdr->header == RAS_TABLE_HDR_BAD &&
                   amdgpu_bad_page_threshold != 0) {
                if (hdr->version >= RAS_TABLE_VER_V2_1) {
@@ -1932,3 +1928,26 @@ int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device 
*adev,
                                                                           
result);
        return -EOPNOTSUPP;
 }
+
+void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control 
: NULL;
+
+       if (!control || amdgpu_bad_page_threshold == 0)
+               return;
+
+       if (control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
+               if (amdgpu_dpm_send_rma_reason(adev))
+                       dev_warn(adev->dev, "Unable to send in-band RMA CPER");
+               else
+                       dev_dbg(adev->dev, "Sent in-band RMA CPER");
+
+               if (adev->cper.enabled && !amdgpu_uniras_enabled(adev)) {
+                       if (amdgpu_cper_generate_bp_threshold_record(adev))
+                               dev_warn(adev->dev, "Unable to send out-of-band 
RMA CPER");
+                       else
+                               dev_dbg(adev->dev, "Sent out-of-band RMA CPER");
+               }
+       }
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 2e5d63957e71..a62114800a92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -193,6 +193,8 @@ int amdgpu_ras_eeprom_read_idx(struct 
amdgpu_ras_eeprom_control *control,
 
 int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control 
*control);
 
+void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev);
+
 extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
 extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
 
-- 
2.43.0

Reply via email to