[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Russell, Kent <[email protected]>
> Sent: Thursday, February 5, 2026 1:52 AM
> To: [email protected]
> Cc: Liu, Xiang(Dean) <[email protected]>; Zhou1, Tao <[email protected]>;
> Russell, Kent <[email protected]>
> Subject: [PATCH] drm/amdgpu: Send applicable RMA CPERs at end of RAS init
>
> Firmware and monitoring tools may not be ready to receive a CPER when we read
> the bad pages, so send the CPERs at the end of RAs initialization to ensure 
> that the

[Tao] RAs -> RAS

> FW is ready to receive and process the CPER. This removes the previous CPER
> submission that was added during bad page load, and sends both in-band and 
> out-
> of-band at the same time.
>
> Signed-off-by: Kent Russell <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  2 ++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 27 ++++++++++++++++---
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  2 ++
>  3 files changed, 27 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index b28fcf932f7e..856b1bf83533 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4650,6 +4650,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
>                       amdgpu_ras_block_late_init_default(adev, 
> &obj->ras_comm);
>       }
>
> +     amdgpu_ras_check_bad_page_status(adev);
> +
>       return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 469d04a39d7d..91de4085a66d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -1712,10 +1712,6 @@ int amdgpu_ras_eeprom_check(struct
> amdgpu_ras_eeprom_control *control)
>                       dev_warn(adev->dev, "RAS records:%u exceeds 90%% of
> threshold:%d",
>                                       control->ras_num_bad_pages,
>                                       ras->bad_page_cnt_threshold);
> -             if (amdgpu_bad_page_threshold != 0 &&
> -                     control->ras_num_bad_pages >= ras-
> >bad_page_cnt_threshold)
> -                     amdgpu_dpm_send_rma_reason(adev);
> -
>       } else if (hdr->header == RAS_TABLE_HDR_BAD &&
>                  amdgpu_bad_page_threshold != 0) {
>               if (hdr->version >= RAS_TABLE_VER_V2_1) { @@ -1932,3
> +1928,26 @@ int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
>                                                                          
> result);
>       return -EOPNOTSUPP;
>  }
> +
> +void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev) {
> +     struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +     struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control
> +: NULL;
> +
> +     if (!control || amdgpu_bad_page_threshold == 0)
> +             return;
> +
> +     if (control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
> +             if (amdgpu_dpm_send_rma_reason(adev))
> +                     dev_warn(adev->dev, "Unable to send in-band RMA CPER");

[Tao] this is oob cper and the following one is ib cper.

With my concerns fixed, the patch is:  Reviewed-by: Tao Zhou <[email protected]>

> +             else
> +                     dev_dbg(adev->dev, "Sent in-band RMA CPER");
> +
> +             if (adev->cper.enabled && !amdgpu_uniras_enabled(adev)) {
> +                     if (amdgpu_cper_generate_bp_threshold_record(adev))
> +                             dev_warn(adev->dev, "Unable to send out-of-band
> RMA CPER");
> +                     else
> +                             dev_dbg(adev->dev, "Sent out-of-band RMA CPER");
> +             }
> +     }
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 2e5d63957e71..a62114800a92 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -193,6 +193,8 @@ int amdgpu_ras_eeprom_read_idx(struct
> amdgpu_ras_eeprom_control *control,
>
>  int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control
> *control);
>
> +void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev);
> +
>  extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
>  extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
>
> --
> 2.43.0

Reply via email to