AMD General
> -----Original Message-----
> From: Chai, Thomas <[email protected]>
> Sent: Monday, May 18, 2026 3:22 PM
> To: [email protected]
> Cc: Chai, Thomas <[email protected]>; Zhang, Hawking
> <[email protected]>; Zhou1, Tao <[email protected]>; Yang,
> Stanley <[email protected]>; Chai, Thomas <[email protected]>
> Subject: [PATCH 7/7] drm/amdgpu: check and drop invalid bad page records
>
> Check and drop invalid bad page records.
>
> Signed-off-by: YiPeng Chai <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 27
> +++++++++++++++++++++++++
> 1 file changed, 27 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 57f13ad5605a..b0ef0800b380 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3094,6 +3094,20 @@ static int amdgpu_ras_mca2pa(struct
> amdgpu_device *adev,
> return -EINVAL;
> }
>
> +static bool __check_record_in_range(struct amdgpu_device *adev,
> + struct eeprom_table_record *bps, int count) {
> + int i;
> +
> + for (i = 0; i < count; i++) {
> + if (bps[i].retired_page >=
> + (adev->gmc.real_vram_size >>
> AMDGPU_GPU_PAGE_SHIFT))
> + return false;
> + }
> +
> + return true;
> +}
> +
> static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
> struct eeprom_table_record *bps, int
> count) { @@ -3101,6 +3115,14 @@ static int
> __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct ras_err_handler_data *data = con->eh_data;
>
> + if (!__check_record_in_range(adev, bps, count)) {
> + dev_warn(adev->dev,
> + "Recorded address out of range: 0x%llx, 0x%llx, 0x%x,
> 0x%x\n",
> + bps[0].address, bps[0].retired_page,
> + bps[0].mem_channel, bps[0].mcumc_id);
[Tao] can we move the log into __check_record_in_range(adev, bps, count), and
we could print out the info of bps[i] instead of bps[0].
> + return 0;
> + }
> +
> for (j = 0; j < count; j++) {
> if (!data->space_left &&
> amdgpu_ras_realloc_eh_data_space(adev, data, 256))
> { @@ -5642,6 +5664,11 @@ int amdgpu_ras_reserve_page(struct
> amdgpu_device *adev, uint64_t pfn)
> uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
> int ret = 0;
>
> + if (pfn >= (adev->gmc.real_vram_size >>
> AMDGPU_GPU_PAGE_SHIFT)) {
> + dev_warn(adev->dev, "Ignoring out-of-range bad page
> 0x%llx", start);
> + return 0;
> + }
> +
> if (amdgpu_ras_check_critical_address(adev, start))
> return 0;
>
> --
> 2.43.0