Check and drop invalid bad page records.
Signed-off-by: YiPeng Chai <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 27 +++++++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 57f13ad5605a..b0ef0800b380 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3094,6 +3094,20 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
return -EINVAL;
}
+static bool __check_record_in_range(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+ if (bps[i].retired_page >=
+ (adev->gmc.real_vram_size >> AMDGPU_GPU_PAGE_SHIFT))
+ return false;
+ }
+
+ return true;
+}
+
static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int
count)
{
@@ -3101,6 +3115,14 @@ static int __amdgpu_ras_restore_bad_pages(struct
amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data = con->eh_data;
+ if (!__check_record_in_range(adev, bps, count)) {
+ dev_warn(adev->dev,
+ "Recorded address out of range: 0x%llx, 0x%llx, 0x%x,
0x%x\n",
+ bps[0].address, bps[0].retired_page,
+ bps[0].mem_channel, bps[0].mcumc_id);
+ return 0;
+ }
+
for (j = 0; j < count; j++) {
if (!data->space_left &&
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
@@ -5642,6 +5664,11 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev,
uint64_t pfn)
uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
int ret = 0;
+ if (pfn >= (adev->gmc.real_vram_size >> AMDGPU_GPU_PAGE_SHIFT)) {
+ dev_warn(adev->dev, "Ignoring out-of-range bad page 0x%llx",
start);
+ return 0;
+ }
+
if (amdgpu_ras_check_critical_address(adev, start))
return 0;
--
2.43.0