[AMD Official Use Only - AMD Internal Distribution Only] Ping for the series...
> -----Original Message----- > From: Zhou1, Tao <tao.zh...@amd.com> > Sent: Friday, July 11, 2025 5:06 PM > To: amd-gfx@lists.freedesktop.org > Cc: Zhou1, Tao <tao.zh...@amd.com> > Subject: [PATCH 1/2] drm/amdgpu: add range check for RAS bad page address > > Exclude invalid bad pages. > > Signed-off-by: Tao Zhou <tao.zh...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 58 ++++++++++++------------- > 1 file changed, 28 insertions(+), 30 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index a6f512293b5c..1d6d4625abb3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -136,9 +136,9 @@ enum amdgpu_ras_retire_page_reservation { > > atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); > > -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, > +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, > uint64_t addr); > -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, > +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, > uint64_t addr); > #ifdef CONFIG_X86_MCE_AMD > static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device > *adev); > @@ -169,18 +169,16 @@ static int amdgpu_reserve_page_direct(struct > amdgpu_device *adev, uint64_t addre > struct eeprom_table_record err_rec; > int ret; > > - if ((address >= adev->gmc.mc_vram_size) || > - (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { > + ret = amdgpu_ras_check_bad_page(adev, address); > + if (ret == -EINVAL) { > dev_warn(adev->dev, > - "RAS WARN: input address 0x%llx is invalid.\n", > - address); > + "RAS WARN: input address 0x%llx is invalid.\n", > + address); > return -EINVAL; > - } > - > - if (amdgpu_ras_check_bad_page(adev, address)) { > + } else if (ret == 1) { > dev_warn(adev->dev, > - "RAS WARN: 0x%llx has already been marked as bad > page!\n", > - address); > + "RAS WARN: 0x%llx has already been marked as bad > page!\n", > + address); > return 0; > } > > @@ -513,22 +511,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file > *f, > ret = amdgpu_ras_feature_enable(adev, &data.head, 1); > break; > case 2: > - if ((data.inject.address >= adev->gmc.mc_vram_size && > - adev->gmc.mc_vram_size) || > - (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) { > - dev_warn(adev->dev, "RAS WARN: input address " > - "0x%llx is invalid.", > + /* umc ce/ue error injection for a bad page is not allowed */ > + if (data.head.block == AMDGPU_RAS_BLOCK__UMC) > + ret = amdgpu_ras_check_bad_page(adev, > data.inject.address); > + if (ret == -EINVAL) { > + dev_warn(adev->dev, "RAS WARN: input address 0x%llx is > invalid.", > data.inject.address); > - ret = -EINVAL; > break; > - } > - > - /* umc ce/ue error injection for a bad page is not allowed */ > - if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && > - amdgpu_ras_check_bad_page(adev, data.inject.address)) { > - dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " > - "already been marked as bad!\n", > - data.inject.address); > + } else if (ret == 1) { > + dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has > already > been marked as bad!\n", > + data.inject.address); > break; > } > > @@ -3122,18 +3114,24 @@ static int amdgpu_ras_load_bad_pages(struct > amdgpu_device *adev) > return ret; > } > > -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, > +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, > uint64_t addr) > { > struct ras_err_handler_data *data = con->eh_data; > + struct amdgpu_device *adev = con->adev; > int i; > > + if ((addr >= adev->gmc.mc_vram_size && > + adev->gmc.mc_vram_size) || > + (addr >= RAS_UMC_INJECT_ADDR_LIMIT)) > + return -EINVAL; > + > addr >>= AMDGPU_GPU_PAGE_SHIFT; > for (i = 0; i < data->count; i++) > if (addr == data->bps[i].retired_page) > - return true; > + return 1; > > - return false; > + return 0; > } > > /* > @@ -3141,11 +3139,11 @@ static bool > amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, > * > * Note: this check is only for umc block > */ > -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, > +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, > uint64_t addr) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > - bool ret = false; > + int ret = 0; > > if (!con || !con->eh_data) > return ret; > -- > 2.34.1