On Thu, Nov 13, 2025 at 8:12 AM Lijo Lazar <[email protected]> wrote: > > Unregister mce notifier on unload. > > Signed-off-by: Lijo Lazar <[email protected]>
Acked-by: Alex Deucher <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 ++++++++++++++++++++++++- > 1 file changed, 27 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 434a7e057dc9..d5f132f5ae63 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -150,6 +150,8 @@ static void amdgpu_ras_critical_region_fini(struct > amdgpu_device *adev); > > #ifdef CONFIG_X86_MCE_AMD > static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device > *adev); > +static void > +amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev); > struct mce_notifier_adev_list { > struct amdgpu_device *devs[MAX_GPU_INSTANCE]; > int num_gpu; > @@ -3954,7 +3956,9 @@ static int amdgpu_ras_recovery_fini(struct > amdgpu_device *adev) > mutex_unlock(&con->recovery_lock); > > amdgpu_ras_critical_region_init(adev); > - > +#ifdef CONFIG_X86_MCE_AMD > + amdgpu_unregister_bad_pages_mca_notifier(adev); > +#endif > return 0; > } > /* recovery end */ > @@ -4989,6 +4993,28 @@ static void > amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) > notifier_registered = true; > } > } > +static void amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device > *adev) > +{ > + int i, j; > + > + if (!notifier_registered && !mce_adev_list.num_gpu) > + return; > + for (i = 0, j = 0; i < mce_adev_list.num_gpu; i++) { > + if (mce_adev_list.devs[i] == adev) > + mce_adev_list.devs[i] = NULL; > + if (!mce_adev_list.devs[i]) > + ++j; > + } > + > + if (j == mce_adev_list.num_gpu) { > + mce_adev_list.num_gpu = 0; > + /* Unregister x86 notifier with MCE subsystem. */ > + if (notifier_registered) { > + mce_unregister_decode_chain(&amdgpu_bad_page_nb); > + notifier_registered = false; > + } > + } > +} > #endif > > struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) > -- > 2.49.0 >
