On 6/16/26 11:14, Geoffrey McRae wrote: > From: geomcrae_amdeng <[email protected]> > > Fix a sysfs duplication error when reinitializing the device: > > sysfs: cannot create duplicate filename '.../ip_discovery' > kobject_add_internal failed for ip_discovery with -EEXIST > ... > Failed to create device file mem_info_preempt_used (-17) > > The failure is caused by stale sysfs entries not being removed during > device teardown, leading to -EEXIST when the driver is reprobed. In > particular: > > - amdgpu_discovery sysfs kobjects were not fully torn down early enough, > and ip_top remained non-NULL after cleanup > - the preempt manager sysfs attribute was removed only conditionally > and not during the common hw fini path > > Fix this by: > - making amdgpu_discovery_sysfs_fini() externally visible and clearing > adev->discovery.ip_top to prevent reuse > - calling amdgpu_discovery_sysfs_fini() and > amdgpu_preempt_mgr_sysfs_fini() from > amdgpu_device_sys_interface_fini() > > This ensures sysfs state is fully cleaned up before reprobe and avoids > duplicate kobject/file creation. > > Cc: Christian König <[email protected]> > Cc: Alex Deucher <[email protected]> > Signed-off-by: geomcrae_amdeng <[email protected]>
Reviewed-by: Christian König <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 5 ++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c | 14 +++++++++++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 1 + > 5 files changed, 20 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 91f2506b9529..e72924976994 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3678,6 +3678,10 @@ static void amdgpu_device_sys_interface_fini(struct > amdgpu_device *adev) > amdgpu_pm_sysfs_fini(adev); > if (adev->ucode_sysfs_en) > amdgpu_ucode_sysfs_fini(adev); > + > + amdgpu_discovery_sysfs_fini(adev); > + amdgpu_preempt_mgr_sysfs_fini(adev); > + > amdgpu_device_attr_sysfs_fini(adev); > amdgpu_fru_sysfs_fini(adev); > > @@ -4211,6 +4215,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) > > if (adev->mman.initialized) > drain_workqueue(adev->mman.bdev.wq); > + > adev->shutdown = true; > > unregister_pm_notifier(&adev->pm_nb); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c > index 0c51e0fead40..a229fe9d043b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c > @@ -759,8 +759,6 @@ static int amdgpu_discovery_init(struct amdgpu_device > *adev) > return r; > } > > -static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev); > - > void amdgpu_discovery_fini(struct amdgpu_device *adev) > { > if (adev->discovery.ip_top && !adev->discovery.ip_top->standalone_mode) > @@ -1483,7 +1481,7 @@ static void amdgpu_discovery_sysfs_die_free(struct > ip_die_entry *ip_die_entry) > kobject_put(&ip_die_entry->ip_kset.kobj); > } > > -static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev) > +void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev) > { > struct ip_discovery_top *ip_top = adev->discovery.ip_top; > struct list_head *el, *tmp; > @@ -1492,6 +1490,7 @@ static void amdgpu_discovery_sysfs_fini(struct > amdgpu_device *adev) > if (!ip_top) > return; > > + adev->discovery.ip_top = NULL; > die_kset = &ip_top->die_kset; > spin_lock(&die_kset->list_lock); > list_for_each_prev_safe(el, tmp, &die_kset->list) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h > index edc78184e0f3..5b2b16f68576 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h > @@ -41,6 +41,7 @@ struct amdgpu_discovery_info { > bool reserve_tmr; > }; > > +void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev); > void amdgpu_discovery_fini(struct amdgpu_device *adev); > int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c > index 34b5e22b44e5..37ef5b9eb1cf 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c > @@ -46,6 +46,17 @@ static ssize_t mem_info_preempt_used_show(struct device > *dev, > > static DEVICE_ATTR_RO(mem_info_preempt_used); > > +/** > + * amdgpu_preempt_mgr_sysfs_fini - remove PREEMPT manager sysfs attributes > + * > + * @adev: amdgpu_device pointer > + */ > +void amdgpu_preempt_mgr_sysfs_fini(struct amdgpu_device *adev) > +{ > + if (adev->dev->kobj.sd) > + device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used); > +} > + > /** > * amdgpu_preempt_mgr_new - allocate a new node > * > @@ -137,9 +148,6 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev) > if (ret) > return; > > - if (adev->dev->kobj.sd) > - device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used); > - > ttm_resource_manager_cleanup(man); > ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL); > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > index 2d72fa217274..00acec7226f5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > @@ -140,6 +140,7 @@ int amdgpu_gtt_mgr_init(struct amdgpu_device *adev, > uint64_t gtt_size); > void amdgpu_gtt_mgr_fini(struct amdgpu_device *adev); > int amdgpu_preempt_mgr_init(struct amdgpu_device *adev); > void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev); > +void amdgpu_preempt_mgr_sysfs_fini(struct amdgpu_device *adev); > int amdgpu_vram_mgr_init(struct amdgpu_device *adev); > void amdgpu_vram_mgr_fini(struct amdgpu_device *adev); >
