On 12/11/25 16:53, Tvrtko Ursulin wrote:
>
> On 11/12/2025 13:16, Christian König wrote:
>> This allows amdgpu_fences to outlive the amdgpu module.
>>
>> v2: use dma_fence_get_rcu_safe to be NULL safe here.
>>
>> Signed-off-by: Christian König <[email protected]>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 63 +++++++----------------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 -
>> 2 files changed, 20 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index c7843e336310..c636347801c1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -112,8 +112,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct
>> amdgpu_fence *af,
>> af->ring = ring;
>> seq = ++ring->fence_drv.sync_seq;
>> - dma_fence_init(fence, &amdgpu_fence_ops,
>> - &ring->fence_drv.lock,
>> + dma_fence_init(fence, &amdgpu_fence_ops, NULL,
>> adev->fence_context + ring->idx, seq);
>> amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>> @@ -467,7 +466,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring
>> *ring)
>> timer_setup(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback, 0);
>> ring->fence_drv.num_fences_mask = ring->num_hw_submission * 2 - 1;
>> - spin_lock_init(&ring->fence_drv.lock);
>> ring->fence_drv.fences = kcalloc(ring->num_hw_submission * 2,
>> sizeof(void *),
>> GFP_KERNEL);
>> @@ -654,16 +652,20 @@ void amdgpu_fence_driver_set_error(struct
>> amdgpu_ring *ring, int error)
>> struct amdgpu_fence_driver *drv = &ring->fence_drv;
>> unsigned long flags;
>> - spin_lock_irqsave(&drv->lock, flags);
>> + rcu_read_lock();
>> for (unsigned int i = 0; i <= drv->num_fences_mask; ++i) {
>> struct dma_fence *fence;
>> - fence = rcu_dereference_protected(drv->fences[i],
>> - lockdep_is_held(&drv->lock));
>> - if (fence && !dma_fence_is_signaled_locked(fence))
>> + fence = dma_fence_get_rcu(drv->fences[i]);
>
> dma_fence_get_rcu is not safe against passing a NULL fence in, while the
> existing code makes it look like drv->fence[] slot can contain NULL at this
> point?
>
> amdgpu_fence_process() is the place which can NULL the slots? Irq context?
> Why is that safe with no reference held from clearing the slot to operating
> on the fence?
The slots are never cleared. It can only be that they are NULL when the driver
is loaded.
I've switched over to dma_fence_get_rcu_safe() where appropriated.
Regards,
Christian.
>
>> + if (!fence)
>> + continue;
>> +
>> + dma_fence_lock_irqsave(fence, flags);
>> + if (!dma_fence_is_signaled_locked(fence))
>> dma_fence_set_error(fence, error);
>> + dma_fence_unlock_irqrestore(fence, flags);
>> }
>> - spin_unlock_irqrestore(&drv->lock, flags);
>> + rcu_read_unlock();
>> }
>> /**
>> @@ -714,16 +716,19 @@ void
>> amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
>> seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
>> /* mark all fences from the guilty context with an error */
>> - spin_lock_irqsave(&ring->fence_drv.lock, flags);
>> + rcu_read_lock();
>> do {
>> last_seq++;
>> last_seq &= ring->fence_drv.num_fences_mask;
>> ptr = &ring->fence_drv.fences[last_seq];
>> - rcu_read_lock();
>> - unprocessed = rcu_dereference(*ptr);
>> + unprocessed = dma_fence_get_rcu_safe(ptr);
>
> Similar concern like the above.
>
> Regards,
>
> Tvrtko
>> +
>> + if (!unprocessed)
>> + continue;
>> - if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
>> + dma_fence_lock_irqsave(unprocessed, flags);
>> + if (dma_fence_is_signaled_locked(unprocessed)) {
>> fence = container_of(unprocessed, struct amdgpu_fence, base);
>> if (fence == af)
>> @@ -731,9 +736,10 @@ void amdgpu_fence_driver_guilty_force_completion(struct
>> amdgpu_fence *af)
>> else if (fence->context == af->context)
>> dma_fence_set_error(&fence->base, -ECANCELED);
>> }
>> - rcu_read_unlock();
>> + dma_fence_unlock_irqrestore(unprocessed, flags);
>> + dma_fence_put(unprocessed);
>> } while (last_seq != seq);
>> - spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
>> + rcu_read_unlock();
>> /* signal the guilty fence */
>> amdgpu_fence_write(ring, (u32)af->base.seqno);
>> amdgpu_fence_process(ring);
>> @@ -823,39 +829,10 @@ static bool amdgpu_fence_enable_signaling(struct
>> dma_fence *f)
>> return true;
>> }
>> -/**
>> - * amdgpu_fence_free - free up the fence memory
>> - *
>> - * @rcu: RCU callback head
>> - *
>> - * Free up the fence memory after the RCU grace period.
>> - */
>> -static void amdgpu_fence_free(struct rcu_head *rcu)
>> -{
>> - struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
>> -
>> - /* free fence_slab if it's separated fence*/
>> - kfree(to_amdgpu_fence(f));
>> -}
>> -
>> -/**
>> - * amdgpu_fence_release - callback that fence can be freed
>> - *
>> - * @f: fence
>> - *
>> - * This function is called when the reference count becomes zero.
>> - * It just RCU schedules freeing up the fence.
>> - */
>> -static void amdgpu_fence_release(struct dma_fence *f)
>> -{
>> - call_rcu(&f->rcu, amdgpu_fence_free);
>> -}
>> -
>> static const struct dma_fence_ops amdgpu_fence_ops = {
>> .get_driver_name = amdgpu_fence_get_driver_name,
>> .get_timeline_name = amdgpu_fence_get_timeline_name,
>> .enable_signaling = amdgpu_fence_enable_signaling,
>> - .release = amdgpu_fence_release,
>> };
>> /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 7a27c6c4bb44..9cbf63454004 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -125,7 +125,6 @@ struct amdgpu_fence_driver {
>> unsigned irq_type;
>> struct timer_list fallback_timer;
>> unsigned num_fences_mask;
>> - spinlock_t lock;
>> struct dma_fence **fences;
>> };
>>
>