On Mon, Apr 15, 2024 at 09:53:09PM +0200, Janusz Krzysztofik wrote:
> We defer actually closing, unbinding and destroying a VMA until next idle
> point, or until the object is freed in the meantime.  By postponing the
> unbind, we allow for the VMA to be reopened by the client, avoiding the
> work required to rebind the VMA.
> 
> It was assumed that as long as a GT is held idle, no VMA would be reopened
> while we destroy them.  That assumption is no longer true in multi-GT
> configurations, where a VMA we reopen may be handled by a GT different
> from the one that we already keep active via its engine while we set up
> an execbuf request.
> 
> <4> [260.290809] ------------[ cut here ]------------
> <4> [260.290988] list_del corruption. prev->next should be ffff888118c5d990, 
> but was ffff888118c5a510. (prev=ffff888118c5a510)
> <4> [260.291004] WARNING: CPU: 2 PID: 1143 at lib/list_debug.c:62 
> __list_del_entry_valid_or_report+0xb7/0xe0
> ..
> <4> [260.291055] CPU: 2 PID: 1143 Comm: kms_plane Not tainted 
> 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1
> <4> [260.291058] Hardware name: Intel Corporation Meteor Lake Client 
> Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024
> <4> [260.291060] RIP: 0010:__list_del_entry_valid_or_report+0xb7/0xe0
> ...
> <4> [260.291087] Call Trace:
> <4> [260.291089]  <TASK>
> <4> [260.291124]  i915_vma_reopen+0x43/0x80 [i915]
> <4> [260.291298]  eb_lookup_vmas+0x9cb/0xcc0 [i915]
> <4> [260.291579]  i915_gem_do_execbuffer+0xc9a/0x26d0 [i915]
> <4> [260.291883]  i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915]
> ...
> <4> [260.292301]  </TASK>
> ...
> <4> [260.292506] ---[ end trace 0000000000000000 ]---
> <4> [260.292782] general protection fault, probably for non-canonical address 
> 0x6b6b6b6b6b6b6ca3: 0000 [#1] PREEMPT SMP NOPTI
> <4> [260.303575] CPU: 2 PID: 1143 Comm: kms_plane Tainted: G        W         
>  6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1
> <4> [260.313851] Hardware name: Intel Corporation Meteor Lake Client 
> Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024
> <4> [260.326359] RIP: 0010:eb_validate_vmas+0x114/0xd80 [i915]
> ...
> <4> [260.428756] Call Trace:
> <4> [260.431192]  <TASK>
> <4> [639.283393]  i915_gem_do_execbuffer+0xd05/0x26d0 [i915]
> <4> [639.305245]  i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915]
> ...
> <4> [639.411134]  </TASK>
> ...
> <4> [639.449979] ---[ end trace 0000000000000000 ]---
> 
> As soon as we start unbinding and destroying a VMA, marked it as parked,
> and also keep it marked as closed for the rest of its life.  When a VMA
> to be opened occurs closed, reopen it only if not yet parked.
> 
> v3: Fix misplaced brackets.
> v2: Since we no longer re-init the VMA closed list link on VMA park so it
>     looks like still on a list, don't try to delete it from the list again
>     after the VMA has been marked as parked.
> 
> Fixes: b0647a5e79b1 ("drm/i915: Avoid live-lock with i915_vma_parked()")

what about reverting that?

> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10608
> Signed-off-by: Janusz Krzysztofik <janusz.krzyszto...@linux.intel.com>
> Cc: Chris Wilson <chris.p.wil...@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
> Cc: sta...@vger.kernel.org # v6.0+
> ---
>  .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 10 ++++--
>  drivers/gpu/drm/i915/i915_vma.c               | 32 +++++++++++++++----
>  drivers/gpu/drm/i915/i915_vma.h               |  2 +-
>  drivers/gpu/drm/i915/i915_vma_types.h         |  3 ++
>  4 files changed, 37 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 42619fc05de48..97e014f94002e 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -847,9 +847,12 @@ static int __eb_add_lut(struct i915_execbuffer *eb,
>       if (unlikely(!lut))
>               return -ENOMEM;
>  
> +     if (!i915_vma_open(vma)) {
> +             err = -EEXIST;  /* let eb_vma_lookup() retry */
> +             goto err_lut_free;
> +     }
> +
>       i915_vma_get(vma);
> -     if (!atomic_fetch_inc(&vma->open_count))
> -             i915_vma_reopen(vma);
>       lut->handle = handle;
>       lut->ctx = ctx;
>  
> @@ -880,8 +883,9 @@ static int __eb_add_lut(struct i915_execbuffer *eb,
>       return 0;
>  
>  err:
> -     i915_vma_close(vma);
>       i915_vma_put(vma);
> +     i915_vma_close(vma);
> +err_lut_free:
>       i915_lut_handle_free(lut);
>       return err;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index d2f064d2525cc..4435c76f28c8c 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -1735,14 +1735,33 @@ static void __i915_vma_remove_closed(struct i915_vma 
> *vma)
>       list_del_init(&vma->closed_link);
>  }
>  
> -void i915_vma_reopen(struct i915_vma *vma)
> +static struct i915_vma *i915_vma_reopen(struct i915_vma *vma)
> +{
> +     if (atomic_read(&vma->flags) & I915_VMA_PARKED)
> +             return NULL;
> +
> +     __i915_vma_remove_closed(vma);
> +     return vma;
> +}
> +
> +struct i915_vma *i915_vma_open(struct i915_vma *vma)
>  {
>       struct intel_gt *gt = vma->vm->gt;
>  
> +     if (atomic_inc_not_zero(&vma->open_count))
> +             return vma;
> +
>       spin_lock_irq(&gt->closed_lock);
> -     if (i915_vma_is_closed(vma))
> -             __i915_vma_remove_closed(vma);
> +     if (!atomic_inc_not_zero(&vma->open_count)) {
> +             if (i915_vma_is_closed(vma))
> +                     vma = i915_vma_reopen(vma);
> +
> +             if (vma)
> +                     atomic_inc(&vma->open_count);
> +     }
>       spin_unlock_irq(&gt->closed_lock);
> +
> +     return vma;
>  }
>  
>  static void force_unbind(struct i915_vma *vma)
> @@ -1770,7 +1789,8 @@ static void release_references(struct i915_vma *vma, 
> struct intel_gt *gt,
>       spin_unlock(&obj->vma.lock);
>  
>       spin_lock_irq(&gt->closed_lock);
> -     __i915_vma_remove_closed(vma);
> +     if (!(atomic_read(&vma->flags) & I915_VMA_PARKED))
> +             __i915_vma_remove_closed(vma);
>       spin_unlock_irq(&gt->closed_lock);
>  
>       if (vm_ddestroy)
> @@ -1854,22 +1874,22 @@ void i915_vma_parked(struct intel_gt *gt)
>               }
>  
>               list_move(&vma->closed_link, &closed);
> +             atomic_or(I915_VMA_PARKED, &vma->flags);
>       }
>       spin_unlock_irq(&gt->closed_lock);
>  
> -     /* As the GT is held idle, no vma can be reopened as we destroy them */
>       list_for_each_entry_safe(vma, next, &closed, closed_link) {
>               struct drm_i915_gem_object *obj = vma->obj;
>               struct i915_address_space *vm = vma->vm;
>  
>               if (i915_gem_object_trylock(obj, NULL)) {
> -                     INIT_LIST_HEAD(&vma->closed_link);
>                       i915_vma_destroy(vma);
>                       i915_gem_object_unlock(obj);
>               } else {
>                       /* back you go.. */
>                       spin_lock_irq(&gt->closed_lock);
>                       list_add(&vma->closed_link, &gt->closed_vma);
> +                     atomic_andnot(I915_VMA_PARKED, &vma->flags);
>                       spin_unlock_irq(&gt->closed_lock);
>               }
>  
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index e356dfb883d34..331d19672c764 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -268,7 +268,7 @@ int __must_check i915_vma_unbind_async(struct i915_vma 
> *vma, bool trylock_vm);
>  int __must_check i915_vma_unbind_unlocked(struct i915_vma *vma);
>  void i915_vma_unlink_ctx(struct i915_vma *vma);
>  void i915_vma_close(struct i915_vma *vma);
> -void i915_vma_reopen(struct i915_vma *vma);
> +struct i915_vma *i915_vma_open(struct i915_vma *vma);
>  
>  void i915_vma_destroy_locked(struct i915_vma *vma);
>  void i915_vma_destroy(struct i915_vma *vma);
> diff --git a/drivers/gpu/drm/i915/i915_vma_types.h 
> b/drivers/gpu/drm/i915/i915_vma_types.h
> index 559de74d0b114..41784c3025349 100644
> --- a/drivers/gpu/drm/i915/i915_vma_types.h
> +++ b/drivers/gpu/drm/i915/i915_vma_types.h
> @@ -263,6 +263,9 @@ struct i915_vma {
>  #define I915_VMA_SCANOUT_BIT 17
>  #define I915_VMA_SCANOUT     ((int)BIT(I915_VMA_SCANOUT_BIT))
>  
> +#define I915_VMA_PARKED_BIT  18
> +#define I915_VMA_PARKED              ((int)BIT(I915_VMA_PARKED_BIT))
> +
>       struct i915_active active;
>  
>  #define I915_VMA_PAGES_BIAS 24
> -- 
> 2.44.0
> 

Reply via email to