On Thu, Dec 18, 2025 at 05:21:01PM +0100, Thomas Hellström wrote:
> Introduce an rw-semaphore to serialize migration to device if
> it's likely that migration races with another device migration
> of the same CPU address space range.
> This is a temporary fix to attempt to mitigate a livelock that
> might happen if many devices try to migrate a range at the same
> time, and it affects only devices using the xe driver.
> A longer term fix is probably improvements in the core mm
> migration layer.

+1, yes we can likely do some core MM changes to make this work a bit
better.

Until then:
Reviewed-by: Matthew Brost <[email protected]>

> 
> Suggested-by: Matthew Brost <[email protected]>
> Signed-off-by: Thomas Hellström <[email protected]>
> ---
>  drivers/gpu/drm/xe/xe_svm.c | 22 ++++++++++++++++++++--
>  1 file changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> index 7be4d129247d..dce7879b3133 100644
> --- a/drivers/gpu/drm/xe/xe_svm.c
> +++ b/drivers/gpu/drm/xe/xe_svm.c
> @@ -1616,10 +1616,12 @@ struct drm_pagemap *xe_vma_resolve_pagemap(struct 
> xe_vma *vma, struct xe_tile *t
>  int xe_svm_alloc_vram(struct xe_svm_range *range, const struct 
> drm_gpusvm_ctx *ctx,
>                     struct drm_pagemap *dpagemap)
>  {
> +     static DECLARE_RWSEM(driver_migrate_lock);
>       struct xe_vm *vm = range_to_vm(&range->base);
>       enum drm_gpusvm_scan_result migration_state;
>       struct xe_device *xe = vm->xe;
>       int err, retries = 1;
> +     bool write_locked = false;
>  
>       xe_assert(range_to_vm(&range->base)->xe, 
> range->base.pages.flags.migrate_devmem);
>       range_debug(range, "ALLOCATE VRAM");
> @@ -1638,16 +1640,32 @@ int xe_svm_alloc_vram(struct xe_svm_range *range, 
> const struct drm_gpusvm_ctx *c
>               drm_dbg(&xe->drm, "Request migration to device memory on 
> \"%s\".\n",
>                       dpagemap->drm->unique);
>  
> +     err = down_read_interruptible(&driver_migrate_lock);
> +     if (err)
> +             return err;
>       do {
>               err = drm_pagemap_populate_mm(dpagemap, 
> xe_svm_range_start(range),
>                                             xe_svm_range_end(range),
>                                             range->base.gpusvm->mm,
>                                             ctx->timeslice_ms);
>  
> -             if (err == -EBUSY && retries)
> -                     drm_gpusvm_range_evict(range->base.gpusvm, 
> &range->base);
> +             if (err == -EBUSY && retries) {
> +                     if (!write_locked) {
> +                             int lock_err;
>  
> +                             up_read(&driver_migrate_lock);
> +                             lock_err = 
> down_write_killable(&driver_migrate_lock);
> +                             if (lock_err)
> +                                     return lock_err;
> +                             write_locked = true;
> +                     }
> +                     drm_gpusvm_range_evict(range->base.gpusvm, 
> &range->base);
> +             }
>       } while (err == -EBUSY && retries--);
> +     if (write_locked)
> +             up_write(&driver_migrate_lock);
> +     else
> +             up_read(&driver_migrate_lock);
>  
>       return err;
>  }
> -- 
> 2.51.1
> 

Reply via email to