On 2025-09-11 08:09, Christian König wrote:
> It turned out that protecting the status of each bo_va with a
> spinlock was just hiding problems instead of solving them.
> 
> Revert the whole approach, add a separate stats_lock and lockdep
> assertions that the correct reservation lock is held all over the place.
> 
> This not only allows for better checks if a state transition is properly
> protected by a lock, but also switching back to using list macros to
> iterate over the state of lists protected by the dma_resv lock of the
> root PD.
> 
> v2: re-add missing check
> v3: split into two patches
> 
> Signed-off-by: Christian König <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c |   8 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    | 168 +++++++++++-----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  15 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c |   4 -
>  4 files changed, 93 insertions(+), 102 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 0ccbd3c5d88d..428f5e8f1cfc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -679,12 +679,12 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, 
> struct drm_exec *exec,
>       struct amdgpu_bo *bo;
>       int ret;
>  
> -     spin_lock(&vm->status_lock);
> +     spin_lock(&vm->invalidated_lock);
>       while (!list_empty(&vm->invalidated)) {
>               bo_va = list_first_entry(&vm->invalidated,
>                                        struct amdgpu_bo_va,
>                                        base.vm_status);
> -             spin_unlock(&vm->status_lock);
> +             spin_unlock(&vm->invalidated_lock);
>  
>               bo = bo_va->base.bo;
>               ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
> @@ -701,9 +701,9 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, 
> struct drm_exec *exec,
>               if (ret)
>                       return ret;
>  
> -             spin_lock(&vm->status_lock);
> +             spin_lock(&vm->invalidated_lock);
>       }
> -     spin_unlock(&vm->status_lock);
> +     spin_unlock(&vm->invalidated_lock);
>  
>       return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index d0c95fb0ef81..fc36d61567d0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -127,6 +127,17 @@ struct amdgpu_vm_tlb_seq_struct {
>       struct dma_fence_cb cb;
>  };
>  
> +/**
> + * amdgpu_vm_assert_locked - check if VM is correctly locked
> + * @vm: the VM which schould be tested
> + *
> + * Asserts that the VM root PD is locked.
> + */
> +static void amdgpu_vm_assert_locked(struct amdgpu_vm *vm)
> +{
> +     dma_resv_assert_held(vm->root.bo->tbo.base.resv);
> +}
> +
>  /**
>   * amdgpu_vm_set_pasid - manage pasid and vm ptr mapping
>   *
> @@ -143,6 +154,8 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, 
> struct amdgpu_vm *vm,
>  {
>       int r;
>  
> +     amdgpu_vm_assert_locked(vm);
> +

This is causing a warning with the following stack trace. Should callers 
acquire vm->stats_lock beforehand?

[    5.046056] ------------[ cut here ]------------
[    5.046057] WARNING: CPU: 3 PID: 750 at 
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c:138 amdgpu_vm_set_pasid+0xe3/0x150 
[amdgpu]
[    5.046194] Modules linked in: nf_tables vfat fat amdgpu(+) snd_soc_dmic 
snd_sof_amd_renoir snd_sof_amd_acp mt7925e snd_sof_pci snd_sof_xtensa_dsp 
mt7925_common snd_sof mt792x_lib snd_ctl_led snd_sof_utils mt76_connac_lib 
snd_soc_core snd_hda_codec_realtek uvcvideo amdxcp intel_rapl_msr snd_compress 
mt76 i2c_algo_bit videobuf2_vmalloc drm_ttm_helper videobuf2_memops 
snd_hda_scodec_component snd_hda_codec_generic intel_rapl_common 
snd_rpl_pci_acp6x ttm btusb uvc snd_acp_pci drm_exec btrtl videobuf2_v4l2 
mac80211 snd_hda_intel snd_amd_acpi_mach btintel snd_intel_dspcfg videodev 
snd_acp_legacy_common gpu_sched snd_hda_codec btbcm kvm_amd btmtk libarc4 
snd_pci_acp6x snd_hda_core drm_suballoc_helper videobuf2_common kvm bluetooth 
wmi_bmof mc irqbypass video snd_pcm cfg80211 drm_panel_backlight_quirks rapl 
snd_pci_acp5x snd_timer drm_buddy snd_rn_pci_acp3x drm_display_helper pcspkr 
snd_acp_config snd snd_soc_acpi i2c_piix4 joydev cec rfkill snd_pci_acp3x 
k10temp i2c_smbus soundcore mousedev wmi amd_pmc acpi_tad mac_hid fuse
[    5.046223]  nfnetlink ip_tables x_tables ext4 crc16 mbcache jbd2 ucsi_acpi 
typec_ucsi serio_raw roles atkbd libps2 polyval_clmulni typec vivaldi_fmap 
i8042 ghash_clmulni_intel nvme sha512_ssse3 hid_multitouch ccp aesni_intel 
nvme_core sp5100_tco thunderbolt serio i2c_hid_acpi i2c_hid usbhid btrfs 
blake2b_generic xor raid6_pq dm_mod crypto_user
[    5.046236] CPU: 3 UID: 0 PID: 750 Comm: (udev-worker) Tainted: G        W   
        6.16.0-MANJARO+ #121 PREEMPT(full)  
5581d345f4f5e1e764643d15b2e2deacb80ec100
[    5.046238] Tainted: [W]=WARN
[    5.046238] Hardware name: HP HP Spectre Laptop 14-fd0xxx - 5CD411LN4C/8CDD, 
BIOS W81 Ver. 00.46.00 05/10/2024
[    5.046239] RIP: 0010:amdgpu_vm_set_pasid+0xe3/0x150 [amdgpu]
[    5.046355] Code: 50 06 00 00 eb 80 48 8b 86 60 03 00 00 be ff ff ff ff 48 
8b b8 78 01 00 00 48 83 c7 68 e8 95 7a 7e ce 85 c0 0f 85 46 ff ff ff <0f> 0b e9 
3f ff ff ff 49 8d bf 48 fc 00 00 89 34 24 e8 77 a9 7f ce
[    5.046356] RSP: 0018:ffffd21fc26f36f8 EFLAGS: 00010246
[    5.046357] RAX: 0000000000000000 RBX: 0000000000008001 RCX: 0000000000000000
[    5.046358] RDX: 0000000000000000 RSI: ffff88bb4db401e8 RDI: ffff88bb51da3c98
[    5.046359] RBP: ffff88bb63bd8000 R08: 0000000000000001 R09: 0000000000000630
[    5.046359] R10: 0000000000000000 R11: ffff88bb51da3ce8 R12: ffff88bb63bd8000
[    5.046360] R13: 0000000000008001 R14: ffff88bb46b6e800 R15: ffff88bb46e80000
[    5.046360] FS:  00007fc90e27c880(0000) GS:ffff88c2ec841000(0000) 
knlGS:0000000000000000
[    5.046361] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    5.046362] CR2: 00007f80bfd0d000 CR3: 0000000111de2000 CR4: 0000000000750ef0
[    5.046363] PKRU: 55555554
[    5.046363] Call Trace:
[    5.046364]  <TASK>
[    5.046366]  amdgpu_driver_open_kms+0x110/0x2b0 [amdgpu 
d618420326a9dd0453fb7178a4017b35db3575a4]
[    5.046491]  drm_file_alloc+0x207/0x300
[    5.046493]  ? drm_client_modeset_create+0xd7/0x110
[    5.046495]  drm_client_init+0x7b/0x160
[    5.046498]  drm_fbdev_client_setup+0x96/0x170
[    5.046501]  drm_client_setup+0x6d/0x90
[    5.046503]  amdgpu_pci_probe+0x319/0x4b0 [amdgpu 
d618420326a9dd0453fb7178a4017b35db3575a4]
[    5.046624]  local_pci_probe+0x3f/0x90
[    5.046626]  pci_device_probe+0xd7/0x260
[    5.046628]  ? sysfs_do_create_link_sd+0x6d/0xd0
[    5.046630]  really_probe+0xdb/0x340
[    5.046632]  ? pm_runtime_barrier+0x55/0x90
[    5.046634]  __driver_probe_device+0x78/0x140
[    5.046635]  driver_probe_device+0x1f/0xa0
[    5.046636]  ? __pfx___driver_attach+0x10/0x10
[    5.046637]  __driver_attach+0xcf/0x1e0
[    5.046639]  bus_for_each_dev+0x78/0xd0
[    5.046642]  bus_add_driver+0x10e/0x1f0
[    5.046643]  ? __pfx_amdgpu_init+0x10/0x10 [amdgpu 
d618420326a9dd0453fb7178a4017b35db3575a4]
[    5.046767]  driver_register+0x75/0xe0
[    5.046769]  ? __pci_register_driver+0x5f/0x80
[    5.046770]  do_one_initcall+0x58/0x390
[    5.046773]  do_init_module+0x62/0x240
[    5.046775]  ? init_module_from_file+0x85/0xc0
[    5.046776]  init_module_from_file+0x85/0xc0
[    5.046780]  idempotent_init_module+0x106/0x300
[    5.046785]  __x64_sys_finit_module+0x6d/0xd0
[    5.046787]  do_syscall_64+0x94/0x390
[    5.046789]  ? touch_atime+0x20/0x210
[    5.046791]  ? filemap_read+0x39c/0x3d0
[    5.046794]  ? __lock_acquire+0x4a1/0x22c0
[    5.046796]  ? __lock_acquire+0x4a1/0x22c0
[    5.046798]  ? lock_acquire+0xc9/0x2f0
[    5.046800]  ? __might_fault+0x3e/0x80
[    5.046802]  ? find_held_lock+0x2b/0x80
[    5.046804]  ? __might_fault+0x3e/0x80
[    5.046805]  ? lock_release+0xdd/0x2e0
[    5.046807]  ? __rseq_handle_notify_resume+0x366/0x590
[    5.046810]  ? trace_hardirqs_off+0x44/0xb0
[    5.046812]  ? exit_to_user_mode_loop+0x3b/0x140
[    5.046813]  ? do_syscall_64+0x16b/0x390
[    5.046815]  ? trace_hardirqs_off+0x44/0xb0
[    5.046816]  ? exit_to_user_mode_loop+0x3b/0x140
[    5.046817]  ? do_syscall_64+0x16b/0x390
[    5.046819]  ? lockdep_hardirqs_on_prepare+0xdb/0x190
[    5.046820]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[    5.046822] RIP: 0033:0x7fc90db1876d
[    5.046825] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 
f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 
f0 ff ff 73 01 c3 48 8b 0d 73 05 0f 00 f7 d8 64 89 01 48
[    5.046825] RSP: 002b:00007ffe53a77d28 EFLAGS: 00000246 ORIG_RAX: 
0000000000000139
[    5.046828] RAX: ffffffffffffffda RBX: 0000560845137e30 RCX: 00007fc90db1876d
[    5.046829] RDX: 0000000000000000 RSI: 00005608451392a0 RDI: 000000000000003f
[    5.046829] RBP: 00007ffe53a77dc0 R08: 0000000000000000 R09: 00005608451393a0
[    5.046830] R10: 0000000000000000 R11: 0000000000000246 R12: 00005608451392a0
[    5.046831] R13: 0000000000020000 R14: 0000560845135280 R15: 0000560845137e30
[    5.046834]  </TASK>
[    5.046834] irq event stamp: 179341
[    5.046835] hardirqs last  enabled at (179347): [<ffffffff8f3da95e>] 
__up_console_sem+0x5e/0x70
[    5.046837] hardirqs last disabled at (179352): [<ffffffff8f3da943>] 
__up_console_sem+0x43/0x70
[    5.046838] softirqs last  enabled at (173244): [<ffffffff8f326884>] 
__irq_exit_rcu+0xe4/0x100
[    5.046840] softirqs last disabled at (173239): [<ffffffff8f326884>] 
__irq_exit_rcu+0xe4/0x100
[    5.046841] ---[ end trace 0000000000000000 ]---

-Leo

>       if (vm->pasid == pasid)
>               return 0;
>  
> @@ -181,12 +194,11 @@ static void amdgpu_vm_bo_evicted(struct 
> amdgpu_vm_bo_base *vm_bo)
>       struct amdgpu_bo *bo = vm_bo->bo;
>  
>       vm_bo->moved = true;
> -     spin_lock(&vm_bo->vm->status_lock);
> +     amdgpu_vm_assert_locked(vm);
>       if (bo->tbo.type == ttm_bo_type_kernel)
>               list_move(&vm_bo->vm_status, &vm->evicted);
>       else
>               list_move_tail(&vm_bo->vm_status, &vm->evicted);
> -     spin_unlock(&vm_bo->vm->status_lock);
>  }
>  /**
>   * amdgpu_vm_bo_moved - vm_bo is moved
> @@ -198,9 +210,8 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base 
> *vm_bo)
>   */
>  static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
>  {
> -     spin_lock(&vm_bo->vm->status_lock);
> +     amdgpu_vm_assert_locked(vm_bo->vm);
>       list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
> -     spin_unlock(&vm_bo->vm->status_lock);
>  }
>  
>  /**
> @@ -213,9 +224,8 @@ static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base 
> *vm_bo)
>   */
>  static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
>  {
> -     spin_lock(&vm_bo->vm->status_lock);
> +     amdgpu_vm_assert_locked(vm_bo->vm);
>       list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
> -     spin_unlock(&vm_bo->vm->status_lock);
>       vm_bo->moved = false;
>  }
>  
> @@ -229,9 +239,9 @@ static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base 
> *vm_bo)
>   */
>  static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo)
>  {
> -     spin_lock(&vm_bo->vm->status_lock);
> +     spin_lock(&vm_bo->vm->invalidated_lock);
>       list_move(&vm_bo->vm_status, &vm_bo->vm->invalidated);
> -     spin_unlock(&vm_bo->vm->status_lock);
> +     spin_unlock(&vm_bo->vm->invalidated_lock);
>  }
>  
>  /**
> @@ -244,10 +254,9 @@ static void amdgpu_vm_bo_invalidated(struct 
> amdgpu_vm_bo_base *vm_bo)
>   */
>  static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo)
>  {
> +     amdgpu_vm_assert_locked(vm_bo->vm);
>       vm_bo->moved = true;
> -     spin_lock(&vm_bo->vm->status_lock);
>       list_move(&vm_bo->vm_status, &vm_bo->vm->evicted_user);
> -     spin_unlock(&vm_bo->vm->status_lock);
>  }
>  
>  /**
> @@ -260,13 +269,11 @@ static void amdgpu_vm_bo_evicted_user(struct 
> amdgpu_vm_bo_base *vm_bo)
>   */
>  static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
>  {
> -     if (vm_bo->bo->parent) {
> -             spin_lock(&vm_bo->vm->status_lock);
> +     amdgpu_vm_assert_locked(vm_bo->vm);
> +     if (vm_bo->bo->parent)
>               list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
> -             spin_unlock(&vm_bo->vm->status_lock);
> -     } else {
> +     else
>               amdgpu_vm_bo_idle(vm_bo);
> -     }
>  }
>  
>  /**
> @@ -279,9 +286,8 @@ static void amdgpu_vm_bo_relocated(struct 
> amdgpu_vm_bo_base *vm_bo)
>   */
>  static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
>  {
> -     spin_lock(&vm_bo->vm->status_lock);
> +     amdgpu_vm_assert_locked(vm_bo->vm);
>       list_move(&vm_bo->vm_status, &vm_bo->vm->done);
> -     spin_unlock(&vm_bo->vm->status_lock);
>  }
>  
>  /**
> @@ -295,10 +301,13 @@ static void amdgpu_vm_bo_reset_state_machine(struct 
> amdgpu_vm *vm)
>  {
>       struct amdgpu_vm_bo_base *vm_bo, *tmp;
>  
> -     spin_lock(&vm->status_lock);
> +     spin_lock(&vm->invalidated_lock);
>       list_splice_init(&vm->done, &vm->invalidated);
>       list_for_each_entry(vm_bo, &vm->invalidated, vm_status)
>               vm_bo->moved = true;
> +     spin_unlock(&vm->invalidated_lock);
> +
> +     amdgpu_vm_assert_locked(vm_bo->vm);
>       list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) {
>               struct amdgpu_bo *bo = vm_bo->bo;
>  
> @@ -308,14 +317,13 @@ static void amdgpu_vm_bo_reset_state_machine(struct 
> amdgpu_vm *vm)
>               else if (bo->parent)
>                       list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
>       }
> -     spin_unlock(&vm->status_lock);
>  }
>  
>  /**
>   * amdgpu_vm_update_shared - helper to update shared memory stat
>   * @base: base structure for tracking BO usage in a VM
>   *
> - * Takes the vm status_lock and updates the shared memory stat. If the basic
> + * Takes the vm stats_lock and updates the shared memory stat. If the basic
>   * stat changed (e.g. buffer was moved) amdgpu_vm_update_stats need to be 
> called
>   * as well.
>   */
> @@ -327,7 +335,8 @@ static void amdgpu_vm_update_shared(struct 
> amdgpu_vm_bo_base *base)
>       uint32_t bo_memtype = amdgpu_bo_mem_stats_placement(bo);
>       bool shared;
>  
> -     spin_lock(&vm->status_lock);
> +     dma_resv_assert_held(bo->tbo.base.resv);
> +     spin_lock(&vm->stats_lock);
>       shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base);
>       if (base->shared != shared) {
>               base->shared = shared;
> @@ -339,7 +348,7 @@ static void amdgpu_vm_update_shared(struct 
> amdgpu_vm_bo_base *base)
>                       vm->stats[bo_memtype].drm.private += size;
>               }
>       }
> -     spin_unlock(&vm->status_lock);
> +     spin_unlock(&vm->stats_lock);
>  }
>  
>  /**
> @@ -364,11 +373,11 @@ void amdgpu_vm_bo_update_shared(struct amdgpu_bo *bo)
>   *        be bo->tbo.resource
>   * @sign: if we should add (+1) or subtract (-1) from the stat
>   *
> - * Caller need to have the vm status_lock held. Useful for when multiple 
> update
> + * Caller need to have the vm stats_lock held. Useful for when multiple 
> update
>   * need to happen at the same time.
>   */
>  static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base,
> -                         struct ttm_resource *res, int sign)
> +                                       struct ttm_resource *res, int sign)
>  {
>       struct amdgpu_vm *vm = base->vm;
>       struct amdgpu_bo *bo = base->bo;
> @@ -392,7 +401,8 @@ static void amdgpu_vm_update_stats_locked(struct 
> amdgpu_vm_bo_base *base,
>                */
>               if (bo->flags & AMDGPU_GEM_CREATE_DISCARDABLE)
>                       vm->stats[res_memtype].drm.purgeable += size;
> -             if (!(bo->preferred_domains & 
> amdgpu_mem_type_to_domain(res_memtype)))
> +             if (!(bo->preferred_domains &
> +                   amdgpu_mem_type_to_domain(res_memtype)))
>                       vm->stats[bo_memtype].evicted += size;
>       }
>  }
> @@ -411,9 +421,9 @@ void amdgpu_vm_update_stats(struct amdgpu_vm_bo_base 
> *base,
>  {
>       struct amdgpu_vm *vm = base->vm;
>  
> -     spin_lock(&vm->status_lock);
> +     spin_lock(&vm->stats_lock);
>       amdgpu_vm_update_stats_locked(base, res, sign);
> -     spin_unlock(&vm->status_lock);
> +     spin_unlock(&vm->stats_lock);
>  }
>  
>  /**
> @@ -439,10 +449,10 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base 
> *base,
>       base->next = bo->vm_bo;
>       bo->vm_bo = base;
>  
> -     spin_lock(&vm->status_lock);
> +     spin_lock(&vm->stats_lock);
>       base->shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base);
>       amdgpu_vm_update_stats_locked(base, bo->tbo.resource, +1);
> -     spin_unlock(&vm->status_lock);
> +     spin_unlock(&vm->stats_lock);
>  
>       if (!amdgpu_vm_is_bo_always_valid(vm, bo))
>               return;
> @@ -500,10 +510,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, 
> struct drm_exec *exec,
>       int ret;
>  
>       /* We can only trust prev->next while holding the lock */
> -     spin_lock(&vm->status_lock);
> +     spin_lock(&vm->invalidated_lock);
>       while (!list_is_head(prev->next, &vm->done)) {
>               bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status);
> -             spin_unlock(&vm->status_lock);
> +             spin_unlock(&vm->invalidated_lock);
>  
>               bo = bo_va->base.bo;
>               if (bo) {
> @@ -511,10 +521,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, 
> struct drm_exec *exec,
>                       if (unlikely(ret))
>                               return ret;
>               }
> -             spin_lock(&vm->status_lock);
> +             spin_lock(&vm->invalidated_lock);
>               prev = prev->next;
>       }
> -     spin_unlock(&vm->status_lock);
> +     spin_unlock(&vm->invalidated_lock);
>  
>       return 0;
>  }
> @@ -610,7 +620,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>                      void *param)
>  {
>       uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm);
> -     struct amdgpu_vm_bo_base *bo_base;
> +     struct amdgpu_vm_bo_base *bo_base, *tmp;
>       struct amdgpu_bo *bo;
>       int r;
>  
> @@ -623,13 +633,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, 
> struct amdgpu_vm *vm,
>                       return r;
>       }
>  
> -     spin_lock(&vm->status_lock);
> -     while (!list_empty(&vm->evicted)) {
> -             bo_base = list_first_entry(&vm->evicted,
> -                                        struct amdgpu_vm_bo_base,
> -                                        vm_status);
> -             spin_unlock(&vm->status_lock);
> -
> +     list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
>               bo = bo_base->bo;
>  
>               r = validate(param, bo);
> @@ -642,26 +646,21 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, 
> struct amdgpu_vm *vm,
>                       vm->update_funcs->map_table(to_amdgpu_bo_vm(bo));
>                       amdgpu_vm_bo_relocated(bo_base);
>               }
> -             spin_lock(&vm->status_lock);
>       }
> -     while (ticket && !list_empty(&vm->evicted_user)) {
> -             bo_base = list_first_entry(&vm->evicted_user,
> -                                        struct amdgpu_vm_bo_base,
> -                                        vm_status);
> -             spin_unlock(&vm->status_lock);
>  
> -             bo = bo_base->bo;
> -             dma_resv_assert_held(bo->tbo.base.resv);
> -
> -             r = validate(param, bo);
> -             if (r)
> -                     return r;
> +     if (ticket) {
> +             list_for_each_entry_safe(bo_base, tmp, &vm->evicted_user,
> +                                      vm_status) {
> +                     bo = bo_base->bo;
> +                     dma_resv_assert_held(bo->tbo.base.resv);
>  
> -             amdgpu_vm_bo_invalidated(bo_base);
> +                     r = validate(param, bo);
> +                     if (r)
> +                             return r;
>  
> -             spin_lock(&vm->status_lock);
> +                     amdgpu_vm_bo_invalidated(bo_base);
> +             }
>       }
> -     spin_unlock(&vm->status_lock);
>  
>       amdgpu_vm_eviction_lock(vm);
>       vm->evicting = false;
> @@ -684,13 +683,13 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
>  {
>       bool ret;
>  
> +     amdgpu_vm_assert_locked(vm);
> +
>       amdgpu_vm_eviction_lock(vm);
>       ret = !vm->evicting;
>       amdgpu_vm_eviction_unlock(vm);
>  
> -     spin_lock(&vm->status_lock);
>       ret &= list_empty(&vm->evicted);
> -     spin_unlock(&vm->status_lock);
>  
>       spin_lock(&vm->immediate.lock);
>       ret &= !vm->immediate.stopped;
> @@ -981,16 +980,13 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
>                         struct amdgpu_vm *vm, bool immediate)
>  {
>       struct amdgpu_vm_update_params params;
> -     struct amdgpu_vm_bo_base *entry;
> +     struct amdgpu_vm_bo_base *entry, *tmp;
>       bool flush_tlb_needed = false;
> -     LIST_HEAD(relocated);
>       int r, idx;
>  
> -     spin_lock(&vm->status_lock);
> -     list_splice_init(&vm->relocated, &relocated);
> -     spin_unlock(&vm->status_lock);
> +     amdgpu_vm_assert_locked(vm);
>  
> -     if (list_empty(&relocated))
> +     if (list_empty(&vm->relocated))
>               return 0;
>  
>       if (!drm_dev_enter(adev_to_drm(adev), &idx))
> @@ -1005,7 +1001,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
>       if (r)
>               goto error;
>  
> -     list_for_each_entry(entry, &relocated, vm_status) {
> +     list_for_each_entry(entry, &vm->relocated, vm_status) {
>               /* vm_flush_needed after updating moved PDEs */
>               flush_tlb_needed |= entry->moved;
>  
> @@ -1021,9 +1017,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
>       if (flush_tlb_needed)
>               atomic64_inc(&vm->tlb_seq);
>  
> -     while (!list_empty(&relocated)) {
> -             entry = list_first_entry(&relocated, struct amdgpu_vm_bo_base,
> -                                      vm_status);
> +     list_for_each_entry_safe(entry, tmp, &vm->relocated, vm_status) {
>               amdgpu_vm_bo_idle(entry);
>       }
>  
> @@ -1249,9 +1243,9 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
> struct amdgpu_vm *vm,
>  void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
>                         struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM])
>  {
> -     spin_lock(&vm->status_lock);
> +     spin_lock(&vm->stats_lock);
>       memcpy(stats, vm->stats, sizeof(*stats) * __AMDGPU_PL_NUM);
> -     spin_unlock(&vm->status_lock);
> +     spin_unlock(&vm->stats_lock);
>  }
>  
>  /**
> @@ -1618,29 +1612,24 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
>                          struct amdgpu_vm *vm,
>                          struct ww_acquire_ctx *ticket)
>  {
> -     struct amdgpu_bo_va *bo_va;
> +     struct amdgpu_bo_va *bo_va, *tmp;
>       struct dma_resv *resv;
>       bool clear, unlock;
>       int r;
>  
> -     spin_lock(&vm->status_lock);
> -     while (!list_empty(&vm->moved)) {
> -             bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
> -                                      base.vm_status);
> -             spin_unlock(&vm->status_lock);
> -
> +     list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) {
>               /* Per VM BOs never need to bo cleared in the page tables */
>               r = amdgpu_vm_bo_update(adev, bo_va, false);
>               if (r)
>                       return r;
> -             spin_lock(&vm->status_lock);
>       }
>  
> +     spin_lock(&vm->invalidated_lock);
>       while (!list_empty(&vm->invalidated)) {
>               bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
>                                        base.vm_status);
>               resv = bo_va->base.bo->tbo.base.resv;
> -             spin_unlock(&vm->status_lock);
> +             spin_unlock(&vm->invalidated_lock);
>  
>               /* Try to reserve the BO to avoid clearing its ptes */
>               if (!adev->debug_vm && dma_resv_trylock(resv)) {
> @@ -1672,9 +1661,9 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
>                    bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM))
>                       amdgpu_vm_bo_evicted_user(&bo_va->base);
>  
> -             spin_lock(&vm->status_lock);
> +             spin_lock(&vm->invalidated_lock);
>       }
> -     spin_unlock(&vm->status_lock);
> +     spin_unlock(&vm->invalidated_lock);
>  
>       return 0;
>  }
> @@ -2203,9 +2192,9 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
>               }
>       }
>  
> -     spin_lock(&vm->status_lock);
> +     spin_lock(&vm->invalidated_lock);
>       list_del(&bo_va->base.vm_status);
> -     spin_unlock(&vm->status_lock);
> +     spin_unlock(&vm->invalidated_lock);
>  
>       list_for_each_entry_safe(mapping, next, &bo_va->valids, list) {
>               list_del(&mapping->list);
> @@ -2313,10 +2302,10 @@ void amdgpu_vm_bo_move(struct amdgpu_bo *bo, struct 
> ttm_resource *new_mem,
>       for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
>               struct amdgpu_vm *vm = bo_base->vm;
>  
> -             spin_lock(&vm->status_lock);
> +             spin_lock(&vm->stats_lock);
>               amdgpu_vm_update_stats_locked(bo_base, bo->tbo.resource, -1);
>               amdgpu_vm_update_stats_locked(bo_base, new_mem, +1);
> -             spin_unlock(&vm->status_lock);
> +             spin_unlock(&vm->stats_lock);
>       }
>  
>       amdgpu_vm_bo_invalidate(bo, evicted);
> @@ -2583,11 +2572,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>       INIT_LIST_HEAD(&vm->relocated);
>       INIT_LIST_HEAD(&vm->moved);
>       INIT_LIST_HEAD(&vm->idle);
> +     spin_lock_init(&vm->invalidated_lock);
>       INIT_LIST_HEAD(&vm->invalidated);
> -     spin_lock_init(&vm->status_lock);
>       INIT_LIST_HEAD(&vm->freed);
>       INIT_LIST_HEAD(&vm->done);
>       INIT_KFIFO(vm->faults);
> +     spin_lock_init(&vm->stats_lock);
>  
>       r = amdgpu_vm_init_entities(adev, vm);
>       if (r)
> @@ -3052,7 +3042,8 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, 
> struct seq_file *m)
>       unsigned int total_done_objs = 0;
>       unsigned int id = 0;
>  
> -     spin_lock(&vm->status_lock);
> +     amdgpu_vm_assert_locked(vm);
> +
>       seq_puts(m, "\tIdle BOs:\n");
>       list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) {
>               if (!bo_va->base.bo)
> @@ -3090,11 +3081,13 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, 
> struct seq_file *m)
>       id = 0;
>  
>       seq_puts(m, "\tInvalidated BOs:\n");
> +     spin_lock(&vm->invalidated_lock);
>       list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {
>               if (!bo_va->base.bo)
>                       continue;
>               total_invalidated += amdgpu_bo_print_info(id++, bo_va->base.bo, 
> m);
>       }
> +     spin_unlock(&vm->invalidated_lock);
>       total_invalidated_objs = id;
>       id = 0;
>  
> @@ -3104,7 +3097,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, 
> struct seq_file *m)
>                       continue;
>               total_done += amdgpu_bo_print_info(id++, bo_va->base.bo, m);
>       }
> -     spin_unlock(&vm->status_lock);
>       total_done_objs = id;
>  
>       seq_printf(m, "\tTotal idle size:        %12lld\tobjs:\t%d\n", 
> total_idle,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 74e61e45778e..829b400cb8c0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -203,11 +203,11 @@ struct amdgpu_vm_bo_base {
>       /* protected by bo being reserved */
>       struct amdgpu_vm_bo_base        *next;
>  
> -     /* protected by vm status_lock */
> +     /* protected by vm reservation and invalidated_lock */
>       struct list_head                vm_status;
>  
>       /* if the bo is counted as shared in mem stats
> -      * protected by vm status_lock */
> +      * protected by vm BO being reserved */
>       bool                            shared;
>  
>       /* protected by the BO being reserved */
> @@ -343,10 +343,8 @@ struct amdgpu_vm {
>       bool                    evicting;
>       unsigned int            saved_flags;
>  
> -     /* Lock to protect vm_bo add/del/move on all lists of vm */
> -     spinlock_t              status_lock;
> -
> -     /* Memory statistics for this vm, protected by status_lock */
> +     /* Memory statistics for this vm, protected by stats_lock */
> +     spinlock_t              stats_lock;
>       struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM];
>  
>       /*
> @@ -354,6 +352,8 @@ struct amdgpu_vm {
>        * PDs, PTs or per VM BOs. The state transits are:
>        *
>        * evicted -> relocated (PDs, PTs) or moved (per VM BOs) -> idle
> +      *
> +      * Lists are protected by the root PD dma_resv lock.
>        */
>  
>       /* Per-VM and PT BOs who needs a validation */
> @@ -374,7 +374,10 @@ struct amdgpu_vm {
>        * state transits are:
>        *
>        * evicted_user or invalidated -> done
> +      *
> +      * Lists are protected by the invalidated_lock.
>        */
> +     spinlock_t              invalidated_lock;
>  
>       /* BOs for user mode queues that need a validation */
>       struct list_head        evicted_user;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index 30022123b0bf..f57c48b74274 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -541,9 +541,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base 
> *entry)
>       entry->bo->vm_bo = NULL;
>       ttm_bo_set_bulk_move(&entry->bo->tbo, NULL);
>  
> -     spin_lock(&entry->vm->status_lock);
>       list_del(&entry->vm_status);
> -     spin_unlock(&entry->vm->status_lock);
>       amdgpu_bo_unref(&entry->bo);
>  }
>  
> @@ -587,7 +585,6 @@ static void amdgpu_vm_pt_add_list(struct 
> amdgpu_vm_update_params *params,
>       struct amdgpu_vm_pt_cursor seek;
>       struct amdgpu_vm_bo_base *entry;
>  
> -     spin_lock(&params->vm->status_lock);
>       for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, cursor, seek, 
> entry) {
>               if (entry && entry->bo)
>                       list_move(&entry->vm_status, 
> &params->tlb_flush_waitlist);
> @@ -595,7 +592,6 @@ static void amdgpu_vm_pt_add_list(struct 
> amdgpu_vm_update_params *params,
>  
>       /* enter start node now */
>       list_move(&cursor->entry->vm_status, &params->tlb_flush_waitlist);
> -     spin_unlock(&params->vm->status_lock);
>  }
>  
>  /**


Reply via email to