On 2025-09-25 05:01, Sunil Khatri wrote:
userptrs could be changed by the user at any time and
hence while locking all the bos before GPU start processing
validate all the userptr bos.

Signed-off-by: Sunil Khatri<[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 76 +++++++++++++++++++++++
  1 file changed, 76 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 3bbe1001fda1..880bcc6edcbb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -25,6 +25,8 @@
  #include <drm/drm_auth.h>
  #include <drm/drm_exec.h>
  #include <linux/pm_runtime.h>
+#include <linux/hmm.h>
+#include <drm/ttm/ttm_tt.h>
#include "amdgpu.h"
  #include "amdgpu_vm.h"
@@ -761,9 +763,19 @@ amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
        struct amdgpu_device *adev = uq_mgr->adev;
        struct amdgpu_vm *vm = &fpriv->vm;
        struct amdgpu_bo_va *bo_va;
+       struct amdgpu_bo *bo;
        struct drm_exec exec;
+       struct xarray xa;
+       struct hmm_range *range;
+       unsigned long key = 0, tmp_key, count = 0;
        int ret;
+       bool present;
+       bool invalidated = false;
+       struct ttm_operation_ctx ctx = { true, false };
+
+       xa_init(&xa);
+retry_lock:
        drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
        drm_exec_until_all_locked(&exec) {
                ret = amdgpu_vm_lock_pd(vm, &exec, 1);
@@ -794,6 +806,63 @@ amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
        if (ret)
                goto unlock_all;
+ if (invalidated) {
+               tmp_key = 0;
+               xa_for_each(&xa, tmp_key, range) {
+                       bo = range->dev_private_owner;

I believe you're misusing the dev_private_owner field here. This is not a "driver-private" place to store random data. The owner field is used to determine whether hmm_range_fault will cause a device-private page to migrate to system memory or not. If the hmm_range->dev_private_owner matches the pgmap->owner, then the device private page can be accessed directly without migration to system memory. In kfd_svm we set it like this to indicate that GPUs in the same XGMI hive can access each other's memory directly:

kfd_svm.h:#define SVM_ADEV_PGMAP_OWNER(adev)\
kfd_svm.h-            ((adev)->hive ? (void *)(adev)->hive : (void *)(adev))

You can see how this is used here: https://elixir.bootlin.com/linux/v6.17/source/mm/hmm.c#L263

                /*
                 * Don't fault in device private pages owned by the caller,
                 * just report the PFN.
                 */
                if (is_device_private_entry(entry) &&
                    page_pgmap(pfn_swap_entry_to_page(entry))->owner ==
                    range->dev_private_owner) {
                        cpu_flags = HMM_PFN_VALID;
                        if (is_writable_device_private_entry(entry))
                                cpu_flags |= HMM_PFN_WRITE;
                        new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
                        goto out;
                }

Regards,
  Felix


+                       amdgpu_bo_placement_from_domain(bo, 
AMDGPU_GEM_DOMAIN_CPU);
+                       ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+                       if (ret)
+                               goto unlock_all;
+
+                       amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, range);
+               }
+               invalidated = false;
+       }
+
+       /* Validate userptr BOs */
+       list_for_each_entry(bo_va, &vm->done, base.vm_status) {
+               bo = bo_va->base.bo;
+               if (amdgpu_ttm_tt_is_userptr(bo->tbo.ttm)) {
+                       tmp_key = 0;
+                       present = false;
+                       xa_for_each(&xa, tmp_key, range) {
+                               if (range->dev_private_owner == bo) {
+                                       present = true;
+                                       break;
+                               }
+                       }
+
+                       if (!present) {
+                               range = kvmalloc(sizeof(*range), GFP_KERNEL);
+                               if (!range) {
+                                       ret = -ENOMEM;
+                                       goto unlock_all;
+                               }
+
+                               xa_store(&xa, key++, range, GFP_KERNEL);
+                               range->dev_private_owner = bo;
+                               amdgpu_bo_ref(bo);
+                       }
+               }
+       }
+
+       if (key && (key != count)) {
+               drm_file_err(uq_mgr->file, "userptr bos:%lu\n", key);
+               drm_exec_fini(&exec);
+               xa_for_each(&xa, tmp_key, range) {
+                       if (!range)
+                               continue;
+                       bo = range->dev_private_owner;
+                       ret = amdgpu_ttm_tt_get_user_pages(bo, range);
+                       if (ret)
+                               goto unlock_all;
+               }
+               count = key;
+               invalidated = true;
+               goto retry_lock;
+       }
+       /* End validate user ptr*/
        ret = amdgpu_vm_update_pdes(adev, vm, false);
        if (ret)
                goto unlock_all;
@@ -813,6 +882,13 @@ amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
unlock_all:
        drm_exec_fini(&exec);
+       tmp_key = 0;
+       xa_for_each(&xa, tmp_key, range) {
+               bo = range->dev_private_owner;
+               amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
+               amdgpu_bo_unref(&bo);
+       }
+       xa_destroy(&xa);
        return ret;
  }
  

Reply via email to