Re: [PATCH] drm/amdkfd: track unified memory reservation with xnack off

2022-07-18 Thread Felix Kuehling

On 2022-07-15 19:54, Alex Sierra wrote:

[WHY]
Unified memory with xnack off should be tracked, as userptr mappings
and legacy allocations do. To avoid oversuscribe system memory when
xnack off.
[How]
Exposing functions reserve_mem_limit and unreserve_mem_limit to SVM
API and call them on every prange creation and free.

Signed-off-by: Alex Sierra 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  4 ++
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 ---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  | 60 +--
  3 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 73bf8b5f2aa9..83d955f0c52f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -305,6 +305,10 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
  void amdgpu_amdkfd_block_mmu_notifications(void *p);
  int amdgpu_amdkfd_criu_resume(void *p);
  bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
  
  #if IS_ENABLED(CONFIG_HSA_AMD)

  void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 2bc36ff0aa0f..39d589394160 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -129,7 +129,7 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
   *
   * Return: returns -ENOMEM in case of error, ZERO otherwise
   */
-static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
  {
uint64_t reserved_for_pt =
@@ -169,7 +169,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) ||
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
-   (adev->kfd.vram_used + vram_needed >
+   (adev && adev->kfd.vram_used + vram_needed >
 adev->gmc.real_vram_size -
 atomic64_read(>vram_pin_size) -
 reserved_for_pt)) {
@@ -180,7 +180,10 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
/* Update memory accounting by decreasing available system
 * memory, TTM memory and GPU memory as computed above
 */
-   adev->kfd.vram_used += vram_needed;
+   WARN_ONCE(vram_needed && !adev,
+ "adev reference can't be null when vram is used");
+   if (adev)
+   adev->kfd.vram_used += vram_needed;
kfd_mem_limit.system_mem_used += system_mem_needed;
kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
  
@@ -189,7 +192,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,

return ret;
  }
  
-static void unreserve_mem_limit(struct amdgpu_device *adev,

+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
  {
spin_lock(_mem_limit.mem_limit_lock);
@@ -198,7 +201,10 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
+   WARN_ONCE(!adev,
+ "adev reference can't be null when alloc mem flags vram is 
set");
+   if (adev)
+   adev->kfd.vram_used -= ALIGN(size, 
VRAM_ALLOCATION_ALIGN);
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
kfd_mem_limit.system_mem_used -= size;
} else if (!(alloc_flag &
@@ -207,8 +213,7 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
goto release;
}
-
-   WARN_ONCE(adev->kfd.vram_used < 0,
+   WARN_ONCE(adev && adev->kfd.vram_used < 0,
  "KFD VRAM memory accounting unbalanced");
WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,
  "KFD TTM memory accounting unbalanced");
@@ -225,7 +230,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
u32 alloc_flags = bo->kfd_bo->alloc_flags;
u64 size = amdgpu_bo_size(bo);
  
-	unreserve_mem_limit(adev, size, alloc_flags);

+   amdgpu_amdkfd_unreserve_mem_limit(adev, size, alloc_flags);
  
  	

[PATCH] drm/amdkfd: track unified memory reservation with xnack off

2022-07-16 Thread Alex Sierra
[WHY]
Unified memory with xnack off should be tracked, as userptr mappings
and legacy allocations do. To avoid oversuscribe system memory when
xnack off.
[How]
Exposing functions reserve_mem_limit and unreserve_mem_limit to SVM
API and call them on every prange creation and free.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  4 ++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 ---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  | 60 +--
 3 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 73bf8b5f2aa9..83d955f0c52f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -305,6 +305,10 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
 
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 2bc36ff0aa0f..39d589394160 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -129,7 +129,7 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
  *
  * Return: returns -ENOMEM in case of error, ZERO otherwise
  */
-static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
uint64_t reserved_for_pt =
@@ -169,7 +169,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) ||
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
-   (adev->kfd.vram_used + vram_needed >
+   (adev && adev->kfd.vram_used + vram_needed >
 adev->gmc.real_vram_size -
 atomic64_read(>vram_pin_size) -
 reserved_for_pt)) {
@@ -180,7 +180,10 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
/* Update memory accounting by decreasing available system
 * memory, TTM memory and GPU memory as computed above
 */
-   adev->kfd.vram_used += vram_needed;
+   WARN_ONCE(vram_needed && !adev,
+ "adev reference can't be null when vram is used");
+   if (adev)
+   adev->kfd.vram_used += vram_needed;
kfd_mem_limit.system_mem_used += system_mem_needed;
kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
 
@@ -189,7 +192,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
return ret;
 }
 
-static void unreserve_mem_limit(struct amdgpu_device *adev,
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
spin_lock(_mem_limit.mem_limit_lock);
@@ -198,7 +201,10 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
+   WARN_ONCE(!adev,
+ "adev reference can't be null when alloc mem flags 
vram is set");
+   if (adev)
+   adev->kfd.vram_used -= ALIGN(size, 
VRAM_ALLOCATION_ALIGN);
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
kfd_mem_limit.system_mem_used -= size;
} else if (!(alloc_flag &
@@ -207,8 +213,7 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
goto release;
}
-
-   WARN_ONCE(adev->kfd.vram_used < 0,
+   WARN_ONCE(adev && adev->kfd.vram_used < 0,
  "KFD VRAM memory accounting unbalanced");
WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,
  "KFD TTM memory accounting unbalanced");
@@ -225,7 +230,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
u32 alloc_flags = bo->kfd_bo->alloc_flags;
u64 size = amdgpu_bo_size(bo);
 
-   unreserve_mem_limit(adev, size, alloc_flags);
+   amdgpu_amdkfd_unreserve_mem_limit(adev, size, alloc_flags);
 
kfree(bo->kfd_bo);
 }
@@ -1788,7 +1793,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(