Re: [PATCH v7 1/2] drm/buddy: Add start address support to trim function

2024-07-26 Thread Paneer Selvam, Arunpravin




On 7/24/2024 8:42 PM, Jani Nikula wrote:

On Tue, 23 Jul 2024, Arunpravin Paneer Selvam  
wrote:

- Add a new start parameter in trim function to specify exact
   address from where to start the trimming. This would help us
   in situations like if drivers would like to do address alignment
   for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
   flag to disable the allocator trimming part. This patch enables
   the drivers control trimming and they can do it themselves
   based on the application requirements.

v1:(Matthew)
   - check new_start alignment with min chunk_size
   - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
---
  drivers/gpu/drm/drm_buddy.c  | 25 +++--
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
  include/drm/drm_buddy.h  |  2 ++
  3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 6a8e45e9d0ec..103c185bb1c8 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
   * drm_buddy_block_trim - free unused pages
   *
   * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
   * @new_size: original size requested
   * @blocks: Input and output list of allocated blocks.
   * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
   * 0 on success, error code on failure.
   */
  int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,

Is this a pointer only to allow passing NULL as "don't specify"? It's
not used for returning anything in *start. Maybe it should be a const
pointer?

Not the prettiest of interfaces, and the kernel-doc does not reflect any
of this.
We have a plan to improve the interface and add multiple block trim 
support as well.

I will modify in the follow-up patch.

Thanks,
Arun.


BR,
Jani.



 u64 new_size,
 struct list_head *blocks)
  {
struct drm_buddy_block *parent;
struct drm_buddy_block *block;
+   u64 block_start, block_end;
LIST_HEAD(dfs);
u64 new_start;
int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 struct drm_buddy_block,
 link);
  
+	block_start = drm_buddy_block_offset(block);

+   block_end = block_start + drm_buddy_block_size(mm, block);
+
if (WARN_ON(!drm_buddy_block_is_allocated(block)))
return -EINVAL;
  
@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,

if (new_size == drm_buddy_block_size(mm, block))
return 0;
  
+	new_start = block_start;

+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
list_del(>link);
mark_free(mm, block);
mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
parent = block->parent;
block->parent = NULL;
  
-	new_start = drm_buddy_block_offset(block);

list_add(>tmp_link, );
err =  __alloc_range(mm, , new_start, new_size, blocks, NULL);
if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
} while (1);
  
  	/* Trim the allocated block to the required size */

-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
struct list_head *trim_list;
LIST_HEAD(temp);
u64 trim_size;
@@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
}
  
  		drm_buddy_block_trim(mm,

+NULL,
 trim_size,
 trim_list);
  
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c

index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager 
*man,
} while (remaining_size);
  
  	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {

-   if (!drm_buddy_block_trim(mm, vres->base.size, >blocks))
+   if (!drm_buddy_block_trim(mm, NULL, vres->base.size, 
>blocks))
   

Re: [PATCH v7 1/2] drm/buddy: Add start address support to trim function

2024-07-23 Thread Paneer Selvam, Arunpravin

Hi Matthew,

Can we push this version for now as we need to mainline the DCC changes 
ASAP,
while we continue our discussion and proceed to implement the permanent 
solution

for address alignment?

Thanks,
Arun.

On 7/23/2024 6:55 PM, Arunpravin Paneer Selvam wrote:

- Add a new start parameter in trim function to specify exact
   address from where to start the trimming. This would help us
   in situations like if drivers would like to do address alignment
   for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
   flag to disable the allocator trimming part. This patch enables
   the drivers control trimming and they can do it themselves
   based on the application requirements.

v1:(Matthew)
   - check new_start alignment with min chunk_size
   - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
---
  drivers/gpu/drm/drm_buddy.c  | 25 +++--
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
  include/drm/drm_buddy.h  |  2 ++
  3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 6a8e45e9d0ec..103c185bb1c8 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
   * drm_buddy_block_trim - free unused pages
   *
   * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
   * @new_size: original size requested
   * @blocks: Input and output list of allocated blocks.
   * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
   * 0 on success, error code on failure.
   */
  int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,
 u64 new_size,
 struct list_head *blocks)
  {
struct drm_buddy_block *parent;
struct drm_buddy_block *block;
+   u64 block_start, block_end;
LIST_HEAD(dfs);
u64 new_start;
int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 struct drm_buddy_block,
 link);
  
+	block_start = drm_buddy_block_offset(block);

+   block_end = block_start + drm_buddy_block_size(mm, block);
+
if (WARN_ON(!drm_buddy_block_is_allocated(block)))
return -EINVAL;
  
@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,

if (new_size == drm_buddy_block_size(mm, block))
return 0;
  
+	new_start = block_start;

+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
list_del(>link);
mark_free(mm, block);
mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
parent = block->parent;
block->parent = NULL;
  
-	new_start = drm_buddy_block_offset(block);

list_add(>tmp_link, );
err =  __alloc_range(mm, , new_start, new_size, blocks, NULL);
if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
} while (1);
  
  	/* Trim the allocated block to the required size */

-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
struct list_head *trim_list;
LIST_HEAD(temp);
u64 trim_size;
@@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
}
  
  		drm_buddy_block_trim(mm,

+NULL,
 trim_size,
 trim_list);
  
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c

index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager 
*man,
} while (remaining_size);
  
  	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {

-   if (!drm_buddy_block_trim(mm, vres->base.size, >blocks))
+   if (!drm_buddy_block_trim(mm, NULL, vres->base.size, 
>blocks))
size = vres->base.size;
}
  
diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h

index 2a74fa9d0ce5..9689a7c5dd36 100644
--- a/include/drm/drm_buddy.h
+++ b/include/drm/drm_buddy.h
@@ -27,6 +27,7 @@
  #def

[PATCH v7 2/2] drm/amdgpu: Add address alignment support to DCC buffers

2024-07-23 Thread Arunpravin Paneer Selvam
Add address alignment support to the DCC VRAM buffers.

v2:
  - adjust size based on the max_texture_channel_caches values
only for GFX12 DCC buffers.
  - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only
for DCC buffers.
  - roundup non power of two DCC buffer adjusted size to nearest
power of two number as the buddy allocator does not support non
power of two alignments. This applies only to the contiguous
DCC buffers.

v3:(Alex)
  - rewrite the max texture channel caches comparison code in an
algorithmic way to determine the alignment size.

v4:(Alex)
  - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c
and add a new gmc func callback for dcc alignment. If the callback
is non-NULL, call it to get the alignment, otherwise, use the default.

v5:(Alex)
  - Set the Alignment to a default value if the callback doesn't exist.
  - Add the callback to amdgpu_gmc_funcs.

v6:
  - Fix checkpatch warning reported by Intel CI.

v7:(Christian)
  - remove the AMDGPU_GEM_CREATE_GFX12_DCC flag and keep a flag that
checks the BO pinning and for a specific hw generation.

v8:(Christian)
  - move this check into gmc_v12_0_get_dcc_alignment.

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
Reviewed-by: Frank Min 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  |  6 
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 29 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c   | 19 +
 3 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index febca3130497..654d0548a3f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs {
  uint64_t addr, uint64_t *flags);
/* get the amount of memory used by the vbios for pre-OS console */
unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev);
+   /* get the DCC buffer alignment */
+   u64 (*get_dcc_alignment)(struct amdgpu_device *adev);
 
enum amdgpu_memory_partition (*query_mem_partition_mode)(
struct amdgpu_device *adev);
@@ -363,6 +365,10 @@ struct amdgpu_gmc {
(adev)->gmc.gmc_funcs->override_vm_pte_flags\
((adev), (vm), (addr), (pte_flags))
 #define amdgpu_gmc_get_vbios_fb_size(adev) 
(adev)->gmc.gmc_funcs->get_vbios_fb_size((adev))
+#define amdgpu_gmc_get_dcc_alignment(_adev) ({ \
+   typeof(_adev) (adev) = (_adev); \
+   ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev))); \
+})
 
 /**
  * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index f91cc149d06c..c6609f4ac3d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -450,12 +450,12 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,
   const struct ttm_place *place,
   struct ttm_resource **res)
 {
+   u64 size, remaining_size, lpfn, fpfn, adjust_dcc_size = 0;
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
u64 vis_usage = 0, max_bytes, min_block_size;
struct amdgpu_vram_mgr_resource *vres;
-   u64 size, remaining_size, lpfn, fpfn;
struct drm_buddy *mm = >mm;
struct drm_buddy_block *block;
unsigned long pages_per_block;
@@ -511,7 +511,14 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
/* Allocate blocks in desired range */
vres->flags |= DRM_BUDDY_RANGE_ALLOCATION;
 
+   if (adev->gmc.gmc_funcs->get_dcc_alignment)
+   adjust_dcc_size = amdgpu_gmc_get_dcc_alignment(adev);
+
remaining_size = (u64)vres->base.size;
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && adjust_dcc_size) {
+   remaining_size = roundup_pow_of_two(remaining_size + 
adjust_dcc_size);
+   vres->flags |= DRM_BUDDY_TRIM_DISABLE;
+   }
 
mutex_lock(>lock);
while (remaining_size) {
@@ -521,8 +528,11 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
min_block_size = mgr->default_page_size;
 
size = remaining_size;
-   if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
-   !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
+
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && 
adjust_dcc_size)
+   

[PATCH v7 1/2] drm/buddy: Add start address support to trim function

2024-07-23 Thread Arunpravin Paneer Selvam
- Add a new start parameter in trim function to specify exact
  address from where to start the trimming. This would help us
  in situations like if drivers would like to do address alignment
  for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
  flag to disable the allocator trimming part. This patch enables
  the drivers control trimming and they can do it themselves
  based on the application requirements.

v1:(Matthew)
  - check new_start alignment with min chunk_size
  - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
---
 drivers/gpu/drm/drm_buddy.c  | 25 +++--
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
 include/drm/drm_buddy.h  |  2 ++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 6a8e45e9d0ec..103c185bb1c8 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * drm_buddy_block_trim - free unused pages
  *
  * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
  * @new_size: original size requested
  * @blocks: Input and output list of allocated blocks.
  * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * 0 on success, error code on failure.
  */
 int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,
 u64 new_size,
 struct list_head *blocks)
 {
struct drm_buddy_block *parent;
struct drm_buddy_block *block;
+   u64 block_start, block_end;
LIST_HEAD(dfs);
u64 new_start;
int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 struct drm_buddy_block,
 link);
 
+   block_start = drm_buddy_block_offset(block);
+   block_end = block_start + drm_buddy_block_size(mm, block);
+
if (WARN_ON(!drm_buddy_block_is_allocated(block)))
return -EINVAL;
 
@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
if (new_size == drm_buddy_block_size(mm, block))
return 0;
 
+   new_start = block_start;
+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
list_del(>link);
mark_free(mm, block);
mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
parent = block->parent;
block->parent = NULL;
 
-   new_start = drm_buddy_block_offset(block);
list_add(>tmp_link, );
err =  __alloc_range(mm, , new_start, new_size, blocks, NULL);
if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
} while (1);
 
/* Trim the allocated block to the required size */
-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
struct list_head *trim_list;
LIST_HEAD(temp);
u64 trim_size;
@@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
}
 
drm_buddy_block_trim(mm,
+NULL,
 trim_size,
 trim_list);
 
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c 
b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager 
*man,
} while (remaining_size);
 
if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-   if (!drm_buddy_block_trim(mm, vres->base.size, >blocks))
+   if (!drm_buddy_block_trim(mm, NULL, vres->base.size, 
>blocks))
size = vres->base.size;
}
 
diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
index 2a74fa9d0ce5..9689a7c5dd36 100644
--- a/include/drm/drm_buddy.h
+++ b/include/drm/drm_buddy.h
@@ -27,6 +27,7 @@
 #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2)
 #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3)
 #define DRM_BUDDY_CLEARED  BIT(4)
+#define DRM_BUDDY_TRIM_DISABLE BIT(5)
 
 struct drm_buddy_block {
 #define DRM_BUDDY_HE

Re: [PATCH] drm/buddy: Add start address support to trim function

2024-07-22 Thread Paneer Selvam, Arunpravin

Hi Matthew,

On 7/19/2024 4:01 PM, Matthew Auld wrote:

On 17/07/2024 16:02, Paneer Selvam, Arunpravin wrote:



On 7/16/2024 3:34 PM, Matthew Auld wrote:

On 16/07/2024 10:50, Paneer Selvam, Arunpravin wrote:

Hi Matthew,

On 7/10/2024 6:20 PM, Matthew Auld wrote:

On 10/07/2024 07:03, Paneer Selvam, Arunpravin wrote:

Thanks Alex.

Hi Matthew,
Any comments?


Do we not pass the required address alignment when allocating the 
pages in the first place?
If address alignment is really useful, we can add that in the 
drm_buddy_alloc_blocks() function.


I mean don't we already pass the min page size, which should give us 
matching physical address alignment?
I think we don't need to align the address to the passed 
min_block_size value for all the contiguous
buffers, so I thought that decision we can leave it to the drivers 
and they can achieve that through trim function

in this kind of a specific request.


I would have assumed it would be simpler to use min_block_size and 
then trim the size, if it's too big? That would then also take care of 
the try_harder case?
For example, if the required contiguous size is 1MiB and min_block_size 
is 256KiB, to perform the address alignment of 256KiB, we might need to 
over-allocate at least
to the min_block_size (say 256KiB). Now the size becomes 1280KiB and 
since the contiguous flag is enabled, we will round up the size to the 
next power of two and the size
value becomes 2MiB. Next, in trimming we should round up the block start 
address to the min_block_size. May be we can keep the above mentioned 
operations under the
flag combination DRM_BUDDY_CONTIGUOUS_ALLOCATION && 
DRM_BUDDY_ADDRESS_ALIGNMENT?.


At the moment, we cannot support address alignment for try_harder 
allocations since in case of try_harder allocations we first traverse 
RHS to allocate the maximum possible
and traverse LHS (here we align the LHS size to min_block_size) to 
allocate the remaining size. May be in case of 
DRM_BUDDY_ADDRESS_ALIGNMENT, we should first allocate
LHS satisfying the address alignment requirement and then traverse RHS 
to allocate the remaining size if required?


Also how are we dealing with the multi-block try_harder case? AFAICT 
we only allow trimming single block atm, or is it not possible to 
trigger that path here? Or are we handling that somehow?
not possible to trigger that path here. only when we either 
over-allocate the LHS size and pass the multiple blocks to the trim 
function or implement the above mentioned method.




https://patchwork.freedesktop.org/series/136150/
We are getting this sparse error from the Intel CI. Do you think 
these errors are introduced with this patches?


I think it's safe to ignore, there seem to be other series with the 
same thing.

Thanks.




Thanks,
Arun.




Thanks,
Arun.




Thanks,
Arun.

On 7/9/2024 1:42 AM, Alex Deucher wrote:

On Thu, Jul 4, 2024 at 4:40 AM Arunpravin Paneer Selvam
 wrote:

- Add a new start parameter in trim function to specify exact
   address from where to start the trimming. This would help us
   in situations like if drivers would like to do address 
alignment

   for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
   flag to disable the allocator trimming part. This patch enables
   the drivers control trimming and they can do it themselves
   based on the application requirements.

v1:(Matthew)
   - check new_start alignment with min chunk_size
   - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 


Series is:
Acked-by: Alex Deucher 

I'd like to take this series through the amdgpu tree if there 
are no
objections as it's required for display buffers on some chips 
and I'd

like to make sure it lands in 6.11.

Thanks,

Alex


---
  drivers/gpu/drm/drm_buddy.c  | 25 
+++--

  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
  include/drm/drm_buddy.h  |  2 ++
  3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c 
b/drivers/gpu/drm/drm_buddy.c

index 94f8c34fc293..8cebe1fa4e9d 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct 
drm_buddy *mm,

   * drm_buddy_block_trim - free unused pages
   *
   * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
   * @new_size: original size requested
   * @blocks: Input and output list of allocated blocks.
   * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int 
__alloc_contig_try_harder(struct drm_buddy *mm,

   * 0 on success, error code on failure.
   */
  int drm_buddy_block_trim(struct drm_buddy *mm,
+    u64 *start,
  u64 new_size,
  struct list_head *blocks)
  {
 struct drm_buddy_block *parent;
 struct drm_buddy_block *block;
+   u64 block_start, block_end;
 LIST_HEAD(dfs);

[PATCH v6 2/2] drm/amdgpu: Add address alignment support to DCC buffers

2024-07-18 Thread Arunpravin Paneer Selvam
Add address alignment support to the DCC VRAM buffers.

v2:
  - adjust size based on the max_texture_channel_caches values
only for GFX12 DCC buffers.
  - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only
for DCC buffers.
  - roundup non power of two DCC buffer adjusted size to nearest
power of two number as the buddy allocator does not support non
power of two alignments. This applies only to the contiguous
DCC buffers.

v3:(Alex)
  - rewrite the max texture channel caches comparison code in an
algorithmic way to determine the alignment size.

v4:(Alex)
  - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c
and add a new gmc func callback for dcc alignment. If the callback
is non-NULL, call it to get the alignment, otherwise, use the default.

v5:(Alex)
  - Set the Alignment to a default value if the callback doesn't exist.
  - Add the callback to amdgpu_gmc_funcs.

v6:
  - Fix checkpatch warning reported by Intel CI.

v7:(Christian)
  - remove the AMDGPU_GEM_CREATE_GFX12_DCC flag and keep a flag that
checks the BO pinning and for a specific hw generation.

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
Reviewed-by: Frank Min 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  |  6 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 39 +++-
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c   | 15 
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index febca3130497..654d0548a3f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs {
  uint64_t addr, uint64_t *flags);
/* get the amount of memory used by the vbios for pre-OS console */
unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev);
+   /* get the DCC buffer alignment */
+   u64 (*get_dcc_alignment)(struct amdgpu_device *adev);
 
enum amdgpu_memory_partition (*query_mem_partition_mode)(
struct amdgpu_device *adev);
@@ -363,6 +365,10 @@ struct amdgpu_gmc {
(adev)->gmc.gmc_funcs->override_vm_pte_flags\
((adev), (vm), (addr), (pte_flags))
 #define amdgpu_gmc_get_vbios_fb_size(adev) 
(adev)->gmc.gmc_funcs->get_vbios_fb_size((adev))
+#define amdgpu_gmc_get_dcc_alignment(_adev) ({ \
+   typeof(_adev) (adev) = (_adev); \
+   ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev))); \
+})
 
 /**
  * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index f91cc149d06c..ace9d61fc512 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -512,6 +512,17 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
vres->flags |= DRM_BUDDY_RANGE_ALLOCATION;
 
remaining_size = (u64)vres->base.size;
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) ||
+amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) {
+   u64 adjust_size;
+
+   if (adev->gmc.gmc_funcs->get_dcc_alignment) {
+   adjust_size = amdgpu_gmc_get_dcc_alignment(adev);
+   remaining_size = roundup_pow_of_two(remaining_size + 
adjust_size);
+   vres->flags |= DRM_BUDDY_TRIM_DISABLE;
+   }
+   }
 
mutex_lock(>lock);
while (remaining_size) {
@@ -521,8 +532,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
min_block_size = mgr->default_page_size;
 
size = remaining_size;
-   if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
-   !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
+
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 
0) ||
+amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 
1)))
+   min_block_size = size;
+   else if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
+!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
 
BUG_ON(min_block_size < mm->chunk_size);
@@ -553,6 +569,25 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
}
mutex_unlock(>

[PATCH v6 1/2] drm/buddy: Add start address support to trim function

2024-07-18 Thread Arunpravin Paneer Selvam
- Add a new start parameter in trim function to specify exact
  address from where to start the trimming. This would help us
  in situations like if drivers would like to do address alignment
  for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
  flag to disable the allocator trimming part. This patch enables
  the drivers control trimming and they can do it themselves
  based on the application requirements.

v1:(Matthew)
  - check new_start alignment with min chunk_size
  - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
---
 drivers/gpu/drm/drm_buddy.c  | 25 +++--
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
 include/drm/drm_buddy.h  |  2 ++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 6a8e45e9d0ec..103c185bb1c8 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * drm_buddy_block_trim - free unused pages
  *
  * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
  * @new_size: original size requested
  * @blocks: Input and output list of allocated blocks.
  * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * 0 on success, error code on failure.
  */
 int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,
 u64 new_size,
 struct list_head *blocks)
 {
struct drm_buddy_block *parent;
struct drm_buddy_block *block;
+   u64 block_start, block_end;
LIST_HEAD(dfs);
u64 new_start;
int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 struct drm_buddy_block,
 link);
 
+   block_start = drm_buddy_block_offset(block);
+   block_end = block_start + drm_buddy_block_size(mm, block);
+
if (WARN_ON(!drm_buddy_block_is_allocated(block)))
return -EINVAL;
 
@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
if (new_size == drm_buddy_block_size(mm, block))
return 0;
 
+   new_start = block_start;
+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
list_del(>link);
mark_free(mm, block);
mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
parent = block->parent;
block->parent = NULL;
 
-   new_start = drm_buddy_block_offset(block);
list_add(>tmp_link, );
err =  __alloc_range(mm, , new_start, new_size, blocks, NULL);
if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
} while (1);
 
/* Trim the allocated block to the required size */
-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
struct list_head *trim_list;
LIST_HEAD(temp);
u64 trim_size;
@@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
}
 
drm_buddy_block_trim(mm,
+NULL,
 trim_size,
 trim_list);
 
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c 
b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager 
*man,
} while (remaining_size);
 
if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-   if (!drm_buddy_block_trim(mm, vres->base.size, >blocks))
+   if (!drm_buddy_block_trim(mm, NULL, vres->base.size, 
>blocks))
size = vres->base.size;
}
 
diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
index 2a74fa9d0ce5..9689a7c5dd36 100644
--- a/include/drm/drm_buddy.h
+++ b/include/drm/drm_buddy.h
@@ -27,6 +27,7 @@
 #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2)
 #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3)
 #define DRM_BUDDY_CLEARED  BIT(4)
+#define DRM_BUDDY_TRIM_DISABLE BIT(5)
 
 struct drm_buddy_block {
 #define DRM_BUDDY_HE

Re: [PATCH] drm/buddy: Add start address support to trim function

2024-07-17 Thread Paneer Selvam, Arunpravin




On 7/16/2024 3:34 PM, Matthew Auld wrote:

On 16/07/2024 10:50, Paneer Selvam, Arunpravin wrote:

Hi Matthew,

On 7/10/2024 6:20 PM, Matthew Auld wrote:

On 10/07/2024 07:03, Paneer Selvam, Arunpravin wrote:

Thanks Alex.

Hi Matthew,
Any comments?


Do we not pass the required address alignment when allocating the 
pages in the first place?
If address alignment is really useful, we can add that in the 
drm_buddy_alloc_blocks() function.


I mean don't we already pass the min page size, which should give us 
matching physical address alignment?
I think we don't need to align the address to the passed min_block_size 
value for all the contiguous
buffers, so I thought that decision we can leave it to the drivers and 
they can achieve that through trim function

in this kind of a specific request.

https://patchwork.freedesktop.org/series/136150/
We are getting this sparse error from the Intel CI. Do you think these 
errors are introduced with this patches?


Thanks,
Arun.




Thanks,
Arun.




Thanks,
Arun.

On 7/9/2024 1:42 AM, Alex Deucher wrote:

On Thu, Jul 4, 2024 at 4:40 AM Arunpravin Paneer Selvam
 wrote:

- Add a new start parameter in trim function to specify exact
   address from where to start the trimming. This would help us
   in situations like if drivers would like to do address alignment
   for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
   flag to disable the allocator trimming part. This patch enables
   the drivers control trimming and they can do it themselves
   based on the application requirements.

v1:(Matthew)
   - check new_start alignment with min chunk_size
   - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 


Series is:
Acked-by: Alex Deucher 

I'd like to take this series through the amdgpu tree if there are no
objections as it's required for display buffers on some chips and I'd
like to make sure it lands in 6.11.

Thanks,

Alex


---
  drivers/gpu/drm/drm_buddy.c  | 25 
+++--

  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
  include/drm/drm_buddy.h  |  2 ++
  3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c 
b/drivers/gpu/drm/drm_buddy.c

index 94f8c34fc293..8cebe1fa4e9d 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct 
drm_buddy *mm,

   * drm_buddy_block_trim - free unused pages
   *
   * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
   * @new_size: original size requested
   * @blocks: Input and output list of allocated blocks.
   * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct 
drm_buddy *mm,

   * 0 on success, error code on failure.
   */
  int drm_buddy_block_trim(struct drm_buddy *mm,
+    u64 *start,
  u64 new_size,
  struct list_head *blocks)
  {
 struct drm_buddy_block *parent;
 struct drm_buddy_block *block;
+   u64 block_start, block_end;
 LIST_HEAD(dfs);
 u64 new_start;
 int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
  struct drm_buddy_block,
  link);

+   block_start = drm_buddy_block_offset(block);
+   block_end = block_start + drm_buddy_block_size(mm, block);
+
 if (WARN_ON(!drm_buddy_block_is_allocated(block)))
 return -EINVAL;

@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 if (new_size == drm_buddy_block_size(mm, block))
 return 0;

+   new_start = block_start;
+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
 list_del(>link);
 mark_free(mm, block);
 mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 parent = block->parent;
 block->parent = NULL;

-   new_start = drm_buddy_block_offset(block);
 list_add(>tmp_link, );
 err =  __alloc_range(mm, , new_start, new_size, 
blocks, NULL);

 if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy 
*mm,

 } while (1);

 /* Trim the allocated block to the required size */
-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
 struct list_head *trim_list;
 LIST_HEAD(temp);
 u64 tri

Re: [PATCH v5 2/2] drm/amdgpu: Add address alignment support to DCC buffers

2024-07-17 Thread Paneer Selvam, Arunpravin

Hi Christian,

Can we use the below combination flags to kick in hardware workaround 
while pinning BO's for this specific hw generation.


if (place->flags & TTM_PL_FLAG_CONTIGUOUS) &&
(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) ||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) {
}

Regards,
Arun.

On 7/17/2024 2:38 PM, Christian König wrote:

Well that approach was discussed before and seemed to be to complicated.

But I totally agree that the AMDGPU_GEM_CREATE_GFX12_DCC flag is a bad 
idea. This isn't anything userspace should need to specify in the 
first place.


All we need is a hardware workaround which kicks in all the time while 
pinning BOs for this specific hw generation and texture channel 
configuration.


Please remove the AMDGPU_GEM_CREATE_GFX12_DCC flag again if possible 
or specify why it is actually necessary?


Regards,
Christian.

Am 17.07.24 um 05:44 schrieb Marek Olšák:
AMDGPU_GEM_CREATE_GFX12_DCC is set on 90% of all memory allocations, 
and almost all of them are not displayable. Shouldn't we use a 
different way to indicate that we need a non-power-of-two alignment, 
such as looking at the alignment field directly?


Marek

On Tue, Jul 16, 2024, 11:45 Arunpravin Paneer Selvam 
 wrote:


Add address alignment support to the DCC VRAM buffers.

v2:
  - adjust size based on the max_texture_channel_caches values
    only for GFX12 DCC buffers.
  - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only
    for DCC buffers.
  - roundup non power of two DCC buffer adjusted size to nearest
    power of two number as the buddy allocator does not support non
    power of two alignments. This applies only to the contiguous
    DCC buffers.

v3:(Alex)
  - rewrite the max texture channel caches comparison code in an
    algorithmic way to determine the alignment size.

v4:(Alex)
  - Move the logic from amdgpu_vram_mgr_dcc_alignment() to
gmc_v12_0.c
    and add a new gmc func callback for dcc alignment. If the
callback
    is non-NULL, call it to get the alignment, otherwise, use the
default.

v5:(Alex)
  - Set the Alignment to a default value if the callback doesn't
exist.
  - Add the callback to amdgpu_gmc_funcs.

v6:
  - Fix checkpatch error reported by Intel CI.

Signed-off-by: Arunpravin Paneer Selvam

Acked-by: Alex Deucher 
Acked-by: Christian König 
Reviewed-by: Frank Min 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h      |  6 
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 36
++--
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c       | 15 
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index febca3130497..654d0548a3f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs {
                                      uint64_t addr, uint64_t
*flags);
        /* get the amount of memory used by the vbios for pre-OS
console */
        unsigned int (*get_vbios_fb_size)(struct amdgpu_device
*adev);
+       /* get the DCC buffer alignment */
+       u64 (*get_dcc_alignment)(struct amdgpu_device *adev);

        enum amdgpu_memory_partition (*query_mem_partition_mode)(
                struct amdgpu_device *adev);
@@ -363,6 +365,10 @@ struct amdgpu_gmc {
(adev)->gmc.gmc_funcs->override_vm_pte_flags               \
                ((adev), (vm), (addr), (pte_flags))
 #define amdgpu_gmc_get_vbios_fb_size(adev)
(adev)->gmc.gmc_funcs->get_vbios_fb_size((adev))
+#define amdgpu_gmc_get_dcc_alignment(_adev) ({        \
+       typeof(_adev) (adev) = (_adev);        \
+  ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev)));    \
+})

 /**
  * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible
through the BAR
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index f91cc149d06c..aa9dca12371c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -512,6 +512,16 @@ static int amdgpu_vram_mgr_new(struct
ttm_resource_manager *man,
                vres->flags |= DRM_BUDDY_RANGE_ALLOCATION;

        remaining_size = (u64)vres->base.size;
+       if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+           bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) {
+               u64 adjust_size;
+
+               if (adev->gmc.gmc_funcs->get_dcc_alignment) {
+                       adjust_size =
amdgpu_gmc_get_dcc_alignment(adev);
+                       r

[PATCH v5 2/2] drm/amdgpu: Add address alignment support to DCC buffers

2024-07-16 Thread Arunpravin Paneer Selvam
Add address alignment support to the DCC VRAM buffers.

v2:
  - adjust size based on the max_texture_channel_caches values
only for GFX12 DCC buffers.
  - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only
for DCC buffers.
  - roundup non power of two DCC buffer adjusted size to nearest
power of two number as the buddy allocator does not support non
power of two alignments. This applies only to the contiguous
DCC buffers.

v3:(Alex)
  - rewrite the max texture channel caches comparison code in an
algorithmic way to determine the alignment size.

v4:(Alex)
  - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c
and add a new gmc func callback for dcc alignment. If the callback
is non-NULL, call it to get the alignment, otherwise, use the default.

v5:(Alex)
  - Set the Alignment to a default value if the callback doesn't exist.
  - Add the callback to amdgpu_gmc_funcs.

v6:
  - Fix checkpatch error reported by Intel CI.

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
Reviewed-by: Frank Min 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  |  6 
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 36 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c   | 15 
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index febca3130497..654d0548a3f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs {
  uint64_t addr, uint64_t *flags);
/* get the amount of memory used by the vbios for pre-OS console */
unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev);
+   /* get the DCC buffer alignment */
+   u64 (*get_dcc_alignment)(struct amdgpu_device *adev);
 
enum amdgpu_memory_partition (*query_mem_partition_mode)(
struct amdgpu_device *adev);
@@ -363,6 +365,10 @@ struct amdgpu_gmc {
(adev)->gmc.gmc_funcs->override_vm_pte_flags\
((adev), (vm), (addr), (pte_flags))
 #define amdgpu_gmc_get_vbios_fb_size(adev) 
(adev)->gmc.gmc_funcs->get_vbios_fb_size((adev))
+#define amdgpu_gmc_get_dcc_alignment(_adev) ({ \
+   typeof(_adev) (adev) = (_adev); \
+   ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev))); \
+})
 
 /**
  * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index f91cc149d06c..aa9dca12371c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -512,6 +512,16 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
vres->flags |= DRM_BUDDY_RANGE_ALLOCATION;
 
remaining_size = (u64)vres->base.size;
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) {
+   u64 adjust_size;
+
+   if (adev->gmc.gmc_funcs->get_dcc_alignment) {
+   adjust_size = amdgpu_gmc_get_dcc_alignment(adev);
+   remaining_size = roundup_pow_of_two(remaining_size + 
adjust_size);
+   vres->flags |= DRM_BUDDY_TRIM_DISABLE;
+   }
+   }
 
mutex_lock(>lock);
while (remaining_size) {
@@ -521,8 +531,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
min_block_size = mgr->default_page_size;
 
size = remaining_size;
-   if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
-   !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
+
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC)
+   min_block_size = size;
+   else if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
+!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
 
BUG_ON(min_block_size < mm->chunk_size);
@@ -553,6 +567,24 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
}
mutex_unlock(>lock);
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) {
+   struct drm_buddy_block *dcc_block;
+   u64 dcc_start, alignment;
+
+   dcc_block = amdgpu_vram_mgr_first_block(>blocks);
+   dcc_star

[PATCH v5 1/2] drm/buddy: Add start address support to trim function

2024-07-16 Thread Arunpravin Paneer Selvam
- Add a new start parameter in trim function to specify exact
  address from where to start the trimming. This would help us
  in situations like if drivers would like to do address alignment
  for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
  flag to disable the allocator trimming part. This patch enables
  the drivers control trimming and they can do it themselves
  based on the application requirements.

v1:(Matthew)
  - check new_start alignment with min chunk_size
  - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
---
 drivers/gpu/drm/drm_buddy.c  | 25 +++--
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
 include/drm/drm_buddy.h  |  2 ++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 6a8e45e9d0ec..103c185bb1c8 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * drm_buddy_block_trim - free unused pages
  *
  * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
  * @new_size: original size requested
  * @blocks: Input and output list of allocated blocks.
  * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * 0 on success, error code on failure.
  */
 int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,
 u64 new_size,
 struct list_head *blocks)
 {
struct drm_buddy_block *parent;
struct drm_buddy_block *block;
+   u64 block_start, block_end;
LIST_HEAD(dfs);
u64 new_start;
int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 struct drm_buddy_block,
 link);
 
+   block_start = drm_buddy_block_offset(block);
+   block_end = block_start + drm_buddy_block_size(mm, block);
+
if (WARN_ON(!drm_buddy_block_is_allocated(block)))
return -EINVAL;
 
@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
if (new_size == drm_buddy_block_size(mm, block))
return 0;
 
+   new_start = block_start;
+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
list_del(>link);
mark_free(mm, block);
mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
parent = block->parent;
block->parent = NULL;
 
-   new_start = drm_buddy_block_offset(block);
list_add(>tmp_link, );
err =  __alloc_range(mm, , new_start, new_size, blocks, NULL);
if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
} while (1);
 
/* Trim the allocated block to the required size */
-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
struct list_head *trim_list;
LIST_HEAD(temp);
u64 trim_size;
@@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
}
 
drm_buddy_block_trim(mm,
+NULL,
 trim_size,
 trim_list);
 
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c 
b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager 
*man,
} while (remaining_size);
 
if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-   if (!drm_buddy_block_trim(mm, vres->base.size, >blocks))
+   if (!drm_buddy_block_trim(mm, NULL, vres->base.size, 
>blocks))
size = vres->base.size;
}
 
diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
index 2a74fa9d0ce5..9689a7c5dd36 100644
--- a/include/drm/drm_buddy.h
+++ b/include/drm/drm_buddy.h
@@ -27,6 +27,7 @@
 #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2)
 #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3)
 #define DRM_BUDDY_CLEARED  BIT(4)
+#define DRM_BUDDY_TRIM_DISABLE BIT(5)
 
 struct drm_buddy_block {
 #define DRM_BUDDY_HE

Re: [PATCH] drm/buddy: Add start address support to trim function

2024-07-16 Thread Paneer Selvam, Arunpravin

Hi Matthew,

On 7/10/2024 6:20 PM, Matthew Auld wrote:

On 10/07/2024 07:03, Paneer Selvam, Arunpravin wrote:

Thanks Alex.

Hi Matthew,
Any comments?


Do we not pass the required address alignment when allocating the 
pages in the first place?
If address alignment is really useful, we can add that in the 
drm_buddy_alloc_blocks() function.


Thanks,
Arun.




Thanks,
Arun.

On 7/9/2024 1:42 AM, Alex Deucher wrote:

On Thu, Jul 4, 2024 at 4:40 AM Arunpravin Paneer Selvam
 wrote:

- Add a new start parameter in trim function to specify exact
   address from where to start the trimming. This would help us
   in situations like if drivers would like to do address alignment
   for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
   flag to disable the allocator trimming part. This patch enables
   the drivers control trimming and they can do it themselves
   based on the application requirements.

v1:(Matthew)
   - check new_start alignment with min chunk_size
   - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 


Series is:
Acked-by: Alex Deucher 

I'd like to take this series through the amdgpu tree if there are no
objections as it's required for display buffers on some chips and I'd
like to make sure it lands in 6.11.

Thanks,

Alex


---
  drivers/gpu/drm/drm_buddy.c  | 25 +++--
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
  include/drm/drm_buddy.h  |  2 ++
  3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 94f8c34fc293..8cebe1fa4e9d 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct 
drm_buddy *mm,

   * drm_buddy_block_trim - free unused pages
   *
   * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
   * @new_size: original size requested
   * @blocks: Input and output list of allocated blocks.
   * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct 
drm_buddy *mm,

   * 0 on success, error code on failure.
   */
  int drm_buddy_block_trim(struct drm_buddy *mm,
+    u64 *start,
  u64 new_size,
  struct list_head *blocks)
  {
 struct drm_buddy_block *parent;
 struct drm_buddy_block *block;
+   u64 block_start, block_end;
 LIST_HEAD(dfs);
 u64 new_start;
 int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
  struct drm_buddy_block,
  link);

+   block_start = drm_buddy_block_offset(block);
+   block_end = block_start + drm_buddy_block_size(mm, block);
+
 if (WARN_ON(!drm_buddy_block_is_allocated(block)))
 return -EINVAL;

@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 if (new_size == drm_buddy_block_size(mm, block))
 return 0;

+   new_start = block_start;
+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
 list_del(>link);
 mark_free(mm, block);
 mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 parent = block->parent;
 block->parent = NULL;

-   new_start = drm_buddy_block_offset(block);
 list_add(>tmp_link, );
 err =  __alloc_range(mm, , new_start, new_size, 
blocks, NULL);

 if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
 } while (1);

 /* Trim the allocated block to the required size */
-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
 struct list_head *trim_list;
 LIST_HEAD(temp);
 u64 trim_size;
@@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
 }

 drm_buddy_block_trim(mm,
+    NULL,
  trim_size,
  trim_list);

diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c 
b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c

index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct 
ttm_resource_manager *man,

 } while (remaining_size);

 if (plac

[PATCH 1/2] drm/buddy: Add start address support to trim function

2024-07-16 Thread Arunpravin Paneer Selvam
- Add a new start parameter in trim function to specify exact
  address from where to start the trimming. This would help us
  in situations like if drivers would like to do address alignment
  for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
  flag to disable the allocator trimming part. This patch enables
  the drivers control trimming and they can do it themselves
  based on the application requirements.

v1:(Matthew)
  - check new_start alignment with min chunk_size
  - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
---
 drivers/gpu/drm/drm_buddy.c  | 25 +++--
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
 include/drm/drm_buddy.h  |  2 ++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 6a8e45e9d0ec..103c185bb1c8 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * drm_buddy_block_trim - free unused pages
  *
  * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
  * @new_size: original size requested
  * @blocks: Input and output list of allocated blocks.
  * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * 0 on success, error code on failure.
  */
 int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,
 u64 new_size,
 struct list_head *blocks)
 {
struct drm_buddy_block *parent;
struct drm_buddy_block *block;
+   u64 block_start, block_end;
LIST_HEAD(dfs);
u64 new_start;
int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 struct drm_buddy_block,
 link);
 
+   block_start = drm_buddy_block_offset(block);
+   block_end = block_start + drm_buddy_block_size(mm, block);
+
if (WARN_ON(!drm_buddy_block_is_allocated(block)))
return -EINVAL;
 
@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
if (new_size == drm_buddy_block_size(mm, block))
return 0;
 
+   new_start = block_start;
+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
list_del(>link);
mark_free(mm, block);
mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
parent = block->parent;
block->parent = NULL;
 
-   new_start = drm_buddy_block_offset(block);
list_add(>tmp_link, );
err =  __alloc_range(mm, , new_start, new_size, blocks, NULL);
if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
} while (1);
 
/* Trim the allocated block to the required size */
-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
struct list_head *trim_list;
LIST_HEAD(temp);
u64 trim_size;
@@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
}
 
drm_buddy_block_trim(mm,
+NULL,
 trim_size,
 trim_list);
 
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c 
b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager 
*man,
} while (remaining_size);
 
if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-   if (!drm_buddy_block_trim(mm, vres->base.size, >blocks))
+   if (!drm_buddy_block_trim(mm, NULL, vres->base.size, 
>blocks))
size = vres->base.size;
}
 
diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
index 2a74fa9d0ce5..9689a7c5dd36 100644
--- a/include/drm/drm_buddy.h
+++ b/include/drm/drm_buddy.h
@@ -27,6 +27,7 @@
 #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2)
 #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3)
 #define DRM_BUDDY_CLEARED  BIT(4)
+#define DRM_BUDDY_TRIM_DISABLE BIT(5)
 
 struct drm_buddy_block {
 #define DRM_BUDDY_HE

[PATCH 2/2] drm/amdgpu: Add address alignment support to DCC buffers

2024-07-16 Thread Arunpravin Paneer Selvam
Add address alignment support to the DCC VRAM buffers.

v2:
  - adjust size based on the max_texture_channel_caches values
only for GFX12 DCC buffers.
  - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only
for DCC buffers.
  - roundup non power of two DCC buffer adjusted size to nearest
power of two number as the buddy allocator does not support non
power of two alignments. This applies only to the contiguous
DCC buffers.

v3:(Alex)
  - rewrite the max texture channel caches comparison code in an
algorithmic way to determine the alignment size.

v4:(Alex)
  - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c
and add a new gmc func callback for dcc alignment. If the callback
is non-NULL, call it to get the alignment, otherwise, use the default.

v5:(Alex)
  - Set the Alignment to a default value if the callback doesn't exist.
  - Add the callback to amdgpu_gmc_funcs.

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Alex Deucher 
Acked-by: Christian König 
Reviewed-by: Frank Min 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 36 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c   | 15 
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index febca3130497..49dfcf112ac1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs {
  uint64_t addr, uint64_t *flags);
/* get the amount of memory used by the vbios for pre-OS console */
unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev);
+   /* get the DCC buffer alignment */
+   u64 (*get_dcc_alignment)(struct amdgpu_device *adev);
 
enum amdgpu_memory_partition (*query_mem_partition_mode)(
struct amdgpu_device *adev);
@@ -363,6 +365,7 @@ struct amdgpu_gmc {
(adev)->gmc.gmc_funcs->override_vm_pte_flags\
((adev), (vm), (addr), (pte_flags))
 #define amdgpu_gmc_get_vbios_fb_size(adev) 
(adev)->gmc.gmc_funcs->get_vbios_fb_size((adev))
+#define amdgpu_gmc_get_dcc_alignment(adev) 
((adev)->gmc.gmc_funcs->get_dcc_alignment((adev)))
 
 /**
  * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index f91cc149d06c..aa9dca12371c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -512,6 +512,16 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
vres->flags |= DRM_BUDDY_RANGE_ALLOCATION;
 
remaining_size = (u64)vres->base.size;
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) {
+   u64 adjust_size;
+
+   if (adev->gmc.gmc_funcs->get_dcc_alignment) {
+   adjust_size = amdgpu_gmc_get_dcc_alignment(adev);
+   remaining_size = roundup_pow_of_two(remaining_size + 
adjust_size);
+   vres->flags |= DRM_BUDDY_TRIM_DISABLE;
+   }
+   }
 
mutex_lock(>lock);
while (remaining_size) {
@@ -521,8 +531,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
min_block_size = mgr->default_page_size;
 
size = remaining_size;
-   if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
-   !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
+
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC)
+   min_block_size = size;
+   else if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
+!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
 
BUG_ON(min_block_size < mm->chunk_size);
@@ -553,6 +567,24 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
}
mutex_unlock(>lock);
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) {
+   struct drm_buddy_block *dcc_block;
+   u64 dcc_start, alignment;
+
+   dcc_block = amdgpu_vram_mgr_first_block(>blocks);
+   dcc_start = amdgpu_vram_mgr_block_start(dcc_block);
+
+   if (adev->gmc.gmc_funcs->get_dcc_alignment) {
+   align

Re: [PATCH] drm/buddy: Add start address support to trim function

2024-07-10 Thread Paneer Selvam, Arunpravin

Thanks Alex.

Hi Matthew,
Any comments?

Thanks,
Arun.

On 7/9/2024 1:42 AM, Alex Deucher wrote:

On Thu, Jul 4, 2024 at 4:40 AM Arunpravin Paneer Selvam
 wrote:

- Add a new start parameter in trim function to specify exact
   address from where to start the trimming. This would help us
   in situations like if drivers would like to do address alignment
   for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
   flag to disable the allocator trimming part. This patch enables
   the drivers control trimming and they can do it themselves
   based on the application requirements.

v1:(Matthew)
   - check new_start alignment with min chunk_size
   - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 

Series is:
Acked-by: Alex Deucher 

I'd like to take this series through the amdgpu tree if there are no
objections as it's required for display buffers on some chips and I'd
like to make sure it lands in 6.11.

Thanks,

Alex


---
  drivers/gpu/drm/drm_buddy.c  | 25 +++--
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
  include/drm/drm_buddy.h  |  2 ++
  3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 94f8c34fc293..8cebe1fa4e9d 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
   * drm_buddy_block_trim - free unused pages
   *
   * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
   * @new_size: original size requested
   * @blocks: Input and output list of allocated blocks.
   * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
   * 0 on success, error code on failure.
   */
  int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,
  u64 new_size,
  struct list_head *blocks)
  {
 struct drm_buddy_block *parent;
 struct drm_buddy_block *block;
+   u64 block_start, block_end;
 LIST_HEAD(dfs);
 u64 new_start;
 int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
  struct drm_buddy_block,
  link);

+   block_start = drm_buddy_block_offset(block);
+   block_end = block_start + drm_buddy_block_size(mm, block);
+
 if (WARN_ON(!drm_buddy_block_is_allocated(block)))
 return -EINVAL;

@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 if (new_size == drm_buddy_block_size(mm, block))
 return 0;

+   new_start = block_start;
+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
 list_del(>link);
 mark_free(mm, block);
 mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 parent = block->parent;
 block->parent = NULL;

-   new_start = drm_buddy_block_offset(block);
 list_add(>tmp_link, );
 err =  __alloc_range(mm, , new_start, new_size, blocks, NULL);
 if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
 } while (1);

 /* Trim the allocated block to the required size */
-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
 struct list_head *trim_list;
 LIST_HEAD(temp);
 u64 trim_size;
@@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
 }

 drm_buddy_block_trim(mm,
+NULL,
  trim_size,
  trim_list);

diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c 
b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager 
*man,
 } while (remaining_size);

 if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-   if (!drm_buddy_block_trim(mm, vres->base.size, >blocks))
+   if (!drm_buddy_block_trim(mm, NULL, vres->base.size, 
>blocks))
 size = vres->base.size;
 }

diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
in

[PATCH 2/2] drm/amdgpu: Add address alignment support to DCC buffers

2024-07-04 Thread Arunpravin Paneer Selvam
Add address alignment support to the DCC VRAM buffers.

v2:
  - adjust size based on the max_texture_channel_caches values
only for GFX12 DCC buffers.
  - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only
for DCC buffers.
  - roundup non power of two DCC buffer adjusted size to nearest
power of two number as the buddy allocator does not support non
power of two alignments. This applies only to the contiguous
DCC buffers.

v3:(Alex)
  - rewrite the max texture channel caches comparison code in an
algorithmic way to determine the alignment size.

v4:(Alex)
  - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c
and add a new gmc func callback for dcc alignment. If the callback
is non-NULL, call it to get the alignment, otherwise, use the default.

v5:(Alex)
  - Set the Alignment to a default value if the callback doesn't exist.
  - Add the callback to amdgpu_gmc_funcs.

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 36 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c   | 15 
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index febca3130497..49dfcf112ac1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs {
  uint64_t addr, uint64_t *flags);
/* get the amount of memory used by the vbios for pre-OS console */
unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev);
+   /* get the DCC buffer alignment */
+   u64 (*get_dcc_alignment)(struct amdgpu_device *adev);
 
enum amdgpu_memory_partition (*query_mem_partition_mode)(
struct amdgpu_device *adev);
@@ -363,6 +365,7 @@ struct amdgpu_gmc {
(adev)->gmc.gmc_funcs->override_vm_pte_flags\
((adev), (vm), (addr), (pte_flags))
 #define amdgpu_gmc_get_vbios_fb_size(adev) 
(adev)->gmc.gmc_funcs->get_vbios_fb_size((adev))
+#define amdgpu_gmc_get_dcc_alignment(adev) 
((adev)->gmc.gmc_funcs->get_dcc_alignment((adev)))
 
 /**
  * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index eb94f943b28e..756d1c8cbffd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -509,6 +509,16 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
vres->flags |= DRM_BUDDY_RANGE_ALLOCATION;
 
remaining_size = (u64)vres->base.size;
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) {
+   u64 adjust_size;
+
+   if (adev->gmc.gmc_funcs->get_dcc_alignment) {
+   adjust_size = amdgpu_gmc_get_dcc_alignment(adev);
+   remaining_size = roundup_pow_of_two(remaining_size + 
adjust_size);
+   vres->flags |= DRM_BUDDY_TRIM_DISABLE;
+   }
+   }
 
mutex_lock(>lock);
while (remaining_size) {
@@ -518,8 +528,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
min_block_size = mgr->default_page_size;
 
size = remaining_size;
-   if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
-   !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
+
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC)
+   min_block_size = size;
+   else if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
+!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
 
BUG_ON(min_block_size < mm->chunk_size);
@@ -550,6 +564,24 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
}
mutex_unlock(>lock);
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) {
+   struct drm_buddy_block *dcc_block;
+   u64 dcc_start, alignment;
+
+   dcc_block = amdgpu_vram_mgr_first_block(>blocks);
+   dcc_start = amdgpu_vram_mgr_block_start(dcc_block);
+
+   if (adev->gmc.gmc_funcs->get_dcc_alignment) {
+   alignment = amdgpu_gmc_get_dcc_alignment(adev);
+  

[PATCH] drm/buddy: Add start address support to trim function

2024-07-04 Thread Arunpravin Paneer Selvam
- Add a new start parameter in trim function to specify exact
  address from where to start the trimming. This would help us
  in situations like if drivers would like to do address alignment
  for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
  flag to disable the allocator trimming part. This patch enables
  the drivers control trimming and they can do it themselves
  based on the application requirements.

v1:(Matthew)
  - check new_start alignment with min chunk_size
  - use range_overflows()

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/drm_buddy.c  | 25 +++--
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
 include/drm/drm_buddy.h  |  2 ++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 94f8c34fc293..8cebe1fa4e9d 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * drm_buddy_block_trim - free unused pages
  *
  * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
  * @new_size: original size requested
  * @blocks: Input and output list of allocated blocks.
  * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * 0 on success, error code on failure.
  */
 int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,
 u64 new_size,
 struct list_head *blocks)
 {
struct drm_buddy_block *parent;
struct drm_buddy_block *block;
+   u64 block_start, block_end;
LIST_HEAD(dfs);
u64 new_start;
int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 struct drm_buddy_block,
 link);
 
+   block_start = drm_buddy_block_offset(block);
+   block_end = block_start + drm_buddy_block_size(mm, block);
+
if (WARN_ON(!drm_buddy_block_is_allocated(block)))
return -EINVAL;
 
@@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
if (new_size == drm_buddy_block_size(mm, block))
return 0;
 
+   new_start = block_start;
+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if (!IS_ALIGNED(new_start, mm->chunk_size))
+   return -EINVAL;
+
+   if (range_overflows(new_start, new_size, block_end))
+   return -EINVAL;
+   }
+
list_del(>link);
mark_free(mm, block);
mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
parent = block->parent;
block->parent = NULL;
 
-   new_start = drm_buddy_block_offset(block);
list_add(>tmp_link, );
err =  __alloc_range(mm, , new_start, new_size, blocks, NULL);
if (err) {
@@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
} while (1);
 
/* Trim the allocated block to the required size */
-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
struct list_head *trim_list;
LIST_HEAD(temp);
u64 trim_size;
@@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
}
 
drm_buddy_block_trim(mm,
+NULL,
 trim_size,
 trim_list);
 
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c 
b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager 
*man,
} while (remaining_size);
 
if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-   if (!drm_buddy_block_trim(mm, vres->base.size, >blocks))
+   if (!drm_buddy_block_trim(mm, NULL, vres->base.size, 
>blocks))
size = vres->base.size;
}
 
diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
index 82570f77e817..0c2f735f0265 100644
--- a/include/drm/drm_buddy.h
+++ b/include/drm/drm_buddy.h
@@ -27,6 +27,7 @@
 #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2)
 #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3)
 #define DRM_BUDDY_CLEARED  BIT(4)
+#define DRM_BUDDY_TRIM_DISABLE BIT(5)
 
 struct drm_buddy_block {
 #define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12)
@@ -155,6 +1

[PATCH] drm/buddy: Add start address support to trim function

2024-06-20 Thread Arunpravin Paneer Selvam
- Add a new start parameter in trim function to specify exact
  address from where to start the trimming. This would help us
  in situations like if drivers would like to do address alignment
  for specific requirements.

- Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this
  flag to disable the allocator trimming part. This patch enables
  the drivers control trimming and they can do it themselves
  based on the application requirements.

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/drm_buddy.c  | 22 --
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  2 +-
 include/drm/drm_buddy.h  |  2 ++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 6a8e45e9d0ec..287b6acb1637 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * drm_buddy_block_trim - free unused pages
  *
  * @mm: DRM buddy manager
+ * @start: start address to begin the trimming.
  * @new_size: original size requested
  * @blocks: Input and output list of allocated blocks.
  * MUST contain single block as input to be trimmed.
@@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
  * 0 on success, error code on failure.
  */
 int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,
 u64 new_size,
 struct list_head *blocks)
 {
struct drm_buddy_block *parent;
struct drm_buddy_block *block;
+   u64 block_start, block_end;
LIST_HEAD(dfs);
u64 new_start;
int err;
@@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 struct drm_buddy_block,
 link);
 
+   block_start = drm_buddy_block_offset(block);
+   block_end = block_start + drm_buddy_block_size(mm, block) - 1;
+
if (WARN_ON(!drm_buddy_block_is_allocated(block)))
return -EINVAL;
 
@@ -894,6 +900,17 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
if (new_size == drm_buddy_block_size(mm, block))
return 0;
 
+   new_start = block_start;
+   if (start) {
+   new_start = *start;
+
+   if (new_start < block_start)
+   return -EINVAL;
+
+   if ((new_start + new_size) > block_end)
+   return -EINVAL;
+   }
+
list_del(>link);
mark_free(mm, block);
mm->avail += drm_buddy_block_size(mm, block);
@@ -904,7 +921,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
parent = block->parent;
block->parent = NULL;
 
-   new_start = drm_buddy_block_offset(block);
list_add(>tmp_link, );
err =  __alloc_range(mm, , new_start, new_size, blocks, NULL);
if (err) {
@@ -1066,7 +1082,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
} while (1);
 
/* Trim the allocated block to the required size */
-   if (original_size != size) {
+   if (!(flags & DRM_BUDDY_TRIM_DISABLE) &&
+   original_size != size) {
struct list_head *trim_list;
LIST_HEAD(temp);
u64 trim_size;
@@ -1083,6 +1100,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
}
 
drm_buddy_block_trim(mm,
+NULL,
 trim_size,
 trim_list);
 
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c 
b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index fe3779fdba2c..423b261ea743 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager 
*man,
} while (remaining_size);
 
if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-   if (!drm_buddy_block_trim(mm, vres->base.size, >blocks))
+   if (!drm_buddy_block_trim(mm, NULL, vres->base.size, 
>blocks))
size = vres->base.size;
}
 
diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
index 2a74fa9d0ce5..9689a7c5dd36 100644
--- a/include/drm/drm_buddy.h
+++ b/include/drm/drm_buddy.h
@@ -27,6 +27,7 @@
 #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2)
 #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3)
 #define DRM_BUDDY_CLEARED  BIT(4)
+#define DRM_BUDDY_TRIM_DISABLE BIT(5)
 
 struct drm_buddy_block {
 #define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12)
@@ -155,6 +156,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
   unsigned long flags);
 
 int drm_buddy_block_trim(struct drm_buddy *mm,
+u64 *start,
   

Re: [PATCH v3] drm/amdgpu: Fix the BO release clear memory warning

2024-06-10 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 6/10/2024 11:35 PM, Christian König wrote:

Am 10.06.24 um 20:04 schrieb Arunpravin Paneer Selvam:

This happens when the amdgpu_bo_release_notify running
before amdgpu_ttm_set_buffer_funcs_status set the buffer
funcs to enabled.

check the buffer funcs enablement before calling the fill
buffer memory.

v2:(Christian)
   - Apply it only for GEM buffers and since GEM buffers are only
 allocated/freed while the driver is loaded we never run into
 the issue to clear with buffer funcs disabled.

v3:(Mario)
   - drop the stable tag as this will presumably go into a
 -fixes PR for 6.10

Log snip:
*ERROR* Trying to clear memory with ring turned off.
RIP: 0010:amdgpu_bo_release_notify+0x201/0x220 [amdgpu]

Fixes: a68c7eaa7a8f ("drm/amdgpu: Enable clear page functionality")
Signed-off-by: Arunpravin Paneer Selvam 


Reviewed-by: Christian König 
Tested-by: Mikhail Gavrilov 
Tested-by: Richard Gong 
Suggested-by: Christian König 


Please push to drm-misc-fixes ASAP.

I pushed into drm-misc-fixes.

Thanks,
Arun


Thanks,
Christian.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 --
  2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c

index 67c234bcf89f..3adaa4670103 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -108,6 +108,7 @@ int amdgpu_gem_object_create(struct amdgpu_device 
*adev, unsigned long size,

    memset(, 0, sizeof(bp));
  *obj = NULL;
+    flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
    bp.size = size;
  bp.byte_align = alignment;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8d8c39be6129..c556c8b653fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -604,8 +604,6 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
  if (!amdgpu_bo_support_uswc(bo->flags))
  bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
  -    bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
-
  bo->tbo.bdev = >mman.bdev;
  if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
    AMDGPU_GEM_DOMAIN_GDS))






[PATCH v3] drm/amdgpu: Fix the BO release clear memory warning

2024-06-10 Thread Arunpravin Paneer Selvam
This happens when the amdgpu_bo_release_notify running
before amdgpu_ttm_set_buffer_funcs_status set the buffer
funcs to enabled.

check the buffer funcs enablement before calling the fill
buffer memory.

v2:(Christian)
  - Apply it only for GEM buffers and since GEM buffers are only
allocated/freed while the driver is loaded we never run into
the issue to clear with buffer funcs disabled.

v3:(Mario)
  - drop the stable tag as this will presumably go into a
-fixes PR for 6.10

Log snip:
*ERROR* Trying to clear memory with ring turned off.
RIP: 0010:amdgpu_bo_release_notify+0x201/0x220 [amdgpu]

Fixes: a68c7eaa7a8f ("drm/amdgpu: Enable clear page functionality")
Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Christian König 
Tested-by: Mikhail Gavrilov 
Tested-by: Richard Gong 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c| 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 67c234bcf89f..3adaa4670103 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -108,6 +108,7 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, 
unsigned long size,
 
memset(, 0, sizeof(bp));
*obj = NULL;
+   flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
 
bp.size = size;
bp.byte_align = alignment;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8d8c39be6129..c556c8b653fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -604,8 +604,6 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
if (!amdgpu_bo_support_uswc(bo->flags))
bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 
-   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
-
bo->tbo.bdev = >mman.bdev;
if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
  AMDGPU_GEM_DOMAIN_GDS))
-- 
2.25.1



[PATCH v2] drm/amdgpu: Fix the BO release clear memory warning

2024-06-06 Thread Arunpravin Paneer Selvam
038632]  do_syscall_64+0x84/0x1a0
[6.038634]  ? do_syscall_64+0x90/0x1a0
[6.038635]  ? srso_alias_return_thunk+0x5/0xfbef5
[6.038637]  ? do_syscall_64+0x90/0x1a0
[6.038638]  ? srso_alias_return_thunk+0x5/0xfbef5
[6.038639]  ? do_syscall_64+0x90/0x1a0
[6.038640]  ? srso_alias_return_thunk+0x5/0xfbef5
[6.038642]  ? srso_alias_return_thunk+0x5/0xfbef5
[6.038644]  entry_SYSCALL_64_after_hwframe+0x78/0x80
[6.038645] RIP: 0033:0x7f2bd1e9d059
[6.038647] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 
f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 
f0 ff ff 73 01 c3 48 8b 0d 8f 1d 0d 00 f7 d8 64 89 01 48
[6.038648] RSP: 002b:7fffaf804878 EFLAGS: 0246 ORIG_RAX: 
0139
[6.038650] RAX: ffda RBX: 55a9b2328d60 RCX: 7f2bd1e9d059
[6.038650] RDX:  RSI: 7f2bd1fd0509 RDI: 0024
[6.038651] RBP:  R08: 0040 R09: 55a9b23000a0
[6.038652] R10: 0038 R11: 0246 R12: 7f2bd1fd0509
[6.038652] R13: 0002 R14: 55a9b2326f90 R15: 
[6.038655]  
[6.038656] ---[ end trace  ]---

Cc:  # 6.10+
Fixes: a68c7eaa7a8f ("drm/amdgpu: Enable clear page functionality")
Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c| 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 67c234bcf89f..3adaa4670103 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -108,6 +108,7 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, 
unsigned long size,
 
memset(, 0, sizeof(bp));
*obj = NULL;
+   flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
 
bp.size = size;
bp.byte_align = alignment;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8d8c39be6129..c556c8b653fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -604,8 +604,6 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
if (!amdgpu_bo_support_uswc(bo->flags))
bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 
-   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
-
bo->tbo.bdev = >mman.bdev;
if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
  AMDGPU_GEM_DOMAIN_GDS))
-- 
2.25.1



Re: [PATCH] drm/amdgpu: Fix the BO release clear memory warning

2024-06-05 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 5/7/2024 8:21 PM, Christian König wrote:

Am 06.05.24 um 15:48 schrieb Arunpravin Paneer Selvam:

This happens when the amdgpu_bo_release_notify running
before amdgpu_ttm_set_buffer_funcs_status set the buffer
funcs to enabled.

check the buffer funcs enablement before calling the fill
buffer memory.

Log snip:
[    6.036477] [drm:amdgpu_fill_buffer [amdgpu]] *ERROR* Trying to 
clear memory with ring turned off.

[    6.036667] [ cut here ]
[    6.036668] WARNING: CPU: 3 PID: 370 at 
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1355 
amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.036767] Modules linked in: hid_generic amdgpu(+) amdxcp 
drm_exec gpu_sched drm_buddy i2c_algo_bit usbhid drm_suballoc_helper 
drm_display_helper hid sd_mod cec rc_core drm_ttm_helper ahci ttm 
nvme libahci drm_kms_helper nvme_core r8169 xhci_pci libata t10_pi 
xhci_hcd realtek crc32_pclmul crc64_rocksoft mdio_devres crc64 drm 
crc32c_intel scsi_mod usbcore thunderbolt crc_t10dif i2c_piix4 libphy 
crct10dif_generic crct10dif_pclmul crct10dif_common scsi_common 
usb_common video wmi gpio_amdpt gpio_generic button
[    6.036793] CPU: 3 PID: 370 Comm: (udev-worker) Not tainted 
6.8.7-dirty #1
[    6.036795] Hardware name: ASRock X670E Taichi/X670E Taichi, BIOS 
2.10 03/26/2024

[    6.036796] RIP: 0010:amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.036891] Code: 0b e9 af fe ff ff 48 ba ff ff ff ff ff ff ff 7f 
31 f6 4c 89 e7 e8 7f 2f 7a d8 eb 98 e8 18 28 7a d8 eb b2 0f 0b e9 58 
fe ff ff <0f> 0b eb a7 be 03 00 00 00 e8 e1 89 4e d8 eb 9b e8 aa 4d 
ad d8 66

[    6.036892] RSP: 0018:bbe140d1f638 EFLAGS: 00010282
[    6.036894] RAX: ffea RBX: 90cba9e4e858 RCX: 
90dabde38c28
[    6.036895] RDX:  RSI: dfff RDI: 
0001
[    6.036896] RBP: 90cba980ef40 R08:  R09: 
bbe140d1f3c0
[    6.036896] R10: bbe140d1f3b8 R11: 0003 R12: 
90cba9e4e800
[    6.036897] R13: 90cba9e4e958 R14: 90cba980ef40 R15: 
0258
[    6.036898] FS:  7f2bd1679d00() GS:90da7e2c() 
knlGS:

[    6.036899] CS:  0010 DS:  ES:  CR0: 80050033
[    6.036900] CR2: 55a9b0f7299d CR3: 00011bb6e000 CR4: 
00750ef0

[    6.036901] PKRU: 5554
[    6.036901] Call Trace:
[    6.036903]  
[    6.036904]  ? amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.036998]  ? __warn+0x81/0x130
[    6.037002]  ? amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.037095]  ? report_bug+0x171/0x1a0
[    6.037099]  ? handle_bug+0x3c/0x80
[    6.037101]  ? exc_invalid_op+0x17/0x70
[    6.037103]  ? asm_exc_invalid_op+0x1a/0x20
[    6.037107]  ? amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.037199]  ? amdgpu_bo_release_notify+0x14a/0x220 [amdgpu]
[    6.037292]  ttm_bo_release+0xff/0x2e0 [ttm]
[    6.037297]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.037299]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.037301]  ? ttm_resource_move_to_lru_tail+0x140/0x1e0 [ttm]
[    6.037306]  amdgpu_bo_free_kernel+0xcb/0x120 [amdgpu]
[    6.037399]  dm_helpers_free_gpu_mem+0x41/0x80 [amdgpu]
[    6.037544]  dcn315_clk_mgr_construct+0x198/0x7e0 [amdgpu]
[    6.037692]  dc_clk_mgr_create+0x16e/0x5f0 [amdgpu]
[    6.037826]  dc_create+0x28a/0x650 [amdgpu]
[    6.037958]  amdgpu_dm_init.isra.0+0x2d5/0x1ec0 [amdgpu]
[    6.038085]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038087]  ? prb_read_valid+0x1b/0x30
[    6.038089]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038090]  ? console_unlock+0x78/0x120
[    6.038092]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038094]  ? vprintk_emit+0x175/0x2c0
[    6.038095]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038097]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038098]  ? dev_printk_emit+0xa5/0xd0
[    6.038104]  dm_hw_init+0x12/0x30 [amdgpu]
[    6.038209]  amdgpu_device_init+0x1e50/0x2500 [amdgpu]
[    6.038308]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038310]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038313]  amdgpu_driver_load_kms+0x19/0x190 [amdgpu]
[    6.038409]  amdgpu_pci_probe+0x18b/0x510 [amdgpu]
[    6.038505]  local_pci_probe+0x42/0xa0
[    6.038508]  pci_device_probe+0xc7/0x240
[    6.038510]  really_probe+0x19b/0x3e0
[    6.038513]  ? __pfx___driver_attach+0x10/0x10
[    6.038514]  __driver_probe_device+0x78/0x160
[    6.038516]  driver_probe_device+0x1f/0x90
[    6.038517]  __driver_attach+0xd2/0x1c0
[    6.038519]  bus_for_each_dev+0x85/0xd0
[    6.038521]  bus_add_driver+0x116/0x220
[    6.038523]  driver_register+0x59/0x100
[    6.038525]  ? __pfx_amdgpu_init+0x10/0x10 [amdgpu]
[    6.038618]  do_one_initcall+0x58/0x320
[    6.038621]  do_init_module+0x60/0x230
[    6.038624]  init_module_from_file+0x89/0xe0
[    6.038628]  idempotent_init_module+0x120/0x2b0
[    6.038630]  __x64_sys_finit_module+0x5e/0xb0
[    6.038632]  do_syscall_64+0x84/0x1a0
[    6.038634]  ? do_syscall_6

Re: [PATCH v2] drm/buddy: Fix the warn on's during force merge

2024-05-17 Thread Paneer Selvam, Arunpravin

Hi Dave,
Please help pull this patch into drm-next. This will fix any unnecessary
warnings in memory pressure situation.

Thanks for the help.

Regards,
Arun.

On 5/17/2024 8:03 PM, Arunpravin Paneer Selvam wrote:

Move the fallback and block incompatible checks
above, so that we dont unnecessarily split the blocks
and leaving the unmerged. This resolves the unnecessary
warn on's thrown during force_merge call.

v2:(Matthew)
   - Move the fallback and block incompatible checks above
 the contains check.

Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Matthew Auld 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
Link: 
https://patchwork.kernel.org/project/dri-devel/patch/20240517135015.17565-1-arunpravin.paneersel...@amd.com/
---
  drivers/gpu/drm/drm_buddy.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 1daf778cf6fa..94f8c34fc293 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm,
continue;
}
  
+		if (!fallback && block_incompatible(block, flags))

+   continue;
+
if (contains(start, end, block_start, block_end) &&
order == drm_buddy_block_order(block)) {
-   if (!fallback && block_incompatible(block, flags))
-   continue;
-
/*
 * Find the free block within the range.
 */




Re: 6.10/regression/bisected - commit a68c7eaa7a8f cause *ERROR* Trying to clear memory with ring turned off in amdgpu_fill_buffer.

2024-05-17 Thread Paneer Selvam, Arunpravin
729714]  __driver_probe_device+0x18c/0x370
[   13.729718]  driver_probe_device+0x4a/0x120
[   13.729721]  __driver_attach+0x194/0x4a0
[   13.729724]  ? __pfx___driver_attach+0x10/0x10
[   13.729726]  bus_for_each_dev+0x106/0x190
[   13.729730]  ? __pfx_bus_for_each_dev+0x10/0x10
[   13.729737]  bus_add_driver+0x2ff/0x530
[   13.729742]  driver_register+0x1a5/0x360
[   13.729745]  ? __pfx_amdgpu_init+0x10/0x10 [amdgpu]
[   13.729976]  do_one_initcall+0xd6/0x460
[   13.729980]  ? __pfx_do_one_initcall+0x10/0x10
[   13.729987]  ? kasan_unpoison+0x44/0x70
[   13.729992]  do_init_module+0x296/0x7c0
[   13.72]  load_module+0x576d/0x7480
[   13.730013]  ? __pfx_load_module+0x10/0x10
[   13.730019]  ? __might_fault+0x9d/0x120
[   13.730022]  ? local_clock_noinstr+0xd/0x100
[   13.730032]  ? ima_post_load_data+0x68/0x80
[   13.730038]  ? __do_sys_init_module+0x1ef/0x220
[   13.730040]  __do_sys_init_module+0x1ef/0x220
[   13.730042]  ? __pfx___do_sys_init_module+0x10/0x10
[   13.730054]  do_syscall_64+0x97/0x190
[   13.730057]  ? __lock_acquire+0xf65/0x5c70
[   13.730061]  ? __pfx___lock_acquire+0x10/0x10
[   13.730067]  ? do_user_addr_fault+0x4d5/0xbc0
[   13.730071]  ? reacquire_held_locks+0x219/0x4f0
[   13.730073]  ? do_user_addr_fault+0x4d5/0xbc0
[   13.730079]  ? do_user_addr_fault+0x558/0xbc0
[   13.730081]  ? local_clock_noinstr+0xd/0x100
[   13.730086]  ? __pfx_lock_release+0x10/0x10
[   13.730090]  ? handle_mm_fault+0x47d/0x8d0
[   13.730099]  ? lockdep_hardirqs_on_prepare+0x171/0x400
[   13.730103]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   13.730106] RIP: 0033:0x7f22a1fbc5ae
[   13.730113] Code: 48 8b 0d 85 a8 0c 00 f7 d8 64 89 01 48 83 c8 ff
c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00
00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 52 a8 0c 00 f7 d8 64 89
01 48
[   13.730115] RSP: 002b:7ffce615c888 EFLAGS: 0246 ORIG_RAX:
00af
[   13.730118] RAX: ffda RBX: 55f90368f520 RCX: 7f22a1fbc5ae
[   13.730119] RDX: 55f9036c2e40 RSI: 05749496 RDI: 7f2299c00010
[   13.730120] RBP: 7ffce615c940 R08: 55f903645010 R09: 0007
[   13.730122] R10: 0006 R11: 0246 R12: 55f9036c2e40
[   13.730123] R13: 0002 R14: 55f9036b3560 R15: 55f9036d2f70
[   13.730131]  
[   13.730132] irq event stamp: 2745383
[   13.730134] hardirqs last  enabled at (2745389):
[] vprintk_emit.part.0+0x44a/0x4b0
[   13.730136] hardirqs last disabled at (2745394):
[] vprintk_emit.part.0+0x3fd/0x4b0
[   13.730138] softirqs last  enabled at (2744940):
[] __irq_exit_rcu+0xbb/0x1c0
[   13.730141] softirqs last disabled at (2744929):
[] __irq_exit_rcu+0xbb/0x1c0
[   13.730143] ---[ end trace  ]---
[   13.730605] [drm] Display Core v3.2.281 initialized on DCN 3.1.5

Of course, I immediately wanted to find the first bad commit.
And bad commit it has already been found:
a68c7eaa7a8ffdec9287ba1561a668d674c20a13 is the first bad commit
commit a68c7eaa7a8ffdec9287ba1561a668d674c20a13 (HEAD)
Author: Arunpravin Paneer Selvam 
Date:   Fri Apr 19 12:05:37 2024 +0530

 drm/amdgpu: Enable clear page functionality

 Add clear page support in vram memory region.

 v1(Christian):
   - Dont handle clear page as TTM flag since when moving the BO back
 in from GTT again we don't need that.
   - Make a specialized version of amdgpu_fill_buffer() which only
 clears the VRAM areas which are not already cleared
   - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
 amdgpu_object.c

 v2:
   - Modify the function name amdgpu_ttm_* (Alex)
   - Drop the delayed parameter (Christian)
   - handle amdgpu_res_cleared() just above the size
 calculation (Christian)
   - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers
 in the free path to properly wait for fences etc.. (Christian)

 v3(Christian):
   - Remove buffer clear code in VRAM manager instead change the
 AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
 the DRM_BUDDY_CLEARED flag.
   - Remove ! from amdgpu_res_cleared() check.

 v4(Christian):
   - vres flag setting move to vram manager file
   - use dma_fence_get_stub in amdgpu_ttm_clear_buffer function
   - make fence a mandatory parameter and drop the if and the get/put dance

 Signed-off-by: Arunpravin Paneer Selvam 
 Suggested-by: Christian König 
 Acked-by: Felix Kuehling 
 Link: 
https://patchwork.freedesktop.org/patch/msgid/20240419063538.11957-2-arunpravin.paneersel...@amd.com
 Signed-off-by: Christian König 

  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |  9 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h | 25 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c| 70
--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h|  5 +++--
  dri

Re: [PATCH v2] drm/buddy: Fix the warn on's during force merge

2024-05-17 Thread Paneer Selvam, Arunpravin




On 5/17/2024 7:30 PM, Matthew Auld wrote:

On 17/05/2024 14:50, Arunpravin Paneer Selvam wrote:

Move the fallback and block incompatible checks
above, so that we dont unnecessarily split the blocks
and leaving the unmerged. This resolves the unnecessary
warn on's thrown during force_merge call.

v2:(Matthew)
   - Move the fallback and block incompatible checks above
 the contains check.

Signed-off-by: Arunpravin Paneer Selvam 


Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")

Reviewed-by: Matthew Auld 

A follow up unit test to catch this edge case would be lovely.

Yes, I will follow up on this.

Thanks,
Arun.



---
  drivers/gpu/drm/drm_buddy.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 1daf778cf6fa..94f8c34fc293 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm,
  continue;
  }
  +    if (!fallback && block_incompatible(block, flags))
+    continue;
+
  if (contains(start, end, block_start, block_end) &&
  order == drm_buddy_block_order(block)) {
-    if (!fallback && block_incompatible(block, flags))
-    continue;
-
  /*
   * Find the free block within the range.
   */




[PATCH v2] drm/buddy: Fix the warn on's during force merge

2024-05-17 Thread Arunpravin Paneer Selvam
Move the fallback and block incompatible checks
above, so that we dont unnecessarily split the blocks
and leaving the unmerged. This resolves the unnecessary
warn on's thrown during force_merge call.

v2:(Matthew)
  - Move the fallback and block incompatible checks above
the contains check.

Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Matthew Auld 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
Link: 
https://patchwork.kernel.org/project/dri-devel/patch/20240517135015.17565-1-arunpravin.paneersel...@amd.com/
---
 drivers/gpu/drm/drm_buddy.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 1daf778cf6fa..94f8c34fc293 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm,
continue;
}
 
+   if (!fallback && block_incompatible(block, flags))
+   continue;
+
if (contains(start, end, block_start, block_end) &&
order == drm_buddy_block_order(block)) {
-   if (!fallback && block_incompatible(block, flags))
-   continue;
-
/*
 * Find the free block within the range.
 */
-- 
2.25.1



Re: [PATCH v2] drm/buddy: Fix the warn on's during force merge

2024-05-17 Thread Paneer Selvam, Arunpravin

Hi Matthew,
This fixes the problem.

Regards,
Arun.

On 5/17/2024 7:20 PM, Arunpravin Paneer Selvam wrote:

Move the fallback and block incompatible checks
above, so that we dont unnecessarily split the blocks
and leaving the unmerged. This resolves the unnecessary
warn on's thrown during force_merge call.

v2:(Matthew)
   - Move the fallback and block incompatible checks above
 the contains check.

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
---
  drivers/gpu/drm/drm_buddy.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 1daf778cf6fa..94f8c34fc293 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm,
continue;
}
  
+		if (!fallback && block_incompatible(block, flags))

+   continue;
+
if (contains(start, end, block_start, block_end) &&
order == drm_buddy_block_order(block)) {
-   if (!fallback && block_incompatible(block, flags))
-   continue;
-
/*
 * Find the free block within the range.
 */




[PATCH v2] drm/buddy: Fix the warn on's during force merge

2024-05-17 Thread Arunpravin Paneer Selvam
Move the fallback and block incompatible checks
above, so that we dont unnecessarily split the blocks
and leaving the unmerged. This resolves the unnecessary
warn on's thrown during force_merge call.

v2:(Matthew)
  - Move the fallback and block incompatible checks above
the contains check.

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
---
 drivers/gpu/drm/drm_buddy.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 1daf778cf6fa..94f8c34fc293 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm,
continue;
}
 
+   if (!fallback && block_incompatible(block, flags))
+   continue;
+
if (contains(start, end, block_start, block_end) &&
order == drm_buddy_block_order(block)) {
-   if (!fallback && block_incompatible(block, flags))
-   continue;
-
/*
 * Find the free block within the range.
 */
-- 
2.25.1



Re: [PATCH] drm/buddy: Merge back blocks in bias range function

2024-05-17 Thread Paneer Selvam, Arunpravin

Hi Matthew,
Could you help review this patch quickly.

Hi Dave
This patch just fixes the unnecessary warn on's triggered during
the force_merge call.

Regards,
Arun.

On 5/17/2024 6:08 PM, Arunpravin Paneer Selvam wrote:

In bias range allocation, when we don't find the required
blocks (i.e) on returning the -ENOSPC, we should merge back the
split blocks. Otherwise, during force_merge we are flooded with
warn on's due to block and its buddy are in same clear state
(dirty or clear).

Hence, renamed the force_merge with merge_blocks and passed a
force_merge as a bool function parameter. Based on the requirement,
say, in any normal situation we can call the merge_blocks passing
the force_merge variable as false. And, in any memory cruch situation,
we can call the merge_blocks passing the force_merge as true. This
resolves the unnecessary warn on's thrown during force_merge call.

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
---
  drivers/gpu/drm/drm_buddy.c | 32 ++--
  1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 1daf778cf6fa..111f602f1359 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -161,10 +161,11 @@ static unsigned int __drm_buddy_free(struct drm_buddy *mm,
return order;
  }
  
-static int __force_merge(struct drm_buddy *mm,

-u64 start,
-u64 end,
-unsigned int min_order)
+static int __merge_blocks(struct drm_buddy *mm,
+ u64 start,
+ u64 end,
+ unsigned int min_order,
+ bool force_merge)
  {
unsigned int order;
int i;
@@ -195,8 +196,9 @@ static int __force_merge(struct drm_buddy *mm,
if (!drm_buddy_block_is_free(buddy))
continue;
  
-			WARN_ON(drm_buddy_block_is_clear(block) ==

-   drm_buddy_block_is_clear(buddy));
+   if (force_merge)
+   WARN_ON(drm_buddy_block_is_clear(block) ==
+   drm_buddy_block_is_clear(buddy));
  
  			/*

 * If the prev block is same as buddy, don't access the
@@ -210,7 +212,7 @@ static int __force_merge(struct drm_buddy *mm,
if (drm_buddy_block_is_clear(block))
mm->clear_avail -= drm_buddy_block_size(mm, 
block);
  
-			order = __drm_buddy_free(mm, block, true);

+   order = __drm_buddy_free(mm, block, force_merge);
if (order >= min_order)
return 0;
}
@@ -332,7 +334,7 @@ void drm_buddy_fini(struct drm_buddy *mm)
  
  	for (i = 0; i < mm->n_roots; ++i) {

order = ilog2(size) - ilog2(mm->chunk_size);
-   __force_merge(mm, 0, size, order);
+   __merge_blocks(mm, 0, size, order, true);
  
  		WARN_ON(!drm_buddy_block_is_free(mm->roots[i]));

drm_block_free(mm, mm->roots[i]);
@@ -479,7 +481,7 @@ __alloc_range_bias(struct drm_buddy *mm,
   unsigned long flags,
   bool fallback)
  {
-   u64 req_size = mm->chunk_size << order;
+   u64 size, root_size, req_size = mm->chunk_size << order;
struct drm_buddy_block *block;
struct drm_buddy_block *buddy;
LIST_HEAD(dfs);
@@ -487,6 +489,7 @@ __alloc_range_bias(struct drm_buddy *mm,
int i;
  
  	end = end - 1;

+   size = mm->size;
  
  	for (i = 0; i < mm->n_roots; ++i)

list_add_tail(>roots[i]->tmp_link, );
@@ -548,6 +551,15 @@ __alloc_range_bias(struct drm_buddy *mm,
list_add(>left->tmp_link, );
} while (1);
  
+	/* Merge back the split blocks */

+   for (i = 0; i < mm->n_roots; ++i) {
+   order = ilog2(size) - ilog2(mm->chunk_size);
+   __merge_blocks(mm, start, end, order, false);
+
+   root_size = mm->chunk_size << order;
+   size -= root_size;
+   }
+
return ERR_PTR(-ENOSPC);
  
  err_undo:

@@ -1026,7 +1038,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
if (order-- == min_order) {
/* Try allocation through force merge method */
if (mm->clear_avail &&
-   !__force_merge(mm, start, end, min_order)) {
+   !__merge_blocks(mm, start, end, min_order, 
true)) {
block = __drm_buddy_alloc_blocks(mm, 
start,
 end,
 
min_order,




[PATCH] drm/buddy: Merge back blocks in bias range function

2024-05-17 Thread Arunpravin Paneer Selvam
In bias range allocation, when we don't find the required
blocks (i.e) on returning the -ENOSPC, we should merge back the
split blocks. Otherwise, during force_merge we are flooded with
warn on's due to block and its buddy are in same clear state
(dirty or clear).

Hence, renamed the force_merge with merge_blocks and passed a
force_merge as a bool function parameter. Based on the requirement,
say, in any normal situation we can call the merge_blocks passing
the force_merge variable as false. And, in any memory cruch situation,
we can call the merge_blocks passing the force_merge as true. This
resolves the unnecessary warn on's thrown during force_merge call.

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
---
 drivers/gpu/drm/drm_buddy.c | 32 ++--
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 1daf778cf6fa..111f602f1359 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -161,10 +161,11 @@ static unsigned int __drm_buddy_free(struct drm_buddy *mm,
return order;
 }
 
-static int __force_merge(struct drm_buddy *mm,
-u64 start,
-u64 end,
-unsigned int min_order)
+static int __merge_blocks(struct drm_buddy *mm,
+ u64 start,
+ u64 end,
+ unsigned int min_order,
+ bool force_merge)
 {
unsigned int order;
int i;
@@ -195,8 +196,9 @@ static int __force_merge(struct drm_buddy *mm,
if (!drm_buddy_block_is_free(buddy))
continue;
 
-   WARN_ON(drm_buddy_block_is_clear(block) ==
-   drm_buddy_block_is_clear(buddy));
+   if (force_merge)
+   WARN_ON(drm_buddy_block_is_clear(block) ==
+   drm_buddy_block_is_clear(buddy));
 
/*
 * If the prev block is same as buddy, don't access the
@@ -210,7 +212,7 @@ static int __force_merge(struct drm_buddy *mm,
if (drm_buddy_block_is_clear(block))
mm->clear_avail -= drm_buddy_block_size(mm, 
block);
 
-   order = __drm_buddy_free(mm, block, true);
+   order = __drm_buddy_free(mm, block, force_merge);
if (order >= min_order)
return 0;
}
@@ -332,7 +334,7 @@ void drm_buddy_fini(struct drm_buddy *mm)
 
for (i = 0; i < mm->n_roots; ++i) {
order = ilog2(size) - ilog2(mm->chunk_size);
-   __force_merge(mm, 0, size, order);
+   __merge_blocks(mm, 0, size, order, true);
 
WARN_ON(!drm_buddy_block_is_free(mm->roots[i]));
drm_block_free(mm, mm->roots[i]);
@@ -479,7 +481,7 @@ __alloc_range_bias(struct drm_buddy *mm,
   unsigned long flags,
   bool fallback)
 {
-   u64 req_size = mm->chunk_size << order;
+   u64 size, root_size, req_size = mm->chunk_size << order;
struct drm_buddy_block *block;
struct drm_buddy_block *buddy;
LIST_HEAD(dfs);
@@ -487,6 +489,7 @@ __alloc_range_bias(struct drm_buddy *mm,
int i;
 
end = end - 1;
+   size = mm->size;
 
for (i = 0; i < mm->n_roots; ++i)
list_add_tail(>roots[i]->tmp_link, );
@@ -548,6 +551,15 @@ __alloc_range_bias(struct drm_buddy *mm,
list_add(>left->tmp_link, );
} while (1);
 
+   /* Merge back the split blocks */
+   for (i = 0; i < mm->n_roots; ++i) {
+   order = ilog2(size) - ilog2(mm->chunk_size);
+   __merge_blocks(mm, start, end, order, false);
+
+   root_size = mm->chunk_size << order;
+   size -= root_size;
+   }
+
return ERR_PTR(-ENOSPC);
 
 err_undo:
@@ -1026,7 +1038,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
if (order-- == min_order) {
/* Try allocation through force merge method */
if (mm->clear_avail &&
-   !__force_merge(mm, start, end, min_order)) {
+   !__merge_blocks(mm, start, end, min_order, 
true)) {
block = __drm_buddy_alloc_blocks(mm, 
start,
 end,
 
min_order,
-- 
2.25.1



[PATCH v3 1/2] drm/buddy: Fix the range bias clear memory allocation issue

2024-05-14 Thread Arunpravin Paneer Selvam
Problem statement: During the system boot time, an application request
for the bulk volume of cleared range bias memory when the clear_avail
is zero, we dont fallback into normal allocation method as we had an
unnecessary clear_avail check which prevents the fallback method leads
to fb allocation failure following system goes into unresponsive state.

Solution: Remove the unnecessary clear_avail check in the range bias
allocation function.

v2: add a kunit for this corner case (Daniel Vetter)

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/drm_buddy.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 284ebae71cc4..1daf778cf6fa 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -249,6 +249,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 
chunk_size)
 
mm->size = size;
mm->avail = size;
+   mm->clear_avail = 0;
mm->chunk_size = chunk_size;
mm->max_order = ilog2(size) - ilog2(chunk_size);
 
@@ -574,7 +575,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm,
 
block = __alloc_range_bias(mm, start, end, order,
   flags, fallback);
-   if (IS_ERR(block) && mm->clear_avail)
+   if (IS_ERR(block))
return __alloc_range_bias(mm, start, end, order,
  flags, !fallback);
 
-- 
2.25.1



[PATCH v3 2/2] drm/tests: Add a unit test for range bias allocation

2024-05-14 Thread Arunpravin Paneer Selvam
Allocate cleared blocks in the bias range when the DRM
buddy's clear avail is zero. This will validate the bias
range allocation in scenarios like system boot when no
cleared blocks are available and exercise the fallback
path too. The resulting blocks should always be dirty.

v1:(Matthew)
  - move the size to the variable declaration section.
  - move the mm.clear_avail init to allocator init.

Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/tests/drm_buddy_test.c | 36 +-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c 
b/drivers/gpu/drm/tests/drm_buddy_test.c
index e3b50e240d36..b3be68b03610 100644
--- a/drivers/gpu/drm/tests/drm_buddy_test.c
+++ b/drivers/gpu/drm/tests/drm_buddy_test.c
@@ -23,9 +23,11 @@ static inline u64 get_size(int order, u64 chunk_size)
 
 static void drm_test_buddy_alloc_range_bias(struct kunit *test)
 {
-   u32 mm_size, ps, bias_size, bias_start, bias_end, bias_rem;
+   u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem;
DRM_RND_STATE(prng, random_seed);
unsigned int i, count, *order;
+   struct drm_buddy_block *block;
+   unsigned long flags;
struct drm_buddy mm;
LIST_HEAD(allocated);
 
@@ -222,6 +224,38 @@ static void drm_test_buddy_alloc_range_bias(struct kunit 
*test)
 
drm_buddy_free_list(, , 0);
drm_buddy_fini();
+
+   /*
+* Allocate cleared blocks in the bias range when the DRM buddy's clear 
avail is
+* zero. This will validate the bias range allocation in scenarios like 
system boot
+* when no cleared blocks are available and exercise the fallback path 
too. The resulting
+* blocks should always be dirty.
+*/
+
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(, mm_size, ps),
+  "buddy_init failed\n");
+
+   bias_start = round_up(prandom_u32_state() % (mm_size - ps), ps);
+   bias_end = round_up(bias_start + prandom_u32_state() % (mm_size - 
bias_start), ps);
+   bias_end = max(bias_end, bias_start + ps);
+   bias_rem = bias_end - bias_start;
+
+   flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION;
+   size = max(round_up(prandom_u32_state() % bias_rem, ps), ps);
+
+   KUNIT_ASSERT_FALSE_MSG(test,
+  drm_buddy_alloc_blocks(, bias_start,
+ bias_end, size, ps,
+ ,
+ flags),
+  "buddy_alloc failed with bias(%x-%x), size=%u, 
ps=%u\n",
+  bias_start, bias_end, size, ps);
+
+   list_for_each_entry(block, , link)
+   KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false);
+
+   drm_buddy_free_list(, , 0);
+   drm_buddy_fini();
 }
 
 static void drm_test_buddy_alloc_clear(struct kunit *test)
-- 
2.25.1



Re: [PATCH v2 2/2] drm/tests: Add a unit test for range bias allocation

2024-05-13 Thread Paneer Selvam, Arunpravin

Hi Matthew,

On 5/13/2024 1:49 PM, Matthew Auld wrote:

On 12/05/2024 08:59, Arunpravin Paneer Selvam wrote:

Allocate cleared blocks in the bias range when the DRM
buddy's clear avail is zero. This will validate the bias
range allocation in scenarios like system boot when no
cleared blocks are available and exercise the fallback
path too. The resulting blocks should always be dirty.

Signed-off-by: Arunpravin Paneer Selvam 


---
  drivers/gpu/drm/tests/drm_buddy_test.c | 35 ++
  1 file changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c 
b/drivers/gpu/drm/tests/drm_buddy_test.c

index e3b50e240d36..a194f271bc55 100644
--- a/drivers/gpu/drm/tests/drm_buddy_test.c
+++ b/drivers/gpu/drm/tests/drm_buddy_test.c
@@ -26,6 +26,8 @@ static void drm_test_buddy_alloc_range_bias(struct 
kunit *test)

  u32 mm_size, ps, bias_size, bias_start, bias_end, bias_rem;
  DRM_RND_STATE(prng, random_seed);
  unsigned int i, count, *order;
+    struct drm_buddy_block *block;
+    unsigned long flags;
  struct drm_buddy mm;
  LIST_HEAD(allocated);
  @@ -222,6 +224,39 @@ static void 
drm_test_buddy_alloc_range_bias(struct kunit *test)

    drm_buddy_free_list(, , 0);
  drm_buddy_fini();
+
+    /*
+ * Allocate cleared blocks in the bias range when the DRM 
buddy's clear avail is
+ * zero. This will validate the bias range allocation in 
scenarios like system boot
+ * when no cleared blocks are available and exercise the 
fallback path too. The resulting

+ * blocks should always be dirty.
+ */
+
+    KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(, mm_size, ps),
+   "buddy_init failed\n");
+    mm.clear_avail = 0;


Should already be zero, right? Maybe make this an assert instead?
No, since the mm declared as a local variable in the test case, 
mm.clear_avail is not zero.



+
+    bias_start = round_up(prandom_u32_state() % (mm_size - ps), 
ps);
+    bias_end = round_up(bias_start + prandom_u32_state() % 
(mm_size - bias_start), ps);

+    bias_end = max(bias_end, bias_start + ps);
+    bias_rem = bias_end - bias_start;
+
+    flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION;
+    u32 size = max(round_up(prandom_u32_state() % bias_rem, 
ps), ps);


u32 declaration should be moved to above?

Sure.

Thanks,
Arun.


Otherwise,
Reviewed-by: Matthew Auld 


+
+    KUNIT_ASSERT_FALSE_MSG(test,
+   drm_buddy_alloc_blocks(, bias_start,
+  bias_end, size, ps,
+  ,
+  flags),
+   "buddy_alloc failed with bias(%x-%x), size=%u, 
ps=%u\n",

+   bias_start, bias_end, size, ps);
+
+    list_for_each_entry(block, , link)
+    KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false);
+
+    drm_buddy_free_list(, , 0);
+    drm_buddy_fini();
  }
    static void drm_test_buddy_alloc_clear(struct kunit *test)




Re: [PATCH] drm/buddy: Fix the range bias clear memory allocation issue

2024-05-12 Thread Paneer Selvam, Arunpravin

Hi Daniel,

On 5/8/2024 2:11 PM, Daniel Vetter wrote:

On Wed, May 08, 2024 at 12:27:20PM +0530, Arunpravin Paneer Selvam wrote:

Problem statement: During the system boot time, an application request
for the bulk volume of cleared range bias memory when the clear_avail
is zero, we dont fallback into normal allocation method as we had an
unnecessary clear_avail check which prevents the fallback method leads
to fb allocation failure following system goes into unresponsive state.

Solution: Remove the unnecessary clear_avail check in the range bias
allocation function.

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
Reviewed-by: Matthew Auld 
---
  drivers/gpu/drm/drm_buddy.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

Can you please also add a kunit test case to exercise this corner case and
make sure it stays fixed?

I have sent the v2 along with a kunit test case for this corner case.

Thanks,
Arun.


Thanks, Sima

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 284ebae71cc4..831929ac95eb 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -574,7 +574,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm,
  
  	block = __alloc_range_bias(mm, start, end, order,

   flags, fallback);
-   if (IS_ERR(block) && mm->clear_avail)
+   if (IS_ERR(block))
return __alloc_range_bias(mm, start, end, order,
  flags, !fallback);
  
--

2.25.1





[PATCH v2 1/2] drm/buddy: Fix the range bias clear memory allocation issue

2024-05-12 Thread Arunpravin Paneer Selvam
Problem statement: During the system boot time, an application request
for the bulk volume of cleared range bias memory when the clear_avail
is zero, we dont fallback into normal allocation method as we had an
unnecessary clear_avail check which prevents the fallback method leads
to fb allocation failure following system goes into unresponsive state.

Solution: Remove the unnecessary clear_avail check in the range bias
allocation function.

v2: add a kunit for this corner case (Daniel Vetter)

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/drm_buddy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 284ebae71cc4..831929ac95eb 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -574,7 +574,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm,
 
block = __alloc_range_bias(mm, start, end, order,
   flags, fallback);
-   if (IS_ERR(block) && mm->clear_avail)
+   if (IS_ERR(block))
return __alloc_range_bias(mm, start, end, order,
  flags, !fallback);
 
-- 
2.25.1



[PATCH v2 2/2] drm/tests: Add a unit test for range bias allocation

2024-05-12 Thread Arunpravin Paneer Selvam
Allocate cleared blocks in the bias range when the DRM
buddy's clear avail is zero. This will validate the bias
range allocation in scenarios like system boot when no
cleared blocks are available and exercise the fallback
path too. The resulting blocks should always be dirty.

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/tests/drm_buddy_test.c | 35 ++
 1 file changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c 
b/drivers/gpu/drm/tests/drm_buddy_test.c
index e3b50e240d36..a194f271bc55 100644
--- a/drivers/gpu/drm/tests/drm_buddy_test.c
+++ b/drivers/gpu/drm/tests/drm_buddy_test.c
@@ -26,6 +26,8 @@ static void drm_test_buddy_alloc_range_bias(struct kunit 
*test)
u32 mm_size, ps, bias_size, bias_start, bias_end, bias_rem;
DRM_RND_STATE(prng, random_seed);
unsigned int i, count, *order;
+   struct drm_buddy_block *block;
+   unsigned long flags;
struct drm_buddy mm;
LIST_HEAD(allocated);
 
@@ -222,6 +224,39 @@ static void drm_test_buddy_alloc_range_bias(struct kunit 
*test)
 
drm_buddy_free_list(, , 0);
drm_buddy_fini();
+
+   /*
+* Allocate cleared blocks in the bias range when the DRM buddy's clear 
avail is
+* zero. This will validate the bias range allocation in scenarios like 
system boot
+* when no cleared blocks are available and exercise the fallback path 
too. The resulting
+* blocks should always be dirty.
+*/
+
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(, mm_size, ps),
+  "buddy_init failed\n");
+   mm.clear_avail = 0;
+
+   bias_start = round_up(prandom_u32_state() % (mm_size - ps), ps);
+   bias_end = round_up(bias_start + prandom_u32_state() % (mm_size - 
bias_start), ps);
+   bias_end = max(bias_end, bias_start + ps);
+   bias_rem = bias_end - bias_start;
+
+   flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION;
+   u32 size = max(round_up(prandom_u32_state() % bias_rem, ps), ps);
+
+   KUNIT_ASSERT_FALSE_MSG(test,
+  drm_buddy_alloc_blocks(, bias_start,
+ bias_end, size, ps,
+ ,
+ flags),
+  "buddy_alloc failed with bias(%x-%x), size=%u, 
ps=%u\n",
+  bias_start, bias_end, size, ps);
+
+   list_for_each_entry(block, , link)
+   KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false);
+
+   drm_buddy_free_list(, , 0);
+   drm_buddy_fini();
 }
 
 static void drm_test_buddy_alloc_clear(struct kunit *test)
-- 
2.25.1



[PATCH v10 6/6] drm/amdgpu: Enable userq fence interrupt support

2024-05-10 Thread Arunpravin Paneer Selvam
Add support to handle the userqueue protected fence signal hardware
interrupt.

Create a xarray which maps the doorbell index to the fence driver address.
This would help to retrieve the fence driver information when an userq fence
interrupt is triggered. Firmware sends the doorbell offset value and
this info is compared with the queue's mqd doorbell offset value.
If they are same, we process the userq fence interrupt.

v1:(Christian)
  - use xa_load() instead of going over all entries
  - keep the xa_lock until the fence driver process completes
  - create a separate patch to remove the MES self test function
call.

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  2 ++
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   | 15 
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 23 +--
 4 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4ca14b02668b..2d5ef2e74c71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1043,6 +1043,8 @@ struct amdgpu_device {
struct amdgpu_mqd   mqds[AMDGPU_HW_IP_NUM];
const struct amdgpu_userq_funcs *userq_funcs[AMDGPU_HW_IP_NUM];
 
+   struct xarray   userq_xa;
+
/* df */
struct amdgpu_dfdf;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2d9fa3d0d4a4..fd919105a181 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3982,6 +3982,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
spin_lock_init(>audio_endpt_idx_lock);
spin_lock_init(>mm_stats.lock);
 
+   xa_init_flags(>userq_xa, XA_FLAGS_LOCK_IRQ);
+
INIT_LIST_HEAD(>shadow_list);
mutex_init(>shadow_list_lock);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 339d82d5808f..4cbc25595226 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -70,6 +70,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device 
*adev,
struct amdgpu_usermode_queue *userq)
 {
struct amdgpu_userq_fence_driver *fence_drv;
+   unsigned long flags;
int r;
 
fence_drv = kzalloc(sizeof(*fence_drv), GFP_KERNEL);
@@ -97,6 +98,11 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device 
*adev,
fence_drv->context = dma_fence_context_alloc(1);
get_task_comm(fence_drv->timeline_name, current);
 
+   xa_lock_irqsave(>userq_xa, flags);
+   __xa_store(>userq_xa, userq->doorbell_index,
+  fence_drv, GFP_KERNEL);
+   xa_unlock_irqrestore(>userq_xa, flags);
+
userq->fence_drv = fence_drv;
 
return 0;
@@ -147,8 +153,11 @@ void amdgpu_userq_fence_driver_destroy(struct kref *ref)
struct amdgpu_userq_fence_driver *fence_drv = container_of(ref,
 struct amdgpu_userq_fence_driver,
 refcount);
+   struct amdgpu_userq_fence_driver *xa_fence_drv;
struct amdgpu_device *adev = fence_drv->adev;
struct amdgpu_userq_fence *fence, *tmp;
+   struct xarray *xa = >userq_xa;
+   unsigned long index;
struct dma_fence *f;
 
spin_lock(_drv->fence_list_lock);
@@ -165,6 +174,12 @@ void amdgpu_userq_fence_driver_destroy(struct kref *ref)
}
spin_unlock(_drv->fence_list_lock);
 
+   xa_lock(xa);
+   xa_for_each(xa, index, xa_fence_drv)
+   if (xa_fence_drv == fence_drv)
+   __xa_erase(xa, index);
+   xa_unlock(xa);
+
/* Free seq64 memory */
amdgpu_seq64_free(adev, fence_drv->gpu_addr);
kfree(fence_drv);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index a786e25432ae..0a206f484240 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -49,6 +49,7 @@
 #include "gfx_v11_0_3.h"
 #include "nbio_v4_3.h"
 #include "mes_v11_0.h"
+#include "amdgpu_userq_fence.h"
 
 #define GFX11_NUM_GFX_RINGS1
 #define GFX11_MEC_HPD_SIZE 2048
@@ -5939,25 +5940,23 @@ static int gfx_v11_0_eop_irq(struct amdgpu_device *adev,
 struct amdgpu_irq_src *source,
 struct amdgpu_iv_entry *entry)
 {
-   int i;
+   u32 doorbell_offset = entry->src_data[0];
u8 me_id, pipe_id, queue_id;
struct amdgpu_ring *ring;
-   uint32_t mes_queue_id = entry->src_data[0];
+ 

[PATCH v10 1/6] drm/amdgpu: Implement a new userqueue fence driver

2024-05-10 Thread Arunpravin Paneer Selvam
Developed a userqueue fence driver for the userqueue process shared
BO synchronization.

Create a dma fence having write pointer as the seqno and allocate a
seq64 memory for each user queue process and feed this memory address
into the firmware/hardware, thus the firmware writes the read pointer
into the given address when the process completes it execution.
Compare wptr and rptr, if rptr >= wptr, signal the fences for the waiting
process to consume the buffers.

v2: Worked on review comments from Christian for the following
modifications

- Add wptr as sequence number into the fence
- Add a reference count for the fence driver
- Add dma_fence_put below the list_del as it might
  frees the userq fence.
- Trim unnecessary code in interrupt handler.
- Check dma fence signaled state in dma fence creation
  function for a potential problem of hardware completing
  the job processing beforehand.
- Add necessary locks.
- Create a list and process all the unsignaled fences.
- clean up fences in destroy function.
- implement .signaled callback function

v3: Worked on review comments from Christian
- Modify naming convention for reference counted objects
- Fix fence driver reference drop issue
- Drop amdgpu_userq_fence_driver_process() function return value

v4: Worked on review comments from Christian
- Moved fence driver allocation into amdgpu_userq_fence_driver_alloc()
- Added detail doc mentioning the differences b/w
  two spinlocks declared.

v5: Worked on review comments from Christian
- Check before upcast and remove local variable
- Add error handling in fence_drv alloc function.
- Move rptr read fn outside of the loop and remove WARN_ON in
  destroy function.

v6:
  - clear the seq64 memory in user fence driver(Christian)
  - fix for the wptr va bo mapping(Christian)
  - move the fence_drv xa entry erase code from the interrupt handler
into user fence destroy function

Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Christian König 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |   6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c |   4 +-
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   | 257 ++
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h   |  69 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c |  12 +-
 .../gpu/drm/amd/include/amdgpu_userqueue.h|   5 +-
 7 files changed, 349 insertions(+), 7 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index a640bfa468ad..f8dd808bd71c 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -80,7 +80,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o 
amdgpu_kms.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
-   amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o
+   amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o \
+   amdgpu_userq_fence.o
 
 amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index acee1c279abb..844f7b5f90db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -51,6 +51,7 @@
 #include "amdgpu_sched.h"
 #include "amdgpu_xgmi.h"
 #include "amdgpu_userqueue.h"
+#include "amdgpu_userq_fence.h"
 #include "../amdxcp/amdgpu_xcp_drv.h"
 
 /*
@@ -3012,6 +3013,10 @@ static int __init amdgpu_init(void)
if (r)
goto error_fence;
 
+   r = amdgpu_userq_fence_slab_init();
+   if (r)
+   goto error_fence;
+
DRM_INFO("amdgpu kernel modesetting enabled.\n");
amdgpu_register_atpx_handler();
amdgpu_acpi_detect();
@@ -3037,6 +3042,7 @@ static void __exit amdgpu_exit(void)
amdgpu_acpi_release();
amdgpu_sync_fini();
amdgpu_fence_slab_fini();
+   amdgpu_userq_fence_slab_fini();
mmu_notifier_synchronize();
amdgpu_xcp_drv_release();
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
index e22cb2b5cd92..3ef1d7f4fd23 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
@@ -45,7 +45,7 @@
  */
 static inline u64 amdgpu_seq64_get_va_base(struct amdgpu_device *adev)
 {
-   return AMDGPU_VA_RESERVED_SEQ64_START(adev);
+   return AMDGPU_VA_RESERVED_BOTTOM;
 }
 
 /**
@@ -90,7 +90,7 @@ int amdgpu_seq64_map(struct amdgpu_device *adev, struct 

[PATCH v10 4/6] drm/amdgpu: Implement userqueue signal/wait IOCTL

2024-05-10 Thread Arunpravin Paneer Selvam
This patch introduces new IOCTL for userqueue secure semaphore.

The signal IOCTL called from userspace application creates a drm
syncobj and array of bo GEM handles and passed in as parameter to
the driver to install the fence into it.

The wait IOCTL gets an array of drm syncobjs, finds the fences
attached to the drm syncobjs and obtain the array of
memory_address/fence_value combintion which are returned to
userspace.

v2: (Christian)
- Install fence into GEM BO object.
- Lock all BO's using the dma resv subsystem
- Reorder the sequence in signal IOCTL function.
- Get write pointer from the shadow wptr
- use userq_fence to fetch the va/value in wait IOCTL.

v3: (Christian)
- Use drm_exec helper for the proper BO drm reserve and avoid BO
  lock/unlock issues.
- fence/fence driver reference count logic for signal/wait IOCTLs.

v4: (Christian)
- Fixed the drm_exec calling sequence
- use dma_resv_for_each_fence_unlock if BO's are not locked
- Modified the fence_info array storing logic.

v5: (Christian)
- Keep fence_drv until wait queue execution.
- Add dma_fence_wait for other fences.
- Lock BO's using drm_exec as the number of fences in them could
  change.
- Install signaled fences as well into BO/Syncobj.
- Move Syncobj fence installation code after the drm_exec_prepare_array.
- Directly add dma_resv_usage_rw(args->bo_flags
- remove unnecessary dma_fence_put.

v6: (Christian)
- Add xarray stuff to store the fence_drv
- Implement a function to iterate over the xarray and drop
  the fence_drv references.
- Add drm_exec_until_all_locked() wrapper
- Add a check that if we haven't exceeded the user allocated num_fences
  before adding dma_fence to the fences array.

v7: (Christian)
- Use memdup_user() for kmalloc_array + copy_from_user
- Move the fence_drv references from the xarray into the newly created fence
  and drop the fence_drv references when we signal this fence.
- Move this locking of BOs before the "if (!wait_info->num_fences)",
  this way you need this code block only once.
- Merge the error handling code and the cleanup + return 0 code.
- Initializing the xa should probably be done in the userq code.
- Remove the userq back pointer stored in fence_drv.
- Pass xarray as parameter in amdgpu_userq_walk_and_drop_fence_drv()

v8: (Christian)
- Move fence_drv references must come before adding the fence to the list.
- Use xa_lock_irqsave_nested for nested spinlock operations.
- userq_mgr should be per fpriv and not one per device.
- Restructure the interrupt process code for the early exit of the loop.
- The reference acquired in the syncobj fence replace code needs to be
  kept around.
- Modify the dma_fence acquire placement in wait IOCTL.
- Move USERQ_BO_WRITE flag to UAPI header file.
- drop the fence drv reference after telling the hw to stop accessing it.
- Add multi sync object support to userq signal IOCTL.

v9: (Christian)
- Store all the fence_drv ref to other drivers and not ourself.
- Iterate over uq_fence_drv_xa without holding a lock.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |   2 +
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   | 431 +-
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h   |   6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c |  29 +-
 .../gpu/drm/amd/include/amdgpu_userqueue.h|   1 +
 5 files changed, 462 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 844f7b5f90db..5892a4c1a92e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2918,6 +2918,8 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
DRM_IOCTL_DEF_DRV(AMDGPU_GEM_OP, amdgpu_gem_op_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_GEM_USERPTR, amdgpu_gem_userptr_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_USERQ, amdgpu_userq_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
+   DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
+   DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
 };
 
 static const struct drm_driver amdgpu_kms_driver = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index f7baea2c67ab..339d82d5808f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -92,6 +93,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device 
*adev,
spin_lock_init(_drv->fence_list_lock);
 
fence_drv-&g

[PATCH v10 5/6] drm/amdgpu: Remove the MES self test

2024-05-10 Thread Arunpravin Paneer Selvam
Remove MES self test as this conflicts the userqueue fence
interrupts.

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  3 ---
 drivers/gpu/drm/amd/amdgpu/mes_v10_1.c | 12 +---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 14 +-
 3 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7753a2e64d41..2d9fa3d0d4a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4719,9 +4719,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
fbcon)
}
adev->in_suspend = false;
 
-   if (adev->enable_mes)
-   amdgpu_mes_self_test(adev);
-
if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
DRM_WARN("smart shift update failed\n");
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c
index 4d1121d1a1e7..b7bfb3185a30 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c
@@ -1161,20 +1161,10 @@ static int mes_v10_0_early_init(void *handle)
return 0;
 }
 
-static int mes_v10_0_late_init(void *handle)
-{
-   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
-   if (!amdgpu_in_reset(adev))
-   amdgpu_mes_self_test(adev);
-
-   return 0;
-}
-
 static const struct amd_ip_funcs mes_v10_1_ip_funcs = {
.name = "mes_v10_1",
.early_init = mes_v10_0_early_init,
-   .late_init = mes_v10_0_late_init,
+   .late_init = NULL,
.sw_init = mes_v10_1_sw_init,
.sw_fini = mes_v10_1_sw_fini,
.hw_init = mes_v10_1_hw_init,
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index feb7fa2c304c..5923b7b0bd95 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -1274,22 +1274,10 @@ static int mes_v11_0_early_init(void *handle)
return 0;
 }
 
-static int mes_v11_0_late_init(void *handle)
-{
-   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
-   /* it's only intended for use in mes_self_test case, not for s0ix and 
reset */
-   if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend &&
-   (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(11, 0, 3)))
-   amdgpu_mes_self_test(adev);
-
-   return 0;
-}
-
 static const struct amd_ip_funcs mes_v11_0_ip_funcs = {
.name = "mes_v11_0",
.early_init = mes_v11_0_early_init,
-   .late_init = mes_v11_0_late_init,
+   .late_init = NULL,
.sw_init = mes_v11_0_sw_init,
.sw_fini = mes_v11_0_sw_fini,
.hw_init = mes_v11_0_hw_init,
-- 
2.25.1



[PATCH v10 3/6] drm/amdgpu: UAPI headers for userqueue secure semaphore

2024-05-10 Thread Arunpravin Paneer Selvam
Add UAPI header support for userqueue Secure semaphore

v2: Worked on review comments from Christian for the following
modifications

- Add bo handles, bo flags and padding fields.
- Include value/va in a combined array.

v3: Worked on review comments from Christian

- Add num_fences field to obtain the number of objects required
  to allocate memory for userq_fence_info.
- Replace obj_handle name with syncobj_handle.
- Replace point name with syncobj_point.
- Replace count_handles name with num_syncobj_handles.
- Fix structure padding related issues.

v4: Worked on review comments from Christian
- Modify the bo flags description.

Signed-off-by: Alex Deucher 
Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Christian König 
---
 include/uapi/drm/amdgpu_drm.h | 110 ++
 1 file changed, 110 insertions(+)

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index def6874a588c..4ab965acfe06 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -55,6 +55,8 @@ extern "C" {
 #define DRM_AMDGPU_FENCE_TO_HANDLE 0x14
 #define DRM_AMDGPU_SCHED   0x15
 #define DRM_AMDGPU_USERQ   0x16
+#define DRM_AMDGPU_USERQ_SIGNAL0x17
+#define DRM_AMDGPU_USERQ_WAIT  0x18
 
 #define DRM_IOCTL_AMDGPU_GEM_CREATEDRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create)
 #define DRM_IOCTL_AMDGPU_GEM_MMAP  DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
@@ -73,6 +75,8 @@ extern "C" {
 #define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle)
 #define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + 
DRM_AMDGPU_SCHED, union drm_amdgpu_sched)
 #define DRM_IOCTL_AMDGPU_USERQ DRM_IOW(DRM_COMMAND_BASE + 
DRM_AMDGPU_USERQ, union drm_amdgpu_userq)
+#define DRM_IOCTL_AMDGPU_USERQ_SIGNAL  DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal)
+#define DRM_IOCTL_AMDGPU_USERQ_WAITDRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait)
 
 /**
  * DOC: memory domains
@@ -431,6 +435,112 @@ union drm_amdgpu_userq {
struct drm_amdgpu_userq_out out;
 };
 
+/* dma_resv usage flag */
+#define AMDGPU_USERQ_BO_WRITE  1
+
+/* userq signal/wait ioctl */
+struct drm_amdgpu_userq_signal {
+   /**
+* @queue_id: Queue handle used by the userq fence creation function
+* to retrieve the WPTR.
+*/
+   __u32   queue_id;
+   /**
+* @flags: flags to indicate special function for userq fence creation.
+* Unused for now.
+*/
+   __u32   flags;
+   /**
+* @syncobj_handles_array: An array of syncobj handles used by the 
userq fence
+* creation IOCTL to install the created dma_fence object which can be
+* utilized by userspace to explicitly synchronize GPU commands.
+*/
+   __u64   syncobj_handles_array;
+   /**
+* @num_syncobj_handles: A count that represents the number of syncobj 
handles in
+* @syncobj_handles_array.
+*/
+   __u64   num_syncobj_handles;
+   /**
+* @syncobj_point: A given point on the timeline to be signaled.
+* Unused for now.
+*/
+   __u64   syncobj_point;
+   /**
+* @bo_handles_array: An array of GEM BO handles used by the userq 
fence creation
+* IOCTL to install the created dma_fence object which can be utilized 
by
+* userspace to synchronize the BO usage between user processes.
+*/
+   __u64   bo_handles_array;
+   /**
+* @num_bo_handles: A count that represents the number of GEM BO 
handles in
+* @bo_handles_array.
+*/
+   __u32   num_bo_handles;
+   /**
+* @bo_flags: flags to indicate BOs synchronize for READ or WRITE
+*/
+   __u32   bo_flags;
+};
+
+struct drm_amdgpu_userq_fence_info {
+   /**
+* @va: A gpu address allocated for each queue which stores the
+* read pointer (RPTR) value.
+*/
+   __u64   va;
+   /**
+* @value: A 64 bit value represents the write pointer (WPTR) of the
+* queue commands which compared with the RPTR value to signal the
+* fences.
+*/
+   __u64   value;
+};
+
+struct drm_amdgpu_userq_wait {
+   /**
+* @flags: flags to specify special function for userq wait information.
+* Unused for now.
+*/
+   __u32   flags;
+   /**
+* @bo_wait_flags: flags to define the BOs for READ or WRITE to store 
the
+* matching fence wait info pair in @userq_fence_info.
+*/
+   __u32   bo_wait_flags;
+   __u32   pad;
+   /**
+* @syncobj_handles_array: An array of syncobj handles defined to get 
the
+* fence wait 

[PATCH v10 2/6] drm/amdgpu: Add mqd support for the fence address

2024-05-10 Thread Arunpravin Paneer Selvam
- Add a field in struct v11_gfx_mqd for userqueue
  fence address.

- Assign fence gpu VA address to the userqueue mqd
  fence address fields.

v2: Remove the mask and replace with lower_32_bits (Christian)

Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c | 11 +++
 drivers/gpu/drm/amd/include/v11_structs.h|  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c
index 6369a26fb07e..d8592cfef534 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c
@@ -25,6 +25,7 @@
 #include "v11_structs.h"
 #include "mes_v11_0.h"
 #include "amdgpu_userqueue.h"
+#include "amdgpu_userq_fence.h"
 
 #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE
 #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE
@@ -204,6 +205,14 @@ static int mes_v11_0_userq_create_ctx_space(struct 
amdgpu_userq_mgr *uq_mgr,
return 0;
 }
 
+static void mes_v11_0_userq_set_fence_space(struct amdgpu_usermode_queue 
*queue)
+{
+   struct v11_gfx_mqd *mqd = queue->mqd.cpu_ptr;
+
+   mqd->fenceaddress_lo = lower_32_bits(queue->fence_drv->gpu_addr);
+   mqd->fenceaddress_hi = upper_32_bits(queue->fence_drv->gpu_addr);
+}
+
 static int mes_v11_0_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
  struct drm_amdgpu_userq_in *args_in,
  struct amdgpu_usermode_queue *queue)
@@ -274,6 +283,8 @@ static int mes_v11_0_userq_mqd_create(struct 
amdgpu_userq_mgr *uq_mgr,
goto free_mqd;
}
 
+   mes_v11_0_userq_set_fence_space(queue);
+
/* FW expects WPTR BOs to be mapped into GART */
r = mes_v11_0_create_wptr_mapping(uq_mgr, queue, 
userq_props->wptr_gpu_addr);
if (r) {
diff --git a/drivers/gpu/drm/amd/include/v11_structs.h 
b/drivers/gpu/drm/amd/include/v11_structs.h
index f8008270f813..797ce6a1e56e 100644
--- a/drivers/gpu/drm/amd/include/v11_structs.h
+++ b/drivers/gpu/drm/amd/include/v11_structs.h
@@ -535,8 +535,8 @@ struct v11_gfx_mqd {
uint32_t reserved_507; // offset: 507  (0x1FB)
uint32_t reserved_508; // offset: 508  (0x1FC)
uint32_t reserved_509; // offset: 509  (0x1FD)
-   uint32_t reserved_510; // offset: 510  (0x1FE)
-   uint32_t reserved_511; // offset: 511  (0x1FF)
+   uint32_t fenceaddress_lo; // offset: 510  (0x1FE)
+   uint32_t fenceaddress_hi; // offset: 511  (0x1FF)
 };
 
 struct v11_sdma_mqd {
-- 
2.25.1



[PATCH] drm/buddy: Fix the range bias clear memory allocation issue

2024-05-08 Thread Arunpravin Paneer Selvam
Problem statement: During the system boot time, an application request
for the bulk volume of cleared range bias memory when the clear_avail
is zero, we dont fallback into normal allocation method as we had an
unnecessary clear_avail check which prevents the fallback method leads
to fb allocation failure following system goes into unresponsive state.

Solution: Remove the unnecessary clear_avail check in the range bias
allocation function.

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/drm_buddy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 284ebae71cc4..831929ac95eb 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -574,7 +574,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm,
 
block = __alloc_range_bias(mm, start, end, order,
   flags, fallback);
-   if (IS_ERR(block) && mm->clear_avail)
+   if (IS_ERR(block))
return __alloc_range_bias(mm, start, end, order,
  flags, !fallback);
 
-- 
2.25.1



[PATCH] drm/amdgpu: Fix the BO release clear memory warning

2024-05-06 Thread Arunpravin Paneer Selvam
038638]  ? srso_alias_return_thunk+0x5/0xfbef5
[6.038639]  ? do_syscall_64+0x90/0x1a0
[6.038640]  ? srso_alias_return_thunk+0x5/0xfbef5
[6.038642]  ? srso_alias_return_thunk+0x5/0xfbef5
[6.038644]  entry_SYSCALL_64_after_hwframe+0x78/0x80
[6.038645] RIP: 0033:0x7f2bd1e9d059
[6.038647] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 
f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 
f0 ff ff 73 01 c3 48 8b 0d 8f 1d 0d 00 f7 d8 64 89 01 48
[6.038648] RSP: 002b:7fffaf804878 EFLAGS: 0246 ORIG_RAX: 
0139
[6.038650] RAX: ffda RBX: 55a9b2328d60 RCX: 7f2bd1e9d059
[6.038650] RDX:  RSI: 7f2bd1fd0509 RDI: 0024
[6.038651] RBP:  R08: 0040 R09: 55a9b23000a0
[6.038652] R10: 0038 R11: 0246 R12: 7f2bd1fd0509
[6.038652] R13: 0002 R14: 55a9b2326f90 R15: 
[6.038655]  
[6.038656] ---[ end trace  ]---

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index b2a83c802bbd..ecd0cf6cdd2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1371,6 +1371,9 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
adev->in_suspend || drm_dev_is_unplugged(adev_to_drm(adev)))
return;
 
+   if (!adev->mman.buffer_funcs_enabled)
+   return;
+
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
return;
 
-- 
2.25.1



[PATCH] drm/buddy: Fix the range bias clear memory allocation issue

2024-05-06 Thread Arunpravin Paneer Selvam
Problem statement: During the system boot time, an application request
for the bulk volume of cleared range bias memory when the clear_avail
is zero, we dont fallback into normal allocation method as we had an
unnecessary clear_avail check which prevents the fallback method leads
to fb allocation failure following system goes into unresponsive state.

Solution: Remove the unnecessary clear_avail check in the range bias
allocation function.

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
---
 drivers/gpu/drm/drm_buddy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 284ebae71cc4..831929ac95eb 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -574,7 +574,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm,
 
block = __alloc_range_bias(mm, start, end, order,
   flags, fallback);
-   if (IS_ERR(block) && mm->clear_avail)
+   if (IS_ERR(block))
return __alloc_range_bias(mm, start, end, order,
  flags, !fallback);
 
-- 
2.25.1



Re: Splat during driver probe

2024-05-03 Thread Paneer Selvam, Arunpravin

Hi Alex,

yes, this is related to memory clearing changes. This is a known issue.
I am working on a fix.

Thanks,
Arun.

On 5/3/2024 6:46 PM, Alex Deucher wrote:

Possibly related to Arun's new memory clearing changes.  @Arunpravin
can you take a look?

Alex

On Fri, May 3, 2024 at 9:10 AM Tvrtko Ursulin  wrote:


Hi,

I tried asking on #radeon yesterday but no takers. I was wondering if
the below splat is already known or not.

Appears to happen due amdgpu_bo_release_notify genuniely running before
amdgpu_ttm_set_buffer_funcs_status set the buffer funcs to enabled
during device probe.

I couldn't easily figure out what changed.

Regards,

Tvrtko

[   11.802030] [drm:amdgpu_fill_buffer [amdgpu]] *ERROR* Trying to clear
memory with ring turned off.
[   11.802519] [ cut here ]
[   11.802530] WARNING: CPU: 5 PID: 435 at
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1378
amdgpu_bo_release_notify+0x20c/0x230 [amdgpu]
[   11.802916] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6
nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct
amdgpu(E+) nft_chain_nat nf_tables ip6table_nat ip6table_mangle
ip6table_raw ip6table_security iptable_nat nf_nat nf_conntrack
nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle iptable_raw
iptable_security nfnetlink ip6table_filter ip6_tables iptable_filter
cmac algif_hash ramoops algif_skcipher reed_solomon af_alg bnep qrtr
mousedev intel_rapl_msr ath11k_pci(+) intel_rapl_common mhi
snd_acp_sof_mach snd_acp_mach snd_sof_probes ath11k snd_soc_dmic kvm_amd
cdc_mbim qmi_helpers joydev hid_multitouch cdc_wdm snd_sof_amd_vangogh
snd_sof_pci kvm mac80211 snd_hda_codec_hdmi hci_uart crct10dif_pclmul
amdxcp i2c_algo_bit snd_sof_amd_acp btqca drm_ttm_helper libarc4
crc32_pclmul btrtl snd_hda_intel snd_sof_xtensa_dsp btbcm ttm
snd_intel_dspcfg btintel polyval_clmulni snd_sof drm_exec
polyval_generic snd_hda_codec snd_sof_utils gpu_sched cfg80211 bluetooth
snd_pci_acp5x gf128mul cdc_ncm
[   11.803070]  sp5100_tco snd_hwdep ghash_clmulni_intel snd_soc_nau8821
snd_soc_max98388 drm_suballoc_helper sha512_ssse3 cdc_ether
snd_acp_config drm_buddy atkbd snd_soc_core aesni_intel snd_hda_core
video ecdh_generic crypto_simd libps2 usbnet cryptd rapl mii wdat_wdt
vivaldi_fmap pcspkr snd_compress i2c_piix4 snd_pcm drm_display_helper
ccp snd_soc_acpi cdc_acm rfkill wmi ltrf216a 8250_dw i2c_hid_acpi
snd_timer snd i2c_hid industrialio soundcore hid_steam pkcs8_key_parser
crypto_user loop fuse dm_mod ip_tables x_tables overlay ext4 crc16
mbcache jbd2 usbhid vfat fat btrfs blake2b_generic libcrc32c
crc32c_generic xor raid6_pq sdhci_pci cqhci nvme serio_raw sdhci
xhci_pci xhci_pci_renesas crc32c_intel mmc_core nvme_core i8042 serio
spi_amd
[   11.803448] CPU: 5 PID: 435 Comm: (udev-worker) Tainted: GW
E  6.9.0-rc6 #3 dd3fb65c639fd86ff89731c604b1e804996e8989
[   11.803471] Hardware name: Valve Galileo/Galileo, BIOS F7G0107 12/01/2023
[   11.803486] RIP: 0010:amdgpu_bo_release_notify+0x20c/0x230 [amdgpu]
[   11.803857] Code: 0b e9 a4 fe ff ff 48 ba ff ff ff ff ff ff ff 7f 31
f6 4c 89 e7 e8 44 e5 33 e6 eb 8d e8 dd dd 33 e6 eb a7 0f 0b e9 4d fe ff
ff <0f> 0b eb 9c be 03 00 00 00 e8 f6 04 00 e6 eb 90 e8 8f 64 79 e6 66
[   11.803890] RSP: 0018:a77bc1537568 EFLAGS: 00010282
[   11.803904] RAX: ffea RBX: 8e1f0da89048 RCX:

[   11.803919] RDX:  RSI: 0027 RDI:

[   11.803934] RBP: 8e1f6ec0ffb0 R08:  R09:
0003
[   11.803948] R10: a77bc15372f0 R11: 8e223ef7ffe8 R12:
8e1f0da89000
[   11.803963] R13: 8e1f0da89180 R14: 8e1f6ec0ffb0 R15:
8e1f6ec0
[   11.803977] FS:  7fa2b600b200() GS:8e222ee8()
knlGS:
[   11.803994] CS:  0010 DS:  ES:  CR0: 80050033
[   11.804007] CR2: 55cac2aa7080 CR3: 00010f818000 CR4:
00350ef0
[   11.804021] Call Trace:
[   11.804031]  
[   11.804042]  ? __warn+0x8c/0x180
[   11.804057]  ? amdgpu_bo_release_notify+0x20c/0x230 [amdgpu
03b1dca7e09918e3f45efb1acdfc90682f73e4c1]
[   11.804456]  ? report_bug+0x191/0x1c0
[   11.804473]  ? handle_bug+0x3a/0x70
[   11.804485]  ? exc_invalid_op+0x17/0x70
[   11.804497]  ? asm_exc_invalid_op+0x1a/0x20
[   11.804517]  ? amdgpu_bo_release_notify+0x20c/0x230 [amdgpu
03b1dca7e09918e3f45efb1acdfc90682f73e4c1]
[   11.804894]  ? amdgpu_bo_release_notify+0x14a/0x230 [amdgpu
03b1dca7e09918e3f45efb1acdfc90682f73e4c1]
[   11.805277]  ttm_bo_release+0x115/0x330 [ttm
39c917822ce73b2a754d4183af7a0990991c66be]
[   11.805303]  ? srso_return_thunk+0x5/0x5f
[   11.805315]  ? __mutex_unlock_slowpath+0x3a/0x290
[   11.805332]  amdgpu_bo_free_kernel+0xd6/0x130 [amdgpu
03b1dca7e09918e3f45efb1acdfc90682f73e4c1]
[   11.805709]  dm_helpers_free_gpu_mem+0x41/0x80 [amdgpu
03b1dca7e09918e3f45efb1acdfc90682f73e4c1]
[   11.806172]  vg_clk_mgr_construct+0x2c4/0x490 [amdgpu
03b1dca7e09918e3f45efb1acdfc90682f73e4c1]
[   11.

[PATCH v9 5/5] drm/amdgpu: Enable userq fence interrupt support

2024-04-29 Thread Arunpravin Paneer Selvam
Add support to handle the userqueue protected fence signal hardware
interrupt.

Create a xarray which maps the doorbell index to the fence driver address.
This would help to retrieve the fence driver information when an userq fence
interrupt is triggered. Firmware sends the doorbell offset value and
this info is compared with the queue's mqd doorbell offset value.
If they are same, we process the userq fence interrupt.

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  5 ++--
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  6 +
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 25 ++-
 drivers/gpu/drm/amd/amdgpu/mes_v10_1.c|  5 
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c|  7 --
 6 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4ca14b02668b..2d5ef2e74c71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1043,6 +1043,8 @@ struct amdgpu_device {
struct amdgpu_mqd   mqds[AMDGPU_HW_IP_NUM];
const struct amdgpu_userq_funcs *userq_funcs[AMDGPU_HW_IP_NUM];
 
+   struct xarray   userq_xa;
+
/* df */
struct amdgpu_dfdf;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7753a2e64d41..fd919105a181 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3982,6 +3982,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
spin_lock_init(>audio_endpt_idx_lock);
spin_lock_init(>mm_stats.lock);
 
+   xa_init_flags(>userq_xa, XA_FLAGS_LOCK_IRQ);
+
INIT_LIST_HEAD(>shadow_list);
mutex_init(>shadow_list_lock);
 
@@ -4719,9 +4721,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
fbcon)
}
adev->in_suspend = false;
 
-   if (adev->enable_mes)
-   amdgpu_mes_self_test(adev);
-
if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
DRM_WARN("smart shift update failed\n");
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 6fb75cc1d20c..614953b0fc19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -70,6 +70,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device 
*adev,
struct amdgpu_usermode_queue *userq)
 {
struct amdgpu_userq_fence_driver *fence_drv;
+   unsigned long flags;
int r;
 
fence_drv = kzalloc(sizeof(*fence_drv), GFP_KERNEL);
@@ -96,6 +97,11 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device 
*adev,
fence_drv->context = dma_fence_context_alloc(1);
get_task_comm(fence_drv->timeline_name, current);
 
+   xa_lock_irqsave(>userq_xa, flags);
+   __xa_store(>userq_xa, userq->doorbell_index,
+  fence_drv, GFP_KERNEL);
+   xa_unlock_irqrestore(>userq_xa, flags);
+
userq->fence_drv = fence_drv;
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index a786e25432ae..d6cdca0a652f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -49,6 +49,7 @@
 #include "gfx_v11_0_3.h"
 #include "nbio_v4_3.h"
 #include "mes_v11_0.h"
+#include "amdgpu_userq_fence.h"
 
 #define GFX11_NUM_GFX_RINGS1
 #define GFX11_MEC_HPD_SIZE 2048
@@ -5939,25 +5940,25 @@ static int gfx_v11_0_eop_irq(struct amdgpu_device *adev,
 struct amdgpu_irq_src *source,
 struct amdgpu_iv_entry *entry)
 {
-   int i;
+   u32 doorbell_offset = entry->src_data[0];
u8 me_id, pipe_id, queue_id;
struct amdgpu_ring *ring;
-   uint32_t mes_queue_id = entry->src_data[0];
+   int i;
 
DRM_DEBUG("IH: CP EOP\n");
 
-   if (adev->enable_mes && (mes_queue_id & AMDGPU_FENCE_MES_QUEUE_FLAG)) {
-   struct amdgpu_mes_queue *queue;
+   if (adev->enable_mes && doorbell_offset) {
+   struct amdgpu_userq_fence_driver *fence_drv = NULL;
+   struct xarray *xa = >userq_xa;
+   unsigned long index, flags;
 
-   mes_queue_id &= AMDGPU_FENCE_MES_QUEUE_ID_MASK;
+   xa_lock_irqsave(xa, flags);
+   xa_for_each(xa, index, fence_drv)
+   if (doorbell_offset == index)
+   break;
+   xa_unlock_irqrestore(xa, flags);
 
-   spin_lock(>mes.queue_id_lock);
-   

[PATCH v9 3/5] drm/amdgpu: UAPI headers for userqueue Secure semaphore

2024-04-29 Thread Arunpravin Paneer Selvam
Add UAPI header support for userqueue Secure semaphore

v2: Worked on review comments from Christian for the following
modifications

- Add bo handles, bo flags and padding fields.
- Include value/va in a combined array.

v3: Worked on review comments from Christian

- Add num_fences field to obtain the number of objects required
  to allocate memory for userq_fence_info.
- Replace obj_handle name with syncobj_handle.
- Replace point name with syncobj_point.
- Replace count_handles name with num_syncobj_handles.
- Fix structure padding related issues.

v4: Worked on review comments from Christian
- Modify the bo flags description.

Signed-off-by: Alex Deucher 
Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Christian König 
---
 include/uapi/drm/amdgpu_drm.h | 115 ++
 1 file changed, 115 insertions(+)

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index def6874a588c..1217ae30a8d8 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -55,6 +55,8 @@ extern "C" {
 #define DRM_AMDGPU_FENCE_TO_HANDLE 0x14
 #define DRM_AMDGPU_SCHED   0x15
 #define DRM_AMDGPU_USERQ   0x16
+#define DRM_AMDGPU_USERQ_SIGNAL0x17
+#define DRM_AMDGPU_USERQ_WAIT  0x18
 
 #define DRM_IOCTL_AMDGPU_GEM_CREATEDRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create)
 #define DRM_IOCTL_AMDGPU_GEM_MMAP  DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
@@ -73,6 +75,8 @@ extern "C" {
 #define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle)
 #define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + 
DRM_AMDGPU_SCHED, union drm_amdgpu_sched)
 #define DRM_IOCTL_AMDGPU_USERQ DRM_IOW(DRM_COMMAND_BASE + 
DRM_AMDGPU_USERQ, union drm_amdgpu_userq)
+#define DRM_IOCTL_AMDGPU_USERQ_SIGNAL  DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal)
+#define DRM_IOCTL_AMDGPU_USERQ_WAITDRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait)
 
 /**
  * DOC: memory domains
@@ -431,6 +435,117 @@ union drm_amdgpu_userq {
struct drm_amdgpu_userq_out out;
 };
 
+/* dma_resv usage flag */
+#define AMDGPU_USERQ_BO_WRITE  1
+
+/* userq signal/wait ioctl */
+struct drm_amdgpu_userq_signal {
+   /**
+* @queue_id: Queue handle used by the userq fence creation function
+* to retrieve the WPTR.
+*/
+   __u32   queue_id;
+   /**
+* @flags: flags to indicate special function for userq fence creation.
+* Unused for now.
+*/
+   __u32   flags;
+   /**
+* @syncobj_handles_array: An array of syncobj handles used by the 
userq fence
+* creation IOCTL to install the created dma_fence object which can be
+* utilized by userspace to explicitly synchronize GPU commands.
+*/
+   __u64   syncobj_handles_array;
+   /**
+* @num_syncobj_handles: A count that represents the number of syncobj 
handles in
+* @syncobj_handles_array.
+*/
+   __u64   num_syncobj_handles;
+   /**
+* @syncobj_point: A given point on the timeline to be signaled.
+* Unused for now.
+*/
+   __u64   syncobj_point;
+   /**
+* @bo_handles_array: An array of GEM BO handles used by the userq 
fence creation
+* IOCTL to install the created dma_fence object which can be utilized 
by
+* userspace to synchronize the BO usage between user processes.
+*/
+   __u64   bo_handles_array;
+   /**
+* @num_bo_handles: A count that represents the number of GEM BO 
handles in
+* @bo_handles_array.
+*/
+   __u32   num_bo_handles;
+   /**
+* @bo_flags: flags to indicate BOs synchronize for READ or WRITE
+*/
+   __u32   bo_flags;
+};
+
+struct drm_amdgpu_userq_fence_info {
+   /**
+* @va: A gpu address allocated for each queue which stores the
+* read pointer (RPTR) value.
+*/
+   __u64   va;
+   /**
+* @value: A 64 bit value represents the write pointer (WPTR) of the
+* queue commands which compared with the RPTR value to signal the
+* fences.
+*/
+   __u64   value;
+};
+
+struct drm_amdgpu_userq_wait {
+   /**
+* @waitq_id: Queue handle used to retrieve the queue information to 
store
+* the fence driver references in the wait user queue structure.
+*/
+   __u32   waitq_id;
+   /**
+* @flags: flags to specify special function for userq wait information.
+* Unused for now.
+*/
+   __u32   flags;
+   /**
+* @bo_wait_flags: flags to define the BOs for READ or WRITE to store 
the
+* matching 

[PATCH v9 4/5] drm/amdgpu: Implement userqueue signal/wait IOCTL

2024-04-29 Thread Arunpravin Paneer Selvam
This patch introduces new IOCTL for userqueue secure semaphore.

The signal IOCTL called from userspace application creates a drm
syncobj and array of bo GEM handles and passed in as parameter to
the driver to install the fence into it.

The wait IOCTL gets an array of drm syncobjs, finds the fences
attached to the drm syncobjs and obtain the array of
memory_address/fence_value combintion which are returned to
userspace.

v2: (Christian)
- Install fence into GEM BO object.
- Lock all BO's using the dma resv subsystem
- Reorder the sequence in signal IOCTL function.
- Get write pointer from the shadow wptr
- use userq_fence to fetch the va/value in wait IOCTL.

v3: (Christian)
- Use drm_exec helper for the proper BO drm reserve and avoid BO
  lock/unlock issues.
- fence/fence driver reference count logic for signal/wait IOCTLs.

v4: (Christian)
- Fixed the drm_exec calling sequence
- use dma_resv_for_each_fence_unlock if BO's are not locked
- Modified the fence_info array storing logic.

v5: (Christian)
- Keep fence_drv until wait queue execution.
- Add dma_fence_wait for other fences.
- Lock BO's using drm_exec as the number of fences in them could
  change.
- Install signaled fences as well into BO/Syncobj.
- Move Syncobj fence installation code after the drm_exec_prepare_array.
- Directly add dma_resv_usage_rw(args->bo_flags
- remove unnecessary dma_fence_put.

v6: (Christian)
- Add xarray stuff to store the fence_drv
- Implement a function to iterate over the xarray and drop
  the fence_drv references.
- Add drm_exec_until_all_locked() wrapper
- Add a check that if we haven't exceeded the user allocated num_fences
  before adding dma_fence to the fences array.

v7: (Christian)
- Use memdup_user() for kmalloc_array + copy_from_user
- Move the fence_drv references from the xarray into the newly created fence
  and drop the fence_drv references when we signal this fence.
- Move this locking of BOs before the "if (!wait_info->num_fences)",
  this way you need this code block only once.
- Merge the error handling code and the cleanup + return 0 code.
- Initializing the xa should probably be done in the userq code.
- Remove the userq back pointer stored in fence_drv.
- Pass xarray as parameter in amdgpu_userq_walk_and_drop_fence_drv()

v8: (Christian)
- Move fence_drv references must come before adding the fence to the list.
- Use xa_lock_irqsave_nested for nested spinlock operations.
- userq_mgr should be per fpriv and not one per device.
- Restructure the interrupt process code for the early exit of the loop.
- The reference acquired in the syncobj fence replace code needs to be
  kept around.
- Modify the dma_fence acquire placement in wait IOCTL.
- Move USERQ_BO_WRITE flag to UAPI header file.
- drop the fence drv reference after telling the hw to stop accessing it.
- Add multi sync object support to userq signal IOCTL.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |   2 +
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   | 454 +-
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h   |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c |  29 +-
 .../gpu/drm/amd/include/amdgpu_userqueue.h|   1 +
 5 files changed, 484 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 844f7b5f90db..5892a4c1a92e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2918,6 +2918,8 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
DRM_IOCTL_DEF_DRV(AMDGPU_GEM_OP, amdgpu_gem_op_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_GEM_USERPTR, amdgpu_gem_userptr_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_USERQ, amdgpu_userq_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
+   DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
+   DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
 };
 
 static const struct drm_driver amdgpu_kms_driver = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index f7baea2c67ab..6fb75cc1d20c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -102,8 +103,11 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device 
*adev,
 
 void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver 
*fence_drv)
 {
+   struct amdgpu_userq_fence_driver *stored_fence_drv;
struct amdgpu_userq_fence *userq_fence, *tmp;
+   

[PATCH v9 1/5] drm/amdgpu: Implement a new userqueue fence driver

2024-04-29 Thread Arunpravin Paneer Selvam
Developed a userqueue fence driver for the userqueue process shared
BO synchronization.

Create a dma fence having write pointer as the seqno and allocate a
seq64 memory for each user queue process and feed this memory address
into the firmware/hardware, thus the firmware writes the read pointer
into the given address when the process completes it execution.
Compare wptr and rptr, if rptr >= wptr, signal the fences for the waiting
process to consume the buffers.

v2: Worked on review comments from Christian for the following
modifications

- Add wptr as sequence number into the fence
- Add a reference count for the fence driver
- Add dma_fence_put below the list_del as it might
  frees the userq fence.
- Trim unnecessary code in interrupt handler.
- Check dma fence signaled state in dma fence creation
  function for a potential problem of hardware completing
  the job processing beforehand.
- Add necessary locks.
- Create a list and process all the unsignaled fences.
- clean up fences in destroy function.
- implement .signaled callback function

v3: Worked on review comments from Christian
- Modify naming convention for reference counted objects
- Fix fence driver reference drop issue
- Drop amdgpu_userq_fence_driver_process() function return value

v4: Worked on review comments from Christian
- Moved fence driver allocation into amdgpu_userq_fence_driver_alloc()
- Added detail doc mentioning the differences b/w
  two spinlocks declared.

v5: Worked on review comments from Christian
- Check before upcast and remove local variable
- Add error handling in fence_drv alloc function.
- Move rptr read fn outside of the loop and remove WARN_ON in
  destroy function.

v6:
  - clear the seq64 memory in user fence driver(Christian)
  - fix for the wptr va bo mapping(Christian)
  - move the fence_drv xa entry erase code from the interrupt handler
into user fence destroy function

Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Christian König 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |   6 +
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   | 257 ++
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h   |  69 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c |  12 +-
 .../gpu/drm/amd/include/amdgpu_userqueue.h|   5 +-
 6 files changed, 347 insertions(+), 5 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index a640bfa468ad..f8dd808bd71c 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -80,7 +80,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o 
amdgpu_kms.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
-   amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o
+   amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o \
+   amdgpu_userq_fence.o
 
 amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index acee1c279abb..844f7b5f90db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -51,6 +51,7 @@
 #include "amdgpu_sched.h"
 #include "amdgpu_xgmi.h"
 #include "amdgpu_userqueue.h"
+#include "amdgpu_userq_fence.h"
 #include "../amdxcp/amdgpu_xcp_drv.h"
 
 /*
@@ -3012,6 +3013,10 @@ static int __init amdgpu_init(void)
if (r)
goto error_fence;
 
+   r = amdgpu_userq_fence_slab_init();
+   if (r)
+   goto error_fence;
+
DRM_INFO("amdgpu kernel modesetting enabled.\n");
amdgpu_register_atpx_handler();
amdgpu_acpi_detect();
@@ -3037,6 +3042,7 @@ static void __exit amdgpu_exit(void)
amdgpu_acpi_release();
amdgpu_sync_fini();
amdgpu_fence_slab_fini();
+   amdgpu_userq_fence_slab_fini();
mmu_notifier_synchronize();
amdgpu_xcp_drv_release();
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
new file mode 100644
index ..f7baea2c67ab
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, includi

[PATCH v9 2/5] drm/amdgpu: Add mqd support for the fence address

2024-04-29 Thread Arunpravin Paneer Selvam
- Add a field in struct v11_gfx_mqd for userqueue
  fence address.

- Assign fence gpu VA address to the userqueue mqd
  fence address fields.

v2: Remove the mask and replace with lower_32_bits (Christian)

Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c | 11 +++
 drivers/gpu/drm/amd/include/v11_structs.h|  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c
index 6369a26fb07e..d8592cfef534 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c
@@ -25,6 +25,7 @@
 #include "v11_structs.h"
 #include "mes_v11_0.h"
 #include "amdgpu_userqueue.h"
+#include "amdgpu_userq_fence.h"
 
 #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE
 #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE
@@ -204,6 +205,14 @@ static int mes_v11_0_userq_create_ctx_space(struct 
amdgpu_userq_mgr *uq_mgr,
return 0;
 }
 
+static void mes_v11_0_userq_set_fence_space(struct amdgpu_usermode_queue 
*queue)
+{
+   struct v11_gfx_mqd *mqd = queue->mqd.cpu_ptr;
+
+   mqd->fenceaddress_lo = lower_32_bits(queue->fence_drv->gpu_addr);
+   mqd->fenceaddress_hi = upper_32_bits(queue->fence_drv->gpu_addr);
+}
+
 static int mes_v11_0_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
  struct drm_amdgpu_userq_in *args_in,
  struct amdgpu_usermode_queue *queue)
@@ -274,6 +283,8 @@ static int mes_v11_0_userq_mqd_create(struct 
amdgpu_userq_mgr *uq_mgr,
goto free_mqd;
}
 
+   mes_v11_0_userq_set_fence_space(queue);
+
/* FW expects WPTR BOs to be mapped into GART */
r = mes_v11_0_create_wptr_mapping(uq_mgr, queue, 
userq_props->wptr_gpu_addr);
if (r) {
diff --git a/drivers/gpu/drm/amd/include/v11_structs.h 
b/drivers/gpu/drm/amd/include/v11_structs.h
index f8008270f813..797ce6a1e56e 100644
--- a/drivers/gpu/drm/amd/include/v11_structs.h
+++ b/drivers/gpu/drm/amd/include/v11_structs.h
@@ -535,8 +535,8 @@ struct v11_gfx_mqd {
uint32_t reserved_507; // offset: 507  (0x1FB)
uint32_t reserved_508; // offset: 508  (0x1FC)
uint32_t reserved_509; // offset: 509  (0x1FD)
-   uint32_t reserved_510; // offset: 510  (0x1FE)
-   uint32_t reserved_511; // offset: 511  (0x1FF)
+   uint32_t fenceaddress_lo; // offset: 510  (0x1FE)
+   uint32_t fenceaddress_hi; // offset: 511  (0x1FF)
 };
 
 struct v11_sdma_mqd {
-- 
2.25.1



[PATCH v5] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-25 Thread Arunpravin Paneer Selvam
Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
  allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
  and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
  the allocation fails, we return -ENOSPC.

v2:
  - keep the mem_flags and bo->flags check as is(Christian)
  - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
amdgpu_bo_pin_restricted function placement range iteration
loop(Christian)
  - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
(Christian)
  - Keep the kernel BO allocation as is(Christain)
  - If BO pin vram allocation failed, we need to return -ENOSPC as
RDMA cannot work with scattered VRAM pages(Philip)

v3(Christian):
  - keep contiguous flag handling outside of pages_per_block
calculation
  - remove the hacky implementation in contiguous flag error
handling code

v4(Christian):
  - use any variable and return value for non-contiguous
fallback

v5: rebase to amd-staging-drm-next branch

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 23 +++-
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 331b9ed8062c..316a9f897f2b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo 
*abo, u32 domain)
else
places[c].flags |= TTM_PL_FLAG_TOPDOWN;
 
-   if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+   if (abo->tbo.type == ttm_bo_type_kernel &&
+   flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
c++;
}
 
@@ -964,6 +966,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
if (!bo->placements[i].lpfn ||
(lpfn && lpfn < bo->placements[i].lpfn))
bo->placements[i].lpfn = lpfn;
+
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->placements[i].mem_type == TTM_PL_VRAM)
+   bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
}
 
r = ttm_bo_validate(>tbo, >placement, );
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..f23002ed2b42 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -450,6 +450,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 {
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
+   struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
u64 vis_usage = 0, max_bytes, min_block_size;
struct amdgpu_vram_mgr_resource *vres;
u64 size, remaining_size, lpfn, fpfn;
@@ -468,7 +469,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
if (tbo->type != ttm_bo_type_kernel)
max_bytes -= AMDGPU_VM_RESERVED_VRAM;
 
-   if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
pages_per_block = ~0ul;
} else {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -477,7 +478,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
/* default to 2MB */
pages_per_block = 2UL << (20UL - PAGE_SHIFT);
 #endif
-   pages_per_block = max_t(uint32_t, pages_per_block,
+   pages_per_block = max_t(u32, pages_per_block,
tbo->page_alignment);
}
 
@@ -498,7 +499,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
if (place->flags & TTM_PL_FLAG_TOPDOWN)
vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
 
-   if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
vres-

Re: [PATCH v3] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-25 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 4/24/2024 2:02 PM, Christian König wrote:

Am 24.04.24 um 09:13 schrieb Arunpravin Paneer Selvam:

Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
   allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
   and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
   the allocation fails, we return -ENOSPC.

v2:
   - keep the mem_flags and bo->flags check as is(Christian)
   - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
 amdgpu_bo_pin_restricted function placement range iteration
 loop(Christian)
   - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
 (Christian)
   - Keep the kernel BO allocation as is(Christain)
   - If BO pin vram allocation failed, we need to return -ENOSPC as
 RDMA cannot work with scattered VRAM pages(Philip)

v3(Christian):
   - keep contiguous flag handling outside of pages_per_block
 calculation
   - remove the hacky implementation in contiguous flag error
 handling code

Signed-off-by: Arunpravin Paneer Selvam 


Suggested-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 83 ++--
  2 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 492aebc44e51..c594d2a5978e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -154,8 +154,10 @@ void amdgpu_bo_placement_from_domain(struct 
amdgpu_bo *abo, u32 domain)

  else
  places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  -    if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+    if (abo->tbo.type == ttm_bo_type_kernel &&
+    flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
  places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
  c++;
  }
  @@ -965,6 +967,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

  if (!bo->placements[i].lpfn ||
  (lpfn && lpfn < bo->placements[i].lpfn))
  bo->placements[i].lpfn = lpfn;
+
+    if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+    bo->placements[i].mem_type == TTM_PL_VRAM)
+    bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
  }
    r = ttm_bo_validate(>tbo, >placement, );
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index e494f5bf136a..17c5d9ce9927 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,23 @@ static inline u64 
amdgpu_vram_mgr_blocks_size(struct list_head *head)

  return size;
  }
  +static inline void amdgpu_vram_mgr_limit_min_block_size(unsigned 
long pages_per_block,

+    u64 size,
+    u64 *min_block_size,
+    bool contiguous_enabled)
+{
+    if (contiguous_enabled)
+    return;
+
+    /*
+ * if size >= 2MiB, limit the min_block_size to 2MiB
+ * for better TLB usage.
+ */
+    if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
+    !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
+    *min_block_size = (u64)pages_per_block << PAGE_SHIFT;
+}
+
  /**
   * DOC: mem_info_vram_total
   *
@@ -452,11 +469,12 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  struct amdgpu_device *adev = to_amdgpu_device(mgr);
  struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
  u64 vis_usage = 0, max_bytes, min_block_size;
+    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
  struct amdgpu_vram_mgr_resource *vres;
  u64 size, remaining_size, lpfn, fpfn;
  struct drm_buddy *mm = >mm;
-    struct drm_buddy_block *block;
  unsigned long pages_per_block;
+    struct drm_buddy_block *block;
  int r;
    lpfn = (u64)place->lpfn << PAGE_SHIFT;
@@ -469,18 +487,14 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  if (tbo->type != ttm_bo_type_kernel)
  max_bytes -= AMDGPU_VM_RESERVED_VRAM;
  -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-    pages_per_block = ~0ul;
-    } else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+    if

[PATCH v4] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-25 Thread Arunpravin Paneer Selvam
Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
  allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
  and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
  the allocation fails, we return -ENOSPC.

v2:
  - keep the mem_flags and bo->flags check as is(Christian)
  - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
amdgpu_bo_pin_restricted function placement range iteration
loop(Christian)
  - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
(Christian)
  - Keep the kernel BO allocation as is(Christain)
  - If BO pin vram allocation failed, we need to return -ENOSPC as
RDMA cannot work with scattered VRAM pages(Philip)

v3(Christian):
  - keep contiguous flag handling outside of pages_per_block
calculation
  - remove the hacky implementation in contiguous flag error
handling code

v4(Christian):
  - use any variable and return value for non-contiguous
fallback

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 22 ++--
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 492aebc44e51..c594d2a5978e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -154,8 +154,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo 
*abo, u32 domain)
else
places[c].flags |= TTM_PL_FLAG_TOPDOWN;
 
-   if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+   if (abo->tbo.type == ttm_bo_type_kernel &&
+   flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
c++;
}
 
@@ -965,6 +967,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
if (!bo->placements[i].lpfn ||
(lpfn && lpfn < bo->placements[i].lpfn))
bo->placements[i].lpfn = lpfn;
+
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->placements[i].mem_type == TTM_PL_VRAM)
+   bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
}
 
r = ttm_bo_validate(>tbo, >placement, );
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index e494f5bf136a..6c30eceec896 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -469,7 +469,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
if (tbo->type != ttm_bo_type_kernel)
max_bytes -= AMDGPU_VM_RESERVED_VRAM;
 
-   if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
pages_per_block = ~0ul;
} else {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -478,7 +478,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
/* default to 2MB */
pages_per_block = 2UL << (20UL - PAGE_SHIFT);
 #endif
-   pages_per_block = max_t(uint32_t, pages_per_block,
+   pages_per_block = max_t(u32, pages_per_block,
tbo->page_alignment);
}
 
@@ -499,7 +499,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
if (place->flags & TTM_PL_FLAG_TOPDOWN)
vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
 
-   if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
 
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED)
@@ -518,21 +518,31 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,
else
min_block_size = mgr->default_page_size;
 
-   BUG_ON(min_block_size < mm->chunk_size);
-
/* Limit maximum size to 2GiB due to SG table limitations */
size = min(remaining_size, 2ULL <

[PATCH v3] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-24 Thread Arunpravin Paneer Selvam
Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
  allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
  and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
  the allocation fails, we return -ENOSPC.

v2:
  - keep the mem_flags and bo->flags check as is(Christian)
  - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
amdgpu_bo_pin_restricted function placement range iteration
loop(Christian)
  - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
(Christian)
  - Keep the kernel BO allocation as is(Christain)
  - If BO pin vram allocation failed, we need to return -ENOSPC as
RDMA cannot work with scattered VRAM pages(Philip)

v3(Christian):
  - keep contiguous flag handling outside of pages_per_block
calculation
  - remove the hacky implementation in contiguous flag error
handling code

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 83 ++--
 2 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 492aebc44e51..c594d2a5978e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -154,8 +154,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo 
*abo, u32 domain)
else
places[c].flags |= TTM_PL_FLAG_TOPDOWN;
 
-   if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+   if (abo->tbo.type == ttm_bo_type_kernel &&
+   flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
c++;
}
 
@@ -965,6 +967,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
if (!bo->placements[i].lpfn ||
(lpfn && lpfn < bo->placements[i].lpfn))
bo->placements[i].lpfn = lpfn;
+
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->placements[i].mem_type == TTM_PL_VRAM)
+   bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
}
 
r = ttm_bo_validate(>tbo, >placement, );
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index e494f5bf136a..17c5d9ce9927 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,23 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct 
list_head *head)
return size;
 }
 
+static inline void amdgpu_vram_mgr_limit_min_block_size(unsigned long 
pages_per_block,
+   u64 size,
+   u64 *min_block_size,
+   bool contiguous_enabled)
+{
+   if (contiguous_enabled)
+   return;
+
+   /*
+* if size >= 2MiB, limit the min_block_size to 2MiB
+* for better TLB usage.
+*/
+   if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
+   !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
+   *min_block_size = (u64)pages_per_block << PAGE_SHIFT;
+}
+
 /**
  * DOC: mem_info_vram_total
  *
@@ -452,11 +469,12 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
u64 vis_usage = 0, max_bytes, min_block_size;
+   struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
struct amdgpu_vram_mgr_resource *vres;
u64 size, remaining_size, lpfn, fpfn;
struct drm_buddy *mm = >mm;
-   struct drm_buddy_block *block;
unsigned long pages_per_block;
+   struct drm_buddy_block *block;
int r;
 
lpfn = (u64)place->lpfn << PAGE_SHIFT;
@@ -469,18 +487,14 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,
if (tbo->type != ttm_bo_type_kernel)
max_bytes -= AMDGPU_VM_RESERVED_VRAM;
 
-   if (place->fla

Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-17 Thread Paneer Selvam, Arunpravin

Hi Philip,

On 4/17/2024 8:58 PM, Philip Yang wrote:



On 2024-04-17 10:32, Paneer Selvam, Arunpravin wrote:

Hi Christian,

On 4/17/2024 6:57 PM, Paneer Selvam, Arunpravin wrote:

Hi Christian,

On 4/17/2024 12:19 PM, Christian König wrote:

Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam:

Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
   allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo 
validation

   and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
   the allocation fails, we return -ENOSPC.

v2:
   - keep the mem_flags and bo->flags check as is(Christian)
   - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
 amdgpu_bo_pin_restricted function placement range iteration
 loop(Christian)
   - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
 (Christian)
   - Keep the kernel BO allocation as is(Christain)
   - If BO pin vram allocation failed, we need to return -ENOSPC as
 RDMA cannot work with scattered VRAM pages(Philip)

Signed-off-by: Arunpravin Paneer Selvam 


Suggested-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 
+++-

  2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..caaef7b1df49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct 
amdgpu_bo *abo, u32 domain)

  else
  places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  -    if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+    if (abo->tbo.type == ttm_bo_type_kernel &&
+    flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
  places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
  c++;
  }
  @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct 
amdgpu_bo *bo, u32 domain,

  if (!bo->placements[i].lpfn ||
  (lpfn && lpfn < bo->placements[i].lpfn))
  bo->placements[i].lpfn = lpfn;
+
+    if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+    bo->placements[i].mem_type == TTM_PL_VRAM)
+    bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
  }
    r = ttm_bo_validate(>tbo, >placement, );


Nice work, up till here that looks exactly right as far as I can see.

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..4be8b091099a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,29 @@ static inline u64 
amdgpu_vram_mgr_blocks_size(struct list_head *head)

  return size;
  }
  +static inline unsigned long
+amdgpu_vram_mgr_calculate_pages_per_block(struct 
ttm_buffer_object *tbo,

+  const struct ttm_place *place,
+  unsigned long bo_flags)
+{
+    unsigned long pages_per_block;
+
+    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+    pages_per_block = ~0ul;


If I understand it correctly this here enforces the allocation of a 
contiguous buffer in the way that it says we should have only one 
giant page for the whole BO.
yes, for contiguous we don't have the 2MB limit, hence we are 
setting to maximum to avoid restricting the min_block_size to 2MB 
limitation.



+    } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+    pages_per_block = HPAGE_PMD_NR;
+#else
+    /* default to 2MB */
+    pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+    pages_per_block = max_t(uint32_t, pages_per_block,
+    tbo->page_alignment);
+    }
+
+    return pages_per_block;
+}
+
  /**
   * DOC: mem_info_vram_total
   *
@@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
  struct amdgpu_device *adev = to_amdgpu_device(mgr);
  u64 vis_usage = 0, max_bytes, min_block_size;
+    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
  struct amdgpu_vram_mgr_resource *vres;
  u64 size, remaining_size, lpfn, fpfn;
+    unsigned long bo_flags = bo->flags;
  struct drm_buddy *mm = >

Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-17 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 4/17/2024 6:57 PM, Paneer Selvam, Arunpravin wrote:

Hi Christian,

On 4/17/2024 12:19 PM, Christian König wrote:

Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam:

Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
   allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo 
validation

   and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
   the allocation fails, we return -ENOSPC.

v2:
   - keep the mem_flags and bo->flags check as is(Christian)
   - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
 amdgpu_bo_pin_restricted function placement range iteration
 loop(Christian)
   - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
 (Christian)
   - Keep the kernel BO allocation as is(Christain)
   - If BO pin vram allocation failed, we need to return -ENOSPC as
 RDMA cannot work with scattered VRAM pages(Philip)

Signed-off-by: Arunpravin Paneer Selvam 


Suggested-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 
+++-

  2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..caaef7b1df49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct 
amdgpu_bo *abo, u32 domain)

  else
  places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  -    if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+    if (abo->tbo.type == ttm_bo_type_kernel &&
+    flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
  places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
  c++;
  }
  @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

  if (!bo->placements[i].lpfn ||
  (lpfn && lpfn < bo->placements[i].lpfn))
  bo->placements[i].lpfn = lpfn;
+
+    if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+    bo->placements[i].mem_type == TTM_PL_VRAM)
+    bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
  }
    r = ttm_bo_validate(>tbo, >placement, );


Nice work, up till here that looks exactly right as far as I can see.

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..4be8b091099a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,29 @@ static inline u64 
amdgpu_vram_mgr_blocks_size(struct list_head *head)

  return size;
  }
  +static inline unsigned long
+amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object 
*tbo,

+  const struct ttm_place *place,
+  unsigned long bo_flags)
+{
+    unsigned long pages_per_block;
+
+    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+    pages_per_block = ~0ul;


If I understand it correctly this here enforces the allocation of a 
contiguous buffer in the way that it says we should have only one 
giant page for the whole BO.
yes, for contiguous we don't have the 2MB limit, hence we are setting 
to maximum to avoid restricting the min_block_size to 2MB limitation.



+    } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+    pages_per_block = HPAGE_PMD_NR;
+#else
+    /* default to 2MB */
+    pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+    pages_per_block = max_t(uint32_t, pages_per_block,
+    tbo->page_alignment);
+    }
+
+    return pages_per_block;
+}
+
  /**
   * DOC: mem_info_vram_total
   *
@@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
  struct amdgpu_device *adev = to_amdgpu_device(mgr);
  u64 vis_usage = 0, max_bytes, min_block_size;
+    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
  struct amdgpu_vram_mgr_resource *vres;
  u64 size, remaining_size, lpfn, fpfn;
+    unsigned long bo_flags = bo->flags;
  struct drm_buddy *mm = >mm;
  struct drm_buddy_block *block;
  unsigned long pages_per_block;
@@ -468,18 +493,9 @

Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-17 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 4/17/2024 12:19 PM, Christian König wrote:

Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam:

Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
   allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
   and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
   the allocation fails, we return -ENOSPC.

v2:
   - keep the mem_flags and bo->flags check as is(Christian)
   - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
 amdgpu_bo_pin_restricted function placement range iteration
 loop(Christian)
   - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
 (Christian)
   - Keep the kernel BO allocation as is(Christain)
   - If BO pin vram allocation failed, we need to return -ENOSPC as
 RDMA cannot work with scattered VRAM pages(Philip)

Signed-off-by: Arunpravin Paneer Selvam 


Suggested-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++-
  2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..caaef7b1df49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct 
amdgpu_bo *abo, u32 domain)

  else
  places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  -    if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+    if (abo->tbo.type == ttm_bo_type_kernel &&
+    flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
  places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
  c++;
  }
  @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

  if (!bo->placements[i].lpfn ||
  (lpfn && lpfn < bo->placements[i].lpfn))
  bo->placements[i].lpfn = lpfn;
+
+    if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+    bo->placements[i].mem_type == TTM_PL_VRAM)
+    bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
  }
    r = ttm_bo_validate(>tbo, >placement, );


Nice work, up till here that looks exactly right as far as I can see.

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..4be8b091099a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,29 @@ static inline u64 
amdgpu_vram_mgr_blocks_size(struct list_head *head)

  return size;
  }
  +static inline unsigned long
+amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object 
*tbo,

+  const struct ttm_place *place,
+  unsigned long bo_flags)
+{
+    unsigned long pages_per_block;
+
+    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+    pages_per_block = ~0ul;


If I understand it correctly this here enforces the allocation of a 
contiguous buffer in the way that it says we should have only one 
giant page for the whole BO.
yes, for contiguous we don't have the 2MB limit, hence we are setting to 
maximum to avoid restricting the min_block_size to 2MB limitation.



+    } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+    pages_per_block = HPAGE_PMD_NR;
+#else
+    /* default to 2MB */
+    pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+    pages_per_block = max_t(uint32_t, pages_per_block,
+    tbo->page_alignment);
+    }
+
+    return pages_per_block;
+}
+
  /**
   * DOC: mem_info_vram_total
   *
@@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
  struct amdgpu_device *adev = to_amdgpu_device(mgr);
  u64 vis_usage = 0, max_bytes, min_block_size;
+    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
  struct amdgpu_vram_mgr_resource *vres;
  u64 size, remaining_size, lpfn, fpfn;
+    unsigned long bo_flags = bo->flags;
  struct drm_buddy *mm = >mm;
  struct drm_buddy_block *block;
  unsigned long pages_per_block;
@@ -468,18 +493,9 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  if (tbo->t

[PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-17 Thread Arunpravin Paneer Selvam
Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
  allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
  and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
  the allocation fails, we return -ENOSPC.

v2:
  - keep the mem_flags and bo->flags check as is(Christian)
  - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
amdgpu_bo_pin_restricted function placement range iteration
loop(Christian)
  - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
(Christian)
  - Keep the kernel BO allocation as is(Christain)
  - If BO pin vram allocation failed, we need to return -ENOSPC as
RDMA cannot work with scattered VRAM pages(Philip)

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++-
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..caaef7b1df49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo 
*abo, u32 domain)
else
places[c].flags |= TTM_PL_FLAG_TOPDOWN;
 
-   if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+   if (abo->tbo.type == ttm_bo_type_kernel &&
+   flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
c++;
}
 
@@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
if (!bo->placements[i].lpfn ||
(lpfn && lpfn < bo->placements[i].lpfn))
bo->placements[i].lpfn = lpfn;
+
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+   bo->placements[i].mem_type == TTM_PL_VRAM)
+   bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
}
 
r = ttm_bo_validate(>tbo, >placement, );
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..4be8b091099a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,29 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct 
list_head *head)
return size;
 }
 
+static inline unsigned long
+amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object *tbo,
+ const struct ttm_place *place,
+ unsigned long bo_flags)
+{
+   unsigned long pages_per_block;
+
+   if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   pages_per_block = ~0ul;
+   } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+   pages_per_block = HPAGE_PMD_NR;
+#else
+   /* default to 2MB */
+   pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+   pages_per_block = max_t(uint32_t, pages_per_block,
+   tbo->page_alignment);
+   }
+
+   return pages_per_block;
+}
+
 /**
  * DOC: mem_info_vram_total
  *
@@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
u64 vis_usage = 0, max_bytes, min_block_size;
+   struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
struct amdgpu_vram_mgr_resource *vres;
u64 size, remaining_size, lpfn, fpfn;
+   unsigned long bo_flags = bo->flags;
struct drm_buddy *mm = >mm;
struct drm_buddy_block *block;
unsigned long pages_per_block;
@@ -468,18 +493,9 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
if (tbo->type != ttm_bo_type_kernel)
max_bytes -= AMDGPU_VM_RESERVED_VRAM;
 
-   if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-   pages_per_block = ~0ul;
-   } else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-   pages_per_block = HPAGE_PMD_NR;
-#els

Re: [PATCH] drm/amdgpu: clear seq64 memory on free

2024-04-16 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 4/16/2024 6:24 PM, Christian König wrote:

Am 16.04.24 um 14:34 schrieb Paneer Selvam, Arunpravin:

Hi Christian,

On 4/16/2024 5:47 PM, Christian König wrote:

Am 16.04.24 um 14:16 schrieb Paneer Selvam, Arunpravin:

Hi Christian,

On 4/16/2024 2:35 PM, Christian König wrote:

Am 15.04.24 um 20:48 schrieb Arunpravin Paneer Selvam:

We should clear the memory on free. Otherwise,
there is a chance that we will access the previous
application data and this would leads to an abnormal
behaviour in the current application.


Mhm, I would strongly expect that we initialize the seq number 
after allocation.


It could be that we also have situations were the correct start 
value is 0x or something like that instead.


Why does this matter?
when the user queue A's u64 address (fence address) is allocated to 
the newly created user queue B, we see a problem.
User queue B calls the signal IOCTL which creates a new fence 
having the wptr as the seq number, in
amdgpu_userq_fence_create function we have a check where we are 
comparing the rptr and wptr value (rptr >= wptr).
since the rptr value is read from the u64 address which has the 
user queue A's wptr data, this rptr >= wptr condition
gets satisfied and we are dropping the reference before the actual 
command gets processed in the hardware.

If we clear this u64 value on free, we dont see this problem.


Yeah, but that doesn't belongs into the seq64 handling.

Instead the code which allocates the seq64 during userqueue created 
needs to clear it to 0.

sure, got it.


Yeah, but fixing that aside. We should probably initialize the seq64 
array to something like 0xdeadbeef or a similar pattern to find issues 
were we forget to initialize the allocated slots.

yes, I will prepare a patch and send for the review.

Thanks,
Arun.


Regards,
Christian.



Thanks,
Arun.


Regards,
Christian.



Thanks,
Arun.


Regards,
Christian.



Signed-off-by: Arunpravin Paneer Selvam 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c

index 4b9afc4df031..9613992c9804 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
@@ -189,10 +189,14 @@ int amdgpu_seq64_alloc(struct amdgpu_device 
*adev, u64 *va, u64 **cpu_addr)

  void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va)
  {
  unsigned long bit_pos;
+    u64 *cpu_addr;
    bit_pos = (va - amdgpu_seq64_get_va_base(adev)) / 
sizeof(u64);

-    if (bit_pos < adev->seq64.num_sem)
+    if (bit_pos < adev->seq64.num_sem) {
+    cpu_addr = bit_pos + adev->seq64.cpu_base_addr;
+    memset(cpu_addr, 0, sizeof(u64));
  __clear_bit(bit_pos, adev->seq64.used);
+    }
  }
    /**














Re: [PATCH] drm/amdgpu: clear seq64 memory on free

2024-04-16 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 4/16/2024 5:47 PM, Christian König wrote:

Am 16.04.24 um 14:16 schrieb Paneer Selvam, Arunpravin:

Hi Christian,

On 4/16/2024 2:35 PM, Christian König wrote:

Am 15.04.24 um 20:48 schrieb Arunpravin Paneer Selvam:

We should clear the memory on free. Otherwise,
there is a chance that we will access the previous
application data and this would leads to an abnormal
behaviour in the current application.


Mhm, I would strongly expect that we initialize the seq number after 
allocation.


It could be that we also have situations were the correct start 
value is 0x or something like that instead.


Why does this matter?
when the user queue A's u64 address (fence address) is allocated to 
the newly created user queue B, we see a problem.
User queue B calls the signal IOCTL which creates a new fence having 
the wptr as the seq number, in
amdgpu_userq_fence_create function we have a check where we are 
comparing the rptr and wptr value (rptr >= wptr).
since the rptr value is read from the u64 address which has the user 
queue A's wptr data, this rptr >= wptr condition
gets satisfied and we are dropping the reference before the actual 
command gets processed in the hardware.

If we clear this u64 value on free, we dont see this problem.


Yeah, but that doesn't belongs into the seq64 handling.

Instead the code which allocates the seq64 during userqueue created 
needs to clear it to 0.

sure, got it.

Thanks,
Arun.


Regards,
Christian.



Thanks,
Arun.


Regards,
Christian.



Signed-off-by: Arunpravin Paneer Selvam 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c

index 4b9afc4df031..9613992c9804 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
@@ -189,10 +189,14 @@ int amdgpu_seq64_alloc(struct amdgpu_device 
*adev, u64 *va, u64 **cpu_addr)

  void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va)
  {
  unsigned long bit_pos;
+    u64 *cpu_addr;
    bit_pos = (va - amdgpu_seq64_get_va_base(adev)) / sizeof(u64);
-    if (bit_pos < adev->seq64.num_sem)
+    if (bit_pos < adev->seq64.num_sem) {
+    cpu_addr = bit_pos + adev->seq64.cpu_base_addr;
+    memset(cpu_addr, 0, sizeof(u64));
  __clear_bit(bit_pos, adev->seq64.used);
+    }
  }
    /**










Re: [PATCH] drm/amdgpu: clear seq64 memory on free

2024-04-16 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 4/16/2024 2:35 PM, Christian König wrote:

Am 15.04.24 um 20:48 schrieb Arunpravin Paneer Selvam:

We should clear the memory on free. Otherwise,
there is a chance that we will access the previous
application data and this would leads to an abnormal
behaviour in the current application.


Mhm, I would strongly expect that we initialize the seq number after 
allocation.


It could be that we also have situations were the correct start value 
is 0x or something like that instead.


Why does this matter?
when the user queue A's u64 address (fence address) is allocated to the 
newly created user queue B, we see a problem.
User queue B calls the signal IOCTL which creates a new fence having the 
wptr as the seq number, in
amdgpu_userq_fence_create function we have a check where we are 
comparing the rptr and wptr value (rptr >= wptr).
since the rptr value is read from the u64 address which has the user 
queue A's wptr data, this rptr >= wptr condition
gets satisfied and we are dropping the reference before the actual 
command gets processed in the hardware.

If we clear this u64 value on free, we dont see this problem.

Thanks,
Arun.


Regards,
Christian.



Signed-off-by: Arunpravin Paneer Selvam 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c

index 4b9afc4df031..9613992c9804 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
@@ -189,10 +189,14 @@ int amdgpu_seq64_alloc(struct amdgpu_device 
*adev, u64 *va, u64 **cpu_addr)

  void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va)
  {
  unsigned long bit_pos;
+    u64 *cpu_addr;
    bit_pos = (va - amdgpu_seq64_get_va_base(adev)) / sizeof(u64);
-    if (bit_pos < adev->seq64.num_sem)
+    if (bit_pos < adev->seq64.num_sem) {
+    cpu_addr = bit_pos + adev->seq64.cpu_base_addr;
+    memset(cpu_addr, 0, sizeof(u64));
  __clear_bit(bit_pos, adev->seq64.used);
+    }
  }
    /**






Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-16 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 4/16/2024 1:16 PM, Christian König wrote:

Am 16.04.24 um 00:02 schrieb Philip Yang:

On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote:

Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.
This change will simplify the KFD best effort contiguous VRAM 
allocation, because KFD doesn't need set new GEM_ flag.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.


AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page 
table BO, it is fine as BO size is page size 4K. For 64KB reserved 
BOs and F/W size related BOs, do all allocation happen at driver 
initialization before the VRAM is fragmented?




Oh, that's a really good point, we need to keep the behavior as is for 
kernel allocations. Arun can you take care of that?

Sure, I will take care kernel allocations.
Thanks,
Arun.


Thanks,
Christian.


- we will try to allocate the contiguous buffer. Say if the
   allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo 
validation

   and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
   the allocation fails, we return -ENOSPC.

Signed-off-by: Arunpravin Paneer 
Selvam

Suggested-by: Christian König
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 
+++-

  2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..41926d631563 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct 
amdgpu_bo *abo, u32 domain)

  else
  places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  -    if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
-    places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
  c++;
  }
  @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

  {
  struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
  struct ttm_operation_ctx ctx = { false, false };
+    struct ttm_place *places = bo->placements;
+    u32 c = 0;
  int r, i;
    if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
@@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

    if (bo->tbo.pin_count) {
  uint32_t mem_type = bo->tbo.resource->mem_type;
-    uint32_t mem_flags = bo->tbo.resource->placement;
    if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
  return -EINVAL;
  -    if ((mem_type == TTM_PL_VRAM) &&
-    (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
-    !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
-    return -EINVAL;
-
This looks like a bug before, but with this patch, the check makes 
sense and is needed.

  ttm_bo_pin(>tbo);
    if (max_offset != 0) {
@@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

  bo->placements[i].lpfn = lpfn;
  }
  +    if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
+    !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
+    places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+


If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, 
should pin and return scattered pages because the RDMA support 
scattered dmabuf. Christian also pointed this out.


If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&

    bo->placements[i].mem_type == TTM_PL_VRAM)
        o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;

  r = ttm_bo_validate(>tbo, >placement, );
  if (unlikely(r)) {
  dev_err(adev->dev, "%p pin failed\n", bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..ddbf302878f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,30 @@ static inline u64 
amdgpu_vram_mgr_blocks_size(struct list_head *head)

  return size;
  }
  +static inline unsigned long
+amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
+ const struct ttm_place *place,
+ unsigned long bo_flags)
+{
+    unsigned long pages_per_block;
+
+    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+    place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+    pages_per_blo

Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-16 Thread Paneer Selvam, Arunpravin




On 4/16/2024 3:32 AM, Philip Yang wrote:



On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote:

Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.
This change will simplify the KFD best effort contiguous VRAM 
allocation, because KFD doesn't need set new GEM_ flag.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.


AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page 
table BO, it is fine as BO size is page size 4K. For 64KB reserved BOs 
and F/W size related BOs, do all allocation happen at driver 
initialization before the VRAM is fragmented?



- we will try to allocate the contiguous buffer. Say if the
   allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
   and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
   the allocation fails, we return -ENOSPC.

Signed-off-by: Arunpravin Paneer Selvam
Suggested-by: Christian König
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++-
  2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..41926d631563 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, 
u32 domain)
else
places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  
-		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)

-   places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
c++;
}
  
@@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,

  {
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
struct ttm_operation_ctx ctx = { false, false };
+   struct ttm_place *places = bo->placements;
+   u32 c = 0;
int r, i;
  
  	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))

@@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
  
  	if (bo->tbo.pin_count) {

uint32_t mem_type = bo->tbo.resource->mem_type;
-   uint32_t mem_flags = bo->tbo.resource->placement;
  
  		if (!(domain & amdgpu_mem_type_to_domain(mem_type)))

return -EINVAL;
  
-		if ((mem_type == TTM_PL_VRAM) &&

-   (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
-   !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
-   return -EINVAL;
-
This looks like a bug before, but with this patch, the check makes 
sense and is needed.

ttm_bo_pin(>tbo);
  
  		if (max_offset != 0) {

@@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
bo->placements[i].lpfn = lpfn;
}
  
+	if (domain & AMDGPU_GEM_DOMAIN_VRAM &&

+   !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
+   places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+


If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, 
should pin and return scattered pages because the RDMA support 
scattered dmabuf. Christian also pointed this out.


If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&

    bo->placements[i].mem_type == TTM_PL_VRAM)
        o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;

r = ttm_bo_validate(>tbo, >placement, );
if (unlikely(r)) {
dev_err(adev->dev, "%p pin failed\n", bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..ddbf302878f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct 
list_head *head)
return size;
  }
  
+static inline unsigned long

+amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
+const struct ttm_place *place,
+unsigned long bo_flags)
+{
+   unsigned long pages_per_block;
+
+   if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+   place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+   pages_per_block = ~0ul;
+   } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+   pages_per_block =

[PATCH] drm/amdgpu: clear seq64 memory on free

2024-04-15 Thread Arunpravin Paneer Selvam
We should clear the memory on free. Otherwise,
there is a chance that we will access the previous
application data and this would leads to an abnormal
behaviour in the current application.

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
index 4b9afc4df031..9613992c9804 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
@@ -189,10 +189,14 @@ int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 
*va, u64 **cpu_addr)
 void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va)
 {
unsigned long bit_pos;
+   u64 *cpu_addr;
 
bit_pos = (va - amdgpu_seq64_get_va_base(adev)) / sizeof(u64);
-   if (bit_pos < adev->seq64.num_sem)
+   if (bit_pos < adev->seq64.num_sem) {
+   cpu_addr = bit_pos + adev->seq64.cpu_base_addr;
+   memset(cpu_addr, 0, sizeof(u64));
__clear_bit(bit_pos, adev->seq64.used);
+   }
 }
 
 /**
-- 
2.25.1



Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-15 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 4/15/2024 7:33 PM, Christian König wrote:

Am 14.04.24 um 16:57 schrieb Arunpravin Paneer Selvam:

Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
   allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
   and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
   the allocation fails, we return -ENOSPC.

Signed-off-by: Arunpravin Paneer Selvam 


Suggested-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++-
  2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..41926d631563 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct 
amdgpu_bo *abo, u32 domain)

  else
  places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  -    if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
-    places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
  c++;
  }
  @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

  {
  struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
  struct ttm_operation_ctx ctx = { false, false };
+    struct ttm_place *places = bo->placements;
+    u32 c = 0;
  int r, i;
    if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
@@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

    if (bo->tbo.pin_count) {
  uint32_t mem_type = bo->tbo.resource->mem_type;
-    uint32_t mem_flags = bo->tbo.resource->placement;
    if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
  return -EINVAL;
  -    if ((mem_type == TTM_PL_VRAM) &&
-    (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
-    !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
-    return -EINVAL;
-


I think that check here needs to stay.


  ttm_bo_pin(>tbo);
    if (max_offset != 0) {
@@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

  bo->placements[i].lpfn = lpfn;
  }
  +    if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
+    !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
+    places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+


This needs to go into the loop directly above as something like this 
here:


If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
    bo->placements[i].mem_type == TTM_PL_VRAM)
        o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;

This essentially replaces the removed code in 
amdgpu_bo_placement_from_domain() and only applies it during pinning.



  r = ttm_bo_validate(>tbo, >placement, );
  if (unlikely(r)) {
  dev_err(adev->dev, "%p pin failed\n", bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..ddbf302878f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,30 @@ static inline u64 
amdgpu_vram_mgr_blocks_size(struct list_head *head)

  return size;
  }
  +static inline unsigned long
+amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
+ const struct ttm_place *place,
+ unsigned long bo_flags)


Well I think we need a better name here. "find" usually means we look 
up something in a data structure. Maybe 
amdgpu_vram_mgr_calculate_pages_per_block.



+{
+    unsigned long pages_per_block;
+
+    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+    place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+    pages_per_block = ~0ul;
+    } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+    pages_per_block = HPAGE_PMD_NR;
+#else
+    /* default to 2MB */
+    pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+    pages_per_block = max_t(uint32_t, pages_per_block,
+    tbo->page_alignment);
+    }
+
+    return pages_per_block;
+}
+
  /**
   * DOC: mem_info_vram_total
   *
@@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

Re: [PATCH v11 1/3] drm/buddy: Implement tracking clear page feature

2024-04-15 Thread Paneer Selvam, Arunpravin

Hi Christian,
Could you please push these patches into drm branch.

Thanks,
Arun.

On 4/15/2024 2:53 AM, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as cleared,
   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the list(Christian)
   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks
 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required blocks.
   - Update the xe driver for the drm_buddy_free_list change in arguments.
   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

v3:
   - fix Gitlab user reported lockup issue.
   - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
   - modify to pass the root order instead max_order in fini()
 function(Matthew)
   - change bool 1 to true(Matthew)
   - add check if min_block_size is power of 2(Matthew)
   - modify the min_block_size datatype to u64(Matthew)

v4:
   - rename the function drm_buddy_defrag with __force_merge.
   - Include __force_merge directly in drm buddy file and remove
 the defrag use in amdgpu driver.
   - Remove list_empty() check(Matthew)
   - Remove unnecessary space, headers and placement of new variables(Matthew)
   - Add a unit test case(Matthew)

v5:
   - remove force merge support to actual range allocation and not to bail
 out when contains && split(Matthew)
   - add range support to force merge function.

v6:
   - modify the alloc_range() function clear page non merged blocks
 allocation(Matthew)
   - correct the list_insert function name(Matthew).

Signed-off-by: Arunpravin Paneer Selvam 
Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
Reviewed-by: Matthew Auld 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 427 ++
  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c|  28 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  16 +-
  6 files changed, 363 insertions(+), 124 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
return 0;
  
  error_free_blocks:

-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
  error_fini:
ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager 
*man,
  
  	amdgpu_vram_mgr_do_reserve(man);
  
-	drm_buddy_free_list(mm, >blocks);

+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
  
  	atomic64_sub(vis_usage, >vis_usage);

@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
kfree(rsv);
  
  	list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) {

-   drm_buddy_free_list(>mm, >allocated);
+   drm_buddy_free_list(>mm, >allocated, 0);
kfree(rsv);
}
if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 5ebdd6f8f36e..284ebae71cc4 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
__list_add(>link, node->link.prev, >link);
  }
  
+static void clear_reset(struct drm_buddy_block *block)

+{
+   

[PATCH v11 1/3] drm/buddy: Implement tracking clear page feature

2024-04-14 Thread Arunpravin Paneer Selvam
- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
  successfully clears the blocks in the free path. On the otherhand,
  DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
  but fallback to uncleared if we can't find the cleared blocks.
  when driver requests uncleared memory we try to use uncleared but
  fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as cleared,
  when there are buddies which are cleared as well we can merge them.
  Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
  - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
cleared. Else, reset the clear flag for each block in the list(Christian)
  - For merging the 2 cleared blocks compare as below,
drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
  - Defragment the memory beginning from min_order
till the required memory space is available.

v2: (Matthew)
  - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks
operation within drm buddy.
  - Write a macro block_incompatible() to allocate the required blocks.
  - Update the xe driver for the drm_buddy_free_list change in arguments.
  - add a warning if the two blocks are incompatible on
defragmentation
  - call full defragmentation in the fini() function
  - place a condition to test if min_order is equal to 0
  - replace the list with safe_reverse() variant as we might
remove the block from the list.

v3:
  - fix Gitlab user reported lockup issue.
  - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
  - modify to pass the root order instead max_order in fini()
function(Matthew)
  - change bool 1 to true(Matthew)
  - add check if min_block_size is power of 2(Matthew)
  - modify the min_block_size datatype to u64(Matthew)

v4:
  - rename the function drm_buddy_defrag with __force_merge.
  - Include __force_merge directly in drm buddy file and remove
the defrag use in amdgpu driver.
  - Remove list_empty() check(Matthew)
  - Remove unnecessary space, headers and placement of new variables(Matthew)
  - Add a unit test case(Matthew)

v5:
  - remove force merge support to actual range allocation and not to bail
out when contains && split(Matthew)
  - add range support to force merge function.

v6:
  - modify the alloc_range() function clear page non merged blocks
allocation(Matthew)
  - correct the list_insert function name(Matthew).

Signed-off-by: Arunpravin Paneer Selvam 
Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
 drivers/gpu/drm/drm_buddy.c   | 427 ++
 drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
 drivers/gpu/drm/tests/drm_buddy_test.c|  28 +-
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
 include/drm/drm_buddy.h   |  16 +-
 6 files changed, 363 insertions(+), 124 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
return 0;
 
 error_free_blocks:
-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
 error_fini:
ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager 
*man,
 
amdgpu_vram_mgr_do_reserve(man);
 
-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
 
atomic64_sub(vis_usage, >vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
kfree(rsv);
 
list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) {
-   drm_buddy_free_list(>mm, >allocated);
+   drm_buddy_free_list(>mm, >allocated, 0);
kfree(rsv);
}
if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 5ebdd6f8f36e..284ebae71cc4 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
__list_add(>link, node->link.prev, >link);
 }
 
+static void clear_reset(struct drm_buddy_block *block)
+{
+   block->header &= ~DRM_BUDDY_HEADER_CLEAR;
+}
+
+static void mark_cleared(struct drm_buddy_block *block)
+{
+   block->header |= DRM_BUDDY_HEADER_CLEAR;
+}
+
 stati

[PATCH v11 3/3] drm/tests: Add a test case for drm buddy clear allocation

2024-04-14 Thread Arunpravin Paneer Selvam
Add a new test case for the drm buddy clear and dirty
allocation.

v2:(Matthew)
  - make size as u32
  - rename PAGE_SIZE with SZ_4K
  - dont fragment the address space for all the order allocation
iterations. we can do it once and just increment and allocate
the size.
  - create new mm with non power-of-two size to ensure the multi-root
force_merge during fini.

v3:
  - add randomness in size calculation(Matthew)

Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Matthew Auld 
Suggested-by: Matthew Auld 
---
 drivers/gpu/drm/tests/drm_buddy_test.c | 143 +
 1 file changed, 143 insertions(+)

diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c 
b/drivers/gpu/drm/tests/drm_buddy_test.c
index 4621a860cb05..e3b50e240d36 100644
--- a/drivers/gpu/drm/tests/drm_buddy_test.c
+++ b/drivers/gpu/drm/tests/drm_buddy_test.c
@@ -224,6 +224,148 @@ static void drm_test_buddy_alloc_range_bias(struct kunit 
*test)
drm_buddy_fini();
 }
 
+static void drm_test_buddy_alloc_clear(struct kunit *test)
+{
+   unsigned long n_pages, total, i = 0;
+   DRM_RND_STATE(prng, random_seed);
+   const unsigned long ps = SZ_4K;
+   struct drm_buddy_block *block;
+   const int max_order = 12;
+   LIST_HEAD(allocated);
+   struct drm_buddy mm;
+   unsigned int order;
+   u32 mm_size, size;
+   LIST_HEAD(dirty);
+   LIST_HEAD(clean);
+
+   mm_size = SZ_4K << max_order;
+   KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps));
+
+   KUNIT_EXPECT_EQ(test, mm.max_order, max_order);
+
+   /*
+* Idea is to allocate and free some random portion of the address 
space,
+* returning those pages as non-dirty and randomly alternate between
+* requesting dirty and non-dirty pages (not going over the limit
+* we freed as non-dirty), putting that into two separate lists.
+* Loop over both lists at the end checking that the dirty list
+* is indeed all dirty pages and vice versa. Free it all again,
+* keeping the dirty/clear status.
+*/
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size,
+   5 * ps, ps, 
,
+   
DRM_BUDDY_TOPDOWN_ALLOCATION),
+   "buddy_alloc hit an error size=%lu\n", 5 * ps);
+   drm_buddy_free_list(, , DRM_BUDDY_CLEARED);
+
+   n_pages = 10;
+   do {
+   unsigned long flags;
+   struct list_head *list;
+   int slot = i % 2;
+
+   if (slot == 0) {
+   list = 
+   flags = 0;
+   } else {
+   list = 
+   flags = DRM_BUDDY_CLEAR_ALLOCATION;
+   }
+
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, 
mm_size,
+   ps, ps, 
list,
+   flags),
+   "buddy_alloc hit an error size=%lu\n", 
ps);
+   } while (++i < n_pages);
+
+   list_for_each_entry(block, , link)
+   KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true);
+
+   list_for_each_entry(block, , link)
+   KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false);
+
+   drm_buddy_free_list(, , DRM_BUDDY_CLEARED);
+
+   /*
+* Trying to go over the clear limit for some allocation.
+* The allocation should never fail with reasonable page-size.
+*/
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size,
+   10 * ps, ps, ,
+   
DRM_BUDDY_CLEAR_ALLOCATION),
+   "buddy_alloc hit an error size=%lu\n", 10 * ps);
+
+   drm_buddy_free_list(, , DRM_BUDDY_CLEARED);
+   drm_buddy_free_list(, , 0);
+   drm_buddy_fini();
+
+   KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps));
+
+   /*
+* Create a new mm. Intentionally fragment the address space by creating
+* two alternating lists. Free both lists, one as dirty the other as 
clean.
+* Try to allocate double the previous size with matching 
min_page_size. The
+* allocation should never fail as it calls the force_merge. Also check 
that
+* the page is always dirty after force_merge. Free the page as dirty, 
then
+* repeat the whole thing, increment the order until we hit the 
max_order.
+*/
+
+   i = 0;
+   n_pages = mm_size / ps;
+   do {
+   struct list_head *list;
+   int slot = i % 2;
+
+   if (slot == 0)
+   list = 
+ 

[PATCH v11 2/3] drm/amdgpu: Enable clear page functionality

2024-04-14 Thread Arunpravin Paneer Selvam
Add clear page support in vram memory region.

v1(Christian):
  - Dont handle clear page as TTM flag since when moving the BO back
in from GTT again we don't need that.
  - Make a specialized version of amdgpu_fill_buffer() which only
clears the VRAM areas which are not already cleared
  - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
amdgpu_object.c

v2:
  - Modify the function name amdgpu_ttm_* (Alex)
  - Drop the delayed parameter (Christian)
  - handle amdgpu_res_cleared() just above the size
calculation (Christian)
  - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers
in the free path to properly wait for fences etc.. (Christian)

v3(Christian):
  - Remove buffer clear code in VRAM manager instead change the
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
the DRM_BUDDY_CLEARED flag.
  - Remove ! from amdgpu_res_cleared() check.

v4(Christian):
  - vres flag setting move to vram manager file
  - use dma_fence_get_stub in amdgpu_ttm_clear_buffer function
  - make fence a mandatory parameter and drop the if and the get/put dance

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
Acked-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c|  9 +--
 .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 70 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  | 10 +++
 6 files changed, 116 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..e011a326fe86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"
 
 /**
  * DOC: amdgpu_object
@@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
if (!amdgpu_bo_support_uswc(bo->flags))
bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 
-   if (adev->ras_enabled)
-   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
 
bo->tbo.bdev = >mman.bdev;
if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -634,7 +634,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
bo->tbo.resource->mem_type == TTM_PL_VRAM) {
struct dma_fence *fence;
 
-   r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true);
+   r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, );
if (unlikely(r))
goto fail_unreserve;
 
@@ -1365,8 +1365,9 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
return;
 
-   r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true);
+   r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true);
if (!WARN_ON(r)) {
+   amdgpu_vram_mgr_set_cleared(bo->resource);
amdgpu_bo_fence(abo, fence, false);
dma_fence_put(fence);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
index 381101d2bf05..50fcd86e1033 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
amdgpu_res_cursor *cur, uint64_t size)
}
 }
 
+/**
+ * amdgpu_res_cleared - check if blocks are cleared
+ *
+ * @cur: the cursor to extract the block
+ *
+ * Check if the @cur block is cleared
+ */
+static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
+{
+   struct drm_buddy_block *block;
+
+   switch (cur->mem_type) {
+   case TTM_PL_VRAM:
+   block = cur->node;
+
+   if (!amdgpu_vram_mgr_is_cleared(block))
+   return false;
+   break;
+   default:
+   return false;
+   }
+
+   return true;
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index fc418e670fda..f0b42b567357 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -378,11 +378,12 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo,
(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
struct dma_fence *wipe_fence = NULL;
 
-   r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, _fence,
-   false);
+   r = 

Re: [PATCH v10 1/3] drm/buddy: Implement tracking clear page feature

2024-04-14 Thread Paneer Selvam, Arunpravin

Hi Matthew,

On 4/10/2024 6:22 PM, Matthew Auld wrote:

On 08/04/2024 16:16, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as 
cleared,

   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the 
list(Christian)

   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of 
blocks

 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required blocks.
   - Update the xe driver for the drm_buddy_free_list change in 
arguments.

   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

v3:
   - fix Gitlab user reported lockup issue.
   - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
   - modify to pass the root order instead max_order in fini()
 function(Matthew)
   - change bool 1 to true(Matthew)
   - add check if min_block_size is power of 2(Matthew)
   - modify the min_block_size datatype to u64(Matthew)

v4:
   - rename the function drm_buddy_defrag with __force_merge.
   - Include __force_merge directly in drm buddy file and remove
 the defrag use in amdgpu driver.
   - Remove list_empty() check(Matthew)
   - Remove unnecessary space, headers and placement of new 
variables(Matthew)

   - Add a unit test case(Matthew)

v5:
   - remove force merge support to actual range allocation and not to 
bail

 out when contains && split(Matthew)
   - add range support to force merge function.

Signed-off-by: Arunpravin Paneer Selvam 


Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 430 ++
  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c    |  28 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  16 +-
  6 files changed, 368 insertions(+), 122 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  return 0;
    error_free_blocks:
-    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
  error_fini:
  ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct 
ttm_resource_manager *man,

    amdgpu_vram_mgr_do_reserve(man);
  -    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
    atomic64_sub(vis_usage, >vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device 
*adev)

  kfree(rsv);
    list_for_each_entry_safe(rsv, temp, >reserved_pages, 
blocks) {

-    drm_buddy_free_list(>mm, >allocated);
+    drm_buddy_free_list(>mm, >allocated, 0);
  kfree(rsv);
  }
  if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 5ebdd6f8f36e..83dbe252f727 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm,
  kmem_cache_free(slab_blocks, block);
  }
  -static void list_insert_sorted(struct drm_buddy *mm,
-   struct drm_buddy_block *block)
+static void list_insert(struct drm_buddy *mm,
+    struct drm_buddy_block *block)


Why the change here?


  {
  struct drm_buddy_block *node;
  struct list_head *head;
@@ -57,6 +57,16 @@ s

[PATCH] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-14 Thread Arunpravin Paneer Selvam
Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
  allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
  and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
  the allocation fails, we return -ENOSPC.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++-
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..41926d631563 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, 
u32 domain)
else
places[c].flags |= TTM_PL_FLAG_TOPDOWN;
 
-   if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
-   places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
c++;
}
 
@@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
 {
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
struct ttm_operation_ctx ctx = { false, false };
+   struct ttm_place *places = bo->placements;
+   u32 c = 0;
int r, i;
 
if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
@@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
 
if (bo->tbo.pin_count) {
uint32_t mem_type = bo->tbo.resource->mem_type;
-   uint32_t mem_flags = bo->tbo.resource->placement;
 
if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
return -EINVAL;
 
-   if ((mem_type == TTM_PL_VRAM) &&
-   (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
-   !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
-   return -EINVAL;
-
ttm_bo_pin(>tbo);
 
if (max_offset != 0) {
@@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
bo->placements[i].lpfn = lpfn;
}
 
+   if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
+   !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
+   places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
r = ttm_bo_validate(>tbo, >placement, );
if (unlikely(r)) {
dev_err(adev->dev, "%p pin failed\n", bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..ddbf302878f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct 
list_head *head)
return size;
 }
 
+static inline unsigned long
+amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
+const struct ttm_place *place,
+unsigned long bo_flags)
+{
+   unsigned long pages_per_block;
+
+   if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+   place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+   pages_per_block = ~0ul;
+   } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+   pages_per_block = HPAGE_PMD_NR;
+#else
+   /* default to 2MB */
+   pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+   pages_per_block = max_t(uint32_t, pages_per_block,
+   tbo->page_alignment);
+   }
+
+   return pages_per_block;
+}
+
 /**
  * DOC: mem_info_vram_total
  *
@@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
u64 vis_usage = 0, max_bytes, min_block_size;
+   struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
struct amdgpu_vram_mgr_resource *vres;
u64 size, remaining_size, lpfn, fpfn;
+   unsigned long bo_flags = bo->flags;
struct drm_buddy *mm = >mm;
struct drm_buddy_block *block;
unsigned long pag

Re: [PATCH v10 1/3] drm/buddy: Implement tracking clear page feature

2024-04-08 Thread Paneer Selvam, Arunpravin

Hi Matthew,
Could you please review these changes as few clients are waiting for 
these patches.


Thanks,
Arun.
On 4/8/2024 8:46 PM, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as cleared,
   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the list(Christian)
   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks
 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required blocks.
   - Update the xe driver for the drm_buddy_free_list change in arguments.
   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

v3:
   - fix Gitlab user reported lockup issue.
   - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
   - modify to pass the root order instead max_order in fini()
 function(Matthew)
   - change bool 1 to true(Matthew)
   - add check if min_block_size is power of 2(Matthew)
   - modify the min_block_size datatype to u64(Matthew)

v4:
   - rename the function drm_buddy_defrag with __force_merge.
   - Include __force_merge directly in drm buddy file and remove
 the defrag use in amdgpu driver.
   - Remove list_empty() check(Matthew)
   - Remove unnecessary space, headers and placement of new variables(Matthew)
   - Add a unit test case(Matthew)

v5:
   - remove force merge support to actual range allocation and not to bail
 out when contains && split(Matthew)
   - add range support to force merge function.

Signed-off-by: Arunpravin Paneer Selvam 
Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 430 ++
  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c|  28 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  16 +-
  6 files changed, 368 insertions(+), 122 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
return 0;
  
  error_free_blocks:

-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
  error_fini:
ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager 
*man,
  
  	amdgpu_vram_mgr_do_reserve(man);
  
-	drm_buddy_free_list(mm, >blocks);

+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
  
  	atomic64_sub(vis_usage, >vis_usage);

@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
kfree(rsv);
  
  	list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) {

-   drm_buddy_free_list(>mm, >allocated);
+   drm_buddy_free_list(>mm, >allocated, 0);
kfree(rsv);
}
if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 5ebdd6f8f36e..83dbe252f727 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm,
kmem_cache_free(slab_blocks, block);
  }
  
-static void list_insert_sorted(struct drm_buddy *mm,

-  struct drm_buddy_block *block)
+static void list_insert(struct drm_buddy *mm,
+   struct drm_buddy_block *block)
  {
stru

[PATCH v10 3/3] drm/tests: Add a test case for drm buddy clear allocation

2024-04-08 Thread Arunpravin Paneer Selvam
Add a new test case for the drm buddy clear and dirty
allocation.

v2:(Matthew)
  - make size as u32
  - rename PAGE_SIZE with SZ_4K
  - dont fragment the address space for all the order allocation
iterations. we can do it once and just increment and allocate
the size.
  - create new mm with non power-of-two size to ensure the multi-root
force_merge during fini.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Matthew Auld 
---
 drivers/gpu/drm/tests/drm_buddy_test.c | 141 +
 1 file changed, 141 insertions(+)

diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c 
b/drivers/gpu/drm/tests/drm_buddy_test.c
index 4621a860cb05..b07f132f2835 100644
--- a/drivers/gpu/drm/tests/drm_buddy_test.c
+++ b/drivers/gpu/drm/tests/drm_buddy_test.c
@@ -224,6 +224,146 @@ static void drm_test_buddy_alloc_range_bias(struct kunit 
*test)
drm_buddy_fini();
 }
 
+static void drm_test_buddy_alloc_clear(struct kunit *test)
+{
+   unsigned long n_pages, total, i = 0;
+   const unsigned long ps = SZ_4K;
+   struct drm_buddy_block *block;
+   const int max_order = 12;
+   LIST_HEAD(allocated);
+   struct drm_buddy mm;
+   unsigned int order;
+   u32 mm_size, size;
+   LIST_HEAD(dirty);
+   LIST_HEAD(clean);
+
+   mm_size = SZ_4K << max_order;
+   KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps));
+
+   KUNIT_EXPECT_EQ(test, mm.max_order, max_order);
+
+   /*
+* Idea is to allocate and free some random portion of the address 
space,
+* returning those pages as non-dirty and randomly alternate between
+* requesting dirty and non-dirty pages (not going over the limit
+* we freed as non-dirty), putting that into two separate lists.
+* Loop over both lists at the end checking that the dirty list
+* is indeed all dirty pages and vice versa. Free it all again,
+* keeping the dirty/clear status.
+*/
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size,
+   5 * ps, ps, 
,
+   
DRM_BUDDY_TOPDOWN_ALLOCATION),
+   "buddy_alloc hit an error size=%lu\n", 5 * ps);
+   drm_buddy_free_list(, , DRM_BUDDY_CLEARED);
+
+   n_pages = 10;
+   do {
+   unsigned long flags;
+   struct list_head *list;
+   int slot = i % 2;
+
+   if (slot == 0) {
+   list = 
+   flags = 0;
+   } else {
+   list = 
+   flags = DRM_BUDDY_CLEAR_ALLOCATION;
+   }
+
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, 
mm_size,
+   ps, ps, 
list,
+   flags),
+   "buddy_alloc hit an error size=%lu\n", 
ps);
+   } while (++i < n_pages);
+
+   list_for_each_entry(block, , link)
+   KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true);
+
+   list_for_each_entry(block, , link)
+   KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false);
+
+   drm_buddy_free_list(, , DRM_BUDDY_CLEARED);
+
+   /*
+* Trying to go over the clear limit for some allocation.
+* The allocation should never fail with reasonable page-size.
+*/
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size,
+   10 * ps, ps, ,
+   
DRM_BUDDY_CLEAR_ALLOCATION),
+   "buddy_alloc hit an error size=%lu\n", 10 * ps);
+
+   drm_buddy_free_list(, , DRM_BUDDY_CLEARED);
+   drm_buddy_free_list(, , 0);
+   drm_buddy_fini();
+
+   KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps));
+
+   /*
+* Create a new mm. Intentionally fragment the address space by creating
+* two alternating lists. Free both lists, one as dirty the other as 
clean.
+* Try to allocate double the previous size with matching 
min_page_size. The
+* allocation should never fail as it calls the force_merge. Also check 
that
+* the page is always dirty after force_merge. Free the page as dirty, 
then
+* repeat the whole thing, increment the order until we hit the 
max_order.
+*/
+
+   i = 0;
+   n_pages = mm_size / ps;
+   do {
+   struct list_head *list;
+   int slot = i % 2;
+
+   if (slot == 0)
+   list = 
+   else
+   list = 
+
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, 
mm_size,
+  

[PATCH v10 1/3] drm/buddy: Implement tracking clear page feature

2024-04-08 Thread Arunpravin Paneer Selvam
- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
  successfully clears the blocks in the free path. On the otherhand,
  DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
  but fallback to uncleared if we can't find the cleared blocks.
  when driver requests uncleared memory we try to use uncleared but
  fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as cleared,
  when there are buddies which are cleared as well we can merge them.
  Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
  - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
cleared. Else, reset the clear flag for each block in the list(Christian)
  - For merging the 2 cleared blocks compare as below,
drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
  - Defragment the memory beginning from min_order
till the required memory space is available.

v2: (Matthew)
  - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks
operation within drm buddy.
  - Write a macro block_incompatible() to allocate the required blocks.
  - Update the xe driver for the drm_buddy_free_list change in arguments.
  - add a warning if the two blocks are incompatible on
defragmentation
  - call full defragmentation in the fini() function
  - place a condition to test if min_order is equal to 0
  - replace the list with safe_reverse() variant as we might
remove the block from the list.

v3:
  - fix Gitlab user reported lockup issue.
  - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
  - modify to pass the root order instead max_order in fini()
function(Matthew)
  - change bool 1 to true(Matthew)
  - add check if min_block_size is power of 2(Matthew)
  - modify the min_block_size datatype to u64(Matthew)

v4:
  - rename the function drm_buddy_defrag with __force_merge.
  - Include __force_merge directly in drm buddy file and remove
the defrag use in amdgpu driver.
  - Remove list_empty() check(Matthew)
  - Remove unnecessary space, headers and placement of new variables(Matthew)
  - Add a unit test case(Matthew)

v5:
  - remove force merge support to actual range allocation and not to bail
out when contains && split(Matthew)
  - add range support to force merge function.

Signed-off-by: Arunpravin Paneer Selvam 
Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
 drivers/gpu/drm/drm_buddy.c   | 430 ++
 drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
 drivers/gpu/drm/tests/drm_buddy_test.c|  28 +-
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
 include/drm/drm_buddy.h   |  16 +-
 6 files changed, 368 insertions(+), 122 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
return 0;
 
 error_free_blocks:
-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
 error_fini:
ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager 
*man,
 
amdgpu_vram_mgr_do_reserve(man);
 
-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
 
atomic64_sub(vis_usage, >vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
kfree(rsv);
 
list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) {
-   drm_buddy_free_list(>mm, >allocated);
+   drm_buddy_free_list(>mm, >allocated, 0);
kfree(rsv);
}
if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 5ebdd6f8f36e..83dbe252f727 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm,
kmem_cache_free(slab_blocks, block);
 }
 
-static void list_insert_sorted(struct drm_buddy *mm,
-  struct drm_buddy_block *block)
+static void list_insert(struct drm_buddy *mm,
+   struct drm_buddy_block *block)
 {
struct drm_buddy_block *node;
struct list_head *head;
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
__list_add(>link, node->link.prev, >link);
 }
 
+static void 

[PATCH v10 2/3] drm/amdgpu: Enable clear page functionality

2024-04-08 Thread Arunpravin Paneer Selvam
Add clear page support in vram memory region.

v1(Christian):
  - Dont handle clear page as TTM flag since when moving the BO back
in from GTT again we don't need that.
  - Make a specialized version of amdgpu_fill_buffer() which only
clears the VRAM areas which are not already cleared
  - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
amdgpu_object.c

v2:
  - Modify the function name amdgpu_ttm_* (Alex)
  - Drop the delayed parameter (Christian)
  - handle amdgpu_res_cleared() just above the size
calculation (Christian)
  - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers
in the free path to properly wait for fences etc.. (Christian)

v3(Christian):
  - Remove buffer clear code in VRAM manager instead change the
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
the DRM_BUDDY_CLEARED flag.
  - Remove ! from amdgpu_res_cleared() check.

v4(Christian):
  - vres flag setting move to vram manager file
  - use dma_fence_get_stub in amdgpu_ttm_clear_buffer function
  - make fence a mandatory parameter and drop the if and the get/put dance

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
Acked-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c|  9 +--
 .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 70 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  | 10 +++
 6 files changed, 116 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..e011a326fe86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"
 
 /**
  * DOC: amdgpu_object
@@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
if (!amdgpu_bo_support_uswc(bo->flags))
bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 
-   if (adev->ras_enabled)
-   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
 
bo->tbo.bdev = >mman.bdev;
if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -634,7 +634,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
bo->tbo.resource->mem_type == TTM_PL_VRAM) {
struct dma_fence *fence;
 
-   r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true);
+   r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, );
if (unlikely(r))
goto fail_unreserve;
 
@@ -1365,8 +1365,9 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
return;
 
-   r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true);
+   r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true);
if (!WARN_ON(r)) {
+   amdgpu_vram_mgr_set_cleared(bo->resource);
amdgpu_bo_fence(abo, fence, false);
dma_fence_put(fence);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
index 381101d2bf05..50fcd86e1033 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
amdgpu_res_cursor *cur, uint64_t size)
}
 }
 
+/**
+ * amdgpu_res_cleared - check if blocks are cleared
+ *
+ * @cur: the cursor to extract the block
+ *
+ * Check if the @cur block is cleared
+ */
+static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
+{
+   struct drm_buddy_block *block;
+
+   switch (cur->mem_type) {
+   case TTM_PL_VRAM:
+   block = cur->node;
+
+   if (!amdgpu_vram_mgr_is_cleared(block))
+   return false;
+   break;
+   default:
+   return false;
+   }
+
+   return true;
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index fc418e670fda..f0b42b567357 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -378,11 +378,12 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo,
(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
struct dma_fence *wipe_fence = NULL;
 
-   r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, _fence,
-   false);
+   r = 

Re: [PATCH v9 1/3] drm/buddy: Implement tracking clear page feature

2024-04-01 Thread Paneer Selvam, Arunpravin

Hi Matthew,

On 3/28/2024 10:18 PM, Matthew Auld wrote:

On 28/03/2024 16:07, Paneer Selvam, Arunpravin wrote:

Hi Matthew,

On 3/26/2024 11:39 PM, Matthew Auld wrote:

On 18/03/2024 21:40, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as 
cleared,

   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the 
list(Christian)

   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of 
blocks

 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required 
blocks.
   - Update the xe driver for the drm_buddy_free_list change in 
arguments.

   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

v3:
   - fix Gitlab user reported lockup issue.
   - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
   - modify to pass the root order instead max_order in fini()
 function(Matthew)
   - change bool 1 to true(Matthew)
   - add check if min_block_size is power of 2(Matthew)
   - modify the min_block_size datatype to u64(Matthew)

v4:
   - rename the function drm_buddy_defrag with __force_merge.
   - Include __force_merge directly in drm buddy file and remove
 the defrag use in amdgpu driver.
   - Remove list_empty() check(Matthew)
   - Remove unnecessary space, headers and placement of new 
variables(Matthew)

   - Add a unit test case(Matthew)

Signed-off-by: Arunpravin Paneer Selvam 


Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 427 
++

  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c    |  18 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  16 +-
  6 files changed, 360 insertions(+), 117 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  return 0;
    error_free_blocks:
-    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
  error_fini:
  ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct 
ttm_resource_manager *man,

    amdgpu_vram_mgr_do_reserve(man);
  -    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
    atomic64_sub(vis_usage, >vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device 
*adev)

  kfree(rsv);
    list_for_each_entry_safe(rsv, temp, >reserved_pages, 
blocks) {

-    drm_buddy_free_list(>mm, >allocated);
+    drm_buddy_free_list(>mm, >allocated, 0);
  kfree(rsv);
  }
  if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index c4222b886db7..625a30a6b855 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm,
  kmem_cache_free(slab_blocks, block);
  }
  -static void list_insert_sorted(struct drm_buddy *mm,
-   struct drm_buddy_block *block)
+static void list_insert(struct drm_buddy *mm,
+    struct drm_buddy_block *block)
  {
  struct drm_buddy_block *node;
  struct list_head *head;
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy 
*mm,

  __list_add(>link, node->l

Re: [PATCH v9 1/3] drm/buddy: Implement tracking clear page feature

2024-03-28 Thread Paneer Selvam, Arunpravin

Hi Matthew,

On 3/26/2024 11:39 PM, Matthew Auld wrote:

On 18/03/2024 21:40, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as 
cleared,

   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the 
list(Christian)

   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of 
blocks

 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required blocks.
   - Update the xe driver for the drm_buddy_free_list change in 
arguments.

   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

v3:
   - fix Gitlab user reported lockup issue.
   - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
   - modify to pass the root order instead max_order in fini()
 function(Matthew)
   - change bool 1 to true(Matthew)
   - add check if min_block_size is power of 2(Matthew)
   - modify the min_block_size datatype to u64(Matthew)

v4:
   - rename the function drm_buddy_defrag with __force_merge.
   - Include __force_merge directly in drm buddy file and remove
 the defrag use in amdgpu driver.
   - Remove list_empty() check(Matthew)
   - Remove unnecessary space, headers and placement of new 
variables(Matthew)

   - Add a unit test case(Matthew)

Signed-off-by: Arunpravin Paneer Selvam 


Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 427 ++
  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c    |  18 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  16 +-
  6 files changed, 360 insertions(+), 117 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  return 0;
    error_free_blocks:
-    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
  error_fini:
  ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct 
ttm_resource_manager *man,

    amdgpu_vram_mgr_do_reserve(man);
  -    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
    atomic64_sub(vis_usage, >vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device 
*adev)

  kfree(rsv);
    list_for_each_entry_safe(rsv, temp, >reserved_pages, 
blocks) {

-    drm_buddy_free_list(>mm, >allocated);
+    drm_buddy_free_list(>mm, >allocated, 0);
  kfree(rsv);
  }
  if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index c4222b886db7..625a30a6b855 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm,
  kmem_cache_free(slab_blocks, block);
  }
  -static void list_insert_sorted(struct drm_buddy *mm,
-   struct drm_buddy_block *block)
+static void list_insert(struct drm_buddy *mm,
+    struct drm_buddy_block *block)
  {
  struct drm_buddy_block *node;
  struct list_head *head;
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
  __list_add(>link, node->link.prev, >link);
  }
  +static void clear_reset(struct drm_buddy_block *block)
+{
+    block->header &= ~DRM

Re: [PATCH v9 2/3] drm/amdgpu: Enable clear page functionality

2024-03-27 Thread Paneer Selvam, Arunpravin

Hi Alex,

On 3/26/2024 8:23 PM, Alex Deucher wrote:

On Tue, Mar 26, 2024 at 10:01 AM Alex Deucher  wrote:

On Tue, Mar 26, 2024 at 9:59 AM Paneer Selvam, Arunpravin
 wrote:

Hi Alex,

On 3/26/2024 7:08 PM, Alex Deucher wrote:

On Mon, Mar 18, 2024 at 5:47 PM Arunpravin Paneer Selvam
 wrote:

Add clear page support in vram memory region.

v1(Christian):
- Dont handle clear page as TTM flag since when moving the BO back
  in from GTT again we don't need that.
- Make a specialized version of amdgpu_fill_buffer() which only
  clears the VRAM areas which are not already cleared
- Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
  amdgpu_object.c

v2:
- Modify the function name amdgpu_ttm_* (Alex)
- Drop the delayed parameter (Christian)
- handle amdgpu_res_cleared() just above the size
  calculation (Christian)
- Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers
  in the free path to properly wait for fences etc.. (Christian)

v3(Christian):
- Remove buffer clear code in VRAM manager instead change the
  AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
  the DRM_BUDDY_CLEARED flag.
- Remove ! from amdgpu_res_cleared() check.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
Acked-by: Felix Kuehling 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 ---
   .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 
   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 61 ++-
   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  5 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |  6 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  |  5 ++
   6 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..c92d92b28a57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
   #include "amdgpu.h"
   #include "amdgpu_trace.h"
   #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"

   /**
* DOC: amdgpu_object
@@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
  if (!amdgpu_bo_support_uswc(bo->flags))
  bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;

-   if (adev->ras_enabled)
-   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;

  bo->tbo.bdev = >mman.bdev;
  if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev,

  if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&
  bo->tbo.resource->mem_type == TTM_PL_VRAM) {
-   struct dma_fence *fence;
+   struct dma_fence *fence = NULL;

-   r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true);
+   r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, );
  if (unlikely(r))
  goto fail_unreserve;

-   dma_resv_add_fence(bo->tbo.base.resv, fence,
-  DMA_RESV_USAGE_KERNEL);
-   dma_fence_put(fence);
+   if (fence) {
+   dma_resv_add_fence(bo->tbo.base.resv, fence,
+  DMA_RESV_USAGE_KERNEL);
+   dma_fence_put(fence);
+   }
  }
  if (!bp->resv)
  amdgpu_bo_unreserve(bo);
@@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
  if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
  return;

-   r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true);
+   r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true);
  if (!WARN_ON(r)) {
+   struct amdgpu_vram_mgr_resource *vres;
+
+   vres = to_amdgpu_vram_mgr_resource(bo->resource);
+   vres->flags |= DRM_BUDDY_CLEARED;
  amdgpu_bo_fence(abo, fence, false);
  dma_fence_put(fence);
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
index 381101d2bf05..50fcd86e1033 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
amdgpu_res_cursor *cur, uint64_t size)
  }
   }

+/**
+ * amdgpu_res_cleared - check if blocks are cleared
+ *
+ * @cur: the cursor to extract the block
+ *
+ * Check if the @cur block is cleared
+ */
+static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
+{
+   struct drm_buddy_block *block;
+
+   switch (cur-

Re: [PATCH v9 2/3] drm/amdgpu: Enable clear page functionality

2024-03-26 Thread Paneer Selvam, Arunpravin

Hi Alex,

On 3/26/2024 7:08 PM, Alex Deucher wrote:

On Mon, Mar 18, 2024 at 5:47 PM Arunpravin Paneer Selvam
 wrote:

Add clear page support in vram memory region.

v1(Christian):
   - Dont handle clear page as TTM flag since when moving the BO back
 in from GTT again we don't need that.
   - Make a specialized version of amdgpu_fill_buffer() which only
 clears the VRAM areas which are not already cleared
   - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
 amdgpu_object.c

v2:
   - Modify the function name amdgpu_ttm_* (Alex)
   - Drop the delayed parameter (Christian)
   - handle amdgpu_res_cleared() just above the size
 calculation (Christian)
   - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers
 in the free path to properly wait for fences etc.. (Christian)

v3(Christian):
   - Remove buffer clear code in VRAM manager instead change the
 AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
 the DRM_BUDDY_CLEARED flag.
   - Remove ! from amdgpu_res_cleared() check.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
Acked-by: Felix Kuehling 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 ---
  .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 61 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |  6 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  |  5 ++
  6 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..c92d92b28a57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"

  /**
   * DOC: amdgpu_object
@@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
 if (!amdgpu_bo_support_uswc(bo->flags))
 bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;

-   if (adev->ras_enabled)
-   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;

 bo->tbo.bdev = >mman.bdev;
 if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev,

 if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&
 bo->tbo.resource->mem_type == TTM_PL_VRAM) {
-   struct dma_fence *fence;
+   struct dma_fence *fence = NULL;

-   r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true);
+   r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, );
 if (unlikely(r))
 goto fail_unreserve;

-   dma_resv_add_fence(bo->tbo.base.resv, fence,
-  DMA_RESV_USAGE_KERNEL);
-   dma_fence_put(fence);
+   if (fence) {
+   dma_resv_add_fence(bo->tbo.base.resv, fence,
+  DMA_RESV_USAGE_KERNEL);
+   dma_fence_put(fence);
+   }
 }
 if (!bp->resv)
 amdgpu_bo_unreserve(bo);
@@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
 if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
 return;

-   r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true);
+   r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true);
 if (!WARN_ON(r)) {
+   struct amdgpu_vram_mgr_resource *vres;
+
+   vres = to_amdgpu_vram_mgr_resource(bo->resource);
+   vres->flags |= DRM_BUDDY_CLEARED;
 amdgpu_bo_fence(abo, fence, false);
 dma_fence_put(fence);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
index 381101d2bf05..50fcd86e1033 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
amdgpu_res_cursor *cur, uint64_t size)
 }
  }

+/**
+ * amdgpu_res_cleared - check if blocks are cleared
+ *
+ * @cur: the cursor to extract the block
+ *
+ * Check if the @cur block is cleared
+ */
+static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
+{
+   struct drm_buddy_block *block;
+
+   switch (cur->mem_type) {
+   case TTM_PL_VRAM:
+   block = cur->node;
+
+   if (!amdgpu_vram_mgr_is_cleared(block))
+   return false;
+   break;
+   default:
+  

Re: [PATCH v9 1/3] drm/buddy: Implement tracking clear page feature

2024-03-25 Thread Paneer Selvam, Arunpravin

Hi Matthew,
ping?

Thanks,
Arun.
On 3/19/2024 3:10 AM, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as cleared,
   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the list(Christian)
   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks
 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required blocks.
   - Update the xe driver for the drm_buddy_free_list change in arguments.
   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

v3:
   - fix Gitlab user reported lockup issue.
   - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
   - modify to pass the root order instead max_order in fini()
 function(Matthew)
   - change bool 1 to true(Matthew)
   - add check if min_block_size is power of 2(Matthew)
   - modify the min_block_size datatype to u64(Matthew)

v4:
   - rename the function drm_buddy_defrag with __force_merge.
   - Include __force_merge directly in drm buddy file and remove
 the defrag use in amdgpu driver.
   - Remove list_empty() check(Matthew)
   - Remove unnecessary space, headers and placement of new variables(Matthew)
   - Add a unit test case(Matthew)

Signed-off-by: Arunpravin Paneer Selvam 
Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 427 ++
  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c|  18 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  16 +-
  6 files changed, 360 insertions(+), 117 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
return 0;
  
  error_free_blocks:

-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
  error_fini:
ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager 
*man,
  
  	amdgpu_vram_mgr_do_reserve(man);
  
-	drm_buddy_free_list(mm, >blocks);

+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
  
  	atomic64_sub(vis_usage, >vis_usage);

@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
kfree(rsv);
  
  	list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) {

-   drm_buddy_free_list(>mm, >allocated);
+   drm_buddy_free_list(>mm, >allocated, 0);
kfree(rsv);
}
if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index c4222b886db7..625a30a6b855 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm,
kmem_cache_free(slab_blocks, block);
  }
  
-static void list_insert_sorted(struct drm_buddy *mm,

-  struct drm_buddy_block *block)
+static void list_insert(struct drm_buddy *mm,
+   struct drm_buddy_block *block)
  {
struct drm_buddy_block *node;
struct list_head *head;
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
__list_add(>link, node->link.prev, >link);
  }
  
+static void clear_reset(struct drm_buddy_block *block)


Re: [PATCH v9 2/3] drm/amdgpu: Enable clear page functionality

2024-03-19 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 3/19/2024 7:17 PM, Christian König wrote:

Am 19.03.24 um 12:41 schrieb Paneer Selvam, Arunpravin:

Hi Christian,

On 3/19/2024 3:58 PM, Christian König wrote:



Am 18.03.24 um 22:40 schrieb Arunpravin Paneer Selvam:

Add clear page support in vram memory region.

v1(Christian):
   - Dont handle clear page as TTM flag since when moving the BO back
 in from GTT again we don't need that.
   - Make a specialized version of amdgpu_fill_buffer() which only
 clears the VRAM areas which are not already cleared
   - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
 amdgpu_object.c

v2:
   - Modify the function name amdgpu_ttm_* (Alex)
   - Drop the delayed parameter (Christian)
   - handle amdgpu_res_cleared() just above the size
 calculation (Christian)
   - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the 
buffers

 in the free path to properly wait for fences etc.. (Christian)

v3(Christian):
   - Remove buffer clear code in VRAM manager instead change the
 AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
 the DRM_BUDDY_CLEARED flag.
   - Remove ! from amdgpu_res_cleared() check.

Signed-off-by: Arunpravin Paneer Selvam 


Suggested-by: Christian König 
Acked-by: Felix Kuehling 


Just a few nit picks below, but in general already looks good to me.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    | 22 ---
  .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h    | 25 
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 61 
++-

  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |  6 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  |  5 ++
  6 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..c92d92b28a57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"
    /**
   * DOC: amdgpu_object
@@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
  if (!amdgpu_bo_support_uswc(bo->flags))
  bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
  -    if (adev->ras_enabled)
-    bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+    bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
    bo->tbo.bdev = >mman.bdev;
  if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
    if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&
  bo->tbo.resource->mem_type == TTM_PL_VRAM) {
-    struct dma_fence *fence;
+    struct dma_fence *fence = NULL;
  -    r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , 
true);

+    r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, );
  if (unlikely(r))
  goto fail_unreserve;
  -    dma_resv_add_fence(bo->tbo.base.resv, fence,
-   DMA_RESV_USAGE_KERNEL);
-    dma_fence_put(fence);
+    if (fence) {
+    dma_resv_add_fence(bo->tbo.base.resv, fence,
+   DMA_RESV_USAGE_KERNEL);
+    dma_fence_put(fence);
+    }
  }
  if (!bp->resv)
  amdgpu_bo_unreserve(bo);
@@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct 
ttm_buffer_object *bo)

  if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
  return;
  -    r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, 
, true);

+    r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true);
  if (!WARN_ON(r)) {
+    struct amdgpu_vram_mgr_resource *vres;
+
+    vres = to_amdgpu_vram_mgr_resource(bo->resource);
+    vres->flags |= DRM_BUDDY_CLEARED;


Those lines should probably be in the VRAM manager.
This flag is set based on the amdgpu_fill_buffer() function return 
value, can we move the amdgpu_fill_buffer() function call and
DRM_BUDDY_CLEARED flag set lines to amdgpu_vram_mgr_del() in VRAM 
manager and does it require to wipe the VRAM buffers here as well

without setting the DRM_BUDDY_CLEARED.


No that won't work. The call to amdgpu_fill_buffer() *must* be here 
because that here is called before the resource is actually freed.


Only setting the vres->flags should probably be moved into 
amdgpu_vram_mgr.c.


So instead of calling that manually here just make a function for it 
to keep the code related to the buddy allocator in one place.

I got it, Thanks.

Regards,
Arun.


Regards,
Christian.




  amdgpu_bo_fence(abo, fence, false);
  dma_fence_put(fence);
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h

index 381101d2bf05.

Re: [PATCH v9 2/3] drm/amdgpu: Enable clear page functionality

2024-03-19 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 3/19/2024 3:58 PM, Christian König wrote:



Am 18.03.24 um 22:40 schrieb Arunpravin Paneer Selvam:

Add clear page support in vram memory region.

v1(Christian):
   - Dont handle clear page as TTM flag since when moving the BO back
 in from GTT again we don't need that.
   - Make a specialized version of amdgpu_fill_buffer() which only
 clears the VRAM areas which are not already cleared
   - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
 amdgpu_object.c

v2:
   - Modify the function name amdgpu_ttm_* (Alex)
   - Drop the delayed parameter (Christian)
   - handle amdgpu_res_cleared() just above the size
 calculation (Christian)
   - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers
 in the free path to properly wait for fences etc.. (Christian)

v3(Christian):
   - Remove buffer clear code in VRAM manager instead change the
 AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
 the DRM_BUDDY_CLEARED flag.
   - Remove ! from amdgpu_res_cleared() check.

Signed-off-by: Arunpravin Paneer Selvam 


Suggested-by: Christian König 
Acked-by: Felix Kuehling 


Just a few nit picks below, but in general already looks good to me.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    | 22 ---
  .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h    | 25 
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 61 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |  6 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  |  5 ++
  6 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..c92d92b28a57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"
    /**
   * DOC: amdgpu_object
@@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
  if (!amdgpu_bo_support_uswc(bo->flags))
  bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
  -    if (adev->ras_enabled)
-    bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+    bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
    bo->tbo.bdev = >mman.bdev;
  if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
    if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&
  bo->tbo.resource->mem_type == TTM_PL_VRAM) {
-    struct dma_fence *fence;
+    struct dma_fence *fence = NULL;
  -    r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , 
true);

+    r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, );
  if (unlikely(r))
  goto fail_unreserve;
  -    dma_resv_add_fence(bo->tbo.base.resv, fence,
-   DMA_RESV_USAGE_KERNEL);
-    dma_fence_put(fence);
+    if (fence) {
+    dma_resv_add_fence(bo->tbo.base.resv, fence,
+   DMA_RESV_USAGE_KERNEL);
+    dma_fence_put(fence);
+    }
  }
  if (!bp->resv)
  amdgpu_bo_unreserve(bo);
@@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct 
ttm_buffer_object *bo)

  if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
  return;
  -    r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, 
, true);

+    r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true);
  if (!WARN_ON(r)) {
+    struct amdgpu_vram_mgr_resource *vres;
+
+    vres = to_amdgpu_vram_mgr_resource(bo->resource);
+    vres->flags |= DRM_BUDDY_CLEARED;


Those lines should probably be in the VRAM manager.
This flag is set based on the amdgpu_fill_buffer() function return 
value, can we move the amdgpu_fill_buffer() function call and
DRM_BUDDY_CLEARED flag set lines to amdgpu_vram_mgr_del() in VRAM 
manager and does it require to wipe the VRAM buffers here as well

without setting the DRM_BUDDY_CLEARED.



  amdgpu_bo_fence(abo, fence, false);
  dma_fence_put(fence);
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h

index 381101d2bf05..50fcd86e1033 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
amdgpu_res_cursor *cur, uint64_t size)

  }
  }
  +/**
+ * amdgpu_res_cleared - check if blocks are cleared
+ *
+ * @cur: the cursor to extract the block
+ *
+ * Check if the @cur block is cleared
+ */
+static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
+{
+    struct drm_buddy_block *block;
+
+    switch (cur->mem_type) {
+

[PATCH v9 3/3] drm/tests: Add a test case for drm buddy clear allocation

2024-03-18 Thread Arunpravin Paneer Selvam
Add a new test case for the drm buddy clear and dirty
allocation.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Matthew Auld 
---
 drivers/gpu/drm/tests/drm_buddy_test.c | 127 +
 1 file changed, 127 insertions(+)

diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c 
b/drivers/gpu/drm/tests/drm_buddy_test.c
index 454ad9952f56..d355a6e61893 100644
--- a/drivers/gpu/drm/tests/drm_buddy_test.c
+++ b/drivers/gpu/drm/tests/drm_buddy_test.c
@@ -19,6 +19,132 @@ static inline u64 get_size(int order, u64 chunk_size)
return (1 << order) * chunk_size;
 }
 
+static void drm_test_buddy_alloc_clear(struct kunit *test)
+{
+   unsigned long n_pages, total, i = 0;
+   const unsigned long ps = SZ_4K;
+   struct drm_buddy_block *block;
+   const int max_order = 12;
+   LIST_HEAD(allocated);
+   struct drm_buddy mm;
+   unsigned int order;
+   u64 mm_size, size;
+   LIST_HEAD(dirty);
+   LIST_HEAD(clean);
+
+   mm_size = PAGE_SIZE << max_order;
+   KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps));
+
+   KUNIT_EXPECT_EQ(test, mm.max_order, max_order);
+
+   /**
+* Idea is to allocate and free some random portion of the address 
space,
+* returning those pages as non-dirty and randomly alternate between
+* requesting dirty and non-dirty pages (not going over the limit
+* we freed as non-dirty), putting that into two separate lists.
+* Loop over both lists at the end checking that the dirty list
+* is indeed all dirty pages and vice versa. Free it all again,
+* keeping the dirty/clear status.
+*/
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size,
+   5 * ps, ps, 
,
+   
DRM_BUDDY_TOPDOWN_ALLOCATION),
+   "buddy_alloc hit an error size=%u\n", 5 * ps);
+   drm_buddy_free_list(, , DRM_BUDDY_CLEARED);
+
+   n_pages = 10;
+   do {
+   unsigned long flags;
+   struct list_head *list;
+   int slot = i % 2;
+
+   if (slot == 0) {
+   list = 
+   flags = 0;
+   } else if (slot == 1) {
+   list = 
+   flags = DRM_BUDDY_CLEAR_ALLOCATION;
+   }
+
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, 
mm_size,
+   ps, ps, 
list,
+   flags),
+   "buddy_alloc hit an error size=%u\n", 
ps);
+   } while (++i < n_pages);
+
+   list_for_each_entry(block, , link)
+   KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true);
+
+   list_for_each_entry(block, , link)
+   KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false);
+
+   drm_buddy_free_list(, , DRM_BUDDY_CLEARED);
+
+   /**
+* Trying to go over the clear limit for some allocation.
+* The allocation should never fail with reasonable page-size.
+*/
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size,
+   10 * ps, ps, ,
+   
DRM_BUDDY_CLEAR_ALLOCATION),
+   "buddy_alloc hit an error size=%u\n", 10 * ps);
+
+   drm_buddy_free_list(, , DRM_BUDDY_CLEARED);
+   drm_buddy_free_list(, , 0);
+   drm_buddy_fini();
+
+   KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps));
+
+   /**
+* Create a new mm. Intentionally fragment the address space by creating
+* two alternating lists. Free both lists, one as dirty the other as 
clean.
+* Try to allocate double the previous size with matching 
min_page_size. The
+* allocation should never fail as it calls the force_merge. Also check 
that
+* the page is always dirty after force_merge. Free the page as dirty, 
then
+* repeat the whole thing, increment the order until we hit the 
max_order.
+*/
+
+   order = 1;
+   do {
+   size = PAGE_SIZE << order;
+   i = 0;
+   n_pages = mm_size / ps;
+   do {
+   struct list_head *list;
+   int slot = i % 2;
+
+   if (slot == 0)
+   list = 
+   else if (slot == 1)
+   list = 
+
+   KUNIT_ASSERT_FALSE_MSG(test,
+  drm_buddy_alloc_blocks(, 0, 
mm_size,
+  

[PATCH v9 2/3] drm/amdgpu: Enable clear page functionality

2024-03-18 Thread Arunpravin Paneer Selvam
Add clear page support in vram memory region.

v1(Christian):
  - Dont handle clear page as TTM flag since when moving the BO back
in from GTT again we don't need that.
  - Make a specialized version of amdgpu_fill_buffer() which only
clears the VRAM areas which are not already cleared
  - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
amdgpu_object.c

v2:
  - Modify the function name amdgpu_ttm_* (Alex)
  - Drop the delayed parameter (Christian)
  - handle amdgpu_res_cleared() just above the size
calculation (Christian)
  - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers
in the free path to properly wait for fences etc.. (Christian)

v3(Christian):
  - Remove buffer clear code in VRAM manager instead change the
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
the DRM_BUDDY_CLEARED flag.
  - Remove ! from amdgpu_res_cleared() check.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
Acked-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 ---
 .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 61 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  |  5 ++
 6 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..c92d92b28a57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"
 
 /**
  * DOC: amdgpu_object
@@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
if (!amdgpu_bo_support_uswc(bo->flags))
bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 
-   if (adev->ras_enabled)
-   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
 
bo->tbo.bdev = >mman.bdev;
if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
 
if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&
bo->tbo.resource->mem_type == TTM_PL_VRAM) {
-   struct dma_fence *fence;
+   struct dma_fence *fence = NULL;
 
-   r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true);
+   r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, );
if (unlikely(r))
goto fail_unreserve;
 
-   dma_resv_add_fence(bo->tbo.base.resv, fence,
-  DMA_RESV_USAGE_KERNEL);
-   dma_fence_put(fence);
+   if (fence) {
+   dma_resv_add_fence(bo->tbo.base.resv, fence,
+  DMA_RESV_USAGE_KERNEL);
+   dma_fence_put(fence);
+   }
}
if (!bp->resv)
amdgpu_bo_unreserve(bo);
@@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
return;
 
-   r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true);
+   r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true);
if (!WARN_ON(r)) {
+   struct amdgpu_vram_mgr_resource *vres;
+
+   vres = to_amdgpu_vram_mgr_resource(bo->resource);
+   vres->flags |= DRM_BUDDY_CLEARED;
amdgpu_bo_fence(abo, fence, false);
dma_fence_put(fence);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
index 381101d2bf05..50fcd86e1033 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
amdgpu_res_cursor *cur, uint64_t size)
}
 }
 
+/**
+ * amdgpu_res_cleared - check if blocks are cleared
+ *
+ * @cur: the cursor to extract the block
+ *
+ * Check if the @cur block is cleared
+ */
+static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
+{
+   struct drm_buddy_block *block;
+
+   switch (cur->mem_type) {
+   case TTM_PL_VRAM:
+   block = cur->node;
+
+   if (!amdgpu_vram_mgr_is_cleared(block))
+   return false;
+   break;
+   default:
+   return false;
+   }
+
+   return true;
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/am

[PATCH v9 1/3] drm/buddy: Implement tracking clear page feature

2024-03-18 Thread Arunpravin Paneer Selvam
- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
  successfully clears the blocks in the free path. On the otherhand,
  DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
  but fallback to uncleared if we can't find the cleared blocks.
  when driver requests uncleared memory we try to use uncleared but
  fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as cleared,
  when there are buddies which are cleared as well we can merge them.
  Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
  - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
cleared. Else, reset the clear flag for each block in the list(Christian)
  - For merging the 2 cleared blocks compare as below,
drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
  - Defragment the memory beginning from min_order
till the required memory space is available.

v2: (Matthew)
  - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks
operation within drm buddy.
  - Write a macro block_incompatible() to allocate the required blocks.
  - Update the xe driver for the drm_buddy_free_list change in arguments.
  - add a warning if the two blocks are incompatible on
defragmentation
  - call full defragmentation in the fini() function
  - place a condition to test if min_order is equal to 0
  - replace the list with safe_reverse() variant as we might
remove the block from the list.

v3:
  - fix Gitlab user reported lockup issue.
  - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
  - modify to pass the root order instead max_order in fini()
function(Matthew)
  - change bool 1 to true(Matthew)
  - add check if min_block_size is power of 2(Matthew)
  - modify the min_block_size datatype to u64(Matthew)

v4:
  - rename the function drm_buddy_defrag with __force_merge.
  - Include __force_merge directly in drm buddy file and remove
the defrag use in amdgpu driver.
  - Remove list_empty() check(Matthew)
  - Remove unnecessary space, headers and placement of new variables(Matthew)
  - Add a unit test case(Matthew)

Signed-off-by: Arunpravin Paneer Selvam 
Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
 drivers/gpu/drm/drm_buddy.c   | 427 ++
 drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
 drivers/gpu/drm/tests/drm_buddy_test.c|  18 +-
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
 include/drm/drm_buddy.h   |  16 +-
 6 files changed, 360 insertions(+), 117 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
return 0;
 
 error_free_blocks:
-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
 error_fini:
ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager 
*man,
 
amdgpu_vram_mgr_do_reserve(man);
 
-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
 
atomic64_sub(vis_usage, >vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
kfree(rsv);
 
list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) {
-   drm_buddy_free_list(>mm, >allocated);
+   drm_buddy_free_list(>mm, >allocated, 0);
kfree(rsv);
}
if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index c4222b886db7..625a30a6b855 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm,
kmem_cache_free(slab_blocks, block);
 }
 
-static void list_insert_sorted(struct drm_buddy *mm,
-  struct drm_buddy_block *block)
+static void list_insert(struct drm_buddy *mm,
+   struct drm_buddy_block *block)
 {
struct drm_buddy_block *node;
struct list_head *head;
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
__list_add(>link, node->link.prev, >link);
 }
 
+static void clear_reset(struct drm_buddy_block *block)
+{
+   block->header &= ~DRM_BUDDY_HEADER_CLEAR;
+}
+
+static void mark_cleared(struct drm_buddy_block *block)
+{
+   

Re: [PATCH v8 1/3] drm/buddy: Implement tracking clear page feature

2024-03-07 Thread Paneer Selvam, Arunpravin

Hi Matthew,

On 3/6/2024 11:19 PM, Matthew Auld wrote:

On 04/03/2024 16:32, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as 
cleared,

   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the 
list(Christian)

   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of 
blocks

 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required blocks.
   - Update the xe driver for the drm_buddy_free_list change in 
arguments.

   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

v3:
   - fix Gitlab user reported lockup issue.
   - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
   - modify to pass the root order instead max_order in fini()
 function(Matthew)
   - change bool 1 to true(Matthew)
   - add check if min_block_size is power of 2(Matthew)
   - modify the min_block_size datatype to u64(Matthew)

Signed-off-by: Arunpravin Paneer Selvam 


Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 


Is there a unit test for this? What about maybe something roughly like:

- Pick small random mm_size which is not always power-of-two.
- Allocate and free some random portion of the address space, 
returning those pages as non-dirty. Then do another cycle and randomly 
alternate between requesting dirty and non-dirty pages (not going over 
the limit you freed as non-dirty), putting that into two separate 
lists. Loop over both lists at the end checking that the dirty list is 
indeed all dirty pages and vice versa. Free it all again, keeping the 
dirty/clear status.
- Also try to go over the clear limit for some allocation. The 
allocation should never fail with reasonable page-size.
- Test the defrag/force_merge interface. Clean the mm or create new 
one. Intentionally fragment the address space, by creating two 
alternating lists. Free both lists, one as dirty the other as clean. 
Try to allocate double the previous size with matching min_page_size. 
Should fail. Call force_merge. Should now succeed. Also check that the 
page is always dirty after force_merge. Free the page as dirty, then 
repeat the whole thing, doubling the page-size until you hit max_order.
- Make sure we also call fini() with some part of the address space 
left as non-dirty. Should not trigger any warnings.


I think would be good to consider, but not a blocker or anything. Some 
comments below, otherwise I think looks good.

Yes. It is good to have a unit test for this feature. I will send the patch.



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 294 +++---
  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c    |  18 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  22 +-
  6 files changed, 290 insertions(+), 60 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  return 0;
    error_free_blocks:
-    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
  error_fini:
  ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct 
ttm_resource_manager *man,

    amdgpu_vram_mgr_do_reserve(man);
  -    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
    a

Re: [PATCH v8 1/3] drm/buddy: Implement tracking clear page feature

2024-03-06 Thread Paneer Selvam, Arunpravin

Hi Matthew,
Ping?

Thanks,
Arun.

On 3/4/2024 10:02 PM, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as cleared,
   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the list(Christian)
   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks
 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required blocks.
   - Update the xe driver for the drm_buddy_free_list change in arguments.
   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

v3:
   - fix Gitlab user reported lockup issue.
   - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
   - modify to pass the root order instead max_order in fini()
 function(Matthew)
   - change bool 1 to true(Matthew)
   - add check if min_block_size is power of 2(Matthew)
   - modify the min_block_size datatype to u64(Matthew)

Signed-off-by: Arunpravin Paneer Selvam 
Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 294 +++---
  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c|  18 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  22 +-
  6 files changed, 290 insertions(+), 60 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
return 0;
  
  error_free_blocks:

-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
  error_fini:
ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager 
*man,
  
  	amdgpu_vram_mgr_do_reserve(man);
  
-	drm_buddy_free_list(mm, >blocks);

+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
  
  	atomic64_sub(vis_usage, >vis_usage);

@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
kfree(rsv);
  
  	list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) {

-   drm_buddy_free_list(>mm, >allocated);
+   drm_buddy_free_list(>mm, >allocated, 0);
kfree(rsv);
}
if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index c4222b886db7..40131ed9b0cd 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
__list_add(>link, node->link.prev, >link);
  }
  
+static void clear_reset(struct drm_buddy_block *block)

+{
+   block->header &= ~DRM_BUDDY_HEADER_CLEAR;
+}
+
+static void mark_cleared(struct drm_buddy_block *block)
+{
+   block->header |= DRM_BUDDY_HEADER_CLEAR;
+}
+
  static void mark_allocated(struct drm_buddy_block *block)
  {
block->header &= ~DRM_BUDDY_HEADER_STATE;
@@ -186,11 +196,21 @@ EXPORT_SYMBOL(drm_buddy_init);
   */
  void drm_buddy_fini(struct drm_buddy *mm)
  {
+   u64 root_size, size;
+   unsigned int order;
int i;
  
+	size = mm->size;

+
for (i = 0; i < mm->n_roots; ++i) {
+   order = ilog2(size) - ilog2(mm->chunk_size);
+   root_size = mm->chunk_size << order;
+   drm_bu

Re: [PATCH v8 3/3] drm/buddy: Add user for defragmentation

2024-03-06 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 3/5/2024 5:41 PM, Christian König wrote:

Am 05.03.24 um 12:14 schrieb Paneer Selvam, Arunpravin:

On 3/5/2024 4:33 PM, Paneer Selvam, Arunpravin wrote:

Hi Christian,

On 3/4/2024 10:09 PM, Christian König wrote:

Am 04.03.24 um 17:32 schrieb Arunpravin Paneer Selvam:

Add amdgpu driver as user for the drm buddy
defragmentation.

Signed-off-by: Arunpravin Paneer Selvam 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++--
  drivers/gpu/drm/drm_buddy.c  |  1 +
  2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index e494f5bf136a..cff8a526c622 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

 min_block_size,
 >blocks,
 vres->flags);
-    if (unlikely(r))
-    goto error_free_blocks;
+    if (unlikely(r)) {
+    if (r == -ENOSPC) {
+    drm_buddy_defrag(mm, min_block_size);
+    r = drm_buddy_alloc_blocks(mm, fpfn,
+   lpfn,
+   size,
+   min_block_size,
+   >blocks,
+   vres->flags);


That doesn't looks like something we should do.

We might fallback when contiguous memory is requested, but 
certainly not on normal allocation failure.
yes, defrag here not useful for normal allocations. But worried 
about the bigger min_block_size normal allocations.
In such cases, I think we should move this drm_buddy_defrag() call 
into buddy allocator file. For example if the required
size is 1024KiB and if min_block_size is 256KiB, the allocator first 
tries to find the 1024KiB block, when there is no single 1024KiB block,
the allocator goes one level below in freelist and tries to search 
for two 512KiB blocks and goes on. At one point of time if we have 
less space,
we might go further levels below to search four 256KiB blocks to 
satisfy the request.


Assuming if the allocator cannot find the first 256KiB block, that 
time I think we might need to merge the two 128KiB blocks
through defragmentation function. And again for the second 256KiB 
block, we might need to call the defragmentation again to
merge two 128KiB blocks or four 64KiB blocks to form minimum 
alignment size of 256KiB. This goes on for the third and fourth
256KiB blocks to complete the required size allocation of 1024KiB. 
Please let me know if my understanding is not correct.


I don't think we should do that. We essentially have to support two 
different use cases:


1. Non contiguous allocation with 2MiB min_block_size for everything 
larger than 2MiB. Using a block size as large as possible is 
desirable, but not something we enforce.


2. Contiguous allocations for display, firmware etc.. Here we need to 
enforce a large block size and can live with the additional overhead 
caused by force merging.

Thanks. I will make the changes accordingly.




As you have suggested we can also rename this as force merge or some 
other names.


Yeah, but just an suggestion. You are way deeper in the code and 
handling than I'm, so feel free to name it whatever you think fits best.

Sure :)

Thanks,
Arun.


Regards,
Christian.




Thanks,
Arun.


Thanks,
Arun.


Regards,
Christian.


+    if (unlikely(r))
+    goto error_free_blocks;
+    } else {
+    goto error_free_blocks;
+    }
+    }
    if (size > remaining_size)
  remaining_size = 0;
diff --git a/drivers/gpu/drm/drm_buddy.c 
b/drivers/gpu/drm/drm_buddy.c

index 40131ed9b0cd..19440f8caec0 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -396,6 +396,7 @@ void drm_buddy_defrag(struct drm_buddy *mm,
  }
  }
  }
+EXPORT_SYMBOL(drm_buddy_defrag);
    /**
   * drm_buddy_free_block - free a block












Re: [PATCH v8 3/3] drm/buddy: Add user for defragmentation

2024-03-05 Thread Paneer Selvam, Arunpravin




On 3/5/2024 4:33 PM, Paneer Selvam, Arunpravin wrote:

Hi Christian,

On 3/4/2024 10:09 PM, Christian König wrote:

Am 04.03.24 um 17:32 schrieb Arunpravin Paneer Selvam:

Add amdgpu driver as user for the drm buddy
defragmentation.

Signed-off-by: Arunpravin Paneer Selvam 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++--
  drivers/gpu/drm/drm_buddy.c  |  1 +
  2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index e494f5bf136a..cff8a526c622 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

 min_block_size,
 >blocks,
 vres->flags);
-    if (unlikely(r))
-    goto error_free_blocks;
+    if (unlikely(r)) {
+    if (r == -ENOSPC) {
+    drm_buddy_defrag(mm, min_block_size);
+    r = drm_buddy_alloc_blocks(mm, fpfn,
+   lpfn,
+   size,
+   min_block_size,
+   >blocks,
+   vres->flags);


That doesn't looks like something we should do.

We might fallback when contiguous memory is requested, but certainly 
not on normal allocation failure.
yes, defrag here not useful for normal allocations. But worried about 
the bigger min_block_size normal allocations.
In such cases, I think we should move this drm_buddy_defrag() call 
into buddy allocator file. For example if the required
size is 1024KiB and if min_block_size is 256KiB, the allocator first 
tries to find the 1024KiB block, when there is no single 1024KiB block,
the allocator goes one level below in freelist and tries to search for 
two 512KiB blocks and goes on. At one point of time if we have less 
space,
we might go further levels below to search four 256KiB blocks to 
satisfy the request.


Assuming if the allocator cannot find the first 256KiB block, that 
time I think we might need to merge the two 128KiB blocks
through defragmentation function. And again for the second 256KiB 
block, we might need to call the defragmentation again to
merge two 128KiB blocks or four 64KiB blocks to form minimum alignment 
size of 256KiB. This goes on for the third and fourth
256KiB blocks to complete the required size allocation of 1024KiB. 
Please let me know if my understanding is not correct.


As you have suggested we can also rename this as force merge or some 
other names.


Thanks,
Arun.


Thanks,
Arun.


Regards,
Christian.


+    if (unlikely(r))
+    goto error_free_blocks;
+    } else {
+    goto error_free_blocks;
+    }
+    }
    if (size > remaining_size)
  remaining_size = 0;
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 40131ed9b0cd..19440f8caec0 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -396,6 +396,7 @@ void drm_buddy_defrag(struct drm_buddy *mm,
  }
  }
  }
+EXPORT_SYMBOL(drm_buddy_defrag);
    /**
   * drm_buddy_free_block - free a block








Re: [PATCH v8 3/3] drm/buddy: Add user for defragmentation

2024-03-05 Thread Paneer Selvam, Arunpravin

Hi Christian,

On 3/4/2024 10:09 PM, Christian König wrote:

Am 04.03.24 um 17:32 schrieb Arunpravin Paneer Selvam:

Add amdgpu driver as user for the drm buddy
defragmentation.

Signed-off-by: Arunpravin Paneer Selvam 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++--
  drivers/gpu/drm/drm_buddy.c  |  1 +
  2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index e494f5bf136a..cff8a526c622 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

 min_block_size,
 >blocks,
 vres->flags);
-    if (unlikely(r))
-    goto error_free_blocks;
+    if (unlikely(r)) {
+    if (r == -ENOSPC) {
+    drm_buddy_defrag(mm, min_block_size);
+    r = drm_buddy_alloc_blocks(mm, fpfn,
+   lpfn,
+   size,
+   min_block_size,
+   >blocks,
+   vres->flags);


That doesn't looks like something we should do.

We might fallback when contiguous memory is requested, but certainly 
not on normal allocation failure.
yes, defrag here not useful for normal allocations. But worried about 
the bigger min_block_size normal allocations.
In such cases, I think we should move this drm_buddy_defrag() call into 
buddy allocator file. For example if the required
size is 1024KiB and if min_block_size is 256KiB, the allocator first 
tries to find the 1024KiB block, when there is no single 1024KiB block,
the allocator goes one level below in freelist and tries to search for 
two 512KiB blocks and goes on. At one point of time if we have less space,
we might go further levels below to search four 256KiB blocks to satisfy 
the request.


Assuming if the allocator cannot find the first 256KiB block, that time 
I think we might need to merge the two 128KiB blocks
through defragmentation function. And again for the second 256KiB block, 
we might need to call the defragmentation again to
merge two 128KiB blocks or four 64KiB blocks to form minimum alignment 
size of 256KiB. This goes on for the third and fourth
256KiB blocks to complete the required size allocation of 1024KiB. 
Please let me know if my understanding is not correct.


Thanks,
Arun.


Regards,
Christian.


+    if (unlikely(r))
+    goto error_free_blocks;
+    } else {
+    goto error_free_blocks;
+    }
+    }
    if (size > remaining_size)
  remaining_size = 0;
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 40131ed9b0cd..19440f8caec0 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -396,6 +396,7 @@ void drm_buddy_defrag(struct drm_buddy *mm,
  }
  }
  }
+EXPORT_SYMBOL(drm_buddy_defrag);
    /**
   * drm_buddy_free_block - free a block






[PATCH v8 2/3] drm/amdgpu: Enable clear page functionality

2024-03-04 Thread Arunpravin Paneer Selvam
Add clear page support in vram memory region.

v1(Christian):
  - Dont handle clear page as TTM flag since when moving the BO back
in from GTT again we don't need that.
  - Make a specialized version of amdgpu_fill_buffer() which only
clears the VRAM areas which are not already cleared
  - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
amdgpu_object.c

v2:
  - Modify the function name amdgpu_ttm_* (Alex)
  - Drop the delayed parameter (Christian)
  - handle amdgpu_res_cleared() just above the size
calculation (Christian)
  - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers
in the free path to properly wait for fences etc.. (Christian)

v3(Christian):
  - Remove buffer clear code in VRAM manager instead change the
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
the DRM_BUDDY_CLEARED flag.
  - Remove ! from amdgpu_res_cleared() check.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
Acked-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 ---
 .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 61 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  |  5 ++
 6 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..c92d92b28a57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"
 
 /**
  * DOC: amdgpu_object
@@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
if (!amdgpu_bo_support_uswc(bo->flags))
bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 
-   if (adev->ras_enabled)
-   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
 
bo->tbo.bdev = >mman.bdev;
if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
 
if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&
bo->tbo.resource->mem_type == TTM_PL_VRAM) {
-   struct dma_fence *fence;
+   struct dma_fence *fence = NULL;
 
-   r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true);
+   r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, );
if (unlikely(r))
goto fail_unreserve;
 
-   dma_resv_add_fence(bo->tbo.base.resv, fence,
-  DMA_RESV_USAGE_KERNEL);
-   dma_fence_put(fence);
+   if (fence) {
+   dma_resv_add_fence(bo->tbo.base.resv, fence,
+  DMA_RESV_USAGE_KERNEL);
+   dma_fence_put(fence);
+   }
}
if (!bp->resv)
amdgpu_bo_unreserve(bo);
@@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
return;
 
-   r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true);
+   r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true);
if (!WARN_ON(r)) {
+   struct amdgpu_vram_mgr_resource *vres;
+
+   vres = to_amdgpu_vram_mgr_resource(bo->resource);
+   vres->flags |= DRM_BUDDY_CLEARED;
amdgpu_bo_fence(abo, fence, false);
dma_fence_put(fence);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
index 381101d2bf05..50fcd86e1033 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
amdgpu_res_cursor *cur, uint64_t size)
}
 }
 
+/**
+ * amdgpu_res_cleared - check if blocks are cleared
+ *
+ * @cur: the cursor to extract the block
+ *
+ * Check if the @cur block is cleared
+ */
+static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
+{
+   struct drm_buddy_block *block;
+
+   switch (cur->mem_type) {
+   case TTM_PL_VRAM:
+   block = cur->node;
+
+   if (!amdgpu_vram_mgr_is_cleared(block))
+   return false;
+   break;
+   default:
+   return false;
+   }
+
+   return true;
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/am

[PATCH v8 3/3] drm/buddy: Add user for defragmentation

2024-03-04 Thread Arunpravin Paneer Selvam
Add amdgpu driver as user for the drm buddy
defragmentation.

Signed-off-by: Arunpravin Paneer Selvam 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++--
 drivers/gpu/drm/drm_buddy.c  |  1 +
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index e494f5bf136a..cff8a526c622 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
   min_block_size,
   >blocks,
   vres->flags);
-   if (unlikely(r))
-   goto error_free_blocks;
+   if (unlikely(r)) {
+   if (r == -ENOSPC) {
+   drm_buddy_defrag(mm, min_block_size);
+   r = drm_buddy_alloc_blocks(mm, fpfn,
+  lpfn,
+  size,
+  min_block_size,
+  >blocks,
+  vres->flags);
+   if (unlikely(r))
+   goto error_free_blocks;
+   } else {
+   goto error_free_blocks;
+   }
+   }
 
if (size > remaining_size)
remaining_size = 0;
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 40131ed9b0cd..19440f8caec0 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -396,6 +396,7 @@ void drm_buddy_defrag(struct drm_buddy *mm,
}
}
 }
+EXPORT_SYMBOL(drm_buddy_defrag);
 
 /**
  * drm_buddy_free_block - free a block
-- 
2.25.1



[PATCH v8 1/3] drm/buddy: Implement tracking clear page feature

2024-03-04 Thread Arunpravin Paneer Selvam
- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
  successfully clears the blocks in the free path. On the otherhand,
  DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
  but fallback to uncleared if we can't find the cleared blocks.
  when driver requests uncleared memory we try to use uncleared but
  fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as cleared,
  when there are buddies which are cleared as well we can merge them.
  Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
  - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
cleared. Else, reset the clear flag for each block in the list(Christian)
  - For merging the 2 cleared blocks compare as below,
drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
  - Defragment the memory beginning from min_order
till the required memory space is available.

v2: (Matthew)
  - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks
operation within drm buddy.
  - Write a macro block_incompatible() to allocate the required blocks.
  - Update the xe driver for the drm_buddy_free_list change in arguments.
  - add a warning if the two blocks are incompatible on
defragmentation
  - call full defragmentation in the fini() function
  - place a condition to test if min_order is equal to 0
  - replace the list with safe_reverse() variant as we might
remove the block from the list.

v3:
  - fix Gitlab user reported lockup issue.
  - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
  - modify to pass the root order instead max_order in fini()
function(Matthew)
  - change bool 1 to true(Matthew)
  - add check if min_block_size is power of 2(Matthew)
  - modify the min_block_size datatype to u64(Matthew)

Signed-off-by: Arunpravin Paneer Selvam 
Signed-off-by: Matthew Auld 
Suggested-by: Christian König 
Suggested-by: Matthew Auld 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
 drivers/gpu/drm/drm_buddy.c   | 294 +++---
 drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
 drivers/gpu/drm/tests/drm_buddy_test.c|  18 +-
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
 include/drm/drm_buddy.h   |  22 +-
 6 files changed, 290 insertions(+), 60 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
return 0;
 
 error_free_blocks:
-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
 error_fini:
ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager 
*man,
 
amdgpu_vram_mgr_do_reserve(man);
 
-   drm_buddy_free_list(mm, >blocks);
+   drm_buddy_free_list(mm, >blocks, 0);
mutex_unlock(>lock);
 
atomic64_sub(vis_usage, >vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
kfree(rsv);
 
list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) {
-   drm_buddy_free_list(>mm, >allocated);
+   drm_buddy_free_list(>mm, >allocated, 0);
kfree(rsv);
}
if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index c4222b886db7..40131ed9b0cd 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
__list_add(>link, node->link.prev, >link);
 }
 
+static void clear_reset(struct drm_buddy_block *block)
+{
+   block->header &= ~DRM_BUDDY_HEADER_CLEAR;
+}
+
+static void mark_cleared(struct drm_buddy_block *block)
+{
+   block->header |= DRM_BUDDY_HEADER_CLEAR;
+}
+
 static void mark_allocated(struct drm_buddy_block *block)
 {
block->header &= ~DRM_BUDDY_HEADER_STATE;
@@ -186,11 +196,21 @@ EXPORT_SYMBOL(drm_buddy_init);
  */
 void drm_buddy_fini(struct drm_buddy *mm)
 {
+   u64 root_size, size;
+   unsigned int order;
int i;
 
+   size = mm->size;
+
for (i = 0; i < mm->n_roots; ++i) {
+   order = ilog2(size) - ilog2(mm->chunk_size);
+   root_size = mm->chunk_size << order;
+   drm_buddy_defrag(mm, root_size);
+
WARN_ON(!drm_buddy_block_is_free(mm->roots[i]));
drm_block_free(mm, mm->roots[i]);
+
+

Re: [PATCH v7 3/3] drm/buddy: Add defragmentation support

2024-03-04 Thread Paneer Selvam, Arunpravin

Hi Matthew,

On 2/22/2024 12:12 AM, Matthew Auld wrote:

On 21/02/2024 12:18, Arunpravin Paneer Selvam wrote:

Add a function to support defragmentation.

v1:
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2(Matthew):
   - add amdgpu user for defragmentation
   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

Signed-off-by: Arunpravin Paneer Selvam 


Suggested-by: Matthew Auld 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++-
  drivers/gpu/drm/drm_buddy.c  | 93 +---
  include/drm/drm_buddy.h  |  3 +
  3 files changed, 97 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index e494f5bf136a..cff8a526c622 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

 min_block_size,
 >blocks,
 vres->flags);
-    if (unlikely(r))
-    goto error_free_blocks;
+    if (unlikely(r)) {
+    if (r == -ENOSPC) {
+    drm_buddy_defrag(mm, min_block_size);
+    r = drm_buddy_alloc_blocks(mm, fpfn,
+   lpfn,
+   size,
+   min_block_size,
+   >blocks,
+   vres->flags);
+    if (unlikely(r))
+    goto error_free_blocks;
+    } else {
+    goto error_free_blocks;
+    }
+    }
    if (size > remaining_size)
  remaining_size = 0;
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 18e004fa39d3..56bd1560fbcd 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -203,6 +203,8 @@ void drm_buddy_fini(struct drm_buddy *mm)
  drm_block_free(mm, mm->roots[i]);
  }
  +    drm_buddy_defrag(mm, mm->chunk_size << mm->max_order);


I think this needs to be called higher up, otherwise we blow up with 
the WARN, plus we just freed the root(s). There is also the case with 
non-power-of-two VRAM size, in which case you get multiple roots and 
max_order is just the largest root and not entire address space. I 
guess do this in the loop above and use the root order instead?


Also this should be done as part of the first patch and then in this 
patch it is just a case of exporting it. Every commit should ideally 
be functional by itself.
You mean we move the above change in drm_buddy_fini function and 
drm_buddy_defrag function as part of first patch.
And just we add export function and add amdgpu user in this patch. Is my 
understanding correct?


Thanks,
Arun.



+
  WARN_ON(mm->avail != mm->size);
    kfree(mm->roots);
@@ -276,25 +278,39 @@ drm_get_buddy(struct drm_buddy_block *block)
  }
  EXPORT_SYMBOL(drm_get_buddy);
  -static void __drm_buddy_free(struct drm_buddy *mm,
- struct drm_buddy_block *block)
+static unsigned int __drm_buddy_free(struct drm_buddy *mm,
+ struct drm_buddy_block *block,
+ bool defrag)
  {
+    unsigned int order, block_order;
  struct drm_buddy_block *parent;
  +    block_order = drm_buddy_block_order(block);
+
  while ((parent = block->parent)) {
-    struct drm_buddy_block *buddy;
+    struct drm_buddy_block *buddy = NULL;
    buddy = __get_buddy(block);
    if (!drm_buddy_block_is_free(buddy))
  break;
  -    if (drm_buddy_block_is_clear(block) !=
-    drm_buddy_block_is_clear(buddy))
-    break;
+    if (!defrag) {
+    /*
+ * Check the block and its buddy clear state and exit
+ * the loop if they both have the dissimilar state.
+ */
+    if (drm_buddy_block_is_clear(block) !=
+    drm_buddy_block_is_clear(buddy))
+    break;
  -    if (drm_buddy_block_is_clear(block))
-    mark_cleared(parent);
+    if (drm_buddy_block_is_clear(block))
+    mark_cleared(parent);
+    }
+
+    WARN_ON(defrag &&
+    (drm_buddy_block_is_clear(block) ==
+ drm_buddy_block_is_clear(buddy)));
    list_del(>link);
  @@ -304,8 +320,57 @@ static void __drm_buddy_free(struct drm_buddy 
*mm,

  block = parent;
  }
  -    mark_free(mm, block);
+    order = drm_buddy_block_order(block);
+    if (block_order != orde

Re: [PATCH v7 3/3] drm/buddy: Add defragmentation support

2024-02-22 Thread Arunpravin Paneer Selvam

Hi Christian,

On 2/21/2024 7:58 PM, Christian König wrote:

Am 21.02.24 um 13:18 schrieb Arunpravin Paneer Selvam:

Add a function to support defragmentation.


Thinking more about it maybe you want to call this function differently.

Essentially we are force merging pages even if their cleared flag 
doesn't match, that is something different than defragmentation I think.


Maybe rename it for force_merge or something similar. Not mandatory, 
just an idea how to improve the readability of the code.


Apart from that just let me know when I can push it to drm-misc-next.

Christian.
I am fixing few issues reported by user and addressing Matthew's 
suggestions on v7 version. I will post the v8 version incorporating

all the changes. I hope we can push v8 into drm-misc-next.

Thanks,
Arun.




v1:
   - Defragment the memory beginning from min_order
 till the required memory space is available.

v2(Matthew):
   - add amdgpu user for defragmentation
   - add a warning if the two blocks are incompatible on
 defragmentation
   - call full defragmentation in the fini() function
   - place a condition to test if min_order is equal to 0
   - replace the list with safe_reverse() variant as we might
 remove the block from the list.

Signed-off-by: Arunpravin Paneer Selvam 


Suggested-by: Matthew Auld 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++-
  drivers/gpu/drm/drm_buddy.c  | 93 +---
  include/drm/drm_buddy.h  |  3 +
  3 files changed, 97 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index e494f5bf136a..cff8a526c622 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

 min_block_size,
 >blocks,
 vres->flags);
-    if (unlikely(r))
-    goto error_free_blocks;
+    if (unlikely(r)) {
+    if (r == -ENOSPC) {
+    drm_buddy_defrag(mm, min_block_size);
+    r = drm_buddy_alloc_blocks(mm, fpfn,
+   lpfn,
+   size,
+   min_block_size,
+   >blocks,
+   vres->flags);
+    if (unlikely(r))
+    goto error_free_blocks;
+    } else {
+    goto error_free_blocks;
+    }
+    }
    if (size > remaining_size)
  remaining_size = 0;
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 18e004fa39d3..56bd1560fbcd 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -203,6 +203,8 @@ void drm_buddy_fini(struct drm_buddy *mm)
  drm_block_free(mm, mm->roots[i]);
  }
  +    drm_buddy_defrag(mm, mm->chunk_size << mm->max_order);
+
  WARN_ON(mm->avail != mm->size);
    kfree(mm->roots);
@@ -276,25 +278,39 @@ drm_get_buddy(struct drm_buddy_block *block)
  }
  EXPORT_SYMBOL(drm_get_buddy);
  -static void __drm_buddy_free(struct drm_buddy *mm,
- struct drm_buddy_block *block)
+static unsigned int __drm_buddy_free(struct drm_buddy *mm,
+ struct drm_buddy_block *block,
+ bool defrag)
  {
+    unsigned int order, block_order;
  struct drm_buddy_block *parent;
  +    block_order = drm_buddy_block_order(block);
+
  while ((parent = block->parent)) {
-    struct drm_buddy_block *buddy;
+    struct drm_buddy_block *buddy = NULL;
    buddy = __get_buddy(block);
    if (!drm_buddy_block_is_free(buddy))
  break;
  -    if (drm_buddy_block_is_clear(block) !=
-    drm_buddy_block_is_clear(buddy))
-    break;
+    if (!defrag) {
+    /*
+ * Check the block and its buddy clear state and exit
+ * the loop if they both have the dissimilar state.
+ */
+    if (drm_buddy_block_is_clear(block) !=
+    drm_buddy_block_is_clear(buddy))
+    break;
  -    if (drm_buddy_block_is_clear(block))
-    mark_cleared(parent);
+    if (drm_buddy_block_is_clear(block))
+    mark_cleared(parent);
+    }
+
+    WARN_ON(defrag &&
+    (drm_buddy_block_is_clear(block) ==
+ drm_buddy_block_is_clear(buddy)));
    list_del(>link);
  @@ -304,8 +320,57 @@ static void __drm_buddy_free(struct drm_buddy 
*mm,

  block = parent;
  }
  -    mark_free(mm, block);
+    order = drm_buddy_block_order(block);
+    if (block_order != order)
+    mark_free(mm, block);
+
+    return order;
+}
+
+/**
+ * drm_buddy_defrag - Defragmentation routi

Re: [PATCH v6 1/3] drm/buddy: Implement tracking clear page feature

2024-02-21 Thread Paneer Selvam, Arunpravin



On 2/16/2024 5:33 PM, Matthew Auld wrote:

On 08/02/2024 15:49, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as 
cleared,

   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

v1: (Christian)
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the list.

   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of 
blocks

 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required blocks.
   - Update the xe driver for the drm_buddy_free_list change in 
arguments.


Signed-off-by: Arunpravin Paneer Selvam 


Signed-off-by: Matthew Auld 
Suggested-by: Christian König 


Probably needs a new unit test.

I think we are missing something to forcefully re-merge everything at 
fini()? In theory we can just call the defrag routine. Otherwise we 
might trigger various warnings since the root(s) might still be split.


Also one nit below. Otherwise I think looks good.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 192 ++
  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c    |  10 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  18 +-
  6 files changed, 187 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  return 0;
    error_free_blocks:
-    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
  error_fini:
  ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct 
ttm_resource_manager *man,

    amdgpu_vram_mgr_do_reserve(man);
  -    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
    atomic64_sub(vis_usage, >vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device 
*adev)

  kfree(rsv);
    list_for_each_entry_safe(rsv, temp, >reserved_pages, 
blocks) {

-    drm_buddy_free_list(>mm, >allocated);
+    drm_buddy_free_list(>mm, >allocated, 0);
  kfree(rsv);
  }
  if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index f57e6d74fb0e..33ad0cfbd54c 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
  __list_add(>link, node->link.prev, >link);
  }
  +static void clear_reset(struct drm_buddy_block *block)
+{
+    block->header &= ~DRM_BUDDY_HEADER_CLEAR;
+}
+
+static void mark_cleared(struct drm_buddy_block *block)
+{
+    block->header |= DRM_BUDDY_HEADER_CLEAR;
+}
+
  static void mark_allocated(struct drm_buddy_block *block)
  {
  block->header &= ~DRM_BUDDY_HEADER_STATE;
@@ -223,6 +233,12 @@ static int split_block(struct drm_buddy *mm,
  mark_free(mm, block->left);
  mark_free(mm, block->right);
  +    if (drm_buddy_block_is_clear(block)) {
+    mark_cleared(block->left);
+    mark_cleared(block->right);
+    clear_reset(block);
+    }
+
  mark_split(block);
    return 0;
@@ -273,6 +289,13 @@ static void __drm_buddy_free(struct drm_buddy *mm,
  if (!drm_buddy_block_is_free(buddy))
  break;
  +    if (drm_buddy_block_is_clear(block) !=
+    drm_buddy_block_is_clear(buddy))
+    break;
+
+    if (drm_buddy_block_is_clear(block))
+    mark_cleared(parent);
+
  list_del(>link);
    drm_block_free(mm, block);
@@ -295,26 +318,61 @@ void drm_buddy_free_block(struct drm_buddy *mm,
  {
  BUG_ON(!drm_buddy_block_is_allocated(block));
  mm->avail += drm_buddy_block_size(mm, block);
+    if (drm_buddy_block_is_clear(block))

Re: [PATCH v6 1/3] drm/buddy: Implement tracking clear page feature

2024-02-21 Thread Paneer Selvam, Arunpravin



On 2/16/2024 5:33 PM, Matthew Auld wrote:

On 08/02/2024 15:49, Arunpravin Paneer Selvam wrote:

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
   successfully clears the blocks in the free path. On the otherhand,
   DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
   but fallback to uncleared if we can't find the cleared blocks.
   when driver requests uncleared memory we try to use uncleared but
   fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as 
cleared,

   when there are buddies which are cleared as well we can merge them.
   Otherwise, we prefer to keep the blocks as separated.

v1: (Christian)
   - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
 cleared. Else, reset the clear flag for each block in the list.

   - For merging the 2 cleared blocks compare as below,
 drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)

v2: (Matthew)
   - Add a wrapper drm_buddy_free_list_internal for the freeing of 
blocks

 operation within drm buddy.
   - Write a macro block_incompatible() to allocate the required blocks.
   - Update the xe driver for the drm_buddy_free_list change in 
arguments.


Signed-off-by: Arunpravin Paneer Selvam 


Signed-off-by: Matthew Auld 
Suggested-by: Christian König 


Probably needs a new unit test.

Sure, I am working on it. I will send in a separate patch.


I think we are missing something to forcefully re-merge everything at 
fini()? In theory we can just call the defrag routine. Otherwise we 
might trigger various warnings since the root(s) might still be split.


I have added the full defrag in the fini() function. Please review the 
patch number 3.


Thanks,

Arun.



Also one nit below. Otherwise I think looks good.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
  drivers/gpu/drm/drm_buddy.c   | 192 ++
  drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
  drivers/gpu/drm/tests/drm_buddy_test.c    |  10 +-
  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c  |   4 +-
  include/drm/drm_buddy.h   |  18 +-
  6 files changed, 187 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct 
ttm_resource_manager *man,

  return 0;
    error_free_blocks:
-    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
  error_fini:
  ttm_resource_fini(man, >base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct 
ttm_resource_manager *man,

    amdgpu_vram_mgr_do_reserve(man);
  -    drm_buddy_free_list(mm, >blocks);
+    drm_buddy_free_list(mm, >blocks, 0);
  mutex_unlock(>lock);
    atomic64_sub(vis_usage, >vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device 
*adev)

  kfree(rsv);
    list_for_each_entry_safe(rsv, temp, >reserved_pages, 
blocks) {

-    drm_buddy_free_list(>mm, >allocated);
+    drm_buddy_free_list(>mm, >allocated, 0);
  kfree(rsv);
  }
  if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index f57e6d74fb0e..33ad0cfbd54c 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
  __list_add(>link, node->link.prev, >link);
  }
  +static void clear_reset(struct drm_buddy_block *block)
+{
+    block->header &= ~DRM_BUDDY_HEADER_CLEAR;
+}
+
+static void mark_cleared(struct drm_buddy_block *block)
+{
+    block->header |= DRM_BUDDY_HEADER_CLEAR;
+}
+
  static void mark_allocated(struct drm_buddy_block *block)
  {
  block->header &= ~DRM_BUDDY_HEADER_STATE;
@@ -223,6 +233,12 @@ static int split_block(struct drm_buddy *mm,
  mark_free(mm, block->left);
  mark_free(mm, block->right);
  +    if (drm_buddy_block_is_clear(block)) {
+    mark_cleared(block->left);
+    mark_cleared(block->right);
+    clear_reset(block);
+    }
+
  mark_split(block);
    return 0;
@@ -273,6 +289,13 @@ static void __drm_buddy_free(struct drm_buddy *mm,
  if (!drm_buddy_block_is_free(buddy))
  break;
  +    if (drm_buddy_block_is_clear(block) !=
+    drm_buddy_block_is_clear(buddy))
+    break;
+
+    if (drm_buddy_block_is_clear(block))
+    mark_cleared(parent);
+
  list_del(>link);
    drm_block_free(mm, block);
@@ -295,26 +318,61 @@ void drm_buddy_free_block(str

[PATCH v7 3/3] drm/buddy: Add defragmentation support

2024-02-21 Thread Arunpravin Paneer Selvam
Add a function to support defragmentation.

v1:
  - Defragment the memory beginning from min_order
till the required memory space is available.

v2(Matthew):
  - add amdgpu user for defragmentation
  - add a warning if the two blocks are incompatible on
defragmentation
  - call full defragmentation in the fini() function
  - place a condition to test if min_order is equal to 0
  - replace the list with safe_reverse() variant as we might
remove the block from the list.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Matthew Auld 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++-
 drivers/gpu/drm/drm_buddy.c  | 93 +---
 include/drm/drm_buddy.h  |  3 +
 3 files changed, 97 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index e494f5bf136a..cff8a526c622 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
   min_block_size,
   >blocks,
   vres->flags);
-   if (unlikely(r))
-   goto error_free_blocks;
+   if (unlikely(r)) {
+   if (r == -ENOSPC) {
+   drm_buddy_defrag(mm, min_block_size);
+   r = drm_buddy_alloc_blocks(mm, fpfn,
+  lpfn,
+  size,
+  min_block_size,
+  >blocks,
+  vres->flags);
+   if (unlikely(r))
+   goto error_free_blocks;
+   } else {
+   goto error_free_blocks;
+   }
+   }
 
if (size > remaining_size)
remaining_size = 0;
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 18e004fa39d3..56bd1560fbcd 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -203,6 +203,8 @@ void drm_buddy_fini(struct drm_buddy *mm)
drm_block_free(mm, mm->roots[i]);
}
 
+   drm_buddy_defrag(mm, mm->chunk_size << mm->max_order);
+
WARN_ON(mm->avail != mm->size);
 
kfree(mm->roots);
@@ -276,25 +278,39 @@ drm_get_buddy(struct drm_buddy_block *block)
 }
 EXPORT_SYMBOL(drm_get_buddy);
 
-static void __drm_buddy_free(struct drm_buddy *mm,
-struct drm_buddy_block *block)
+static unsigned int __drm_buddy_free(struct drm_buddy *mm,
+struct drm_buddy_block *block,
+bool defrag)
 {
+   unsigned int order, block_order;
struct drm_buddy_block *parent;
 
+   block_order = drm_buddy_block_order(block);
+
while ((parent = block->parent)) {
-   struct drm_buddy_block *buddy;
+   struct drm_buddy_block *buddy = NULL;
 
buddy = __get_buddy(block);
 
if (!drm_buddy_block_is_free(buddy))
break;
 
-   if (drm_buddy_block_is_clear(block) !=
-   drm_buddy_block_is_clear(buddy))
-   break;
+   if (!defrag) {
+   /*
+* Check the block and its buddy clear state and exit
+* the loop if they both have the dissimilar state.
+*/
+   if (drm_buddy_block_is_clear(block) !=
+   drm_buddy_block_is_clear(buddy))
+   break;
 
-   if (drm_buddy_block_is_clear(block))
-   mark_cleared(parent);
+   if (drm_buddy_block_is_clear(block))
+   mark_cleared(parent);
+   }
+
+   WARN_ON(defrag &&
+   (drm_buddy_block_is_clear(block) ==
+drm_buddy_block_is_clear(buddy)));
 
list_del(>link);
 
@@ -304,8 +320,57 @@ static void __drm_buddy_free(struct drm_buddy *mm,
block = parent;
}
 
-   mark_free(mm, block);
+   order = drm_buddy_block_order(block);
+   if (block_order != order)
+   mark_free(mm, block);
+
+   return order;
+}
+
+/**
+ * drm_buddy_defrag - Defragmentation routine
+ *
+ * @mm: DRM buddy manager
+ * @min_block_size: minimum size in bytes to begin
+ * the defragmenta

[PATCH v7 2/3] drm/amdgpu: Enable clear page functionality

2024-02-21 Thread Arunpravin Paneer Selvam
Add clear page support in vram memory region.

v1(Christian):
  - Dont handle clear page as TTM flag since when moving the BO back
in from GTT again we don't need that.
  - Make a specialized version of amdgpu_fill_buffer() which only
clears the VRAM areas which are not already cleared
  - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in
amdgpu_object.c

v2:
  - Modify the function name amdgpu_ttm_* (Alex)
  - Drop the delayed parameter (Christian)
  - handle amdgpu_res_cleared() just above the size
calculation (Christian)
  - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers
in the free path to properly wait for fences etc.. (Christian)

v3(Christian):
  - Remove buffer clear code in VRAM manager instead change the
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set
the DRM_BUDDY_CLEARED flag.
  - Remove ! from amdgpu_res_cleared() check.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
Acked-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 ---
 .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 61 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  |  5 ++
 6 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 010b0cb7693c..904d71b3393f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"
 
 /**
  * DOC: amdgpu_object
@@ -595,8 +596,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
if (!amdgpu_bo_support_uswc(bo->flags))
bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 
-   if (adev->ras_enabled)
-   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+   bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
 
bo->tbo.bdev = >mman.bdev;
if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -626,15 +626,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
 
if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&
bo->tbo.resource->mem_type == TTM_PL_VRAM) {
-   struct dma_fence *fence;
+   struct dma_fence *fence = NULL;
 
-   r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true);
+   r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, );
if (unlikely(r))
goto fail_unreserve;
 
-   dma_resv_add_fence(bo->tbo.base.resv, fence,
-  DMA_RESV_USAGE_KERNEL);
-   dma_fence_put(fence);
+   if (fence) {
+   dma_resv_add_fence(bo->tbo.base.resv, fence,
+  DMA_RESV_USAGE_KERNEL);
+   dma_fence_put(fence);
+   }
}
if (!bp->resv)
amdgpu_bo_unreserve(bo);
@@ -1359,8 +1361,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
return;
 
-   r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true);
+   r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true);
if (!WARN_ON(r)) {
+   struct amdgpu_vram_mgr_resource *vres;
+
+   vres = to_amdgpu_vram_mgr_resource(bo->resource);
+   vres->flags |= DRM_BUDDY_CLEARED;
amdgpu_bo_fence(abo, fence, false);
dma_fence_put(fence);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
index 381101d2bf05..50fcd86e1033 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
amdgpu_res_cursor *cur, uint64_t size)
}
 }
 
+/**
+ * amdgpu_res_cleared - check if blocks are cleared
+ *
+ * @cur: the cursor to extract the block
+ *
+ * Check if the @cur block is cleared
+ */
+static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
+{
+   struct drm_buddy_block *block;
+
+   switch (cur->mem_type) {
+   case TTM_PL_VRAM:
+   block = cur->node;
+
+   if (!amdgpu_vram_mgr_is_cleared(block))
+   return false;
+   break;
+   default:
+   return false;
+   }
+
+   return true;
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/am

  1   2   3   4   5   >