Re: [PATCH v7 1/2] drm/buddy: Add start address support to trim function
On 7/24/2024 8:42 PM, Jani Nikula wrote: On Tue, 23 Jul 2024, Arunpravin Paneer Selvam wrote: - Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 6a8e45e9d0ec..103c185bb1c8 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start, Is this a pointer only to allow passing NULL as "don't specify"? It's not used for returning anything in *start. Maybe it should be a const pointer? Not the prettiest of interfaces, and the kernel-doc does not reflect any of this. We have a plan to improve the interface and add multiple block trim support as well. I will modify in the follow-up patch. Thanks, Arun. BR, Jani. u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, +NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, >blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, >blocks))
Re: [PATCH v7 1/2] drm/buddy: Add start address support to trim function
Hi Matthew, Can we push this version for now as we need to mainline the DCC changes ASAP, while we continue our discussion and proceed to implement the permanent solution for address alignment? Thanks, Arun. On 7/23/2024 6:55 PM, Arunpravin Paneer Selvam wrote: - Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 6a8e45e9d0ec..103c185bb1c8 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, +NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, >blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, >blocks)) size = vres->base.size; } diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index 2a74fa9d0ce5..9689a7c5dd36 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -27,6 +27,7 @@ #def
[PATCH v7 2/2] drm/amdgpu: Add address alignment support to DCC buffers
Add address alignment support to the DCC VRAM buffers. v2: - adjust size based on the max_texture_channel_caches values only for GFX12 DCC buffers. - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only for DCC buffers. - roundup non power of two DCC buffer adjusted size to nearest power of two number as the buddy allocator does not support non power of two alignments. This applies only to the contiguous DCC buffers. v3:(Alex) - rewrite the max texture channel caches comparison code in an algorithmic way to determine the alignment size. v4:(Alex) - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c and add a new gmc func callback for dcc alignment. If the callback is non-NULL, call it to get the alignment, otherwise, use the default. v5:(Alex) - Set the Alignment to a default value if the callback doesn't exist. - Add the callback to amdgpu_gmc_funcs. v6: - Fix checkpatch warning reported by Intel CI. v7:(Christian) - remove the AMDGPU_GEM_CREATE_GFX12_DCC flag and keep a flag that checks the BO pinning and for a specific hw generation. v8:(Christian) - move this check into gmc_v12_0_get_dcc_alignment. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König Reviewed-by: Frank Min --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 6 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 29 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 19 + 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index febca3130497..654d0548a3f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs { uint64_t addr, uint64_t *flags); /* get the amount of memory used by the vbios for pre-OS console */ unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev); + /* get the DCC buffer alignment */ + u64 (*get_dcc_alignment)(struct amdgpu_device *adev); enum amdgpu_memory_partition (*query_mem_partition_mode)( struct amdgpu_device *adev); @@ -363,6 +365,10 @@ struct amdgpu_gmc { (adev)->gmc.gmc_funcs->override_vm_pte_flags\ ((adev), (vm), (addr), (pte_flags)) #define amdgpu_gmc_get_vbios_fb_size(adev) (adev)->gmc.gmc_funcs->get_vbios_fb_size((adev)) +#define amdgpu_gmc_get_dcc_alignment(_adev) ({ \ + typeof(_adev) (adev) = (_adev); \ + ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev))); \ +}) /** * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index f91cc149d06c..c6609f4ac3d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -450,12 +450,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, const struct ttm_place *place, struct ttm_resource **res) { + u64 size, remaining_size, lpfn, fpfn, adjust_dcc_size = 0; struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); u64 vis_usage = 0, max_bytes, min_block_size; struct amdgpu_vram_mgr_resource *vres; - u64 size, remaining_size, lpfn, fpfn; struct drm_buddy *mm = >mm; struct drm_buddy_block *block; unsigned long pages_per_block; @@ -511,7 +511,14 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, /* Allocate blocks in desired range */ vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; + if (adev->gmc.gmc_funcs->get_dcc_alignment) + adjust_dcc_size = amdgpu_gmc_get_dcc_alignment(adev); + remaining_size = (u64)vres->base.size; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && adjust_dcc_size) { + remaining_size = roundup_pow_of_two(remaining_size + adjust_dcc_size); + vres->flags |= DRM_BUDDY_TRIM_DISABLE; + } mutex_lock(>lock); while (remaining_size) { @@ -521,8 +528,11 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size = mgr->default_page_size; size = remaining_size; - if ((size >= (u64)pages_per_block << PAGE_SHIFT) && - !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && adjust_dcc_size) +
[PATCH v7 1/2] drm/buddy: Add start address support to trim function
- Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 6a8e45e9d0ec..103c185bb1c8 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, +NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, >blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, >blocks)) size = vres->base.size; } diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index 2a74fa9d0ce5..9689a7c5dd36 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -27,6 +27,7 @@ #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2) #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) #define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) struct drm_buddy_block { #define DRM_BUDDY_HE
Re: [PATCH] drm/buddy: Add start address support to trim function
Hi Matthew, On 7/19/2024 4:01 PM, Matthew Auld wrote: On 17/07/2024 16:02, Paneer Selvam, Arunpravin wrote: On 7/16/2024 3:34 PM, Matthew Auld wrote: On 16/07/2024 10:50, Paneer Selvam, Arunpravin wrote: Hi Matthew, On 7/10/2024 6:20 PM, Matthew Auld wrote: On 10/07/2024 07:03, Paneer Selvam, Arunpravin wrote: Thanks Alex. Hi Matthew, Any comments? Do we not pass the required address alignment when allocating the pages in the first place? If address alignment is really useful, we can add that in the drm_buddy_alloc_blocks() function. I mean don't we already pass the min page size, which should give us matching physical address alignment? I think we don't need to align the address to the passed min_block_size value for all the contiguous buffers, so I thought that decision we can leave it to the drivers and they can achieve that through trim function in this kind of a specific request. I would have assumed it would be simpler to use min_block_size and then trim the size, if it's too big? That would then also take care of the try_harder case? For example, if the required contiguous size is 1MiB and min_block_size is 256KiB, to perform the address alignment of 256KiB, we might need to over-allocate at least to the min_block_size (say 256KiB). Now the size becomes 1280KiB and since the contiguous flag is enabled, we will round up the size to the next power of two and the size value becomes 2MiB. Next, in trimming we should round up the block start address to the min_block_size. May be we can keep the above mentioned operations under the flag combination DRM_BUDDY_CONTIGUOUS_ALLOCATION && DRM_BUDDY_ADDRESS_ALIGNMENT?. At the moment, we cannot support address alignment for try_harder allocations since in case of try_harder allocations we first traverse RHS to allocate the maximum possible and traverse LHS (here we align the LHS size to min_block_size) to allocate the remaining size. May be in case of DRM_BUDDY_ADDRESS_ALIGNMENT, we should first allocate LHS satisfying the address alignment requirement and then traverse RHS to allocate the remaining size if required? Also how are we dealing with the multi-block try_harder case? AFAICT we only allow trimming single block atm, or is it not possible to trigger that path here? Or are we handling that somehow? not possible to trigger that path here. only when we either over-allocate the LHS size and pass the multiple blocks to the trim function or implement the above mentioned method. https://patchwork.freedesktop.org/series/136150/ We are getting this sparse error from the Intel CI. Do you think these errors are introduced with this patches? I think it's safe to ignore, there seem to be other series with the same thing. Thanks. Thanks, Arun. Thanks, Arun. Thanks, Arun. On 7/9/2024 1:42 AM, Alex Deucher wrote: On Thu, Jul 4, 2024 at 4:40 AM Arunpravin Paneer Selvam wrote: - Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Series is: Acked-by: Alex Deucher I'd like to take this series through the amdgpu tree if there are no objections as it's required for display buffers on some chips and I'd like to make sure it lands in 6.11. Thanks, Alex --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 94f8c34fc293..8cebe1fa4e9d 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs);
[PATCH v6 2/2] drm/amdgpu: Add address alignment support to DCC buffers
Add address alignment support to the DCC VRAM buffers. v2: - adjust size based on the max_texture_channel_caches values only for GFX12 DCC buffers. - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only for DCC buffers. - roundup non power of two DCC buffer adjusted size to nearest power of two number as the buddy allocator does not support non power of two alignments. This applies only to the contiguous DCC buffers. v3:(Alex) - rewrite the max texture channel caches comparison code in an algorithmic way to determine the alignment size. v4:(Alex) - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c and add a new gmc func callback for dcc alignment. If the callback is non-NULL, call it to get the alignment, otherwise, use the default. v5:(Alex) - Set the Alignment to a default value if the callback doesn't exist. - Add the callback to amdgpu_gmc_funcs. v6: - Fix checkpatch warning reported by Intel CI. v7:(Christian) - remove the AMDGPU_GEM_CREATE_GFX12_DCC flag and keep a flag that checks the BO pinning and for a specific hw generation. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König Reviewed-by: Frank Min --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 6 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 39 +++- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 15 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index febca3130497..654d0548a3f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs { uint64_t addr, uint64_t *flags); /* get the amount of memory used by the vbios for pre-OS console */ unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev); + /* get the DCC buffer alignment */ + u64 (*get_dcc_alignment)(struct amdgpu_device *adev); enum amdgpu_memory_partition (*query_mem_partition_mode)( struct amdgpu_device *adev); @@ -363,6 +365,10 @@ struct amdgpu_gmc { (adev)->gmc.gmc_funcs->override_vm_pte_flags\ ((adev), (vm), (addr), (pte_flags)) #define amdgpu_gmc_get_vbios_fb_size(adev) (adev)->gmc.gmc_funcs->get_vbios_fb_size((adev)) +#define amdgpu_gmc_get_dcc_alignment(_adev) ({ \ + typeof(_adev) (adev) = (_adev); \ + ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev))); \ +}) /** * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index f91cc149d06c..ace9d61fc512 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -512,6 +512,17 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; remaining_size = (u64)vres->base.size; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || +amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) { + u64 adjust_size; + + if (adev->gmc.gmc_funcs->get_dcc_alignment) { + adjust_size = amdgpu_gmc_get_dcc_alignment(adev); + remaining_size = roundup_pow_of_two(remaining_size + adjust_size); + vres->flags |= DRM_BUDDY_TRIM_DISABLE; + } + } mutex_lock(>lock); while (remaining_size) { @@ -521,8 +532,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size = mgr->default_page_size; size = remaining_size; - if ((size >= (u64)pages_per_block << PAGE_SHIFT) && - !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || +amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) + min_block_size = size; + else if ((size >= (u64)pages_per_block << PAGE_SHIFT) && +!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; BUG_ON(min_block_size < mm->chunk_size); @@ -553,6 +569,25 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, } mutex_unlock(>
[PATCH v6 1/2] drm/buddy: Add start address support to trim function
- Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 6a8e45e9d0ec..103c185bb1c8 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, +NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, >blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, >blocks)) size = vres->base.size; } diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index 2a74fa9d0ce5..9689a7c5dd36 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -27,6 +27,7 @@ #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2) #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) #define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) struct drm_buddy_block { #define DRM_BUDDY_HE
Re: [PATCH] drm/buddy: Add start address support to trim function
On 7/16/2024 3:34 PM, Matthew Auld wrote: On 16/07/2024 10:50, Paneer Selvam, Arunpravin wrote: Hi Matthew, On 7/10/2024 6:20 PM, Matthew Auld wrote: On 10/07/2024 07:03, Paneer Selvam, Arunpravin wrote: Thanks Alex. Hi Matthew, Any comments? Do we not pass the required address alignment when allocating the pages in the first place? If address alignment is really useful, we can add that in the drm_buddy_alloc_blocks() function. I mean don't we already pass the min page size, which should give us matching physical address alignment? I think we don't need to align the address to the passed min_block_size value for all the contiguous buffers, so I thought that decision we can leave it to the drivers and they can achieve that through trim function in this kind of a specific request. https://patchwork.freedesktop.org/series/136150/ We are getting this sparse error from the Intel CI. Do you think these errors are introduced with this patches? Thanks, Arun. Thanks, Arun. Thanks, Arun. On 7/9/2024 1:42 AM, Alex Deucher wrote: On Thu, Jul 4, 2024 at 4:40 AM Arunpravin Paneer Selvam wrote: - Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Series is: Acked-by: Alex Deucher I'd like to take this series through the amdgpu tree if there are no objections as it's required for display buffers on some chips and I'd like to make sure it lands in 6.11. Thanks, Alex --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 94f8c34fc293..8cebe1fa4e9d 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 tri
Re: [PATCH v5 2/2] drm/amdgpu: Add address alignment support to DCC buffers
Hi Christian, Can we use the below combination flags to kick in hardware workaround while pinning BO's for this specific hw generation. if (place->flags & TTM_PL_FLAG_CONTIGUOUS) && (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) { } Regards, Arun. On 7/17/2024 2:38 PM, Christian König wrote: Well that approach was discussed before and seemed to be to complicated. But I totally agree that the AMDGPU_GEM_CREATE_GFX12_DCC flag is a bad idea. This isn't anything userspace should need to specify in the first place. All we need is a hardware workaround which kicks in all the time while pinning BOs for this specific hw generation and texture channel configuration. Please remove the AMDGPU_GEM_CREATE_GFX12_DCC flag again if possible or specify why it is actually necessary? Regards, Christian. Am 17.07.24 um 05:44 schrieb Marek Olšák: AMDGPU_GEM_CREATE_GFX12_DCC is set on 90% of all memory allocations, and almost all of them are not displayable. Shouldn't we use a different way to indicate that we need a non-power-of-two alignment, such as looking at the alignment field directly? Marek On Tue, Jul 16, 2024, 11:45 Arunpravin Paneer Selvam wrote: Add address alignment support to the DCC VRAM buffers. v2: - adjust size based on the max_texture_channel_caches values only for GFX12 DCC buffers. - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only for DCC buffers. - roundup non power of two DCC buffer adjusted size to nearest power of two number as the buddy allocator does not support non power of two alignments. This applies only to the contiguous DCC buffers. v3:(Alex) - rewrite the max texture channel caches comparison code in an algorithmic way to determine the alignment size. v4:(Alex) - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c and add a new gmc func callback for dcc alignment. If the callback is non-NULL, call it to get the alignment, otherwise, use the default. v5:(Alex) - Set the Alignment to a default value if the callback doesn't exist. - Add the callback to amdgpu_gmc_funcs. v6: - Fix checkpatch error reported by Intel CI. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König Reviewed-by: Frank Min --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 6 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 36 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 15 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index febca3130497..654d0548a3f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs { uint64_t addr, uint64_t *flags); /* get the amount of memory used by the vbios for pre-OS console */ unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev); + /* get the DCC buffer alignment */ + u64 (*get_dcc_alignment)(struct amdgpu_device *adev); enum amdgpu_memory_partition (*query_mem_partition_mode)( struct amdgpu_device *adev); @@ -363,6 +365,10 @@ struct amdgpu_gmc { (adev)->gmc.gmc_funcs->override_vm_pte_flags \ ((adev), (vm), (addr), (pte_flags)) #define amdgpu_gmc_get_vbios_fb_size(adev) (adev)->gmc.gmc_funcs->get_vbios_fb_size((adev)) +#define amdgpu_gmc_get_dcc_alignment(_adev) ({ \ + typeof(_adev) (adev) = (_adev); \ + ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev))); \ +}) /** * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index f91cc149d06c..aa9dca12371c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -512,6 +512,16 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; remaining_size = (u64)vres->base.size; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) { + u64 adjust_size; + + if (adev->gmc.gmc_funcs->get_dcc_alignment) { + adjust_size = amdgpu_gmc_get_dcc_alignment(adev); + r
[PATCH v5 2/2] drm/amdgpu: Add address alignment support to DCC buffers
Add address alignment support to the DCC VRAM buffers. v2: - adjust size based on the max_texture_channel_caches values only for GFX12 DCC buffers. - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only for DCC buffers. - roundup non power of two DCC buffer adjusted size to nearest power of two number as the buddy allocator does not support non power of two alignments. This applies only to the contiguous DCC buffers. v3:(Alex) - rewrite the max texture channel caches comparison code in an algorithmic way to determine the alignment size. v4:(Alex) - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c and add a new gmc func callback for dcc alignment. If the callback is non-NULL, call it to get the alignment, otherwise, use the default. v5:(Alex) - Set the Alignment to a default value if the callback doesn't exist. - Add the callback to amdgpu_gmc_funcs. v6: - Fix checkpatch error reported by Intel CI. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König Reviewed-by: Frank Min --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 6 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 36 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 15 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index febca3130497..654d0548a3f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs { uint64_t addr, uint64_t *flags); /* get the amount of memory used by the vbios for pre-OS console */ unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev); + /* get the DCC buffer alignment */ + u64 (*get_dcc_alignment)(struct amdgpu_device *adev); enum amdgpu_memory_partition (*query_mem_partition_mode)( struct amdgpu_device *adev); @@ -363,6 +365,10 @@ struct amdgpu_gmc { (adev)->gmc.gmc_funcs->override_vm_pte_flags\ ((adev), (vm), (addr), (pte_flags)) #define amdgpu_gmc_get_vbios_fb_size(adev) (adev)->gmc.gmc_funcs->get_vbios_fb_size((adev)) +#define amdgpu_gmc_get_dcc_alignment(_adev) ({ \ + typeof(_adev) (adev) = (_adev); \ + ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev))); \ +}) /** * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index f91cc149d06c..aa9dca12371c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -512,6 +512,16 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; remaining_size = (u64)vres->base.size; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) { + u64 adjust_size; + + if (adev->gmc.gmc_funcs->get_dcc_alignment) { + adjust_size = amdgpu_gmc_get_dcc_alignment(adev); + remaining_size = roundup_pow_of_two(remaining_size + adjust_size); + vres->flags |= DRM_BUDDY_TRIM_DISABLE; + } + } mutex_lock(>lock); while (remaining_size) { @@ -521,8 +531,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size = mgr->default_page_size; size = remaining_size; - if ((size >= (u64)pages_per_block << PAGE_SHIFT) && - !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) + min_block_size = size; + else if ((size >= (u64)pages_per_block << PAGE_SHIFT) && +!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; BUG_ON(min_block_size < mm->chunk_size); @@ -553,6 +567,24 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, } mutex_unlock(>lock); + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) { + struct drm_buddy_block *dcc_block; + u64 dcc_start, alignment; + + dcc_block = amdgpu_vram_mgr_first_block(>blocks); + dcc_star
[PATCH v5 1/2] drm/buddy: Add start address support to trim function
- Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 6a8e45e9d0ec..103c185bb1c8 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, +NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, >blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, >blocks)) size = vres->base.size; } diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index 2a74fa9d0ce5..9689a7c5dd36 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -27,6 +27,7 @@ #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2) #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) #define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) struct drm_buddy_block { #define DRM_BUDDY_HE
Re: [PATCH] drm/buddy: Add start address support to trim function
Hi Matthew, On 7/10/2024 6:20 PM, Matthew Auld wrote: On 10/07/2024 07:03, Paneer Selvam, Arunpravin wrote: Thanks Alex. Hi Matthew, Any comments? Do we not pass the required address alignment when allocating the pages in the first place? If address alignment is really useful, we can add that in the drm_buddy_alloc_blocks() function. Thanks, Arun. Thanks, Arun. On 7/9/2024 1:42 AM, Alex Deucher wrote: On Thu, Jul 4, 2024 at 4:40 AM Arunpravin Paneer Selvam wrote: - Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Series is: Acked-by: Alex Deucher I'd like to take this series through the amdgpu tree if there are no objections as it's required for display buffers on some chips and I'd like to make sure it lands in 6.11. Thanks, Alex --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 94f8c34fc293..8cebe1fa4e9d 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, + NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (plac
[PATCH 1/2] drm/buddy: Add start address support to trim function
- Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 6a8e45e9d0ec..103c185bb1c8 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, +NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, >blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, >blocks)) size = vres->base.size; } diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index 2a74fa9d0ce5..9689a7c5dd36 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -27,6 +27,7 @@ #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2) #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) #define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) struct drm_buddy_block { #define DRM_BUDDY_HE
[PATCH 2/2] drm/amdgpu: Add address alignment support to DCC buffers
Add address alignment support to the DCC VRAM buffers. v2: - adjust size based on the max_texture_channel_caches values only for GFX12 DCC buffers. - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only for DCC buffers. - roundup non power of two DCC buffer adjusted size to nearest power of two number as the buddy allocator does not support non power of two alignments. This applies only to the contiguous DCC buffers. v3:(Alex) - rewrite the max texture channel caches comparison code in an algorithmic way to determine the alignment size. v4:(Alex) - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c and add a new gmc func callback for dcc alignment. If the callback is non-NULL, call it to get the alignment, otherwise, use the default. v5:(Alex) - Set the Alignment to a default value if the callback doesn't exist. - Add the callback to amdgpu_gmc_funcs. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Alex Deucher Acked-by: Christian König Reviewed-by: Frank Min --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 36 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 15 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index febca3130497..49dfcf112ac1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs { uint64_t addr, uint64_t *flags); /* get the amount of memory used by the vbios for pre-OS console */ unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev); + /* get the DCC buffer alignment */ + u64 (*get_dcc_alignment)(struct amdgpu_device *adev); enum amdgpu_memory_partition (*query_mem_partition_mode)( struct amdgpu_device *adev); @@ -363,6 +365,7 @@ struct amdgpu_gmc { (adev)->gmc.gmc_funcs->override_vm_pte_flags\ ((adev), (vm), (addr), (pte_flags)) #define amdgpu_gmc_get_vbios_fb_size(adev) (adev)->gmc.gmc_funcs->get_vbios_fb_size((adev)) +#define amdgpu_gmc_get_dcc_alignment(adev) ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev))) /** * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index f91cc149d06c..aa9dca12371c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -512,6 +512,16 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; remaining_size = (u64)vres->base.size; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) { + u64 adjust_size; + + if (adev->gmc.gmc_funcs->get_dcc_alignment) { + adjust_size = amdgpu_gmc_get_dcc_alignment(adev); + remaining_size = roundup_pow_of_two(remaining_size + adjust_size); + vres->flags |= DRM_BUDDY_TRIM_DISABLE; + } + } mutex_lock(>lock); while (remaining_size) { @@ -521,8 +531,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size = mgr->default_page_size; size = remaining_size; - if ((size >= (u64)pages_per_block << PAGE_SHIFT) && - !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) + min_block_size = size; + else if ((size >= (u64)pages_per_block << PAGE_SHIFT) && +!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; BUG_ON(min_block_size < mm->chunk_size); @@ -553,6 +567,24 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, } mutex_unlock(>lock); + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) { + struct drm_buddy_block *dcc_block; + u64 dcc_start, alignment; + + dcc_block = amdgpu_vram_mgr_first_block(>blocks); + dcc_start = amdgpu_vram_mgr_block_start(dcc_block); + + if (adev->gmc.gmc_funcs->get_dcc_alignment) { + align
Re: [PATCH] drm/buddy: Add start address support to trim function
Thanks Alex. Hi Matthew, Any comments? Thanks, Arun. On 7/9/2024 1:42 AM, Alex Deucher wrote: On Thu, Jul 4, 2024 at 4:40 AM Arunpravin Paneer Selvam wrote: - Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam Series is: Acked-by: Alex Deucher I'd like to take this series through the amdgpu tree if there are no objections as it's required for display buffers on some chips and I'd like to make sure it lands in 6.11. Thanks, Alex --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 94f8c34fc293..8cebe1fa4e9d 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, +NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, >blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, >blocks)) size = vres->base.size; } diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h in
[PATCH 2/2] drm/amdgpu: Add address alignment support to DCC buffers
Add address alignment support to the DCC VRAM buffers. v2: - adjust size based on the max_texture_channel_caches values only for GFX12 DCC buffers. - used AMDGPU_GEM_CREATE_GFX12_DCC flag to apply change only for DCC buffers. - roundup non power of two DCC buffer adjusted size to nearest power of two number as the buddy allocator does not support non power of two alignments. This applies only to the contiguous DCC buffers. v3:(Alex) - rewrite the max texture channel caches comparison code in an algorithmic way to determine the alignment size. v4:(Alex) - Move the logic from amdgpu_vram_mgr_dcc_alignment() to gmc_v12_0.c and add a new gmc func callback for dcc alignment. If the callback is non-NULL, call it to get the alignment, otherwise, use the default. v5:(Alex) - Set the Alignment to a default value if the callback doesn't exist. - Add the callback to amdgpu_gmc_funcs. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 36 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 15 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index febca3130497..49dfcf112ac1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -156,6 +156,8 @@ struct amdgpu_gmc_funcs { uint64_t addr, uint64_t *flags); /* get the amount of memory used by the vbios for pre-OS console */ unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev); + /* get the DCC buffer alignment */ + u64 (*get_dcc_alignment)(struct amdgpu_device *adev); enum amdgpu_memory_partition (*query_mem_partition_mode)( struct amdgpu_device *adev); @@ -363,6 +365,7 @@ struct amdgpu_gmc { (adev)->gmc.gmc_funcs->override_vm_pte_flags\ ((adev), (vm), (addr), (pte_flags)) #define amdgpu_gmc_get_vbios_fb_size(adev) (adev)->gmc.gmc_funcs->get_vbios_fb_size((adev)) +#define amdgpu_gmc_get_dcc_alignment(adev) ((adev)->gmc.gmc_funcs->get_dcc_alignment((adev))) /** * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index eb94f943b28e..756d1c8cbffd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -509,6 +509,16 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; remaining_size = (u64)vres->base.size; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) { + u64 adjust_size; + + if (adev->gmc.gmc_funcs->get_dcc_alignment) { + adjust_size = amdgpu_gmc_get_dcc_alignment(adev); + remaining_size = roundup_pow_of_two(remaining_size + adjust_size); + vres->flags |= DRM_BUDDY_TRIM_DISABLE; + } + } mutex_lock(>lock); while (remaining_size) { @@ -518,8 +528,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size = mgr->default_page_size; size = remaining_size; - if ((size >= (u64)pages_per_block << PAGE_SHIFT) && - !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) + min_block_size = size; + else if ((size >= (u64)pages_per_block << PAGE_SHIFT) && +!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; BUG_ON(min_block_size < mm->chunk_size); @@ -550,6 +564,24 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, } mutex_unlock(>lock); + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) { + struct drm_buddy_block *dcc_block; + u64 dcc_start, alignment; + + dcc_block = amdgpu_vram_mgr_first_block(>blocks); + dcc_start = amdgpu_vram_mgr_block_start(dcc_block); + + if (adev->gmc.gmc_funcs->get_dcc_alignment) { + alignment = amdgpu_gmc_get_dcc_alignment(adev); +
[PATCH] drm/buddy: Add start address support to trim function
- Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. v1:(Matthew) - check new_start alignment with min chunk_size - use range_overflows() Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/drm_buddy.c | 25 +++-- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 94f8c34fc293..8cebe1fa4e9d 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,20 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +924,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1085,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1103,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, +NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, >blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, >blocks)) size = vres->base.size; } diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index 82570f77e817..0c2f735f0265 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -27,6 +27,7 @@ #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2) #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) #define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) struct drm_buddy_block { #define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) @@ -155,6 +1
[PATCH] drm/buddy: Add start address support to trim function
- Add a new start parameter in trim function to specify exact address from where to start the trimming. This would help us in situations like if drivers would like to do address alignment for specific requirements. - Add a new flag DRM_BUDDY_TRIM_DISABLE. Drivers can use this flag to disable the allocator trimming part. This patch enables the drivers control trimming and they can do it themselves based on the application requirements. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/drm_buddy.c | 22 -- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 2 +- include/drm/drm_buddy.h | 2 ++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 6a8e45e9d0ec..287b6acb1637 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -851,6 +851,7 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * drm_buddy_block_trim - free unused pages * * @mm: DRM buddy manager + * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. * MUST contain single block as input to be trimmed. @@ -866,11 +867,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * 0 on success, error code on failure. */ int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start, u64 new_size, struct list_head *blocks) { struct drm_buddy_block *parent; struct drm_buddy_block *block; + u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; int err; @@ -882,6 +885,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, struct drm_buddy_block, link); + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block) - 1; + if (WARN_ON(!drm_buddy_block_is_allocated(block))) return -EINVAL; @@ -894,6 +900,17 @@ int drm_buddy_block_trim(struct drm_buddy *mm, if (new_size == drm_buddy_block_size(mm, block)) return 0; + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if ((new_start + new_size) > block_end) + return -EINVAL; + } + list_del(>link); mark_free(mm, block); mm->avail += drm_buddy_block_size(mm, block); @@ -904,7 +921,6 @@ int drm_buddy_block_trim(struct drm_buddy *mm, parent = block->parent; block->parent = NULL; - new_start = drm_buddy_block_offset(block); list_add(>tmp_link, ); err = __alloc_range(mm, , new_start, new_size, blocks, NULL); if (err) { @@ -1066,7 +1082,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (original_size != size) { + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); u64 trim_size; @@ -1083,6 +1100,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } drm_buddy_block_trim(mm, +NULL, trim_size, trim_list); diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index fe3779fdba2c..423b261ea743 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -150,7 +150,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, } while (remaining_size); if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, vres->base.size, >blocks)) + if (!drm_buddy_block_trim(mm, NULL, vres->base.size, >blocks)) size = vres->base.size; } diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index 2a74fa9d0ce5..9689a7c5dd36 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -27,6 +27,7 @@ #define DRM_BUDDY_CONTIGUOUS_ALLOCATIONBIT(2) #define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) #define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) struct drm_buddy_block { #define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) @@ -155,6 +156,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, unsigned long flags); int drm_buddy_block_trim(struct drm_buddy *mm, +u64 *start,
Re: [PATCH v3] drm/amdgpu: Fix the BO release clear memory warning
Hi Christian, On 6/10/2024 11:35 PM, Christian König wrote: Am 10.06.24 um 20:04 schrieb Arunpravin Paneer Selvam: This happens when the amdgpu_bo_release_notify running before amdgpu_ttm_set_buffer_funcs_status set the buffer funcs to enabled. check the buffer funcs enablement before calling the fill buffer memory. v2:(Christian) - Apply it only for GEM buffers and since GEM buffers are only allocated/freed while the driver is loaded we never run into the issue to clear with buffer funcs disabled. v3:(Mario) - drop the stable tag as this will presumably go into a -fixes PR for 6.10 Log snip: *ERROR* Trying to clear memory with ring turned off. RIP: 0010:amdgpu_bo_release_notify+0x201/0x220 [amdgpu] Fixes: a68c7eaa7a8f ("drm/amdgpu: Enable clear page functionality") Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Tested-by: Mikhail Gavrilov Tested-by: Richard Gong Suggested-by: Christian König Please push to drm-misc-fixes ASAP. I pushed into drm-misc-fixes. Thanks, Arun Thanks, Christian. --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 67c234bcf89f..3adaa4670103 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -108,6 +108,7 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size, memset(, 0, sizeof(bp)); *obj = NULL; + flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bp.size = size; bp.byte_align = alignment; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8d8c39be6129..c556c8b653fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -604,8 +604,6 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; - bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | AMDGPU_GEM_DOMAIN_GDS))
[PATCH v3] drm/amdgpu: Fix the BO release clear memory warning
This happens when the amdgpu_bo_release_notify running before amdgpu_ttm_set_buffer_funcs_status set the buffer funcs to enabled. check the buffer funcs enablement before calling the fill buffer memory. v2:(Christian) - Apply it only for GEM buffers and since GEM buffers are only allocated/freed while the driver is loaded we never run into the issue to clear with buffer funcs disabled. v3:(Mario) - drop the stable tag as this will presumably go into a -fixes PR for 6.10 Log snip: *ERROR* Trying to clear memory with ring turned off. RIP: 0010:amdgpu_bo_release_notify+0x201/0x220 [amdgpu] Fixes: a68c7eaa7a8f ("drm/amdgpu: Enable clear page functionality") Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Tested-by: Mikhail Gavrilov Tested-by: Richard Gong Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c| 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 67c234bcf89f..3adaa4670103 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -108,6 +108,7 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size, memset(, 0, sizeof(bp)); *obj = NULL; + flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bp.size = size; bp.byte_align = alignment; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8d8c39be6129..c556c8b653fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -604,8 +604,6 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; - bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | AMDGPU_GEM_DOMAIN_GDS)) -- 2.25.1
[PATCH v2] drm/amdgpu: Fix the BO release clear memory warning
038632] do_syscall_64+0x84/0x1a0 [6.038634] ? do_syscall_64+0x90/0x1a0 [6.038635] ? srso_alias_return_thunk+0x5/0xfbef5 [6.038637] ? do_syscall_64+0x90/0x1a0 [6.038638] ? srso_alias_return_thunk+0x5/0xfbef5 [6.038639] ? do_syscall_64+0x90/0x1a0 [6.038640] ? srso_alias_return_thunk+0x5/0xfbef5 [6.038642] ? srso_alias_return_thunk+0x5/0xfbef5 [6.038644] entry_SYSCALL_64_after_hwframe+0x78/0x80 [6.038645] RIP: 0033:0x7f2bd1e9d059 [6.038647] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 8f 1d 0d 00 f7 d8 64 89 01 48 [6.038648] RSP: 002b:7fffaf804878 EFLAGS: 0246 ORIG_RAX: 0139 [6.038650] RAX: ffda RBX: 55a9b2328d60 RCX: 7f2bd1e9d059 [6.038650] RDX: RSI: 7f2bd1fd0509 RDI: 0024 [6.038651] RBP: R08: 0040 R09: 55a9b23000a0 [6.038652] R10: 0038 R11: 0246 R12: 7f2bd1fd0509 [6.038652] R13: 0002 R14: 55a9b2326f90 R15: [6.038655] [6.038656] ---[ end trace ]--- Cc: # 6.10+ Fixes: a68c7eaa7a8f ("drm/amdgpu: Enable clear page functionality") Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c| 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 67c234bcf89f..3adaa4670103 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -108,6 +108,7 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size, memset(, 0, sizeof(bp)); *obj = NULL; + flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bp.size = size; bp.byte_align = alignment; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8d8c39be6129..c556c8b653fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -604,8 +604,6 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; - bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | AMDGPU_GEM_DOMAIN_GDS)) -- 2.25.1
Re: [PATCH] drm/amdgpu: Fix the BO release clear memory warning
Hi Christian, On 5/7/2024 8:21 PM, Christian König wrote: Am 06.05.24 um 15:48 schrieb Arunpravin Paneer Selvam: This happens when the amdgpu_bo_release_notify running before amdgpu_ttm_set_buffer_funcs_status set the buffer funcs to enabled. check the buffer funcs enablement before calling the fill buffer memory. Log snip: [ 6.036477] [drm:amdgpu_fill_buffer [amdgpu]] *ERROR* Trying to clear memory with ring turned off. [ 6.036667] [ cut here ] [ 6.036668] WARNING: CPU: 3 PID: 370 at drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1355 amdgpu_bo_release_notify+0x201/0x220 [amdgpu] [ 6.036767] Modules linked in: hid_generic amdgpu(+) amdxcp drm_exec gpu_sched drm_buddy i2c_algo_bit usbhid drm_suballoc_helper drm_display_helper hid sd_mod cec rc_core drm_ttm_helper ahci ttm nvme libahci drm_kms_helper nvme_core r8169 xhci_pci libata t10_pi xhci_hcd realtek crc32_pclmul crc64_rocksoft mdio_devres crc64 drm crc32c_intel scsi_mod usbcore thunderbolt crc_t10dif i2c_piix4 libphy crct10dif_generic crct10dif_pclmul crct10dif_common scsi_common usb_common video wmi gpio_amdpt gpio_generic button [ 6.036793] CPU: 3 PID: 370 Comm: (udev-worker) Not tainted 6.8.7-dirty #1 [ 6.036795] Hardware name: ASRock X670E Taichi/X670E Taichi, BIOS 2.10 03/26/2024 [ 6.036796] RIP: 0010:amdgpu_bo_release_notify+0x201/0x220 [amdgpu] [ 6.036891] Code: 0b e9 af fe ff ff 48 ba ff ff ff ff ff ff ff 7f 31 f6 4c 89 e7 e8 7f 2f 7a d8 eb 98 e8 18 28 7a d8 eb b2 0f 0b e9 58 fe ff ff <0f> 0b eb a7 be 03 00 00 00 e8 e1 89 4e d8 eb 9b e8 aa 4d ad d8 66 [ 6.036892] RSP: 0018:bbe140d1f638 EFLAGS: 00010282 [ 6.036894] RAX: ffea RBX: 90cba9e4e858 RCX: 90dabde38c28 [ 6.036895] RDX: RSI: dfff RDI: 0001 [ 6.036896] RBP: 90cba980ef40 R08: R09: bbe140d1f3c0 [ 6.036896] R10: bbe140d1f3b8 R11: 0003 R12: 90cba9e4e800 [ 6.036897] R13: 90cba9e4e958 R14: 90cba980ef40 R15: 0258 [ 6.036898] FS: 7f2bd1679d00() GS:90da7e2c() knlGS: [ 6.036899] CS: 0010 DS: ES: CR0: 80050033 [ 6.036900] CR2: 55a9b0f7299d CR3: 00011bb6e000 CR4: 00750ef0 [ 6.036901] PKRU: 5554 [ 6.036901] Call Trace: [ 6.036903] [ 6.036904] ? amdgpu_bo_release_notify+0x201/0x220 [amdgpu] [ 6.036998] ? __warn+0x81/0x130 [ 6.037002] ? amdgpu_bo_release_notify+0x201/0x220 [amdgpu] [ 6.037095] ? report_bug+0x171/0x1a0 [ 6.037099] ? handle_bug+0x3c/0x80 [ 6.037101] ? exc_invalid_op+0x17/0x70 [ 6.037103] ? asm_exc_invalid_op+0x1a/0x20 [ 6.037107] ? amdgpu_bo_release_notify+0x201/0x220 [amdgpu] [ 6.037199] ? amdgpu_bo_release_notify+0x14a/0x220 [amdgpu] [ 6.037292] ttm_bo_release+0xff/0x2e0 [ttm] [ 6.037297] ? srso_alias_return_thunk+0x5/0xfbef5 [ 6.037299] ? srso_alias_return_thunk+0x5/0xfbef5 [ 6.037301] ? ttm_resource_move_to_lru_tail+0x140/0x1e0 [ttm] [ 6.037306] amdgpu_bo_free_kernel+0xcb/0x120 [amdgpu] [ 6.037399] dm_helpers_free_gpu_mem+0x41/0x80 [amdgpu] [ 6.037544] dcn315_clk_mgr_construct+0x198/0x7e0 [amdgpu] [ 6.037692] dc_clk_mgr_create+0x16e/0x5f0 [amdgpu] [ 6.037826] dc_create+0x28a/0x650 [amdgpu] [ 6.037958] amdgpu_dm_init.isra.0+0x2d5/0x1ec0 [amdgpu] [ 6.038085] ? srso_alias_return_thunk+0x5/0xfbef5 [ 6.038087] ? prb_read_valid+0x1b/0x30 [ 6.038089] ? srso_alias_return_thunk+0x5/0xfbef5 [ 6.038090] ? console_unlock+0x78/0x120 [ 6.038092] ? srso_alias_return_thunk+0x5/0xfbef5 [ 6.038094] ? vprintk_emit+0x175/0x2c0 [ 6.038095] ? srso_alias_return_thunk+0x5/0xfbef5 [ 6.038097] ? srso_alias_return_thunk+0x5/0xfbef5 [ 6.038098] ? dev_printk_emit+0xa5/0xd0 [ 6.038104] dm_hw_init+0x12/0x30 [amdgpu] [ 6.038209] amdgpu_device_init+0x1e50/0x2500 [amdgpu] [ 6.038308] ? srso_alias_return_thunk+0x5/0xfbef5 [ 6.038310] ? srso_alias_return_thunk+0x5/0xfbef5 [ 6.038313] amdgpu_driver_load_kms+0x19/0x190 [amdgpu] [ 6.038409] amdgpu_pci_probe+0x18b/0x510 [amdgpu] [ 6.038505] local_pci_probe+0x42/0xa0 [ 6.038508] pci_device_probe+0xc7/0x240 [ 6.038510] really_probe+0x19b/0x3e0 [ 6.038513] ? __pfx___driver_attach+0x10/0x10 [ 6.038514] __driver_probe_device+0x78/0x160 [ 6.038516] driver_probe_device+0x1f/0x90 [ 6.038517] __driver_attach+0xd2/0x1c0 [ 6.038519] bus_for_each_dev+0x85/0xd0 [ 6.038521] bus_add_driver+0x116/0x220 [ 6.038523] driver_register+0x59/0x100 [ 6.038525] ? __pfx_amdgpu_init+0x10/0x10 [amdgpu] [ 6.038618] do_one_initcall+0x58/0x320 [ 6.038621] do_init_module+0x60/0x230 [ 6.038624] init_module_from_file+0x89/0xe0 [ 6.038628] idempotent_init_module+0x120/0x2b0 [ 6.038630] __x64_sys_finit_module+0x5e/0xb0 [ 6.038632] do_syscall_64+0x84/0x1a0 [ 6.038634] ? do_syscall_6
Re: [PATCH v2] drm/buddy: Fix the warn on's during force merge
Hi Dave, Please help pull this patch into drm-next. This will fix any unnecessary warnings in memory pressure situation. Thanks for the help. Regards, Arun. On 5/17/2024 8:03 PM, Arunpravin Paneer Selvam wrote: Move the fallback and block incompatible checks above, so that we dont unnecessarily split the blocks and leaving the unmerged. This resolves the unnecessary warn on's thrown during force_merge call. v2:(Matthew) - Move the fallback and block incompatible checks above the contains check. Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Matthew Auld Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") Link: https://patchwork.kernel.org/project/dri-devel/patch/20240517135015.17565-1-arunpravin.paneersel...@amd.com/ --- drivers/gpu/drm/drm_buddy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 1daf778cf6fa..94f8c34fc293 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm, continue; } + if (!fallback && block_incompatible(block, flags)) + continue; + if (contains(start, end, block_start, block_end) && order == drm_buddy_block_order(block)) { - if (!fallback && block_incompatible(block, flags)) - continue; - /* * Find the free block within the range. */
Re: 6.10/regression/bisected - commit a68c7eaa7a8f cause *ERROR* Trying to clear memory with ring turned off in amdgpu_fill_buffer.
729714] __driver_probe_device+0x18c/0x370 [ 13.729718] driver_probe_device+0x4a/0x120 [ 13.729721] __driver_attach+0x194/0x4a0 [ 13.729724] ? __pfx___driver_attach+0x10/0x10 [ 13.729726] bus_for_each_dev+0x106/0x190 [ 13.729730] ? __pfx_bus_for_each_dev+0x10/0x10 [ 13.729737] bus_add_driver+0x2ff/0x530 [ 13.729742] driver_register+0x1a5/0x360 [ 13.729745] ? __pfx_amdgpu_init+0x10/0x10 [amdgpu] [ 13.729976] do_one_initcall+0xd6/0x460 [ 13.729980] ? __pfx_do_one_initcall+0x10/0x10 [ 13.729987] ? kasan_unpoison+0x44/0x70 [ 13.729992] do_init_module+0x296/0x7c0 [ 13.72] load_module+0x576d/0x7480 [ 13.730013] ? __pfx_load_module+0x10/0x10 [ 13.730019] ? __might_fault+0x9d/0x120 [ 13.730022] ? local_clock_noinstr+0xd/0x100 [ 13.730032] ? ima_post_load_data+0x68/0x80 [ 13.730038] ? __do_sys_init_module+0x1ef/0x220 [ 13.730040] __do_sys_init_module+0x1ef/0x220 [ 13.730042] ? __pfx___do_sys_init_module+0x10/0x10 [ 13.730054] do_syscall_64+0x97/0x190 [ 13.730057] ? __lock_acquire+0xf65/0x5c70 [ 13.730061] ? __pfx___lock_acquire+0x10/0x10 [ 13.730067] ? do_user_addr_fault+0x4d5/0xbc0 [ 13.730071] ? reacquire_held_locks+0x219/0x4f0 [ 13.730073] ? do_user_addr_fault+0x4d5/0xbc0 [ 13.730079] ? do_user_addr_fault+0x558/0xbc0 [ 13.730081] ? local_clock_noinstr+0xd/0x100 [ 13.730086] ? __pfx_lock_release+0x10/0x10 [ 13.730090] ? handle_mm_fault+0x47d/0x8d0 [ 13.730099] ? lockdep_hardirqs_on_prepare+0x171/0x400 [ 13.730103] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 13.730106] RIP: 0033:0x7f22a1fbc5ae [ 13.730113] Code: 48 8b 0d 85 a8 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 52 a8 0c 00 f7 d8 64 89 01 48 [ 13.730115] RSP: 002b:7ffce615c888 EFLAGS: 0246 ORIG_RAX: 00af [ 13.730118] RAX: ffda RBX: 55f90368f520 RCX: 7f22a1fbc5ae [ 13.730119] RDX: 55f9036c2e40 RSI: 05749496 RDI: 7f2299c00010 [ 13.730120] RBP: 7ffce615c940 R08: 55f903645010 R09: 0007 [ 13.730122] R10: 0006 R11: 0246 R12: 55f9036c2e40 [ 13.730123] R13: 0002 R14: 55f9036b3560 R15: 55f9036d2f70 [ 13.730131] [ 13.730132] irq event stamp: 2745383 [ 13.730134] hardirqs last enabled at (2745389): [] vprintk_emit.part.0+0x44a/0x4b0 [ 13.730136] hardirqs last disabled at (2745394): [] vprintk_emit.part.0+0x3fd/0x4b0 [ 13.730138] softirqs last enabled at (2744940): [] __irq_exit_rcu+0xbb/0x1c0 [ 13.730141] softirqs last disabled at (2744929): [] __irq_exit_rcu+0xbb/0x1c0 [ 13.730143] ---[ end trace ]--- [ 13.730605] [drm] Display Core v3.2.281 initialized on DCN 3.1.5 Of course, I immediately wanted to find the first bad commit. And bad commit it has already been found: a68c7eaa7a8ffdec9287ba1561a668d674c20a13 is the first bad commit commit a68c7eaa7a8ffdec9287ba1561a668d674c20a13 (HEAD) Author: Arunpravin Paneer Selvam Date: Fri Apr 19 12:05:37 2024 +0530 drm/amdgpu: Enable clear page functionality Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. v4(Christian): - vres flag setting move to vram manager file - use dma_fence_get_stub in amdgpu_ttm_clear_buffer function - make fence a mandatory parameter and drop the if and the get/put dance Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling Link: https://patchwork.freedesktop.org/patch/msgid/20240419063538.11957-2-arunpravin.paneersel...@amd.com Signed-off-by: Christian König drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 9 + drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h | 25 + drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c| 70 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h| 5 +++-- dri
Re: [PATCH v2] drm/buddy: Fix the warn on's during force merge
On 5/17/2024 7:30 PM, Matthew Auld wrote: On 17/05/2024 14:50, Arunpravin Paneer Selvam wrote: Move the fallback and block incompatible checks above, so that we dont unnecessarily split the blocks and leaving the unmerged. This resolves the unnecessary warn on's thrown during force_merge call. v2:(Matthew) - Move the fallback and block incompatible checks above the contains check. Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") Reviewed-by: Matthew Auld A follow up unit test to catch this edge case would be lovely. Yes, I will follow up on this. Thanks, Arun. --- drivers/gpu/drm/drm_buddy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 1daf778cf6fa..94f8c34fc293 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm, continue; } + if (!fallback && block_incompatible(block, flags)) + continue; + if (contains(start, end, block_start, block_end) && order == drm_buddy_block_order(block)) { - if (!fallback && block_incompatible(block, flags)) - continue; - /* * Find the free block within the range. */
[PATCH v2] drm/buddy: Fix the warn on's during force merge
Move the fallback and block incompatible checks above, so that we dont unnecessarily split the blocks and leaving the unmerged. This resolves the unnecessary warn on's thrown during force_merge call. v2:(Matthew) - Move the fallback and block incompatible checks above the contains check. Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Matthew Auld Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") Link: https://patchwork.kernel.org/project/dri-devel/patch/20240517135015.17565-1-arunpravin.paneersel...@amd.com/ --- drivers/gpu/drm/drm_buddy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 1daf778cf6fa..94f8c34fc293 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm, continue; } + if (!fallback && block_incompatible(block, flags)) + continue; + if (contains(start, end, block_start, block_end) && order == drm_buddy_block_order(block)) { - if (!fallback && block_incompatible(block, flags)) - continue; - /* * Find the free block within the range. */ -- 2.25.1
Re: [PATCH v2] drm/buddy: Fix the warn on's during force merge
Hi Matthew, This fixes the problem. Regards, Arun. On 5/17/2024 7:20 PM, Arunpravin Paneer Selvam wrote: Move the fallback and block incompatible checks above, so that we dont unnecessarily split the blocks and leaving the unmerged. This resolves the unnecessary warn on's thrown during force_merge call. v2:(Matthew) - Move the fallback and block incompatible checks above the contains check. Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") --- drivers/gpu/drm/drm_buddy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 1daf778cf6fa..94f8c34fc293 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm, continue; } + if (!fallback && block_incompatible(block, flags)) + continue; + if (contains(start, end, block_start, block_end) && order == drm_buddy_block_order(block)) { - if (!fallback && block_incompatible(block, flags)) - continue; - /* * Find the free block within the range. */
[PATCH v2] drm/buddy: Fix the warn on's during force merge
Move the fallback and block incompatible checks above, so that we dont unnecessarily split the blocks and leaving the unmerged. This resolves the unnecessary warn on's thrown during force_merge call. v2:(Matthew) - Move the fallback and block incompatible checks above the contains check. Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") --- drivers/gpu/drm/drm_buddy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 1daf778cf6fa..94f8c34fc293 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -524,11 +524,11 @@ __alloc_range_bias(struct drm_buddy *mm, continue; } + if (!fallback && block_incompatible(block, flags)) + continue; + if (contains(start, end, block_start, block_end) && order == drm_buddy_block_order(block)) { - if (!fallback && block_incompatible(block, flags)) - continue; - /* * Find the free block within the range. */ -- 2.25.1
Re: [PATCH] drm/buddy: Merge back blocks in bias range function
Hi Matthew, Could you help review this patch quickly. Hi Dave This patch just fixes the unnecessary warn on's triggered during the force_merge call. Regards, Arun. On 5/17/2024 6:08 PM, Arunpravin Paneer Selvam wrote: In bias range allocation, when we don't find the required blocks (i.e) on returning the -ENOSPC, we should merge back the split blocks. Otherwise, during force_merge we are flooded with warn on's due to block and its buddy are in same clear state (dirty or clear). Hence, renamed the force_merge with merge_blocks and passed a force_merge as a bool function parameter. Based on the requirement, say, in any normal situation we can call the merge_blocks passing the force_merge variable as false. And, in any memory cruch situation, we can call the merge_blocks passing the force_merge as true. This resolves the unnecessary warn on's thrown during force_merge call. Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") --- drivers/gpu/drm/drm_buddy.c | 32 ++-- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 1daf778cf6fa..111f602f1359 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -161,10 +161,11 @@ static unsigned int __drm_buddy_free(struct drm_buddy *mm, return order; } -static int __force_merge(struct drm_buddy *mm, -u64 start, -u64 end, -unsigned int min_order) +static int __merge_blocks(struct drm_buddy *mm, + u64 start, + u64 end, + unsigned int min_order, + bool force_merge) { unsigned int order; int i; @@ -195,8 +196,9 @@ static int __force_merge(struct drm_buddy *mm, if (!drm_buddy_block_is_free(buddy)) continue; - WARN_ON(drm_buddy_block_is_clear(block) == - drm_buddy_block_is_clear(buddy)); + if (force_merge) + WARN_ON(drm_buddy_block_is_clear(block) == + drm_buddy_block_is_clear(buddy)); /* * If the prev block is same as buddy, don't access the @@ -210,7 +212,7 @@ static int __force_merge(struct drm_buddy *mm, if (drm_buddy_block_is_clear(block)) mm->clear_avail -= drm_buddy_block_size(mm, block); - order = __drm_buddy_free(mm, block, true); + order = __drm_buddy_free(mm, block, force_merge); if (order >= min_order) return 0; } @@ -332,7 +334,7 @@ void drm_buddy_fini(struct drm_buddy *mm) for (i = 0; i < mm->n_roots; ++i) { order = ilog2(size) - ilog2(mm->chunk_size); - __force_merge(mm, 0, size, order); + __merge_blocks(mm, 0, size, order, true); WARN_ON(!drm_buddy_block_is_free(mm->roots[i])); drm_block_free(mm, mm->roots[i]); @@ -479,7 +481,7 @@ __alloc_range_bias(struct drm_buddy *mm, unsigned long flags, bool fallback) { - u64 req_size = mm->chunk_size << order; + u64 size, root_size, req_size = mm->chunk_size << order; struct drm_buddy_block *block; struct drm_buddy_block *buddy; LIST_HEAD(dfs); @@ -487,6 +489,7 @@ __alloc_range_bias(struct drm_buddy *mm, int i; end = end - 1; + size = mm->size; for (i = 0; i < mm->n_roots; ++i) list_add_tail(>roots[i]->tmp_link, ); @@ -548,6 +551,15 @@ __alloc_range_bias(struct drm_buddy *mm, list_add(>left->tmp_link, ); } while (1); + /* Merge back the split blocks */ + for (i = 0; i < mm->n_roots; ++i) { + order = ilog2(size) - ilog2(mm->chunk_size); + __merge_blocks(mm, start, end, order, false); + + root_size = mm->chunk_size << order; + size -= root_size; + } + return ERR_PTR(-ENOSPC); err_undo: @@ -1026,7 +1038,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, if (order-- == min_order) { /* Try allocation through force merge method */ if (mm->clear_avail && - !__force_merge(mm, start, end, min_order)) { + !__merge_blocks(mm, start, end, min_order, true)) { block = __drm_buddy_alloc_blocks(mm, start, end, min_order,
[PATCH] drm/buddy: Merge back blocks in bias range function
In bias range allocation, when we don't find the required blocks (i.e) on returning the -ENOSPC, we should merge back the split blocks. Otherwise, during force_merge we are flooded with warn on's due to block and its buddy are in same clear state (dirty or clear). Hence, renamed the force_merge with merge_blocks and passed a force_merge as a bool function parameter. Based on the requirement, say, in any normal situation we can call the merge_blocks passing the force_merge variable as false. And, in any memory cruch situation, we can call the merge_blocks passing the force_merge as true. This resolves the unnecessary warn on's thrown during force_merge call. Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") --- drivers/gpu/drm/drm_buddy.c | 32 ++-- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 1daf778cf6fa..111f602f1359 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -161,10 +161,11 @@ static unsigned int __drm_buddy_free(struct drm_buddy *mm, return order; } -static int __force_merge(struct drm_buddy *mm, -u64 start, -u64 end, -unsigned int min_order) +static int __merge_blocks(struct drm_buddy *mm, + u64 start, + u64 end, + unsigned int min_order, + bool force_merge) { unsigned int order; int i; @@ -195,8 +196,9 @@ static int __force_merge(struct drm_buddy *mm, if (!drm_buddy_block_is_free(buddy)) continue; - WARN_ON(drm_buddy_block_is_clear(block) == - drm_buddy_block_is_clear(buddy)); + if (force_merge) + WARN_ON(drm_buddy_block_is_clear(block) == + drm_buddy_block_is_clear(buddy)); /* * If the prev block is same as buddy, don't access the @@ -210,7 +212,7 @@ static int __force_merge(struct drm_buddy *mm, if (drm_buddy_block_is_clear(block)) mm->clear_avail -= drm_buddy_block_size(mm, block); - order = __drm_buddy_free(mm, block, true); + order = __drm_buddy_free(mm, block, force_merge); if (order >= min_order) return 0; } @@ -332,7 +334,7 @@ void drm_buddy_fini(struct drm_buddy *mm) for (i = 0; i < mm->n_roots; ++i) { order = ilog2(size) - ilog2(mm->chunk_size); - __force_merge(mm, 0, size, order); + __merge_blocks(mm, 0, size, order, true); WARN_ON(!drm_buddy_block_is_free(mm->roots[i])); drm_block_free(mm, mm->roots[i]); @@ -479,7 +481,7 @@ __alloc_range_bias(struct drm_buddy *mm, unsigned long flags, bool fallback) { - u64 req_size = mm->chunk_size << order; + u64 size, root_size, req_size = mm->chunk_size << order; struct drm_buddy_block *block; struct drm_buddy_block *buddy; LIST_HEAD(dfs); @@ -487,6 +489,7 @@ __alloc_range_bias(struct drm_buddy *mm, int i; end = end - 1; + size = mm->size; for (i = 0; i < mm->n_roots; ++i) list_add_tail(>roots[i]->tmp_link, ); @@ -548,6 +551,15 @@ __alloc_range_bias(struct drm_buddy *mm, list_add(>left->tmp_link, ); } while (1); + /* Merge back the split blocks */ + for (i = 0; i < mm->n_roots; ++i) { + order = ilog2(size) - ilog2(mm->chunk_size); + __merge_blocks(mm, start, end, order, false); + + root_size = mm->chunk_size << order; + size -= root_size; + } + return ERR_PTR(-ENOSPC); err_undo: @@ -1026,7 +1038,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, if (order-- == min_order) { /* Try allocation through force merge method */ if (mm->clear_avail && - !__force_merge(mm, start, end, min_order)) { + !__merge_blocks(mm, start, end, min_order, true)) { block = __drm_buddy_alloc_blocks(mm, start, end, min_order, -- 2.25.1
[PATCH v3 1/2] drm/buddy: Fix the range bias clear memory allocation issue
Problem statement: During the system boot time, an application request for the bulk volume of cleared range bias memory when the clear_avail is zero, we dont fallback into normal allocation method as we had an unnecessary clear_avail check which prevents the fallback method leads to fb allocation failure following system goes into unresponsive state. Solution: Remove the unnecessary clear_avail check in the range bias allocation function. v2: add a kunit for this corner case (Daniel Vetter) Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") Reviewed-by: Matthew Auld --- drivers/gpu/drm/drm_buddy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 284ebae71cc4..1daf778cf6fa 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -249,6 +249,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) mm->size = size; mm->avail = size; + mm->clear_avail = 0; mm->chunk_size = chunk_size; mm->max_order = ilog2(size) - ilog2(chunk_size); @@ -574,7 +575,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm, block = __alloc_range_bias(mm, start, end, order, flags, fallback); - if (IS_ERR(block) && mm->clear_avail) + if (IS_ERR(block)) return __alloc_range_bias(mm, start, end, order, flags, !fallback); -- 2.25.1
[PATCH v3 2/2] drm/tests: Add a unit test for range bias allocation
Allocate cleared blocks in the bias range when the DRM buddy's clear avail is zero. This will validate the bias range allocation in scenarios like system boot when no cleared blocks are available and exercise the fallback path too. The resulting blocks should always be dirty. v1:(Matthew) - move the size to the variable declaration section. - move the mm.clear_avail init to allocator init. Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Matthew Auld --- drivers/gpu/drm/tests/drm_buddy_test.c | 36 +- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index e3b50e240d36..b3be68b03610 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -23,9 +23,11 @@ static inline u64 get_size(int order, u64 chunk_size) static void drm_test_buddy_alloc_range_bias(struct kunit *test) { - u32 mm_size, ps, bias_size, bias_start, bias_end, bias_rem; + u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem; DRM_RND_STATE(prng, random_seed); unsigned int i, count, *order; + struct drm_buddy_block *block; + unsigned long flags; struct drm_buddy mm; LIST_HEAD(allocated); @@ -222,6 +224,38 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) drm_buddy_free_list(, , 0); drm_buddy_fini(); + + /* +* Allocate cleared blocks in the bias range when the DRM buddy's clear avail is +* zero. This will validate the bias range allocation in scenarios like system boot +* when no cleared blocks are available and exercise the fallback path too. The resulting +* blocks should always be dirty. +*/ + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(, mm_size, ps), + "buddy_init failed\n"); + + bias_start = round_up(prandom_u32_state() % (mm_size - ps), ps); + bias_end = round_up(bias_start + prandom_u32_state() % (mm_size - bias_start), ps); + bias_end = max(bias_end, bias_start + ps); + bias_rem = bias_end - bias_start; + + flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION; + size = max(round_up(prandom_u32_state() % bias_rem, ps), ps); + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(, bias_start, + bias_end, size, ps, + , + flags), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + + list_for_each_entry(block, , link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(, , 0); + drm_buddy_fini(); } static void drm_test_buddy_alloc_clear(struct kunit *test) -- 2.25.1
Re: [PATCH v2 2/2] drm/tests: Add a unit test for range bias allocation
Hi Matthew, On 5/13/2024 1:49 PM, Matthew Auld wrote: On 12/05/2024 08:59, Arunpravin Paneer Selvam wrote: Allocate cleared blocks in the bias range when the DRM buddy's clear avail is zero. This will validate the bias range allocation in scenarios like system boot when no cleared blocks are available and exercise the fallback path too. The resulting blocks should always be dirty. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/tests/drm_buddy_test.c | 35 ++ 1 file changed, 35 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index e3b50e240d36..a194f271bc55 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -26,6 +26,8 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) u32 mm_size, ps, bias_size, bias_start, bias_end, bias_rem; DRM_RND_STATE(prng, random_seed); unsigned int i, count, *order; + struct drm_buddy_block *block; + unsigned long flags; struct drm_buddy mm; LIST_HEAD(allocated); @@ -222,6 +224,39 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) drm_buddy_free_list(, , 0); drm_buddy_fini(); + + /* + * Allocate cleared blocks in the bias range when the DRM buddy's clear avail is + * zero. This will validate the bias range allocation in scenarios like system boot + * when no cleared blocks are available and exercise the fallback path too. The resulting + * blocks should always be dirty. + */ + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(, mm_size, ps), + "buddy_init failed\n"); + mm.clear_avail = 0; Should already be zero, right? Maybe make this an assert instead? No, since the mm declared as a local variable in the test case, mm.clear_avail is not zero. + + bias_start = round_up(prandom_u32_state() % (mm_size - ps), ps); + bias_end = round_up(bias_start + prandom_u32_state() % (mm_size - bias_start), ps); + bias_end = max(bias_end, bias_start + ps); + bias_rem = bias_end - bias_start; + + flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION; + u32 size = max(round_up(prandom_u32_state() % bias_rem, ps), ps); u32 declaration should be moved to above? Sure. Thanks, Arun. Otherwise, Reviewed-by: Matthew Auld + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(, bias_start, + bias_end, size, ps, + , + flags), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + + list_for_each_entry(block, , link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(, , 0); + drm_buddy_fini(); } static void drm_test_buddy_alloc_clear(struct kunit *test)
Re: [PATCH] drm/buddy: Fix the range bias clear memory allocation issue
Hi Daniel, On 5/8/2024 2:11 PM, Daniel Vetter wrote: On Wed, May 08, 2024 at 12:27:20PM +0530, Arunpravin Paneer Selvam wrote: Problem statement: During the system boot time, an application request for the bulk volume of cleared range bias memory when the clear_avail is zero, we dont fallback into normal allocation method as we had an unnecessary clear_avail check which prevents the fallback method leads to fb allocation failure following system goes into unresponsive state. Solution: Remove the unnecessary clear_avail check in the range bias allocation function. Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") Reviewed-by: Matthew Auld --- drivers/gpu/drm/drm_buddy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Can you please also add a kunit test case to exercise this corner case and make sure it stays fixed? I have sent the v2 along with a kunit test case for this corner case. Thanks, Arun. Thanks, Sima diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 284ebae71cc4..831929ac95eb 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -574,7 +574,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm, block = __alloc_range_bias(mm, start, end, order, flags, fallback); - if (IS_ERR(block) && mm->clear_avail) + if (IS_ERR(block)) return __alloc_range_bias(mm, start, end, order, flags, !fallback); -- 2.25.1
[PATCH v2 1/2] drm/buddy: Fix the range bias clear memory allocation issue
Problem statement: During the system boot time, an application request for the bulk volume of cleared range bias memory when the clear_avail is zero, we dont fallback into normal allocation method as we had an unnecessary clear_avail check which prevents the fallback method leads to fb allocation failure following system goes into unresponsive state. Solution: Remove the unnecessary clear_avail check in the range bias allocation function. v2: add a kunit for this corner case (Daniel Vetter) Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") Reviewed-by: Matthew Auld --- drivers/gpu/drm/drm_buddy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 284ebae71cc4..831929ac95eb 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -574,7 +574,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm, block = __alloc_range_bias(mm, start, end, order, flags, fallback); - if (IS_ERR(block) && mm->clear_avail) + if (IS_ERR(block)) return __alloc_range_bias(mm, start, end, order, flags, !fallback); -- 2.25.1
[PATCH v2 2/2] drm/tests: Add a unit test for range bias allocation
Allocate cleared blocks in the bias range when the DRM buddy's clear avail is zero. This will validate the bias range allocation in scenarios like system boot when no cleared blocks are available and exercise the fallback path too. The resulting blocks should always be dirty. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/tests/drm_buddy_test.c | 35 ++ 1 file changed, 35 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index e3b50e240d36..a194f271bc55 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -26,6 +26,8 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) u32 mm_size, ps, bias_size, bias_start, bias_end, bias_rem; DRM_RND_STATE(prng, random_seed); unsigned int i, count, *order; + struct drm_buddy_block *block; + unsigned long flags; struct drm_buddy mm; LIST_HEAD(allocated); @@ -222,6 +224,39 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) drm_buddy_free_list(, , 0); drm_buddy_fini(); + + /* +* Allocate cleared blocks in the bias range when the DRM buddy's clear avail is +* zero. This will validate the bias range allocation in scenarios like system boot +* when no cleared blocks are available and exercise the fallback path too. The resulting +* blocks should always be dirty. +*/ + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(, mm_size, ps), + "buddy_init failed\n"); + mm.clear_avail = 0; + + bias_start = round_up(prandom_u32_state() % (mm_size - ps), ps); + bias_end = round_up(bias_start + prandom_u32_state() % (mm_size - bias_start), ps); + bias_end = max(bias_end, bias_start + ps); + bias_rem = bias_end - bias_start; + + flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION; + u32 size = max(round_up(prandom_u32_state() % bias_rem, ps), ps); + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(, bias_start, + bias_end, size, ps, + , + flags), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + + list_for_each_entry(block, , link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(, , 0); + drm_buddy_fini(); } static void drm_test_buddy_alloc_clear(struct kunit *test) -- 2.25.1
[PATCH v10 6/6] drm/amdgpu: Enable userq fence interrupt support
Add support to handle the userqueue protected fence signal hardware interrupt. Create a xarray which maps the doorbell index to the fence driver address. This would help to retrieve the fence driver information when an userq fence interrupt is triggered. Firmware sends the doorbell offset value and this info is compared with the queue's mqd doorbell offset value. If they are same, we process the userq fence interrupt. v1:(Christian) - use xa_load() instead of going over all entries - keep the xa_lock until the fence driver process completes - create a separate patch to remove the MES self test function call. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 2 ++ .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 15 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 23 +-- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 4ca14b02668b..2d5ef2e74c71 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1043,6 +1043,8 @@ struct amdgpu_device { struct amdgpu_mqd mqds[AMDGPU_HW_IP_NUM]; const struct amdgpu_userq_funcs *userq_funcs[AMDGPU_HW_IP_NUM]; + struct xarray userq_xa; + /* df */ struct amdgpu_dfdf; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2d9fa3d0d4a4..fd919105a181 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3982,6 +3982,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, spin_lock_init(>audio_endpt_idx_lock); spin_lock_init(>mm_stats.lock); + xa_init_flags(>userq_xa, XA_FLAGS_LOCK_IRQ); + INIT_LIST_HEAD(>shadow_list); mutex_init(>shadow_list_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 339d82d5808f..4cbc25595226 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -70,6 +70,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, struct amdgpu_usermode_queue *userq) { struct amdgpu_userq_fence_driver *fence_drv; + unsigned long flags; int r; fence_drv = kzalloc(sizeof(*fence_drv), GFP_KERNEL); @@ -97,6 +98,11 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, fence_drv->context = dma_fence_context_alloc(1); get_task_comm(fence_drv->timeline_name, current); + xa_lock_irqsave(>userq_xa, flags); + __xa_store(>userq_xa, userq->doorbell_index, + fence_drv, GFP_KERNEL); + xa_unlock_irqrestore(>userq_xa, flags); + userq->fence_drv = fence_drv; return 0; @@ -147,8 +153,11 @@ void amdgpu_userq_fence_driver_destroy(struct kref *ref) struct amdgpu_userq_fence_driver *fence_drv = container_of(ref, struct amdgpu_userq_fence_driver, refcount); + struct amdgpu_userq_fence_driver *xa_fence_drv; struct amdgpu_device *adev = fence_drv->adev; struct amdgpu_userq_fence *fence, *tmp; + struct xarray *xa = >userq_xa; + unsigned long index; struct dma_fence *f; spin_lock(_drv->fence_list_lock); @@ -165,6 +174,12 @@ void amdgpu_userq_fence_driver_destroy(struct kref *ref) } spin_unlock(_drv->fence_list_lock); + xa_lock(xa); + xa_for_each(xa, index, xa_fence_drv) + if (xa_fence_drv == fence_drv) + __xa_erase(xa, index); + xa_unlock(xa); + /* Free seq64 memory */ amdgpu_seq64_free(adev, fence_drv->gpu_addr); kfree(fence_drv); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index a786e25432ae..0a206f484240 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -49,6 +49,7 @@ #include "gfx_v11_0_3.h" #include "nbio_v4_3.h" #include "mes_v11_0.h" +#include "amdgpu_userq_fence.h" #define GFX11_NUM_GFX_RINGS1 #define GFX11_MEC_HPD_SIZE 2048 @@ -5939,25 +5940,23 @@ static int gfx_v11_0_eop_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { - int i; + u32 doorbell_offset = entry->src_data[0]; u8 me_id, pipe_id, queue_id; struct amdgpu_ring *ring; - uint32_t mes_queue_id = entry->src_data[0]; +
[PATCH v10 1/6] drm/amdgpu: Implement a new userqueue fence driver
Developed a userqueue fence driver for the userqueue process shared BO synchronization. Create a dma fence having write pointer as the seqno and allocate a seq64 memory for each user queue process and feed this memory address into the firmware/hardware, thus the firmware writes the read pointer into the given address when the process completes it execution. Compare wptr and rptr, if rptr >= wptr, signal the fences for the waiting process to consume the buffers. v2: Worked on review comments from Christian for the following modifications - Add wptr as sequence number into the fence - Add a reference count for the fence driver - Add dma_fence_put below the list_del as it might frees the userq fence. - Trim unnecessary code in interrupt handler. - Check dma fence signaled state in dma fence creation function for a potential problem of hardware completing the job processing beforehand. - Add necessary locks. - Create a list and process all the unsignaled fences. - clean up fences in destroy function. - implement .signaled callback function v3: Worked on review comments from Christian - Modify naming convention for reference counted objects - Fix fence driver reference drop issue - Drop amdgpu_userq_fence_driver_process() function return value v4: Worked on review comments from Christian - Moved fence driver allocation into amdgpu_userq_fence_driver_alloc() - Added detail doc mentioning the differences b/w two spinlocks declared. v5: Worked on review comments from Christian - Check before upcast and remove local variable - Add error handling in fence_drv alloc function. - Move rptr read fn outside of the loop and remove WARN_ON in destroy function. v6: - clear the seq64 memory in user fence driver(Christian) - fix for the wptr va bo mapping(Christian) - move the fence_drv xa entry erase code from the interrupt handler into user fence destroy function Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 + drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 4 +- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 257 ++ .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 69 + drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 12 +- .../gpu/drm/amd/include/amdgpu_userqueue.h| 5 +- 7 files changed, 349 insertions(+), 7 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index a640bfa468ad..f8dd808bd71c 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -80,7 +80,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \ amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \ amdgpu_fw_attestation.o amdgpu_securedisplay.o \ amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \ - amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o + amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o \ + amdgpu_userq_fence.o amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index acee1c279abb..844f7b5f90db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -51,6 +51,7 @@ #include "amdgpu_sched.h" #include "amdgpu_xgmi.h" #include "amdgpu_userqueue.h" +#include "amdgpu_userq_fence.h" #include "../amdxcp/amdgpu_xcp_drv.h" /* @@ -3012,6 +3013,10 @@ static int __init amdgpu_init(void) if (r) goto error_fence; + r = amdgpu_userq_fence_slab_init(); + if (r) + goto error_fence; + DRM_INFO("amdgpu kernel modesetting enabled.\n"); amdgpu_register_atpx_handler(); amdgpu_acpi_detect(); @@ -3037,6 +3042,7 @@ static void __exit amdgpu_exit(void) amdgpu_acpi_release(); amdgpu_sync_fini(); amdgpu_fence_slab_fini(); + amdgpu_userq_fence_slab_fini(); mmu_notifier_synchronize(); amdgpu_xcp_drv_release(); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c index e22cb2b5cd92..3ef1d7f4fd23 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c @@ -45,7 +45,7 @@ */ static inline u64 amdgpu_seq64_get_va_base(struct amdgpu_device *adev) { - return AMDGPU_VA_RESERVED_SEQ64_START(adev); + return AMDGPU_VA_RESERVED_BOTTOM; } /** @@ -90,7 +90,7 @@ int amdgpu_seq64_map(struct amdgpu_device *adev, struct
[PATCH v10 4/6] drm/amdgpu: Implement userqueue signal/wait IOCTL
This patch introduces new IOCTL for userqueue secure semaphore. The signal IOCTL called from userspace application creates a drm syncobj and array of bo GEM handles and passed in as parameter to the driver to install the fence into it. The wait IOCTL gets an array of drm syncobjs, finds the fences attached to the drm syncobjs and obtain the array of memory_address/fence_value combintion which are returned to userspace. v2: (Christian) - Install fence into GEM BO object. - Lock all BO's using the dma resv subsystem - Reorder the sequence in signal IOCTL function. - Get write pointer from the shadow wptr - use userq_fence to fetch the va/value in wait IOCTL. v3: (Christian) - Use drm_exec helper for the proper BO drm reserve and avoid BO lock/unlock issues. - fence/fence driver reference count logic for signal/wait IOCTLs. v4: (Christian) - Fixed the drm_exec calling sequence - use dma_resv_for_each_fence_unlock if BO's are not locked - Modified the fence_info array storing logic. v5: (Christian) - Keep fence_drv until wait queue execution. - Add dma_fence_wait for other fences. - Lock BO's using drm_exec as the number of fences in them could change. - Install signaled fences as well into BO/Syncobj. - Move Syncobj fence installation code after the drm_exec_prepare_array. - Directly add dma_resv_usage_rw(args->bo_flags - remove unnecessary dma_fence_put. v6: (Christian) - Add xarray stuff to store the fence_drv - Implement a function to iterate over the xarray and drop the fence_drv references. - Add drm_exec_until_all_locked() wrapper - Add a check that if we haven't exceeded the user allocated num_fences before adding dma_fence to the fences array. v7: (Christian) - Use memdup_user() for kmalloc_array + copy_from_user - Move the fence_drv references from the xarray into the newly created fence and drop the fence_drv references when we signal this fence. - Move this locking of BOs before the "if (!wait_info->num_fences)", this way you need this code block only once. - Merge the error handling code and the cleanup + return 0 code. - Initializing the xa should probably be done in the userq code. - Remove the userq back pointer stored in fence_drv. - Pass xarray as parameter in amdgpu_userq_walk_and_drop_fence_drv() v8: (Christian) - Move fence_drv references must come before adding the fence to the list. - Use xa_lock_irqsave_nested for nested spinlock operations. - userq_mgr should be per fpriv and not one per device. - Restructure the interrupt process code for the early exit of the loop. - The reference acquired in the syncobj fence replace code needs to be kept around. - Modify the dma_fence acquire placement in wait IOCTL. - Move USERQ_BO_WRITE flag to UAPI header file. - drop the fence drv reference after telling the hw to stop accessing it. - Add multi sync object support to userq signal IOCTL. v9: (Christian) - Store all the fence_drv ref to other drivers and not ourself. - Iterate over uq_fence_drv_xa without holding a lock. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 + .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 431 +- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 6 + drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 29 +- .../gpu/drm/amd/include/amdgpu_userqueue.h| 1 + 5 files changed, 462 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 844f7b5f90db..5892a4c1a92e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2918,6 +2918,8 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(AMDGPU_GEM_OP, amdgpu_gem_op_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_GEM_USERPTR, amdgpu_gem_userptr_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_USERQ, amdgpu_userq_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), }; static const struct drm_driver amdgpu_kms_driver = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index f7baea2c67ab..339d82d5808f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -25,6 +25,7 @@ #include #include +#include #include #include "amdgpu.h" @@ -92,6 +93,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, spin_lock_init(_drv->fence_list_lock); fence_drv-&g
[PATCH v10 5/6] drm/amdgpu: Remove the MES self test
Remove MES self test as this conflicts the userqueue fence interrupts. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 --- drivers/gpu/drm/amd/amdgpu/mes_v10_1.c | 12 +--- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 14 +- 3 files changed, 2 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7753a2e64d41..2d9fa3d0d4a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4719,9 +4719,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) } adev->in_suspend = false; - if (adev->enable_mes) - amdgpu_mes_self_test(adev); - if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) DRM_WARN("smart shift update failed\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c index 4d1121d1a1e7..b7bfb3185a30 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c @@ -1161,20 +1161,10 @@ static int mes_v10_0_early_init(void *handle) return 0; } -static int mes_v10_0_late_init(void *handle) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)handle; - - if (!amdgpu_in_reset(adev)) - amdgpu_mes_self_test(adev); - - return 0; -} - static const struct amd_ip_funcs mes_v10_1_ip_funcs = { .name = "mes_v10_1", .early_init = mes_v10_0_early_init, - .late_init = mes_v10_0_late_init, + .late_init = NULL, .sw_init = mes_v10_1_sw_init, .sw_fini = mes_v10_1_sw_fini, .hw_init = mes_v10_1_hw_init, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index feb7fa2c304c..5923b7b0bd95 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1274,22 +1274,10 @@ static int mes_v11_0_early_init(void *handle) return 0; } -static int mes_v11_0_late_init(void *handle) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)handle; - - /* it's only intended for use in mes_self_test case, not for s0ix and reset */ - if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend && - (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(11, 0, 3))) - amdgpu_mes_self_test(adev); - - return 0; -} - static const struct amd_ip_funcs mes_v11_0_ip_funcs = { .name = "mes_v11_0", .early_init = mes_v11_0_early_init, - .late_init = mes_v11_0_late_init, + .late_init = NULL, .sw_init = mes_v11_0_sw_init, .sw_fini = mes_v11_0_sw_fini, .hw_init = mes_v11_0_hw_init, -- 2.25.1
[PATCH v10 3/6] drm/amdgpu: UAPI headers for userqueue secure semaphore
Add UAPI header support for userqueue Secure semaphore v2: Worked on review comments from Christian for the following modifications - Add bo handles, bo flags and padding fields. - Include value/va in a combined array. v3: Worked on review comments from Christian - Add num_fences field to obtain the number of objects required to allocate memory for userq_fence_info. - Replace obj_handle name with syncobj_handle. - Replace point name with syncobj_point. - Replace count_handles name with num_syncobj_handles. - Fix structure padding related issues. v4: Worked on review comments from Christian - Modify the bo flags description. Signed-off-by: Alex Deucher Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König --- include/uapi/drm/amdgpu_drm.h | 110 ++ 1 file changed, 110 insertions(+) diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index def6874a588c..4ab965acfe06 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -55,6 +55,8 @@ extern "C" { #define DRM_AMDGPU_FENCE_TO_HANDLE 0x14 #define DRM_AMDGPU_SCHED 0x15 #define DRM_AMDGPU_USERQ 0x16 +#define DRM_AMDGPU_USERQ_SIGNAL0x17 +#define DRM_AMDGPU_USERQ_WAIT 0x18 #define DRM_IOCTL_AMDGPU_GEM_CREATEDRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) @@ -73,6 +75,8 @@ extern "C" { #define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle) #define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_SCHED, union drm_amdgpu_sched) #define DRM_IOCTL_AMDGPU_USERQ DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ, union drm_amdgpu_userq) +#define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) +#define DRM_IOCTL_AMDGPU_USERQ_WAITDRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) /** * DOC: memory domains @@ -431,6 +435,112 @@ union drm_amdgpu_userq { struct drm_amdgpu_userq_out out; }; +/* dma_resv usage flag */ +#define AMDGPU_USERQ_BO_WRITE 1 + +/* userq signal/wait ioctl */ +struct drm_amdgpu_userq_signal { + /** +* @queue_id: Queue handle used by the userq fence creation function +* to retrieve the WPTR. +*/ + __u32 queue_id; + /** +* @flags: flags to indicate special function for userq fence creation. +* Unused for now. +*/ + __u32 flags; + /** +* @syncobj_handles_array: An array of syncobj handles used by the userq fence +* creation IOCTL to install the created dma_fence object which can be +* utilized by userspace to explicitly synchronize GPU commands. +*/ + __u64 syncobj_handles_array; + /** +* @num_syncobj_handles: A count that represents the number of syncobj handles in +* @syncobj_handles_array. +*/ + __u64 num_syncobj_handles; + /** +* @syncobj_point: A given point on the timeline to be signaled. +* Unused for now. +*/ + __u64 syncobj_point; + /** +* @bo_handles_array: An array of GEM BO handles used by the userq fence creation +* IOCTL to install the created dma_fence object which can be utilized by +* userspace to synchronize the BO usage between user processes. +*/ + __u64 bo_handles_array; + /** +* @num_bo_handles: A count that represents the number of GEM BO handles in +* @bo_handles_array. +*/ + __u32 num_bo_handles; + /** +* @bo_flags: flags to indicate BOs synchronize for READ or WRITE +*/ + __u32 bo_flags; +}; + +struct drm_amdgpu_userq_fence_info { + /** +* @va: A gpu address allocated for each queue which stores the +* read pointer (RPTR) value. +*/ + __u64 va; + /** +* @value: A 64 bit value represents the write pointer (WPTR) of the +* queue commands which compared with the RPTR value to signal the +* fences. +*/ + __u64 value; +}; + +struct drm_amdgpu_userq_wait { + /** +* @flags: flags to specify special function for userq wait information. +* Unused for now. +*/ + __u32 flags; + /** +* @bo_wait_flags: flags to define the BOs for READ or WRITE to store the +* matching fence wait info pair in @userq_fence_info. +*/ + __u32 bo_wait_flags; + __u32 pad; + /** +* @syncobj_handles_array: An array of syncobj handles defined to get the +* fence wait
[PATCH v10 2/6] drm/amdgpu: Add mqd support for the fence address
- Add a field in struct v11_gfx_mqd for userqueue fence address. - Assign fence gpu VA address to the userqueue mqd fence address fields. v2: Remove the mask and replace with lower_32_bits (Christian) Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c | 11 +++ drivers/gpu/drm/amd/include/v11_structs.h| 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c index 6369a26fb07e..d8592cfef534 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c @@ -25,6 +25,7 @@ #include "v11_structs.h" #include "mes_v11_0.h" #include "amdgpu_userqueue.h" +#include "amdgpu_userq_fence.h" #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE @@ -204,6 +205,14 @@ static int mes_v11_0_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, return 0; } +static void mes_v11_0_userq_set_fence_space(struct amdgpu_usermode_queue *queue) +{ + struct v11_gfx_mqd *mqd = queue->mqd.cpu_ptr; + + mqd->fenceaddress_lo = lower_32_bits(queue->fence_drv->gpu_addr); + mqd->fenceaddress_hi = upper_32_bits(queue->fence_drv->gpu_addr); +} + static int mes_v11_0_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, struct drm_amdgpu_userq_in *args_in, struct amdgpu_usermode_queue *queue) @@ -274,6 +283,8 @@ static int mes_v11_0_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, goto free_mqd; } + mes_v11_0_userq_set_fence_space(queue); + /* FW expects WPTR BOs to be mapped into GART */ r = mes_v11_0_create_wptr_mapping(uq_mgr, queue, userq_props->wptr_gpu_addr); if (r) { diff --git a/drivers/gpu/drm/amd/include/v11_structs.h b/drivers/gpu/drm/amd/include/v11_structs.h index f8008270f813..797ce6a1e56e 100644 --- a/drivers/gpu/drm/amd/include/v11_structs.h +++ b/drivers/gpu/drm/amd/include/v11_structs.h @@ -535,8 +535,8 @@ struct v11_gfx_mqd { uint32_t reserved_507; // offset: 507 (0x1FB) uint32_t reserved_508; // offset: 508 (0x1FC) uint32_t reserved_509; // offset: 509 (0x1FD) - uint32_t reserved_510; // offset: 510 (0x1FE) - uint32_t reserved_511; // offset: 511 (0x1FF) + uint32_t fenceaddress_lo; // offset: 510 (0x1FE) + uint32_t fenceaddress_hi; // offset: 511 (0x1FF) }; struct v11_sdma_mqd { -- 2.25.1
[PATCH] drm/buddy: Fix the range bias clear memory allocation issue
Problem statement: During the system boot time, an application request for the bulk volume of cleared range bias memory when the clear_avail is zero, we dont fallback into normal allocation method as we had an unnecessary clear_avail check which prevents the fallback method leads to fb allocation failure following system goes into unresponsive state. Solution: Remove the unnecessary clear_avail check in the range bias allocation function. Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") Reviewed-by: Matthew Auld --- drivers/gpu/drm/drm_buddy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 284ebae71cc4..831929ac95eb 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -574,7 +574,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm, block = __alloc_range_bias(mm, start, end, order, flags, fallback); - if (IS_ERR(block) && mm->clear_avail) + if (IS_ERR(block)) return __alloc_range_bias(mm, start, end, order, flags, !fallback); -- 2.25.1
[PATCH] drm/amdgpu: Fix the BO release clear memory warning
038638] ? srso_alias_return_thunk+0x5/0xfbef5 [6.038639] ? do_syscall_64+0x90/0x1a0 [6.038640] ? srso_alias_return_thunk+0x5/0xfbef5 [6.038642] ? srso_alias_return_thunk+0x5/0xfbef5 [6.038644] entry_SYSCALL_64_after_hwframe+0x78/0x80 [6.038645] RIP: 0033:0x7f2bd1e9d059 [6.038647] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 8f 1d 0d 00 f7 d8 64 89 01 48 [6.038648] RSP: 002b:7fffaf804878 EFLAGS: 0246 ORIG_RAX: 0139 [6.038650] RAX: ffda RBX: 55a9b2328d60 RCX: 7f2bd1e9d059 [6.038650] RDX: RSI: 7f2bd1fd0509 RDI: 0024 [6.038651] RBP: R08: 0040 R09: 55a9b23000a0 [6.038652] R10: 0038 R11: 0246 R12: 7f2bd1fd0509 [6.038652] R13: 0002 R14: 55a9b2326f90 R15: [6.038655] [6.038656] ---[ end trace ]--- Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index b2a83c802bbd..ecd0cf6cdd2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1371,6 +1371,9 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) adev->in_suspend || drm_dev_is_unplugged(adev_to_drm(adev))) return; + if (!adev->mman.buffer_funcs_enabled) + return; + if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; -- 2.25.1
[PATCH] drm/buddy: Fix the range bias clear memory allocation issue
Problem statement: During the system boot time, an application request for the bulk volume of cleared range bias memory when the clear_avail is zero, we dont fallback into normal allocation method as we had an unnecessary clear_avail check which prevents the fallback method leads to fb allocation failure following system goes into unresponsive state. Solution: Remove the unnecessary clear_avail check in the range bias allocation function. Signed-off-by: Arunpravin Paneer Selvam Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature") --- drivers/gpu/drm/drm_buddy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 284ebae71cc4..831929ac95eb 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -574,7 +574,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm, block = __alloc_range_bias(mm, start, end, order, flags, fallback); - if (IS_ERR(block) && mm->clear_avail) + if (IS_ERR(block)) return __alloc_range_bias(mm, start, end, order, flags, !fallback); -- 2.25.1
Re: Splat during driver probe
Hi Alex, yes, this is related to memory clearing changes. This is a known issue. I am working on a fix. Thanks, Arun. On 5/3/2024 6:46 PM, Alex Deucher wrote: Possibly related to Arun's new memory clearing changes. @Arunpravin can you take a look? Alex On Fri, May 3, 2024 at 9:10 AM Tvrtko Ursulin wrote: Hi, I tried asking on #radeon yesterday but no takers. I was wondering if the below splat is already known or not. Appears to happen due amdgpu_bo_release_notify genuniely running before amdgpu_ttm_set_buffer_funcs_status set the buffer funcs to enabled during device probe. I couldn't easily figure out what changed. Regards, Tvrtko [ 11.802030] [drm:amdgpu_fill_buffer [amdgpu]] *ERROR* Trying to clear memory with ring turned off. [ 11.802519] [ cut here ] [ 11.802530] WARNING: CPU: 5 PID: 435 at drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1378 amdgpu_bo_release_notify+0x20c/0x230 [amdgpu] [ 11.802916] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct amdgpu(E+) nft_chain_nat nf_tables ip6table_nat ip6table_mangle ip6table_raw ip6table_security iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle iptable_raw iptable_security nfnetlink ip6table_filter ip6_tables iptable_filter cmac algif_hash ramoops algif_skcipher reed_solomon af_alg bnep qrtr mousedev intel_rapl_msr ath11k_pci(+) intel_rapl_common mhi snd_acp_sof_mach snd_acp_mach snd_sof_probes ath11k snd_soc_dmic kvm_amd cdc_mbim qmi_helpers joydev hid_multitouch cdc_wdm snd_sof_amd_vangogh snd_sof_pci kvm mac80211 snd_hda_codec_hdmi hci_uart crct10dif_pclmul amdxcp i2c_algo_bit snd_sof_amd_acp btqca drm_ttm_helper libarc4 crc32_pclmul btrtl snd_hda_intel snd_sof_xtensa_dsp btbcm ttm snd_intel_dspcfg btintel polyval_clmulni snd_sof drm_exec polyval_generic snd_hda_codec snd_sof_utils gpu_sched cfg80211 bluetooth snd_pci_acp5x gf128mul cdc_ncm [ 11.803070] sp5100_tco snd_hwdep ghash_clmulni_intel snd_soc_nau8821 snd_soc_max98388 drm_suballoc_helper sha512_ssse3 cdc_ether snd_acp_config drm_buddy atkbd snd_soc_core aesni_intel snd_hda_core video ecdh_generic crypto_simd libps2 usbnet cryptd rapl mii wdat_wdt vivaldi_fmap pcspkr snd_compress i2c_piix4 snd_pcm drm_display_helper ccp snd_soc_acpi cdc_acm rfkill wmi ltrf216a 8250_dw i2c_hid_acpi snd_timer snd i2c_hid industrialio soundcore hid_steam pkcs8_key_parser crypto_user loop fuse dm_mod ip_tables x_tables overlay ext4 crc16 mbcache jbd2 usbhid vfat fat btrfs blake2b_generic libcrc32c crc32c_generic xor raid6_pq sdhci_pci cqhci nvme serio_raw sdhci xhci_pci xhci_pci_renesas crc32c_intel mmc_core nvme_core i8042 serio spi_amd [ 11.803448] CPU: 5 PID: 435 Comm: (udev-worker) Tainted: GW E 6.9.0-rc6 #3 dd3fb65c639fd86ff89731c604b1e804996e8989 [ 11.803471] Hardware name: Valve Galileo/Galileo, BIOS F7G0107 12/01/2023 [ 11.803486] RIP: 0010:amdgpu_bo_release_notify+0x20c/0x230 [amdgpu] [ 11.803857] Code: 0b e9 a4 fe ff ff 48 ba ff ff ff ff ff ff ff 7f 31 f6 4c 89 e7 e8 44 e5 33 e6 eb 8d e8 dd dd 33 e6 eb a7 0f 0b e9 4d fe ff ff <0f> 0b eb 9c be 03 00 00 00 e8 f6 04 00 e6 eb 90 e8 8f 64 79 e6 66 [ 11.803890] RSP: 0018:a77bc1537568 EFLAGS: 00010282 [ 11.803904] RAX: ffea RBX: 8e1f0da89048 RCX: [ 11.803919] RDX: RSI: 0027 RDI: [ 11.803934] RBP: 8e1f6ec0ffb0 R08: R09: 0003 [ 11.803948] R10: a77bc15372f0 R11: 8e223ef7ffe8 R12: 8e1f0da89000 [ 11.803963] R13: 8e1f0da89180 R14: 8e1f6ec0ffb0 R15: 8e1f6ec0 [ 11.803977] FS: 7fa2b600b200() GS:8e222ee8() knlGS: [ 11.803994] CS: 0010 DS: ES: CR0: 80050033 [ 11.804007] CR2: 55cac2aa7080 CR3: 00010f818000 CR4: 00350ef0 [ 11.804021] Call Trace: [ 11.804031] [ 11.804042] ? __warn+0x8c/0x180 [ 11.804057] ? amdgpu_bo_release_notify+0x20c/0x230 [amdgpu 03b1dca7e09918e3f45efb1acdfc90682f73e4c1] [ 11.804456] ? report_bug+0x191/0x1c0 [ 11.804473] ? handle_bug+0x3a/0x70 [ 11.804485] ? exc_invalid_op+0x17/0x70 [ 11.804497] ? asm_exc_invalid_op+0x1a/0x20 [ 11.804517] ? amdgpu_bo_release_notify+0x20c/0x230 [amdgpu 03b1dca7e09918e3f45efb1acdfc90682f73e4c1] [ 11.804894] ? amdgpu_bo_release_notify+0x14a/0x230 [amdgpu 03b1dca7e09918e3f45efb1acdfc90682f73e4c1] [ 11.805277] ttm_bo_release+0x115/0x330 [ttm 39c917822ce73b2a754d4183af7a0990991c66be] [ 11.805303] ? srso_return_thunk+0x5/0x5f [ 11.805315] ? __mutex_unlock_slowpath+0x3a/0x290 [ 11.805332] amdgpu_bo_free_kernel+0xd6/0x130 [amdgpu 03b1dca7e09918e3f45efb1acdfc90682f73e4c1] [ 11.805709] dm_helpers_free_gpu_mem+0x41/0x80 [amdgpu 03b1dca7e09918e3f45efb1acdfc90682f73e4c1] [ 11.806172] vg_clk_mgr_construct+0x2c4/0x490 [amdgpu 03b1dca7e09918e3f45efb1acdfc90682f73e4c1] [ 11.
[PATCH v9 5/5] drm/amdgpu: Enable userq fence interrupt support
Add support to handle the userqueue protected fence signal hardware interrupt. Create a xarray which maps the doorbell index to the fence driver address. This would help to retrieve the fence driver information when an userq fence interrupt is triggered. Firmware sends the doorbell offset value and this info is compared with the queue's mqd doorbell offset value. If they are same, we process the userq fence interrupt. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 5 ++-- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 6 + drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 25 ++- drivers/gpu/drm/amd/amdgpu/mes_v10_1.c| 5 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c| 7 -- 6 files changed, 23 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 4ca14b02668b..2d5ef2e74c71 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1043,6 +1043,8 @@ struct amdgpu_device { struct amdgpu_mqd mqds[AMDGPU_HW_IP_NUM]; const struct amdgpu_userq_funcs *userq_funcs[AMDGPU_HW_IP_NUM]; + struct xarray userq_xa; + /* df */ struct amdgpu_dfdf; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7753a2e64d41..fd919105a181 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3982,6 +3982,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, spin_lock_init(>audio_endpt_idx_lock); spin_lock_init(>mm_stats.lock); + xa_init_flags(>userq_xa, XA_FLAGS_LOCK_IRQ); + INIT_LIST_HEAD(>shadow_list); mutex_init(>shadow_list_lock); @@ -4719,9 +4721,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) } adev->in_suspend = false; - if (adev->enable_mes) - amdgpu_mes_self_test(adev); - if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) DRM_WARN("smart shift update failed\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 6fb75cc1d20c..614953b0fc19 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -70,6 +70,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, struct amdgpu_usermode_queue *userq) { struct amdgpu_userq_fence_driver *fence_drv; + unsigned long flags; int r; fence_drv = kzalloc(sizeof(*fence_drv), GFP_KERNEL); @@ -96,6 +97,11 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, fence_drv->context = dma_fence_context_alloc(1); get_task_comm(fence_drv->timeline_name, current); + xa_lock_irqsave(>userq_xa, flags); + __xa_store(>userq_xa, userq->doorbell_index, + fence_drv, GFP_KERNEL); + xa_unlock_irqrestore(>userq_xa, flags); + userq->fence_drv = fence_drv; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index a786e25432ae..d6cdca0a652f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -49,6 +49,7 @@ #include "gfx_v11_0_3.h" #include "nbio_v4_3.h" #include "mes_v11_0.h" +#include "amdgpu_userq_fence.h" #define GFX11_NUM_GFX_RINGS1 #define GFX11_MEC_HPD_SIZE 2048 @@ -5939,25 +5940,25 @@ static int gfx_v11_0_eop_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { - int i; + u32 doorbell_offset = entry->src_data[0]; u8 me_id, pipe_id, queue_id; struct amdgpu_ring *ring; - uint32_t mes_queue_id = entry->src_data[0]; + int i; DRM_DEBUG("IH: CP EOP\n"); - if (adev->enable_mes && (mes_queue_id & AMDGPU_FENCE_MES_QUEUE_FLAG)) { - struct amdgpu_mes_queue *queue; + if (adev->enable_mes && doorbell_offset) { + struct amdgpu_userq_fence_driver *fence_drv = NULL; + struct xarray *xa = >userq_xa; + unsigned long index, flags; - mes_queue_id &= AMDGPU_FENCE_MES_QUEUE_ID_MASK; + xa_lock_irqsave(xa, flags); + xa_for_each(xa, index, fence_drv) + if (doorbell_offset == index) + break; + xa_unlock_irqrestore(xa, flags); - spin_lock(>mes.queue_id_lock); -
[PATCH v9 3/5] drm/amdgpu: UAPI headers for userqueue Secure semaphore
Add UAPI header support for userqueue Secure semaphore v2: Worked on review comments from Christian for the following modifications - Add bo handles, bo flags and padding fields. - Include value/va in a combined array. v3: Worked on review comments from Christian - Add num_fences field to obtain the number of objects required to allocate memory for userq_fence_info. - Replace obj_handle name with syncobj_handle. - Replace point name with syncobj_point. - Replace count_handles name with num_syncobj_handles. - Fix structure padding related issues. v4: Worked on review comments from Christian - Modify the bo flags description. Signed-off-by: Alex Deucher Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König --- include/uapi/drm/amdgpu_drm.h | 115 ++ 1 file changed, 115 insertions(+) diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index def6874a588c..1217ae30a8d8 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -55,6 +55,8 @@ extern "C" { #define DRM_AMDGPU_FENCE_TO_HANDLE 0x14 #define DRM_AMDGPU_SCHED 0x15 #define DRM_AMDGPU_USERQ 0x16 +#define DRM_AMDGPU_USERQ_SIGNAL0x17 +#define DRM_AMDGPU_USERQ_WAIT 0x18 #define DRM_IOCTL_AMDGPU_GEM_CREATEDRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) @@ -73,6 +75,8 @@ extern "C" { #define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle) #define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_SCHED, union drm_amdgpu_sched) #define DRM_IOCTL_AMDGPU_USERQ DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ, union drm_amdgpu_userq) +#define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) +#define DRM_IOCTL_AMDGPU_USERQ_WAITDRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) /** * DOC: memory domains @@ -431,6 +435,117 @@ union drm_amdgpu_userq { struct drm_amdgpu_userq_out out; }; +/* dma_resv usage flag */ +#define AMDGPU_USERQ_BO_WRITE 1 + +/* userq signal/wait ioctl */ +struct drm_amdgpu_userq_signal { + /** +* @queue_id: Queue handle used by the userq fence creation function +* to retrieve the WPTR. +*/ + __u32 queue_id; + /** +* @flags: flags to indicate special function for userq fence creation. +* Unused for now. +*/ + __u32 flags; + /** +* @syncobj_handles_array: An array of syncobj handles used by the userq fence +* creation IOCTL to install the created dma_fence object which can be +* utilized by userspace to explicitly synchronize GPU commands. +*/ + __u64 syncobj_handles_array; + /** +* @num_syncobj_handles: A count that represents the number of syncobj handles in +* @syncobj_handles_array. +*/ + __u64 num_syncobj_handles; + /** +* @syncobj_point: A given point on the timeline to be signaled. +* Unused for now. +*/ + __u64 syncobj_point; + /** +* @bo_handles_array: An array of GEM BO handles used by the userq fence creation +* IOCTL to install the created dma_fence object which can be utilized by +* userspace to synchronize the BO usage between user processes. +*/ + __u64 bo_handles_array; + /** +* @num_bo_handles: A count that represents the number of GEM BO handles in +* @bo_handles_array. +*/ + __u32 num_bo_handles; + /** +* @bo_flags: flags to indicate BOs synchronize for READ or WRITE +*/ + __u32 bo_flags; +}; + +struct drm_amdgpu_userq_fence_info { + /** +* @va: A gpu address allocated for each queue which stores the +* read pointer (RPTR) value. +*/ + __u64 va; + /** +* @value: A 64 bit value represents the write pointer (WPTR) of the +* queue commands which compared with the RPTR value to signal the +* fences. +*/ + __u64 value; +}; + +struct drm_amdgpu_userq_wait { + /** +* @waitq_id: Queue handle used to retrieve the queue information to store +* the fence driver references in the wait user queue structure. +*/ + __u32 waitq_id; + /** +* @flags: flags to specify special function for userq wait information. +* Unused for now. +*/ + __u32 flags; + /** +* @bo_wait_flags: flags to define the BOs for READ or WRITE to store the +* matching
[PATCH v9 4/5] drm/amdgpu: Implement userqueue signal/wait IOCTL
This patch introduces new IOCTL for userqueue secure semaphore. The signal IOCTL called from userspace application creates a drm syncobj and array of bo GEM handles and passed in as parameter to the driver to install the fence into it. The wait IOCTL gets an array of drm syncobjs, finds the fences attached to the drm syncobjs and obtain the array of memory_address/fence_value combintion which are returned to userspace. v2: (Christian) - Install fence into GEM BO object. - Lock all BO's using the dma resv subsystem - Reorder the sequence in signal IOCTL function. - Get write pointer from the shadow wptr - use userq_fence to fetch the va/value in wait IOCTL. v3: (Christian) - Use drm_exec helper for the proper BO drm reserve and avoid BO lock/unlock issues. - fence/fence driver reference count logic for signal/wait IOCTLs. v4: (Christian) - Fixed the drm_exec calling sequence - use dma_resv_for_each_fence_unlock if BO's are not locked - Modified the fence_info array storing logic. v5: (Christian) - Keep fence_drv until wait queue execution. - Add dma_fence_wait for other fences. - Lock BO's using drm_exec as the number of fences in them could change. - Install signaled fences as well into BO/Syncobj. - Move Syncobj fence installation code after the drm_exec_prepare_array. - Directly add dma_resv_usage_rw(args->bo_flags - remove unnecessary dma_fence_put. v6: (Christian) - Add xarray stuff to store the fence_drv - Implement a function to iterate over the xarray and drop the fence_drv references. - Add drm_exec_until_all_locked() wrapper - Add a check that if we haven't exceeded the user allocated num_fences before adding dma_fence to the fences array. v7: (Christian) - Use memdup_user() for kmalloc_array + copy_from_user - Move the fence_drv references from the xarray into the newly created fence and drop the fence_drv references when we signal this fence. - Move this locking of BOs before the "if (!wait_info->num_fences)", this way you need this code block only once. - Merge the error handling code and the cleanup + return 0 code. - Initializing the xa should probably be done in the userq code. - Remove the userq back pointer stored in fence_drv. - Pass xarray as parameter in amdgpu_userq_walk_and_drop_fence_drv() v8: (Christian) - Move fence_drv references must come before adding the fence to the list. - Use xa_lock_irqsave_nested for nested spinlock operations. - userq_mgr should be per fpriv and not one per device. - Restructure the interrupt process code for the early exit of the loop. - The reference acquired in the syncobj fence replace code needs to be kept around. - Modify the dma_fence acquire placement in wait IOCTL. - Move USERQ_BO_WRITE flag to UAPI header file. - drop the fence drv reference after telling the hw to stop accessing it. - Add multi sync object support to userq signal IOCTL. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 + .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 454 +- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 29 +- .../gpu/drm/amd/include/amdgpu_userqueue.h| 1 + 5 files changed, 484 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 844f7b5f90db..5892a4c1a92e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2918,6 +2918,8 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(AMDGPU_GEM_OP, amdgpu_gem_op_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_GEM_USERPTR, amdgpu_gem_userptr_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_USERQ, amdgpu_userq_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), }; static const struct drm_driver amdgpu_kms_driver = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index f7baea2c67ab..6fb75cc1d20c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -25,6 +25,7 @@ #include #include +#include #include #include "amdgpu.h" @@ -102,8 +103,11 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv) { + struct amdgpu_userq_fence_driver *stored_fence_drv; struct amdgpu_userq_fence *userq_fence, *tmp; +
[PATCH v9 1/5] drm/amdgpu: Implement a new userqueue fence driver
Developed a userqueue fence driver for the userqueue process shared BO synchronization. Create a dma fence having write pointer as the seqno and allocate a seq64 memory for each user queue process and feed this memory address into the firmware/hardware, thus the firmware writes the read pointer into the given address when the process completes it execution. Compare wptr and rptr, if rptr >= wptr, signal the fences for the waiting process to consume the buffers. v2: Worked on review comments from Christian for the following modifications - Add wptr as sequence number into the fence - Add a reference count for the fence driver - Add dma_fence_put below the list_del as it might frees the userq fence. - Trim unnecessary code in interrupt handler. - Check dma fence signaled state in dma fence creation function for a potential problem of hardware completing the job processing beforehand. - Add necessary locks. - Create a list and process all the unsignaled fences. - clean up fences in destroy function. - implement .signaled callback function v3: Worked on review comments from Christian - Modify naming convention for reference counted objects - Fix fence driver reference drop issue - Drop amdgpu_userq_fence_driver_process() function return value v4: Worked on review comments from Christian - Moved fence driver allocation into amdgpu_userq_fence_driver_alloc() - Added detail doc mentioning the differences b/w two spinlocks declared. v5: Worked on review comments from Christian - Check before upcast and remove local variable - Add error handling in fence_drv alloc function. - Move rptr read fn outside of the loop and remove WARN_ON in destroy function. v6: - clear the seq64 memory in user fence driver(Christian) - fix for the wptr va bo mapping(Christian) - move the fence_drv xa entry erase code from the interrupt handler into user fence destroy function Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 + .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 257 ++ .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 69 + drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 12 +- .../gpu/drm/amd/include/amdgpu_userqueue.h| 5 +- 6 files changed, 347 insertions(+), 5 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index a640bfa468ad..f8dd808bd71c 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -80,7 +80,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \ amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \ amdgpu_fw_attestation.o amdgpu_securedisplay.o \ amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \ - amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o + amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o \ + amdgpu_userq_fence.o amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index acee1c279abb..844f7b5f90db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -51,6 +51,7 @@ #include "amdgpu_sched.h" #include "amdgpu_xgmi.h" #include "amdgpu_userqueue.h" +#include "amdgpu_userq_fence.h" #include "../amdxcp/amdgpu_xcp_drv.h" /* @@ -3012,6 +3013,10 @@ static int __init amdgpu_init(void) if (r) goto error_fence; + r = amdgpu_userq_fence_slab_init(); + if (r) + goto error_fence; + DRM_INFO("amdgpu kernel modesetting enabled.\n"); amdgpu_register_atpx_handler(); amdgpu_acpi_detect(); @@ -3037,6 +3042,7 @@ static void __exit amdgpu_exit(void) amdgpu_acpi_release(); amdgpu_sync_fini(); amdgpu_fence_slab_fini(); + amdgpu_userq_fence_slab_fini(); mmu_notifier_synchronize(); amdgpu_xcp_drv_release(); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c new file mode 100644 index ..f7baea2c67ab --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, includi
[PATCH v9 2/5] drm/amdgpu: Add mqd support for the fence address
- Add a field in struct v11_gfx_mqd for userqueue fence address. - Assign fence gpu VA address to the userqueue mqd fence address fields. v2: Remove the mask and replace with lower_32_bits (Christian) Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c | 11 +++ drivers/gpu/drm/amd/include/v11_structs.h| 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c index 6369a26fb07e..d8592cfef534 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c @@ -25,6 +25,7 @@ #include "v11_structs.h" #include "mes_v11_0.h" #include "amdgpu_userqueue.h" +#include "amdgpu_userq_fence.h" #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE @@ -204,6 +205,14 @@ static int mes_v11_0_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, return 0; } +static void mes_v11_0_userq_set_fence_space(struct amdgpu_usermode_queue *queue) +{ + struct v11_gfx_mqd *mqd = queue->mqd.cpu_ptr; + + mqd->fenceaddress_lo = lower_32_bits(queue->fence_drv->gpu_addr); + mqd->fenceaddress_hi = upper_32_bits(queue->fence_drv->gpu_addr); +} + static int mes_v11_0_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, struct drm_amdgpu_userq_in *args_in, struct amdgpu_usermode_queue *queue) @@ -274,6 +283,8 @@ static int mes_v11_0_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, goto free_mqd; } + mes_v11_0_userq_set_fence_space(queue); + /* FW expects WPTR BOs to be mapped into GART */ r = mes_v11_0_create_wptr_mapping(uq_mgr, queue, userq_props->wptr_gpu_addr); if (r) { diff --git a/drivers/gpu/drm/amd/include/v11_structs.h b/drivers/gpu/drm/amd/include/v11_structs.h index f8008270f813..797ce6a1e56e 100644 --- a/drivers/gpu/drm/amd/include/v11_structs.h +++ b/drivers/gpu/drm/amd/include/v11_structs.h @@ -535,8 +535,8 @@ struct v11_gfx_mqd { uint32_t reserved_507; // offset: 507 (0x1FB) uint32_t reserved_508; // offset: 508 (0x1FC) uint32_t reserved_509; // offset: 509 (0x1FD) - uint32_t reserved_510; // offset: 510 (0x1FE) - uint32_t reserved_511; // offset: 511 (0x1FF) + uint32_t fenceaddress_lo; // offset: 510 (0x1FE) + uint32_t fenceaddress_hi; // offset: 511 (0x1FF) }; struct v11_sdma_mqd { -- 2.25.1
[PATCH v5] drm/amdgpu: Modify the contiguous flags behaviour
Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. v2: - keep the mem_flags and bo->flags check as is(Christian) - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the amdgpu_bo_pin_restricted function placement range iteration loop(Christian) - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block (Christian) - Keep the kernel BO allocation as is(Christain) - If BO pin vram allocation failed, we need to return -ENOSPC as RDMA cannot work with scattered VRAM pages(Philip) v3(Christian): - keep contiguous flag handling outside of pages_per_block calculation - remove the hacky implementation in contiguous flag error handling code v4(Christian): - use any variable and return value for non-contiguous fallback v5: rebase to amd-staging-drm-next branch Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 23 +++- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 331b9ed8062c..316a9f897f2b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + if (abo->tbo.type == ttm_bo_type_kernel && + flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + c++; } @@ -964,6 +966,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (!bo->placements[i].lpfn || (lpfn && lpfn < bo->placements[i].lpfn)) bo->placements[i].lpfn = lpfn; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->placements[i].mem_type == TTM_PL_VRAM) + bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } r = ttm_bo_validate(>tbo, >placement, ); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..f23002ed2b42 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -450,6 +450,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, { struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); u64 vis_usage = 0, max_bytes, min_block_size; struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; @@ -468,7 +469,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (tbo->type != ttm_bo_type_kernel) max_bytes -= AMDGPU_VM_RESERVED_VRAM; - if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { pages_per_block = ~0ul; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -477,7 +478,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, /* default to 2MB */ pages_per_block = 2UL << (20UL - PAGE_SHIFT); #endif - pages_per_block = max_t(uint32_t, pages_per_block, + pages_per_block = max_t(u32, pages_per_block, tbo->page_alignment); } @@ -498,7 +499,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (place->flags & TTM_PL_FLAG_TOPDOWN) vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; - if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) vres-
Re: [PATCH v3] drm/amdgpu: Modify the contiguous flags behaviour
Hi Christian, On 4/24/2024 2:02 PM, Christian König wrote: Am 24.04.24 um 09:13 schrieb Arunpravin Paneer Selvam: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. v2: - keep the mem_flags and bo->flags check as is(Christian) - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the amdgpu_bo_pin_restricted function placement range iteration loop(Christian) - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block (Christian) - Keep the kernel BO allocation as is(Christain) - If BO pin vram allocation failed, we need to return -ENOSPC as RDMA cannot work with scattered VRAM pages(Philip) v3(Christian): - keep contiguous flag handling outside of pages_per_block calculation - remove the hacky implementation in contiguous flag error handling code Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 83 ++-- 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 492aebc44e51..c594d2a5978e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -154,8 +154,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + if (abo->tbo.type == ttm_bo_type_kernel && + flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + c++; } @@ -965,6 +967,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (!bo->placements[i].lpfn || (lpfn && lpfn < bo->placements[i].lpfn)) bo->placements[i].lpfn = lpfn; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->placements[i].mem_type == TTM_PL_VRAM) + bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } r = ttm_bo_validate(>tbo, >placement, ); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..17c5d9ce9927 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,23 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline void amdgpu_vram_mgr_limit_min_block_size(unsigned long pages_per_block, + u64 size, + u64 *min_block_size, + bool contiguous_enabled) +{ + if (contiguous_enabled) + return; + + /* + * if size >= 2MiB, limit the min_block_size to 2MiB + * for better TLB usage. + */ + if ((size >= (u64)pages_per_block << PAGE_SHIFT) && + !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) + *min_block_size = (u64)pages_per_block << PAGE_SHIFT; +} + /** * DOC: mem_info_vram_total * @@ -452,11 +469,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_device *adev = to_amdgpu_device(mgr); struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); u64 vis_usage = 0, max_bytes, min_block_size; + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; struct drm_buddy *mm = >mm; - struct drm_buddy_block *block; unsigned long pages_per_block; + struct drm_buddy_block *block; int r; lpfn = (u64)place->lpfn << PAGE_SHIFT; @@ -469,18 +487,14 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (tbo->type != ttm_bo_type_kernel) max_bytes -= AMDGPU_VM_RESERVED_VRAM; - if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - pages_per_block = ~0ul; - } else { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if
[PATCH v4] drm/amdgpu: Modify the contiguous flags behaviour
Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. v2: - keep the mem_flags and bo->flags check as is(Christian) - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the amdgpu_bo_pin_restricted function placement range iteration loop(Christian) - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block (Christian) - Keep the kernel BO allocation as is(Christain) - If BO pin vram allocation failed, we need to return -ENOSPC as RDMA cannot work with scattered VRAM pages(Philip) v3(Christian): - keep contiguous flag handling outside of pages_per_block calculation - remove the hacky implementation in contiguous flag error handling code v4(Christian): - use any variable and return value for non-contiguous fallback Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 22 ++-- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 492aebc44e51..c594d2a5978e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -154,8 +154,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + if (abo->tbo.type == ttm_bo_type_kernel && + flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + c++; } @@ -965,6 +967,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (!bo->placements[i].lpfn || (lpfn && lpfn < bo->placements[i].lpfn)) bo->placements[i].lpfn = lpfn; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->placements[i].mem_type == TTM_PL_VRAM) + bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } r = ttm_bo_validate(>tbo, >placement, ); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..6c30eceec896 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -469,7 +469,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (tbo->type != ttm_bo_type_kernel) max_bytes -= AMDGPU_VM_RESERVED_VRAM; - if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { pages_per_block = ~0ul; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -478,7 +478,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, /* default to 2MB */ pages_per_block = 2UL << (20UL - PAGE_SHIFT); #endif - pages_per_block = max_t(uint32_t, pages_per_block, + pages_per_block = max_t(u32, pages_per_block, tbo->page_alignment); } @@ -499,7 +499,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (place->flags & TTM_PL_FLAG_TOPDOWN) vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; - if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION; if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED) @@ -518,21 +518,31 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, else min_block_size = mgr->default_page_size; - BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ size = min(remaining_size, 2ULL <
[PATCH v3] drm/amdgpu: Modify the contiguous flags behaviour
Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. v2: - keep the mem_flags and bo->flags check as is(Christian) - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the amdgpu_bo_pin_restricted function placement range iteration loop(Christian) - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block (Christian) - Keep the kernel BO allocation as is(Christain) - If BO pin vram allocation failed, we need to return -ENOSPC as RDMA cannot work with scattered VRAM pages(Philip) v3(Christian): - keep contiguous flag handling outside of pages_per_block calculation - remove the hacky implementation in contiguous flag error handling code Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 83 ++-- 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 492aebc44e51..c594d2a5978e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -154,8 +154,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + if (abo->tbo.type == ttm_bo_type_kernel && + flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + c++; } @@ -965,6 +967,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (!bo->placements[i].lpfn || (lpfn && lpfn < bo->placements[i].lpfn)) bo->placements[i].lpfn = lpfn; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->placements[i].mem_type == TTM_PL_VRAM) + bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } r = ttm_bo_validate(>tbo, >placement, ); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..17c5d9ce9927 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,23 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline void amdgpu_vram_mgr_limit_min_block_size(unsigned long pages_per_block, + u64 size, + u64 *min_block_size, + bool contiguous_enabled) +{ + if (contiguous_enabled) + return; + + /* +* if size >= 2MiB, limit the min_block_size to 2MiB +* for better TLB usage. +*/ + if ((size >= (u64)pages_per_block << PAGE_SHIFT) && + !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) + *min_block_size = (u64)pages_per_block << PAGE_SHIFT; +} + /** * DOC: mem_info_vram_total * @@ -452,11 +469,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_device *adev = to_amdgpu_device(mgr); struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); u64 vis_usage = 0, max_bytes, min_block_size; + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; struct drm_buddy *mm = >mm; - struct drm_buddy_block *block; unsigned long pages_per_block; + struct drm_buddy_block *block; int r; lpfn = (u64)place->lpfn << PAGE_SHIFT; @@ -469,18 +487,14 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (tbo->type != ttm_bo_type_kernel) max_bytes -= AMDGPU_VM_RESERVED_VRAM; - if (place->fla
Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
Hi Philip, On 4/17/2024 8:58 PM, Philip Yang wrote: On 2024-04-17 10:32, Paneer Selvam, Arunpravin wrote: Hi Christian, On 4/17/2024 6:57 PM, Paneer Selvam, Arunpravin wrote: Hi Christian, On 4/17/2024 12:19 PM, Christian König wrote: Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. v2: - keep the mem_flags and bo->flags check as is(Christian) - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the amdgpu_bo_pin_restricted function placement range iteration loop(Christian) - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block (Christian) - Keep the kernel BO allocation as is(Christain) - If BO pin vram allocation failed, we need to return -ENOSPC as RDMA cannot work with scattered VRAM pages(Philip) Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..caaef7b1df49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + if (abo->tbo.type == ttm_bo_type_kernel && + flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + c++; } @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (!bo->placements[i].lpfn || (lpfn && lpfn < bo->placements[i].lpfn)) bo->placements[i].lpfn = lpfn; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->placements[i].mem_type == TTM_PL_VRAM) + bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } r = ttm_bo_validate(>tbo, >placement, ); Nice work, up till here that looks exactly right as far as I can see. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..4be8b091099a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,29 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline unsigned long +amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object *tbo, + const struct ttm_place *place, + unsigned long bo_flags) +{ + unsigned long pages_per_block; + + if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + pages_per_block = ~0ul; If I understand it correctly this here enforces the allocation of a contiguous buffer in the way that it says we should have only one giant page for the whole BO. yes, for contiguous we don't have the 2MB limit, hence we are setting to maximum to avoid restricting the min_block_size to 2MB limitation. + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pages_per_block = HPAGE_PMD_NR; +#else + /* default to 2MB */ + pages_per_block = 2UL << (20UL - PAGE_SHIFT); +#endif + pages_per_block = max_t(uint32_t, pages_per_block, + tbo->page_alignment); + } + + return pages_per_block; +} + /** * DOC: mem_info_vram_total * @@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); u64 vis_usage = 0, max_bytes, min_block_size; + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; + unsigned long bo_flags = bo->flags; struct drm_buddy *mm = >
Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
Hi Christian, On 4/17/2024 6:57 PM, Paneer Selvam, Arunpravin wrote: Hi Christian, On 4/17/2024 12:19 PM, Christian König wrote: Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. v2: - keep the mem_flags and bo->flags check as is(Christian) - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the amdgpu_bo_pin_restricted function placement range iteration loop(Christian) - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block (Christian) - Keep the kernel BO allocation as is(Christain) - If BO pin vram allocation failed, we need to return -ENOSPC as RDMA cannot work with scattered VRAM pages(Philip) Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..caaef7b1df49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + if (abo->tbo.type == ttm_bo_type_kernel && + flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + c++; } @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (!bo->placements[i].lpfn || (lpfn && lpfn < bo->placements[i].lpfn)) bo->placements[i].lpfn = lpfn; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->placements[i].mem_type == TTM_PL_VRAM) + bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } r = ttm_bo_validate(>tbo, >placement, ); Nice work, up till here that looks exactly right as far as I can see. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..4be8b091099a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,29 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline unsigned long +amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object *tbo, + const struct ttm_place *place, + unsigned long bo_flags) +{ + unsigned long pages_per_block; + + if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + pages_per_block = ~0ul; If I understand it correctly this here enforces the allocation of a contiguous buffer in the way that it says we should have only one giant page for the whole BO. yes, for contiguous we don't have the 2MB limit, hence we are setting to maximum to avoid restricting the min_block_size to 2MB limitation. + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pages_per_block = HPAGE_PMD_NR; +#else + /* default to 2MB */ + pages_per_block = 2UL << (20UL - PAGE_SHIFT); +#endif + pages_per_block = max_t(uint32_t, pages_per_block, + tbo->page_alignment); + } + + return pages_per_block; +} + /** * DOC: mem_info_vram_total * @@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); u64 vis_usage = 0, max_bytes, min_block_size; + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; + unsigned long bo_flags = bo->flags; struct drm_buddy *mm = >mm; struct drm_buddy_block *block; unsigned long pages_per_block; @@ -468,18 +493,9 @
Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
Hi Christian, On 4/17/2024 12:19 PM, Christian König wrote: Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. v2: - keep the mem_flags and bo->flags check as is(Christian) - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the amdgpu_bo_pin_restricted function placement range iteration loop(Christian) - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block (Christian) - Keep the kernel BO allocation as is(Christain) - If BO pin vram allocation failed, we need to return -ENOSPC as RDMA cannot work with scattered VRAM pages(Philip) Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..caaef7b1df49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + if (abo->tbo.type == ttm_bo_type_kernel && + flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + c++; } @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (!bo->placements[i].lpfn || (lpfn && lpfn < bo->placements[i].lpfn)) bo->placements[i].lpfn = lpfn; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->placements[i].mem_type == TTM_PL_VRAM) + bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } r = ttm_bo_validate(>tbo, >placement, ); Nice work, up till here that looks exactly right as far as I can see. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..4be8b091099a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,29 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline unsigned long +amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object *tbo, + const struct ttm_place *place, + unsigned long bo_flags) +{ + unsigned long pages_per_block; + + if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + pages_per_block = ~0ul; If I understand it correctly this here enforces the allocation of a contiguous buffer in the way that it says we should have only one giant page for the whole BO. yes, for contiguous we don't have the 2MB limit, hence we are setting to maximum to avoid restricting the min_block_size to 2MB limitation. + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pages_per_block = HPAGE_PMD_NR; +#else + /* default to 2MB */ + pages_per_block = 2UL << (20UL - PAGE_SHIFT); +#endif + pages_per_block = max_t(uint32_t, pages_per_block, + tbo->page_alignment); + } + + return pages_per_block; +} + /** * DOC: mem_info_vram_total * @@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); u64 vis_usage = 0, max_bytes, min_block_size; + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; + unsigned long bo_flags = bo->flags; struct drm_buddy *mm = >mm; struct drm_buddy_block *block; unsigned long pages_per_block; @@ -468,18 +493,9 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (tbo->t
[PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. v2: - keep the mem_flags and bo->flags check as is(Christian) - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the amdgpu_bo_pin_restricted function placement range iteration loop(Christian) - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block (Christian) - Keep the kernel BO allocation as is(Christain) - If BO pin vram allocation failed, we need to return -ENOSPC as RDMA cannot work with scattered VRAM pages(Philip) Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..caaef7b1df49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + if (abo->tbo.type == ttm_bo_type_kernel && + flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + c++; } @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (!bo->placements[i].lpfn || (lpfn && lpfn < bo->placements[i].lpfn)) bo->placements[i].lpfn = lpfn; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->placements[i].mem_type == TTM_PL_VRAM) + bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } r = ttm_bo_validate(>tbo, >placement, ); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..4be8b091099a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,29 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline unsigned long +amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object *tbo, + const struct ttm_place *place, + unsigned long bo_flags) +{ + unsigned long pages_per_block; + + if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + pages_per_block = ~0ul; + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pages_per_block = HPAGE_PMD_NR; +#else + /* default to 2MB */ + pages_per_block = 2UL << (20UL - PAGE_SHIFT); +#endif + pages_per_block = max_t(uint32_t, pages_per_block, + tbo->page_alignment); + } + + return pages_per_block; +} + /** * DOC: mem_info_vram_total * @@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); u64 vis_usage = 0, max_bytes, min_block_size; + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; + unsigned long bo_flags = bo->flags; struct drm_buddy *mm = >mm; struct drm_buddy_block *block; unsigned long pages_per_block; @@ -468,18 +493,9 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (tbo->type != ttm_bo_type_kernel) max_bytes -= AMDGPU_VM_RESERVED_VRAM; - if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - pages_per_block = ~0ul; - } else { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - pages_per_block = HPAGE_PMD_NR; -#els
Re: [PATCH] drm/amdgpu: clear seq64 memory on free
Hi Christian, On 4/16/2024 6:24 PM, Christian König wrote: Am 16.04.24 um 14:34 schrieb Paneer Selvam, Arunpravin: Hi Christian, On 4/16/2024 5:47 PM, Christian König wrote: Am 16.04.24 um 14:16 schrieb Paneer Selvam, Arunpravin: Hi Christian, On 4/16/2024 2:35 PM, Christian König wrote: Am 15.04.24 um 20:48 schrieb Arunpravin Paneer Selvam: We should clear the memory on free. Otherwise, there is a chance that we will access the previous application data and this would leads to an abnormal behaviour in the current application. Mhm, I would strongly expect that we initialize the seq number after allocation. It could be that we also have situations were the correct start value is 0x or something like that instead. Why does this matter? when the user queue A's u64 address (fence address) is allocated to the newly created user queue B, we see a problem. User queue B calls the signal IOCTL which creates a new fence having the wptr as the seq number, in amdgpu_userq_fence_create function we have a check where we are comparing the rptr and wptr value (rptr >= wptr). since the rptr value is read from the u64 address which has the user queue A's wptr data, this rptr >= wptr condition gets satisfied and we are dropping the reference before the actual command gets processed in the hardware. If we clear this u64 value on free, we dont see this problem. Yeah, but that doesn't belongs into the seq64 handling. Instead the code which allocates the seq64 during userqueue created needs to clear it to 0. sure, got it. Yeah, but fixing that aside. We should probably initialize the seq64 array to something like 0xdeadbeef or a similar pattern to find issues were we forget to initialize the allocated slots. yes, I will prepare a patch and send for the review. Thanks, Arun. Regards, Christian. Thanks, Arun. Regards, Christian. Thanks, Arun. Regards, Christian. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c index 4b9afc4df031..9613992c9804 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c @@ -189,10 +189,14 @@ int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 *va, u64 **cpu_addr) void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va) { unsigned long bit_pos; + u64 *cpu_addr; bit_pos = (va - amdgpu_seq64_get_va_base(adev)) / sizeof(u64); - if (bit_pos < adev->seq64.num_sem) + if (bit_pos < adev->seq64.num_sem) { + cpu_addr = bit_pos + adev->seq64.cpu_base_addr; + memset(cpu_addr, 0, sizeof(u64)); __clear_bit(bit_pos, adev->seq64.used); + } } /**
Re: [PATCH] drm/amdgpu: clear seq64 memory on free
Hi Christian, On 4/16/2024 5:47 PM, Christian König wrote: Am 16.04.24 um 14:16 schrieb Paneer Selvam, Arunpravin: Hi Christian, On 4/16/2024 2:35 PM, Christian König wrote: Am 15.04.24 um 20:48 schrieb Arunpravin Paneer Selvam: We should clear the memory on free. Otherwise, there is a chance that we will access the previous application data and this would leads to an abnormal behaviour in the current application. Mhm, I would strongly expect that we initialize the seq number after allocation. It could be that we also have situations were the correct start value is 0x or something like that instead. Why does this matter? when the user queue A's u64 address (fence address) is allocated to the newly created user queue B, we see a problem. User queue B calls the signal IOCTL which creates a new fence having the wptr as the seq number, in amdgpu_userq_fence_create function we have a check where we are comparing the rptr and wptr value (rptr >= wptr). since the rptr value is read from the u64 address which has the user queue A's wptr data, this rptr >= wptr condition gets satisfied and we are dropping the reference before the actual command gets processed in the hardware. If we clear this u64 value on free, we dont see this problem. Yeah, but that doesn't belongs into the seq64 handling. Instead the code which allocates the seq64 during userqueue created needs to clear it to 0. sure, got it. Thanks, Arun. Regards, Christian. Thanks, Arun. Regards, Christian. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c index 4b9afc4df031..9613992c9804 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c @@ -189,10 +189,14 @@ int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 *va, u64 **cpu_addr) void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va) { unsigned long bit_pos; + u64 *cpu_addr; bit_pos = (va - amdgpu_seq64_get_va_base(adev)) / sizeof(u64); - if (bit_pos < adev->seq64.num_sem) + if (bit_pos < adev->seq64.num_sem) { + cpu_addr = bit_pos + adev->seq64.cpu_base_addr; + memset(cpu_addr, 0, sizeof(u64)); __clear_bit(bit_pos, adev->seq64.used); + } } /**
Re: [PATCH] drm/amdgpu: clear seq64 memory on free
Hi Christian, On 4/16/2024 2:35 PM, Christian König wrote: Am 15.04.24 um 20:48 schrieb Arunpravin Paneer Selvam: We should clear the memory on free. Otherwise, there is a chance that we will access the previous application data and this would leads to an abnormal behaviour in the current application. Mhm, I would strongly expect that we initialize the seq number after allocation. It could be that we also have situations were the correct start value is 0x or something like that instead. Why does this matter? when the user queue A's u64 address (fence address) is allocated to the newly created user queue B, we see a problem. User queue B calls the signal IOCTL which creates a new fence having the wptr as the seq number, in amdgpu_userq_fence_create function we have a check where we are comparing the rptr and wptr value (rptr >= wptr). since the rptr value is read from the u64 address which has the user queue A's wptr data, this rptr >= wptr condition gets satisfied and we are dropping the reference before the actual command gets processed in the hardware. If we clear this u64 value on free, we dont see this problem. Thanks, Arun. Regards, Christian. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c index 4b9afc4df031..9613992c9804 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c @@ -189,10 +189,14 @@ int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 *va, u64 **cpu_addr) void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va) { unsigned long bit_pos; + u64 *cpu_addr; bit_pos = (va - amdgpu_seq64_get_va_base(adev)) / sizeof(u64); - if (bit_pos < adev->seq64.num_sem) + if (bit_pos < adev->seq64.num_sem) { + cpu_addr = bit_pos + adev->seq64.cpu_base_addr; + memset(cpu_addr, 0, sizeof(u64)); __clear_bit(bit_pos, adev->seq64.used); + } } /**
Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
Hi Christian, On 4/16/2024 1:16 PM, Christian König wrote: Am 16.04.24 um 00:02 schrieb Philip Yang: On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. This change will simplify the KFD best effort contiguous VRAM allocation, because KFD doesn't need set new GEM_ flag. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page table BO, it is fine as BO size is page size 4K. For 64KB reserved BOs and F/W size related BOs, do all allocation happen at driver initialization before the VRAM is fragmented? Oh, that's a really good point, we need to keep the behavior as is for kernel allocations. Arun can you take care of that? Sure, I will take care kernel allocations. Thanks, Arun. Thanks, Christian. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 14 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..41926d631563 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) - places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; c++; } @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_operation_ctx ctx = { false, false }; + struct ttm_place *places = bo->placements; + u32 c = 0; int r, i; if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (bo->tbo.pin_count) { uint32_t mem_type = bo->tbo.resource->mem_type; - uint32_t mem_flags = bo->tbo.resource->placement; if (!(domain & amdgpu_mem_type_to_domain(mem_type))) return -EINVAL; - if ((mem_type == TTM_PL_VRAM) && - (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) && - !(mem_flags & TTM_PL_FLAG_CONTIGUOUS)) - return -EINVAL; - This looks like a bug before, but with this patch, the check makes sense and is needed. ttm_bo_pin(>tbo); if (max_offset != 0) { @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, bo->placements[i].lpfn = lpfn; } + if (domain & AMDGPU_GEM_DOMAIN_VRAM && + !WARN_ON(places[c].mem_type != TTM_PL_VRAM)) + places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, should pin and return scattered pages because the RDMA support scattered dmabuf. Christian also pointed this out. If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && bo->placements[i].mem_type == TTM_PL_VRAM) o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; r = ttm_bo_validate(>tbo, >placement, ); if (unlikely(r)) { dev_err(adev->dev, "%p pin failed\n", bo); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..ddbf302878f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline unsigned long +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo, + const struct ttm_place *place, + unsigned long bo_flags) +{ + unsigned long pages_per_block; + + if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS || + place->flags & TTM_PL_FLAG_CONTIGUOUS) { + pages_per_blo
Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
On 4/16/2024 3:32 AM, Philip Yang wrote: On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. This change will simplify the KFD best effort contiguous VRAM allocation, because KFD doesn't need set new GEM_ flag. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page table BO, it is fine as BO size is page size 4K. For 64KB reserved BOs and F/W size related BOs, do all allocation happen at driver initialization before the VRAM is fragmented? - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 14 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..41926d631563 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) - places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; c++; } @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_operation_ctx ctx = { false, false }; + struct ttm_place *places = bo->placements; + u32 c = 0; int r, i; if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (bo->tbo.pin_count) { uint32_t mem_type = bo->tbo.resource->mem_type; - uint32_t mem_flags = bo->tbo.resource->placement; if (!(domain & amdgpu_mem_type_to_domain(mem_type))) return -EINVAL; - if ((mem_type == TTM_PL_VRAM) && - (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) && - !(mem_flags & TTM_PL_FLAG_CONTIGUOUS)) - return -EINVAL; - This looks like a bug before, but with this patch, the check makes sense and is needed. ttm_bo_pin(>tbo); if (max_offset != 0) { @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, bo->placements[i].lpfn = lpfn; } + if (domain & AMDGPU_GEM_DOMAIN_VRAM && + !WARN_ON(places[c].mem_type != TTM_PL_VRAM)) + places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, should pin and return scattered pages because the RDMA support scattered dmabuf. Christian also pointed this out. If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && bo->placements[i].mem_type == TTM_PL_VRAM) o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; r = ttm_bo_validate(>tbo, >placement, ); if (unlikely(r)) { dev_err(adev->dev, "%p pin failed\n", bo); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..ddbf302878f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline unsigned long +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo, +const struct ttm_place *place, +unsigned long bo_flags) +{ + unsigned long pages_per_block; + + if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS || + place->flags & TTM_PL_FLAG_CONTIGUOUS) { + pages_per_block = ~0ul; + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pages_per_block =
[PATCH] drm/amdgpu: clear seq64 memory on free
We should clear the memory on free. Otherwise, there is a chance that we will access the previous application data and this would leads to an abnormal behaviour in the current application. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c index 4b9afc4df031..9613992c9804 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c @@ -189,10 +189,14 @@ int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 *va, u64 **cpu_addr) void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va) { unsigned long bit_pos; + u64 *cpu_addr; bit_pos = (va - amdgpu_seq64_get_va_base(adev)) / sizeof(u64); - if (bit_pos < adev->seq64.num_sem) + if (bit_pos < adev->seq64.num_sem) { + cpu_addr = bit_pos + adev->seq64.cpu_base_addr; + memset(cpu_addr, 0, sizeof(u64)); __clear_bit(bit_pos, adev->seq64.used); + } } /** -- 2.25.1
Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
Hi Christian, On 4/15/2024 7:33 PM, Christian König wrote: Am 14.04.24 um 16:57 schrieb Arunpravin Paneer Selvam: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 14 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..41926d631563 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) - places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; c++; } @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_operation_ctx ctx = { false, false }; + struct ttm_place *places = bo->placements; + u32 c = 0; int r, i; if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (bo->tbo.pin_count) { uint32_t mem_type = bo->tbo.resource->mem_type; - uint32_t mem_flags = bo->tbo.resource->placement; if (!(domain & amdgpu_mem_type_to_domain(mem_type))) return -EINVAL; - if ((mem_type == TTM_PL_VRAM) && - (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) && - !(mem_flags & TTM_PL_FLAG_CONTIGUOUS)) - return -EINVAL; - I think that check here needs to stay. ttm_bo_pin(>tbo); if (max_offset != 0) { @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, bo->placements[i].lpfn = lpfn; } + if (domain & AMDGPU_GEM_DOMAIN_VRAM && + !WARN_ON(places[c].mem_type != TTM_PL_VRAM)) + places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + This needs to go into the loop directly above as something like this here: If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && bo->placements[i].mem_type == TTM_PL_VRAM) o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; This essentially replaces the removed code in amdgpu_bo_placement_from_domain() and only applies it during pinning. r = ttm_bo_validate(>tbo, >placement, ); if (unlikely(r)) { dev_err(adev->dev, "%p pin failed\n", bo); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..ddbf302878f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline unsigned long +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo, + const struct ttm_place *place, + unsigned long bo_flags) Well I think we need a better name here. "find" usually means we look up something in a data structure. Maybe amdgpu_vram_mgr_calculate_pages_per_block. +{ + unsigned long pages_per_block; + + if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS || + place->flags & TTM_PL_FLAG_CONTIGUOUS) { + pages_per_block = ~0ul; + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pages_per_block = HPAGE_PMD_NR; +#else + /* default to 2MB */ + pages_per_block = 2UL << (20UL - PAGE_SHIFT); +#endif + pages_per_block = max_t(uint32_t, pages_per_block, + tbo->page_alignment); + } + + return pages_per_block; +} + /** * DOC: mem_info_vram_total * @@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
Re: [PATCH v11 1/3] drm/buddy: Implement tracking clear page feature
Hi Christian, Could you please push these patches into drm branch. Thanks, Arun. On 4/15/2024 2:53 AM, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) v4: - rename the function drm_buddy_defrag with __force_merge. - Include __force_merge directly in drm buddy file and remove the defrag use in amdgpu driver. - Remove list_empty() check(Matthew) - Remove unnecessary space, headers and placement of new variables(Matthew) - Add a unit test case(Matthew) v5: - remove force merge support to actual range allocation and not to bail out when contains && split(Matthew) - add range support to force merge function. v6: - modify the alloc_range() function clear page non merged blocks allocation(Matthew) - correct the list_insert function name(Matthew). Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld Reviewed-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 427 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c| 28 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 16 +- 6 files changed, 363 insertions(+), 124 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 5ebdd6f8f36e..284ebae71cc4 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void clear_reset(struct drm_buddy_block *block) +{ +
[PATCH v11 1/3] drm/buddy: Implement tracking clear page feature
- Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) v4: - rename the function drm_buddy_defrag with __force_merge. - Include __force_merge directly in drm buddy file and remove the defrag use in amdgpu driver. - Remove list_empty() check(Matthew) - Remove unnecessary space, headers and placement of new variables(Matthew) - Add a unit test case(Matthew) v5: - remove force merge support to actual range allocation and not to bail out when contains && split(Matthew) - add range support to force merge function. v6: - modify the alloc_range() function clear page non merged blocks allocation(Matthew) - correct the list_insert function name(Matthew). Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld Reviewed-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 427 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c| 28 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 16 +- 6 files changed, 363 insertions(+), 124 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 5ebdd6f8f36e..284ebae71cc4 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void clear_reset(struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_cleared(struct drm_buddy_block *block) +{ + block->header |= DRM_BUDDY_HEADER_CLEAR; +} + stati
[PATCH v11 3/3] drm/tests: Add a test case for drm buddy clear allocation
Add a new test case for the drm buddy clear and dirty allocation. v2:(Matthew) - make size as u32 - rename PAGE_SIZE with SZ_4K - dont fragment the address space for all the order allocation iterations. we can do it once and just increment and allocate the size. - create new mm with non power-of-two size to ensure the multi-root force_merge during fini. v3: - add randomness in size calculation(Matthew) Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Matthew Auld Suggested-by: Matthew Auld --- drivers/gpu/drm/tests/drm_buddy_test.c | 143 + 1 file changed, 143 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index 4621a860cb05..e3b50e240d36 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -224,6 +224,148 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) drm_buddy_fini(); } +static void drm_test_buddy_alloc_clear(struct kunit *test) +{ + unsigned long n_pages, total, i = 0; + DRM_RND_STATE(prng, random_seed); + const unsigned long ps = SZ_4K; + struct drm_buddy_block *block; + const int max_order = 12; + LIST_HEAD(allocated); + struct drm_buddy mm; + unsigned int order; + u32 mm_size, size; + LIST_HEAD(dirty); + LIST_HEAD(clean); + + mm_size = SZ_4K << max_order; + KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps)); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + /* +* Idea is to allocate and free some random portion of the address space, +* returning those pages as non-dirty and randomly alternate between +* requesting dirty and non-dirty pages (not going over the limit +* we freed as non-dirty), putting that into two separate lists. +* Loop over both lists at the end checking that the dirty list +* is indeed all dirty pages and vice versa. Free it all again, +* keeping the dirty/clear status. +*/ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, + 5 * ps, ps, , + DRM_BUDDY_TOPDOWN_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 5 * ps); + drm_buddy_free_list(, , DRM_BUDDY_CLEARED); + + n_pages = 10; + do { + unsigned long flags; + struct list_head *list; + int slot = i % 2; + + if (slot == 0) { + list = + flags = 0; + } else { + list = + flags = DRM_BUDDY_CLEAR_ALLOCATION; + } + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, + ps, ps, list, + flags), + "buddy_alloc hit an error size=%lu\n", ps); + } while (++i < n_pages); + + list_for_each_entry(block, , link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true); + + list_for_each_entry(block, , link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(, , DRM_BUDDY_CLEARED); + + /* +* Trying to go over the clear limit for some allocation. +* The allocation should never fail with reasonable page-size. +*/ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, + 10 * ps, ps, , + DRM_BUDDY_CLEAR_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 10 * ps); + + drm_buddy_free_list(, , DRM_BUDDY_CLEARED); + drm_buddy_free_list(, , 0); + drm_buddy_fini(); + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps)); + + /* +* Create a new mm. Intentionally fragment the address space by creating +* two alternating lists. Free both lists, one as dirty the other as clean. +* Try to allocate double the previous size with matching min_page_size. The +* allocation should never fail as it calls the force_merge. Also check that +* the page is always dirty after force_merge. Free the page as dirty, then +* repeat the whole thing, increment the order until we hit the max_order. +*/ + + i = 0; + n_pages = mm_size / ps; + do { + struct list_head *list; + int slot = i % 2; + + if (slot == 0) + list = +
[PATCH v11 2/3] drm/amdgpu: Enable clear page functionality
Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. v4(Christian): - vres flag setting move to vram manager file - use dma_fence_get_stub in amdgpu_ttm_clear_buffer function - make fence a mandatory parameter and drop the if and the get/put dance Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 9 +-- .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 70 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 10 +++ 6 files changed, 116 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..e011a326fe86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h" /** * DOC: amdgpu_object @@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (adev->ras_enabled) - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -634,7 +634,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, bo->tbo.resource->mem_type == TTM_PL_VRAM) { struct dma_fence *fence; - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true); + r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, ); if (unlikely(r)) goto fail_unreserve; @@ -1365,8 +1365,9 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true); + r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true); if (!WARN_ON(r)) { + amdgpu_vram_mgr_set_cleared(bo->resource); amdgpu_bo_fence(abo, fence, false); dma_fence_put(fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05..50fcd86e1033 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) } } +/** + * amdgpu_res_cleared - check if blocks are cleared + * + * @cur: the cursor to extract the block + * + * Check if the @cur block is cleared + */ +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) +{ + struct drm_buddy_block *block; + + switch (cur->mem_type) { + case TTM_PL_VRAM: + block = cur->node; + + if (!amdgpu_vram_mgr_is_cleared(block)) + return false; + break; + default: + return false; + } + + return true; +} + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index fc418e670fda..f0b42b567357 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -378,11 +378,12 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, (abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) { struct dma_fence *wipe_fence = NULL; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, _fence, - false); + r =
Re: [PATCH v10 1/3] drm/buddy: Implement tracking clear page feature
Hi Matthew, On 4/10/2024 6:22 PM, Matthew Auld wrote: On 08/04/2024 16:16, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) v4: - rename the function drm_buddy_defrag with __force_merge. - Include __force_merge directly in drm buddy file and remove the defrag use in amdgpu driver. - Remove list_empty() check(Matthew) - Remove unnecessary space, headers and placement of new variables(Matthew) - Add a unit test case(Matthew) v5: - remove force merge support to actual range allocation and not to bail out when contains && split(Matthew) - add range support to force merge function. Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 430 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c | 28 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 16 +- 6 files changed, 368 insertions(+), 122 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 5ebdd6f8f36e..83dbe252f727 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm, kmem_cache_free(slab_blocks, block); } -static void list_insert_sorted(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void list_insert(struct drm_buddy *mm, + struct drm_buddy_block *block) Why the change here? { struct drm_buddy_block *node; struct list_head *head; @@ -57,6 +57,16 @@ s
[PATCH] drm/amdgpu: Modify the contiguous flags behaviour
Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 14 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..41926d631563 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) - places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; c++; } @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_operation_ctx ctx = { false, false }; + struct ttm_place *places = bo->placements; + u32 c = 0; int r, i; if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (bo->tbo.pin_count) { uint32_t mem_type = bo->tbo.resource->mem_type; - uint32_t mem_flags = bo->tbo.resource->placement; if (!(domain & amdgpu_mem_type_to_domain(mem_type))) return -EINVAL; - if ((mem_type == TTM_PL_VRAM) && - (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) && - !(mem_flags & TTM_PL_FLAG_CONTIGUOUS)) - return -EINVAL; - ttm_bo_pin(>tbo); if (max_offset != 0) { @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, bo->placements[i].lpfn = lpfn; } + if (domain & AMDGPU_GEM_DOMAIN_VRAM && + !WARN_ON(places[c].mem_type != TTM_PL_VRAM)) + places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + r = ttm_bo_validate(>tbo, >placement, ); if (unlikely(r)) { dev_err(adev->dev, "%p pin failed\n", bo); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..ddbf302878f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline unsigned long +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo, +const struct ttm_place *place, +unsigned long bo_flags) +{ + unsigned long pages_per_block; + + if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS || + place->flags & TTM_PL_FLAG_CONTIGUOUS) { + pages_per_block = ~0ul; + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pages_per_block = HPAGE_PMD_NR; +#else + /* default to 2MB */ + pages_per_block = 2UL << (20UL - PAGE_SHIFT); +#endif + pages_per_block = max_t(uint32_t, pages_per_block, + tbo->page_alignment); + } + + return pages_per_block; +} + /** * DOC: mem_info_vram_total * @@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); u64 vis_usage = 0, max_bytes, min_block_size; + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo); struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; + unsigned long bo_flags = bo->flags; struct drm_buddy *mm = >mm; struct drm_buddy_block *block; unsigned long pag
Re: [PATCH v10 1/3] drm/buddy: Implement tracking clear page feature
Hi Matthew, Could you please review these changes as few clients are waiting for these patches. Thanks, Arun. On 4/8/2024 8:46 PM, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) v4: - rename the function drm_buddy_defrag with __force_merge. - Include __force_merge directly in drm buddy file and remove the defrag use in amdgpu driver. - Remove list_empty() check(Matthew) - Remove unnecessary space, headers and placement of new variables(Matthew) - Add a unit test case(Matthew) v5: - remove force merge support to actual range allocation and not to bail out when contains && split(Matthew) - add range support to force merge function. Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 430 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c| 28 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 16 +- 6 files changed, 368 insertions(+), 122 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 5ebdd6f8f36e..83dbe252f727 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm, kmem_cache_free(slab_blocks, block); } -static void list_insert_sorted(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void list_insert(struct drm_buddy *mm, + struct drm_buddy_block *block) { stru
[PATCH v10 3/3] drm/tests: Add a test case for drm buddy clear allocation
Add a new test case for the drm buddy clear and dirty allocation. v2:(Matthew) - make size as u32 - rename PAGE_SIZE with SZ_4K - dont fragment the address space for all the order allocation iterations. we can do it once and just increment and allocate the size. - create new mm with non power-of-two size to ensure the multi-root force_merge during fini. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Matthew Auld --- drivers/gpu/drm/tests/drm_buddy_test.c | 141 + 1 file changed, 141 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index 4621a860cb05..b07f132f2835 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -224,6 +224,146 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) drm_buddy_fini(); } +static void drm_test_buddy_alloc_clear(struct kunit *test) +{ + unsigned long n_pages, total, i = 0; + const unsigned long ps = SZ_4K; + struct drm_buddy_block *block; + const int max_order = 12; + LIST_HEAD(allocated); + struct drm_buddy mm; + unsigned int order; + u32 mm_size, size; + LIST_HEAD(dirty); + LIST_HEAD(clean); + + mm_size = SZ_4K << max_order; + KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps)); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + /* +* Idea is to allocate and free some random portion of the address space, +* returning those pages as non-dirty and randomly alternate between +* requesting dirty and non-dirty pages (not going over the limit +* we freed as non-dirty), putting that into two separate lists. +* Loop over both lists at the end checking that the dirty list +* is indeed all dirty pages and vice versa. Free it all again, +* keeping the dirty/clear status. +*/ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, + 5 * ps, ps, , + DRM_BUDDY_TOPDOWN_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 5 * ps); + drm_buddy_free_list(, , DRM_BUDDY_CLEARED); + + n_pages = 10; + do { + unsigned long flags; + struct list_head *list; + int slot = i % 2; + + if (slot == 0) { + list = + flags = 0; + } else { + list = + flags = DRM_BUDDY_CLEAR_ALLOCATION; + } + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, + ps, ps, list, + flags), + "buddy_alloc hit an error size=%lu\n", ps); + } while (++i < n_pages); + + list_for_each_entry(block, , link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true); + + list_for_each_entry(block, , link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(, , DRM_BUDDY_CLEARED); + + /* +* Trying to go over the clear limit for some allocation. +* The allocation should never fail with reasonable page-size. +*/ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, + 10 * ps, ps, , + DRM_BUDDY_CLEAR_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 10 * ps); + + drm_buddy_free_list(, , DRM_BUDDY_CLEARED); + drm_buddy_free_list(, , 0); + drm_buddy_fini(); + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps)); + + /* +* Create a new mm. Intentionally fragment the address space by creating +* two alternating lists. Free both lists, one as dirty the other as clean. +* Try to allocate double the previous size with matching min_page_size. The +* allocation should never fail as it calls the force_merge. Also check that +* the page is always dirty after force_merge. Free the page as dirty, then +* repeat the whole thing, increment the order until we hit the max_order. +*/ + + i = 0; + n_pages = mm_size / ps; + do { + struct list_head *list; + int slot = i % 2; + + if (slot == 0) + list = + else + list = + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, +
[PATCH v10 1/3] drm/buddy: Implement tracking clear page feature
- Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) v4: - rename the function drm_buddy_defrag with __force_merge. - Include __force_merge directly in drm buddy file and remove the defrag use in amdgpu driver. - Remove list_empty() check(Matthew) - Remove unnecessary space, headers and placement of new variables(Matthew) - Add a unit test case(Matthew) v5: - remove force merge support to actual range allocation and not to bail out when contains && split(Matthew) - add range support to force merge function. Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 430 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c| 28 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 16 +- 6 files changed, 368 insertions(+), 122 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 5ebdd6f8f36e..83dbe252f727 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm, kmem_cache_free(slab_blocks, block); } -static void list_insert_sorted(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void list_insert(struct drm_buddy *mm, + struct drm_buddy_block *block) { struct drm_buddy_block *node; struct list_head *head; @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void
[PATCH v10 2/3] drm/amdgpu: Enable clear page functionality
Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. v4(Christian): - vres flag setting move to vram manager file - use dma_fence_get_stub in amdgpu_ttm_clear_buffer function - make fence a mandatory parameter and drop the if and the get/put dance Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 9 +-- .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 70 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 10 +++ 6 files changed, 116 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..e011a326fe86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h" /** * DOC: amdgpu_object @@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (adev->ras_enabled) - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -634,7 +634,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, bo->tbo.resource->mem_type == TTM_PL_VRAM) { struct dma_fence *fence; - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true); + r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, ); if (unlikely(r)) goto fail_unreserve; @@ -1365,8 +1365,9 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true); + r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true); if (!WARN_ON(r)) { + amdgpu_vram_mgr_set_cleared(bo->resource); amdgpu_bo_fence(abo, fence, false); dma_fence_put(fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05..50fcd86e1033 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) } } +/** + * amdgpu_res_cleared - check if blocks are cleared + * + * @cur: the cursor to extract the block + * + * Check if the @cur block is cleared + */ +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) +{ + struct drm_buddy_block *block; + + switch (cur->mem_type) { + case TTM_PL_VRAM: + block = cur->node; + + if (!amdgpu_vram_mgr_is_cleared(block)) + return false; + break; + default: + return false; + } + + return true; +} + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index fc418e670fda..f0b42b567357 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -378,11 +378,12 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, (abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) { struct dma_fence *wipe_fence = NULL; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, _fence, - false); + r =
Re: [PATCH v9 1/3] drm/buddy: Implement tracking clear page feature
Hi Matthew, On 3/28/2024 10:18 PM, Matthew Auld wrote: On 28/03/2024 16:07, Paneer Selvam, Arunpravin wrote: Hi Matthew, On 3/26/2024 11:39 PM, Matthew Auld wrote: On 18/03/2024 21:40, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) v4: - rename the function drm_buddy_defrag with __force_merge. - Include __force_merge directly in drm buddy file and remove the defrag use in amdgpu driver. - Remove list_empty() check(Matthew) - Remove unnecessary space, headers and placement of new variables(Matthew) - Add a unit test case(Matthew) Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 427 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c | 18 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 16 +- 6 files changed, 360 insertions(+), 117 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index c4222b886db7..625a30a6b855 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm, kmem_cache_free(slab_blocks, block); } -static void list_insert_sorted(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void list_insert(struct drm_buddy *mm, + struct drm_buddy_block *block) { struct drm_buddy_block *node; struct list_head *head; @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->l
Re: [PATCH v9 1/3] drm/buddy: Implement tracking clear page feature
Hi Matthew, On 3/26/2024 11:39 PM, Matthew Auld wrote: On 18/03/2024 21:40, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) v4: - rename the function drm_buddy_defrag with __force_merge. - Include __force_merge directly in drm buddy file and remove the defrag use in amdgpu driver. - Remove list_empty() check(Matthew) - Remove unnecessary space, headers and placement of new variables(Matthew) - Add a unit test case(Matthew) Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 427 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c | 18 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 16 +- 6 files changed, 360 insertions(+), 117 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index c4222b886db7..625a30a6b855 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm, kmem_cache_free(slab_blocks, block); } -static void list_insert_sorted(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void list_insert(struct drm_buddy *mm, + struct drm_buddy_block *block) { struct drm_buddy_block *node; struct list_head *head; @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void clear_reset(struct drm_buddy_block *block) +{ + block->header &= ~DRM
Re: [PATCH v9 2/3] drm/amdgpu: Enable clear page functionality
Hi Alex, On 3/26/2024 8:23 PM, Alex Deucher wrote: On Tue, Mar 26, 2024 at 10:01 AM Alex Deucher wrote: On Tue, Mar 26, 2024 at 9:59 AM Paneer Selvam, Arunpravin wrote: Hi Alex, On 3/26/2024 7:08 PM, Alex Deucher wrote: On Mon, Mar 18, 2024 at 5:47 PM Arunpravin Paneer Selvam wrote: Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 --- .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 5 ++ 6 files changed, 111 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..c92d92b28a57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h" /** * DOC: amdgpu_object @@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (adev->ras_enabled) - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && bo->tbo.resource->mem_type == TTM_PL_VRAM) { - struct dma_fence *fence; + struct dma_fence *fence = NULL; - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true); + r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, ); if (unlikely(r)) goto fail_unreserve; - dma_resv_add_fence(bo->tbo.base.resv, fence, - DMA_RESV_USAGE_KERNEL); - dma_fence_put(fence); + if (fence) { + dma_resv_add_fence(bo->tbo.base.resv, fence, + DMA_RESV_USAGE_KERNEL); + dma_fence_put(fence); + } } if (!bp->resv) amdgpu_bo_unreserve(bo); @@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true); + r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true); if (!WARN_ON(r)) { + struct amdgpu_vram_mgr_resource *vres; + + vres = to_amdgpu_vram_mgr_resource(bo->resource); + vres->flags |= DRM_BUDDY_CLEARED; amdgpu_bo_fence(abo, fence, false); dma_fence_put(fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05..50fcd86e1033 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) } } +/** + * amdgpu_res_cleared - check if blocks are cleared + * + * @cur: the cursor to extract the block + * + * Check if the @cur block is cleared + */ +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) +{ + struct drm_buddy_block *block; + + switch (cur-
Re: [PATCH v9 2/3] drm/amdgpu: Enable clear page functionality
Hi Alex, On 3/26/2024 7:08 PM, Alex Deucher wrote: On Mon, Mar 18, 2024 at 5:47 PM Arunpravin Paneer Selvam wrote: Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 --- .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 5 ++ 6 files changed, 111 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..c92d92b28a57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h" /** * DOC: amdgpu_object @@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (adev->ras_enabled) - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && bo->tbo.resource->mem_type == TTM_PL_VRAM) { - struct dma_fence *fence; + struct dma_fence *fence = NULL; - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true); + r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, ); if (unlikely(r)) goto fail_unreserve; - dma_resv_add_fence(bo->tbo.base.resv, fence, - DMA_RESV_USAGE_KERNEL); - dma_fence_put(fence); + if (fence) { + dma_resv_add_fence(bo->tbo.base.resv, fence, + DMA_RESV_USAGE_KERNEL); + dma_fence_put(fence); + } } if (!bp->resv) amdgpu_bo_unreserve(bo); @@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true); + r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true); if (!WARN_ON(r)) { + struct amdgpu_vram_mgr_resource *vres; + + vres = to_amdgpu_vram_mgr_resource(bo->resource); + vres->flags |= DRM_BUDDY_CLEARED; amdgpu_bo_fence(abo, fence, false); dma_fence_put(fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05..50fcd86e1033 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) } } +/** + * amdgpu_res_cleared - check if blocks are cleared + * + * @cur: the cursor to extract the block + * + * Check if the @cur block is cleared + */ +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) +{ + struct drm_buddy_block *block; + + switch (cur->mem_type) { + case TTM_PL_VRAM: + block = cur->node; + + if (!amdgpu_vram_mgr_is_cleared(block)) + return false; + break; + default: +
Re: [PATCH v9 1/3] drm/buddy: Implement tracking clear page feature
Hi Matthew, ping? Thanks, Arun. On 3/19/2024 3:10 AM, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) v4: - rename the function drm_buddy_defrag with __force_merge. - Include __force_merge directly in drm buddy file and remove the defrag use in amdgpu driver. - Remove list_empty() check(Matthew) - Remove unnecessary space, headers and placement of new variables(Matthew) - Add a unit test case(Matthew) Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 427 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c| 18 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 16 +- 6 files changed, 360 insertions(+), 117 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index c4222b886db7..625a30a6b855 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm, kmem_cache_free(slab_blocks, block); } -static void list_insert_sorted(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void list_insert(struct drm_buddy *mm, + struct drm_buddy_block *block) { struct drm_buddy_block *node; struct list_head *head; @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void clear_reset(struct drm_buddy_block *block)
Re: [PATCH v9 2/3] drm/amdgpu: Enable clear page functionality
Hi Christian, On 3/19/2024 7:17 PM, Christian König wrote: Am 19.03.24 um 12:41 schrieb Paneer Selvam, Arunpravin: Hi Christian, On 3/19/2024 3:58 PM, Christian König wrote: Am 18.03.24 um 22:40 schrieb Arunpravin Paneer Selvam: Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling Just a few nit picks below, but in general already looks good to me. --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 22 --- .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h | 25 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 5 ++ 6 files changed, 111 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..c92d92b28a57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h" /** * DOC: amdgpu_object @@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (adev->ras_enabled) - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && bo->tbo.resource->mem_type == TTM_PL_VRAM) { - struct dma_fence *fence; + struct dma_fence *fence = NULL; - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true); + r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, ); if (unlikely(r)) goto fail_unreserve; - dma_resv_add_fence(bo->tbo.base.resv, fence, - DMA_RESV_USAGE_KERNEL); - dma_fence_put(fence); + if (fence) { + dma_resv_add_fence(bo->tbo.base.resv, fence, + DMA_RESV_USAGE_KERNEL); + dma_fence_put(fence); + } } if (!bp->resv) amdgpu_bo_unreserve(bo); @@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true); + r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true); if (!WARN_ON(r)) { + struct amdgpu_vram_mgr_resource *vres; + + vres = to_amdgpu_vram_mgr_resource(bo->resource); + vres->flags |= DRM_BUDDY_CLEARED; Those lines should probably be in the VRAM manager. This flag is set based on the amdgpu_fill_buffer() function return value, can we move the amdgpu_fill_buffer() function call and DRM_BUDDY_CLEARED flag set lines to amdgpu_vram_mgr_del() in VRAM manager and does it require to wipe the VRAM buffers here as well without setting the DRM_BUDDY_CLEARED. No that won't work. The call to amdgpu_fill_buffer() *must* be here because that here is called before the resource is actually freed. Only setting the vres->flags should probably be moved into amdgpu_vram_mgr.c. So instead of calling that manually here just make a function for it to keep the code related to the buddy allocator in one place. I got it, Thanks. Regards, Arun. Regards, Christian. amdgpu_bo_fence(abo, fence, false); dma_fence_put(fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05.
Re: [PATCH v9 2/3] drm/amdgpu: Enable clear page functionality
Hi Christian, On 3/19/2024 3:58 PM, Christian König wrote: Am 18.03.24 um 22:40 schrieb Arunpravin Paneer Selvam: Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling Just a few nit picks below, but in general already looks good to me. --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 22 --- .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h | 25 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 5 ++ 6 files changed, 111 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..c92d92b28a57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h" /** * DOC: amdgpu_object @@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (adev->ras_enabled) - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && bo->tbo.resource->mem_type == TTM_PL_VRAM) { - struct dma_fence *fence; + struct dma_fence *fence = NULL; - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true); + r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, ); if (unlikely(r)) goto fail_unreserve; - dma_resv_add_fence(bo->tbo.base.resv, fence, - DMA_RESV_USAGE_KERNEL); - dma_fence_put(fence); + if (fence) { + dma_resv_add_fence(bo->tbo.base.resv, fence, + DMA_RESV_USAGE_KERNEL); + dma_fence_put(fence); + } } if (!bp->resv) amdgpu_bo_unreserve(bo); @@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true); + r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true); if (!WARN_ON(r)) { + struct amdgpu_vram_mgr_resource *vres; + + vres = to_amdgpu_vram_mgr_resource(bo->resource); + vres->flags |= DRM_BUDDY_CLEARED; Those lines should probably be in the VRAM manager. This flag is set based on the amdgpu_fill_buffer() function return value, can we move the amdgpu_fill_buffer() function call and DRM_BUDDY_CLEARED flag set lines to amdgpu_vram_mgr_del() in VRAM manager and does it require to wipe the VRAM buffers here as well without setting the DRM_BUDDY_CLEARED. amdgpu_bo_fence(abo, fence, false); dma_fence_put(fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05..50fcd86e1033 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) } } +/** + * amdgpu_res_cleared - check if blocks are cleared + * + * @cur: the cursor to extract the block + * + * Check if the @cur block is cleared + */ +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) +{ + struct drm_buddy_block *block; + + switch (cur->mem_type) { +
[PATCH v9 3/3] drm/tests: Add a test case for drm buddy clear allocation
Add a new test case for the drm buddy clear and dirty allocation. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Matthew Auld --- drivers/gpu/drm/tests/drm_buddy_test.c | 127 + 1 file changed, 127 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index 454ad9952f56..d355a6e61893 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -19,6 +19,132 @@ static inline u64 get_size(int order, u64 chunk_size) return (1 << order) * chunk_size; } +static void drm_test_buddy_alloc_clear(struct kunit *test) +{ + unsigned long n_pages, total, i = 0; + const unsigned long ps = SZ_4K; + struct drm_buddy_block *block; + const int max_order = 12; + LIST_HEAD(allocated); + struct drm_buddy mm; + unsigned int order; + u64 mm_size, size; + LIST_HEAD(dirty); + LIST_HEAD(clean); + + mm_size = PAGE_SIZE << max_order; + KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps)); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + /** +* Idea is to allocate and free some random portion of the address space, +* returning those pages as non-dirty and randomly alternate between +* requesting dirty and non-dirty pages (not going over the limit +* we freed as non-dirty), putting that into two separate lists. +* Loop over both lists at the end checking that the dirty list +* is indeed all dirty pages and vice versa. Free it all again, +* keeping the dirty/clear status. +*/ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, + 5 * ps, ps, , + DRM_BUDDY_TOPDOWN_ALLOCATION), + "buddy_alloc hit an error size=%u\n", 5 * ps); + drm_buddy_free_list(, , DRM_BUDDY_CLEARED); + + n_pages = 10; + do { + unsigned long flags; + struct list_head *list; + int slot = i % 2; + + if (slot == 0) { + list = + flags = 0; + } else if (slot == 1) { + list = + flags = DRM_BUDDY_CLEAR_ALLOCATION; + } + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, + ps, ps, list, + flags), + "buddy_alloc hit an error size=%u\n", ps); + } while (++i < n_pages); + + list_for_each_entry(block, , link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true); + + list_for_each_entry(block, , link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(, , DRM_BUDDY_CLEARED); + + /** +* Trying to go over the clear limit for some allocation. +* The allocation should never fail with reasonable page-size. +*/ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(, 0, mm_size, + 10 * ps, ps, , + DRM_BUDDY_CLEAR_ALLOCATION), + "buddy_alloc hit an error size=%u\n", 10 * ps); + + drm_buddy_free_list(, , DRM_BUDDY_CLEARED); + drm_buddy_free_list(, , 0); + drm_buddy_fini(); + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(, mm_size, ps)); + + /** +* Create a new mm. Intentionally fragment the address space by creating +* two alternating lists. Free both lists, one as dirty the other as clean. +* Try to allocate double the previous size with matching min_page_size. The +* allocation should never fail as it calls the force_merge. Also check that +* the page is always dirty after force_merge. Free the page as dirty, then +* repeat the whole thing, increment the order until we hit the max_order. +*/ + + order = 1; + do { + size = PAGE_SIZE << order; + i = 0; + n_pages = mm_size / ps; + do { + struct list_head *list; + int slot = i % 2; + + if (slot == 0) + list = + else if (slot == 1) + list = + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(, 0, mm_size, +
[PATCH v9 2/3] drm/amdgpu: Enable clear page functionality
Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 --- .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 5 ++ 6 files changed, 111 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..c92d92b28a57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h" /** * DOC: amdgpu_object @@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (adev->ras_enabled) - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && bo->tbo.resource->mem_type == TTM_PL_VRAM) { - struct dma_fence *fence; + struct dma_fence *fence = NULL; - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true); + r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, ); if (unlikely(r)) goto fail_unreserve; - dma_resv_add_fence(bo->tbo.base.resv, fence, - DMA_RESV_USAGE_KERNEL); - dma_fence_put(fence); + if (fence) { + dma_resv_add_fence(bo->tbo.base.resv, fence, + DMA_RESV_USAGE_KERNEL); + dma_fence_put(fence); + } } if (!bp->resv) amdgpu_bo_unreserve(bo); @@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true); + r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true); if (!WARN_ON(r)) { + struct amdgpu_vram_mgr_resource *vres; + + vres = to_amdgpu_vram_mgr_resource(bo->resource); + vres->flags |= DRM_BUDDY_CLEARED; amdgpu_bo_fence(abo, fence, false); dma_fence_put(fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05..50fcd86e1033 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) } } +/** + * amdgpu_res_cleared - check if blocks are cleared + * + * @cur: the cursor to extract the block + * + * Check if the @cur block is cleared + */ +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) +{ + struct drm_buddy_block *block; + + switch (cur->mem_type) { + case TTM_PL_VRAM: + block = cur->node; + + if (!amdgpu_vram_mgr_is_cleared(block)) + return false; + break; + default: + return false; + } + + return true; +} + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/am
[PATCH v9 1/3] drm/buddy: Implement tracking clear page feature
- Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) v4: - rename the function drm_buddy_defrag with __force_merge. - Include __force_merge directly in drm buddy file and remove the defrag use in amdgpu driver. - Remove list_empty() check(Matthew) - Remove unnecessary space, headers and placement of new variables(Matthew) - Add a unit test case(Matthew) Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 427 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c| 18 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 16 +- 6 files changed, 360 insertions(+), 117 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index c4222b886db7..625a30a6b855 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -38,8 +38,8 @@ static void drm_block_free(struct drm_buddy *mm, kmem_cache_free(slab_blocks, block); } -static void list_insert_sorted(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void list_insert(struct drm_buddy *mm, + struct drm_buddy_block *block) { struct drm_buddy_block *node; struct list_head *head; @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void clear_reset(struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_cleared(struct drm_buddy_block *block) +{ +
Re: [PATCH v8 1/3] drm/buddy: Implement tracking clear page feature
Hi Matthew, On 3/6/2024 11:19 PM, Matthew Auld wrote: On 04/03/2024 16:32, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld Is there a unit test for this? What about maybe something roughly like: - Pick small random mm_size which is not always power-of-two. - Allocate and free some random portion of the address space, returning those pages as non-dirty. Then do another cycle and randomly alternate between requesting dirty and non-dirty pages (not going over the limit you freed as non-dirty), putting that into two separate lists. Loop over both lists at the end checking that the dirty list is indeed all dirty pages and vice versa. Free it all again, keeping the dirty/clear status. - Also try to go over the clear limit for some allocation. The allocation should never fail with reasonable page-size. - Test the defrag/force_merge interface. Clean the mm or create new one. Intentionally fragment the address space, by creating two alternating lists. Free both lists, one as dirty the other as clean. Try to allocate double the previous size with matching min_page_size. Should fail. Call force_merge. Should now succeed. Also check that the page is always dirty after force_merge. Free the page as dirty, then repeat the whole thing, doubling the page-size until you hit max_order. - Make sure we also call fini() with some part of the address space left as non-dirty. Should not trigger any warnings. I think would be good to consider, but not a blocker or anything. Some comments below, otherwise I think looks good. Yes. It is good to have a unit test for this feature. I will send the patch. --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 294 +++--- drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c | 18 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 22 +- 6 files changed, 290 insertions(+), 60 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); a
Re: [PATCH v8 1/3] drm/buddy: Implement tracking clear page feature
Hi Matthew, Ping? Thanks, Arun. On 3/4/2024 10:02 PM, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 294 +++--- drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c| 18 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 22 +- 6 files changed, 290 insertions(+), 60 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index c4222b886db7..40131ed9b0cd 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void clear_reset(struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_cleared(struct drm_buddy_block *block) +{ + block->header |= DRM_BUDDY_HEADER_CLEAR; +} + static void mark_allocated(struct drm_buddy_block *block) { block->header &= ~DRM_BUDDY_HEADER_STATE; @@ -186,11 +196,21 @@ EXPORT_SYMBOL(drm_buddy_init); */ void drm_buddy_fini(struct drm_buddy *mm) { + u64 root_size, size; + unsigned int order; int i; + size = mm->size; + for (i = 0; i < mm->n_roots; ++i) { + order = ilog2(size) - ilog2(mm->chunk_size); + root_size = mm->chunk_size << order; + drm_bu
Re: [PATCH v8 3/3] drm/buddy: Add user for defragmentation
Hi Christian, On 3/5/2024 5:41 PM, Christian König wrote: Am 05.03.24 um 12:14 schrieb Paneer Selvam, Arunpravin: On 3/5/2024 4:33 PM, Paneer Selvam, Arunpravin wrote: Hi Christian, On 3/4/2024 10:09 PM, Christian König wrote: Am 04.03.24 um 17:32 schrieb Arunpravin Paneer Selvam: Add amdgpu driver as user for the drm buddy defragmentation. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++-- drivers/gpu/drm/drm_buddy.c | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..cff8a526c622 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size, >blocks, vres->flags); - if (unlikely(r)) - goto error_free_blocks; + if (unlikely(r)) { + if (r == -ENOSPC) { + drm_buddy_defrag(mm, min_block_size); + r = drm_buddy_alloc_blocks(mm, fpfn, + lpfn, + size, + min_block_size, + >blocks, + vres->flags); That doesn't looks like something we should do. We might fallback when contiguous memory is requested, but certainly not on normal allocation failure. yes, defrag here not useful for normal allocations. But worried about the bigger min_block_size normal allocations. In such cases, I think we should move this drm_buddy_defrag() call into buddy allocator file. For example if the required size is 1024KiB and if min_block_size is 256KiB, the allocator first tries to find the 1024KiB block, when there is no single 1024KiB block, the allocator goes one level below in freelist and tries to search for two 512KiB blocks and goes on. At one point of time if we have less space, we might go further levels below to search four 256KiB blocks to satisfy the request. Assuming if the allocator cannot find the first 256KiB block, that time I think we might need to merge the two 128KiB blocks through defragmentation function. And again for the second 256KiB block, we might need to call the defragmentation again to merge two 128KiB blocks or four 64KiB blocks to form minimum alignment size of 256KiB. This goes on for the third and fourth 256KiB blocks to complete the required size allocation of 1024KiB. Please let me know if my understanding is not correct. I don't think we should do that. We essentially have to support two different use cases: 1. Non contiguous allocation with 2MiB min_block_size for everything larger than 2MiB. Using a block size as large as possible is desirable, but not something we enforce. 2. Contiguous allocations for display, firmware etc.. Here we need to enforce a large block size and can live with the additional overhead caused by force merging. Thanks. I will make the changes accordingly. As you have suggested we can also rename this as force merge or some other names. Yeah, but just an suggestion. You are way deeper in the code and handling than I'm, so feel free to name it whatever you think fits best. Sure :) Thanks, Arun. Regards, Christian. Thanks, Arun. Thanks, Arun. Regards, Christian. + if (unlikely(r)) + goto error_free_blocks; + } else { + goto error_free_blocks; + } + } if (size > remaining_size) remaining_size = 0; diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 40131ed9b0cd..19440f8caec0 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -396,6 +396,7 @@ void drm_buddy_defrag(struct drm_buddy *mm, } } } +EXPORT_SYMBOL(drm_buddy_defrag); /** * drm_buddy_free_block - free a block
Re: [PATCH v8 3/3] drm/buddy: Add user for defragmentation
On 3/5/2024 4:33 PM, Paneer Selvam, Arunpravin wrote: Hi Christian, On 3/4/2024 10:09 PM, Christian König wrote: Am 04.03.24 um 17:32 schrieb Arunpravin Paneer Selvam: Add amdgpu driver as user for the drm buddy defragmentation. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++-- drivers/gpu/drm/drm_buddy.c | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..cff8a526c622 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size, >blocks, vres->flags); - if (unlikely(r)) - goto error_free_blocks; + if (unlikely(r)) { + if (r == -ENOSPC) { + drm_buddy_defrag(mm, min_block_size); + r = drm_buddy_alloc_blocks(mm, fpfn, + lpfn, + size, + min_block_size, + >blocks, + vres->flags); That doesn't looks like something we should do. We might fallback when contiguous memory is requested, but certainly not on normal allocation failure. yes, defrag here not useful for normal allocations. But worried about the bigger min_block_size normal allocations. In such cases, I think we should move this drm_buddy_defrag() call into buddy allocator file. For example if the required size is 1024KiB and if min_block_size is 256KiB, the allocator first tries to find the 1024KiB block, when there is no single 1024KiB block, the allocator goes one level below in freelist and tries to search for two 512KiB blocks and goes on. At one point of time if we have less space, we might go further levels below to search four 256KiB blocks to satisfy the request. Assuming if the allocator cannot find the first 256KiB block, that time I think we might need to merge the two 128KiB blocks through defragmentation function. And again for the second 256KiB block, we might need to call the defragmentation again to merge two 128KiB blocks or four 64KiB blocks to form minimum alignment size of 256KiB. This goes on for the third and fourth 256KiB blocks to complete the required size allocation of 1024KiB. Please let me know if my understanding is not correct. As you have suggested we can also rename this as force merge or some other names. Thanks, Arun. Thanks, Arun. Regards, Christian. + if (unlikely(r)) + goto error_free_blocks; + } else { + goto error_free_blocks; + } + } if (size > remaining_size) remaining_size = 0; diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 40131ed9b0cd..19440f8caec0 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -396,6 +396,7 @@ void drm_buddy_defrag(struct drm_buddy *mm, } } } +EXPORT_SYMBOL(drm_buddy_defrag); /** * drm_buddy_free_block - free a block
Re: [PATCH v8 3/3] drm/buddy: Add user for defragmentation
Hi Christian, On 3/4/2024 10:09 PM, Christian König wrote: Am 04.03.24 um 17:32 schrieb Arunpravin Paneer Selvam: Add amdgpu driver as user for the drm buddy defragmentation. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++-- drivers/gpu/drm/drm_buddy.c | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..cff8a526c622 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size, >blocks, vres->flags); - if (unlikely(r)) - goto error_free_blocks; + if (unlikely(r)) { + if (r == -ENOSPC) { + drm_buddy_defrag(mm, min_block_size); + r = drm_buddy_alloc_blocks(mm, fpfn, + lpfn, + size, + min_block_size, + >blocks, + vres->flags); That doesn't looks like something we should do. We might fallback when contiguous memory is requested, but certainly not on normal allocation failure. yes, defrag here not useful for normal allocations. But worried about the bigger min_block_size normal allocations. In such cases, I think we should move this drm_buddy_defrag() call into buddy allocator file. For example if the required size is 1024KiB and if min_block_size is 256KiB, the allocator first tries to find the 1024KiB block, when there is no single 1024KiB block, the allocator goes one level below in freelist and tries to search for two 512KiB blocks and goes on. At one point of time if we have less space, we might go further levels below to search four 256KiB blocks to satisfy the request. Assuming if the allocator cannot find the first 256KiB block, that time I think we might need to merge the two 128KiB blocks through defragmentation function. And again for the second 256KiB block, we might need to call the defragmentation again to merge two 128KiB blocks or four 64KiB blocks to form minimum alignment size of 256KiB. This goes on for the third and fourth 256KiB blocks to complete the required size allocation of 1024KiB. Please let me know if my understanding is not correct. Thanks, Arun. Regards, Christian. + if (unlikely(r)) + goto error_free_blocks; + } else { + goto error_free_blocks; + } + } if (size > remaining_size) remaining_size = 0; diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 40131ed9b0cd..19440f8caec0 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -396,6 +396,7 @@ void drm_buddy_defrag(struct drm_buddy *mm, } } } +EXPORT_SYMBOL(drm_buddy_defrag); /** * drm_buddy_free_block - free a block
[PATCH v8 2/3] drm/amdgpu: Enable clear page functionality
Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 --- .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 5 ++ 6 files changed, 111 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..c92d92b28a57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h" /** * DOC: amdgpu_object @@ -601,8 +602,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (adev->ras_enabled) - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -632,15 +632,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && bo->tbo.resource->mem_type == TTM_PL_VRAM) { - struct dma_fence *fence; + struct dma_fence *fence = NULL; - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true); + r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, ); if (unlikely(r)) goto fail_unreserve; - dma_resv_add_fence(bo->tbo.base.resv, fence, - DMA_RESV_USAGE_KERNEL); - dma_fence_put(fence); + if (fence) { + dma_resv_add_fence(bo->tbo.base.resv, fence, + DMA_RESV_USAGE_KERNEL); + dma_fence_put(fence); + } } if (!bp->resv) amdgpu_bo_unreserve(bo); @@ -1365,8 +1367,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true); + r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true); if (!WARN_ON(r)) { + struct amdgpu_vram_mgr_resource *vres; + + vres = to_amdgpu_vram_mgr_resource(bo->resource); + vres->flags |= DRM_BUDDY_CLEARED; amdgpu_bo_fence(abo, fence, false); dma_fence_put(fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05..50fcd86e1033 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) } } +/** + * amdgpu_res_cleared - check if blocks are cleared + * + * @cur: the cursor to extract the block + * + * Check if the @cur block is cleared + */ +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) +{ + struct drm_buddy_block *block; + + switch (cur->mem_type) { + case TTM_PL_VRAM: + block = cur->node; + + if (!amdgpu_vram_mgr_is_cleared(block)) + return false; + break; + default: + return false; + } + + return true; +} + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/am
[PATCH v8 3/3] drm/buddy: Add user for defragmentation
Add amdgpu driver as user for the drm buddy defragmentation. Signed-off-by: Arunpravin Paneer Selvam --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++-- drivers/gpu/drm/drm_buddy.c | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..cff8a526c622 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size, >blocks, vres->flags); - if (unlikely(r)) - goto error_free_blocks; + if (unlikely(r)) { + if (r == -ENOSPC) { + drm_buddy_defrag(mm, min_block_size); + r = drm_buddy_alloc_blocks(mm, fpfn, + lpfn, + size, + min_block_size, + >blocks, + vres->flags); + if (unlikely(r)) + goto error_free_blocks; + } else { + goto error_free_blocks; + } + } if (size > remaining_size) remaining_size = 0; diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 40131ed9b0cd..19440f8caec0 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -396,6 +396,7 @@ void drm_buddy_defrag(struct drm_buddy *mm, } } } +EXPORT_SYMBOL(drm_buddy_defrag); /** * drm_buddy_free_block - free a block -- 2.25.1
[PATCH v8 1/3] drm/buddy: Implement tracking clear page feature
- Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. - Add a function to support defragmentation. v1: - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list(Christian) - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian) - Defragment the memory beginning from min_order till the required memory space is available. v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. v3: - fix Gitlab user reported lockup issue. - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew) - modify to pass the root order instead max_order in fini() function(Matthew) - change bool 1 to true(Matthew) - add check if min_block_size is power of 2(Matthew) - modify the min_block_size datatype to u64(Matthew) Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 294 +++--- drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c| 18 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 22 +- 6 files changed, 290 insertions(+), 60 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index c4222b886db7..40131ed9b0cd 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void clear_reset(struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_cleared(struct drm_buddy_block *block) +{ + block->header |= DRM_BUDDY_HEADER_CLEAR; +} + static void mark_allocated(struct drm_buddy_block *block) { block->header &= ~DRM_BUDDY_HEADER_STATE; @@ -186,11 +196,21 @@ EXPORT_SYMBOL(drm_buddy_init); */ void drm_buddy_fini(struct drm_buddy *mm) { + u64 root_size, size; + unsigned int order; int i; + size = mm->size; + for (i = 0; i < mm->n_roots; ++i) { + order = ilog2(size) - ilog2(mm->chunk_size); + root_size = mm->chunk_size << order; + drm_buddy_defrag(mm, root_size); + WARN_ON(!drm_buddy_block_is_free(mm->roots[i])); drm_block_free(mm, mm->roots[i]); + +
Re: [PATCH v7 3/3] drm/buddy: Add defragmentation support
Hi Matthew, On 2/22/2024 12:12 AM, Matthew Auld wrote: On 21/02/2024 12:18, Arunpravin Paneer Selvam wrote: Add a function to support defragmentation. v1: - Defragment the memory beginning from min_order till the required memory space is available. v2(Matthew): - add amdgpu user for defragmentation - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++- drivers/gpu/drm/drm_buddy.c | 93 +--- include/drm/drm_buddy.h | 3 + 3 files changed, 97 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..cff8a526c622 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size, >blocks, vres->flags); - if (unlikely(r)) - goto error_free_blocks; + if (unlikely(r)) { + if (r == -ENOSPC) { + drm_buddy_defrag(mm, min_block_size); + r = drm_buddy_alloc_blocks(mm, fpfn, + lpfn, + size, + min_block_size, + >blocks, + vres->flags); + if (unlikely(r)) + goto error_free_blocks; + } else { + goto error_free_blocks; + } + } if (size > remaining_size) remaining_size = 0; diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 18e004fa39d3..56bd1560fbcd 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -203,6 +203,8 @@ void drm_buddy_fini(struct drm_buddy *mm) drm_block_free(mm, mm->roots[i]); } + drm_buddy_defrag(mm, mm->chunk_size << mm->max_order); I think this needs to be called higher up, otherwise we blow up with the WARN, plus we just freed the root(s). There is also the case with non-power-of-two VRAM size, in which case you get multiple roots and max_order is just the largest root and not entire address space. I guess do this in the loop above and use the root order instead? Also this should be done as part of the first patch and then in this patch it is just a case of exporting it. Every commit should ideally be functional by itself. You mean we move the above change in drm_buddy_fini function and drm_buddy_defrag function as part of first patch. And just we add export function and add amdgpu user in this patch. Is my understanding correct? Thanks, Arun. + WARN_ON(mm->avail != mm->size); kfree(mm->roots); @@ -276,25 +278,39 @@ drm_get_buddy(struct drm_buddy_block *block) } EXPORT_SYMBOL(drm_get_buddy); -static void __drm_buddy_free(struct drm_buddy *mm, - struct drm_buddy_block *block) +static unsigned int __drm_buddy_free(struct drm_buddy *mm, + struct drm_buddy_block *block, + bool defrag) { + unsigned int order, block_order; struct drm_buddy_block *parent; + block_order = drm_buddy_block_order(block); + while ((parent = block->parent)) { - struct drm_buddy_block *buddy; + struct drm_buddy_block *buddy = NULL; buddy = __get_buddy(block); if (!drm_buddy_block_is_free(buddy)) break; - if (drm_buddy_block_is_clear(block) != - drm_buddy_block_is_clear(buddy)) - break; + if (!defrag) { + /* + * Check the block and its buddy clear state and exit + * the loop if they both have the dissimilar state. + */ + if (drm_buddy_block_is_clear(block) != + drm_buddy_block_is_clear(buddy)) + break; - if (drm_buddy_block_is_clear(block)) - mark_cleared(parent); + if (drm_buddy_block_is_clear(block)) + mark_cleared(parent); + } + + WARN_ON(defrag && + (drm_buddy_block_is_clear(block) == + drm_buddy_block_is_clear(buddy))); list_del(>link); @@ -304,8 +320,57 @@ static void __drm_buddy_free(struct drm_buddy *mm, block = parent; } - mark_free(mm, block); + order = drm_buddy_block_order(block); + if (block_order != orde
Re: [PATCH v7 3/3] drm/buddy: Add defragmentation support
Hi Christian, On 2/21/2024 7:58 PM, Christian König wrote: Am 21.02.24 um 13:18 schrieb Arunpravin Paneer Selvam: Add a function to support defragmentation. Thinking more about it maybe you want to call this function differently. Essentially we are force merging pages even if their cleared flag doesn't match, that is something different than defragmentation I think. Maybe rename it for force_merge or something similar. Not mandatory, just an idea how to improve the readability of the code. Apart from that just let me know when I can push it to drm-misc-next. Christian. I am fixing few issues reported by user and addressing Matthew's suggestions on v7 version. I will post the v8 version incorporating all the changes. I hope we can push v8 into drm-misc-next. Thanks, Arun. v1: - Defragment the memory beginning from min_order till the required memory space is available. v2(Matthew): - add amdgpu user for defragmentation - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++- drivers/gpu/drm/drm_buddy.c | 93 +--- include/drm/drm_buddy.h | 3 + 3 files changed, 97 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..cff8a526c622 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size, >blocks, vres->flags); - if (unlikely(r)) - goto error_free_blocks; + if (unlikely(r)) { + if (r == -ENOSPC) { + drm_buddy_defrag(mm, min_block_size); + r = drm_buddy_alloc_blocks(mm, fpfn, + lpfn, + size, + min_block_size, + >blocks, + vres->flags); + if (unlikely(r)) + goto error_free_blocks; + } else { + goto error_free_blocks; + } + } if (size > remaining_size) remaining_size = 0; diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 18e004fa39d3..56bd1560fbcd 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -203,6 +203,8 @@ void drm_buddy_fini(struct drm_buddy *mm) drm_block_free(mm, mm->roots[i]); } + drm_buddy_defrag(mm, mm->chunk_size << mm->max_order); + WARN_ON(mm->avail != mm->size); kfree(mm->roots); @@ -276,25 +278,39 @@ drm_get_buddy(struct drm_buddy_block *block) } EXPORT_SYMBOL(drm_get_buddy); -static void __drm_buddy_free(struct drm_buddy *mm, - struct drm_buddy_block *block) +static unsigned int __drm_buddy_free(struct drm_buddy *mm, + struct drm_buddy_block *block, + bool defrag) { + unsigned int order, block_order; struct drm_buddy_block *parent; + block_order = drm_buddy_block_order(block); + while ((parent = block->parent)) { - struct drm_buddy_block *buddy; + struct drm_buddy_block *buddy = NULL; buddy = __get_buddy(block); if (!drm_buddy_block_is_free(buddy)) break; - if (drm_buddy_block_is_clear(block) != - drm_buddy_block_is_clear(buddy)) - break; + if (!defrag) { + /* + * Check the block and its buddy clear state and exit + * the loop if they both have the dissimilar state. + */ + if (drm_buddy_block_is_clear(block) != + drm_buddy_block_is_clear(buddy)) + break; - if (drm_buddy_block_is_clear(block)) - mark_cleared(parent); + if (drm_buddy_block_is_clear(block)) + mark_cleared(parent); + } + + WARN_ON(defrag && + (drm_buddy_block_is_clear(block) == + drm_buddy_block_is_clear(buddy))); list_del(>link); @@ -304,8 +320,57 @@ static void __drm_buddy_free(struct drm_buddy *mm, block = parent; } - mark_free(mm, block); + order = drm_buddy_block_order(block); + if (block_order != order) + mark_free(mm, block); + + return order; +} + +/** + * drm_buddy_defrag - Defragmentation routi
Re: [PATCH v6 1/3] drm/buddy: Implement tracking clear page feature
On 2/16/2024 5:33 PM, Matthew Auld wrote: On 08/02/2024 15:49, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. v1: (Christian) - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list. - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy) v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Probably needs a new unit test. I think we are missing something to forcefully re-merge everything at fini()? In theory we can just call the defrag routine. Otherwise we might trigger various warnings since the root(s) might still be split. Also one nit below. Otherwise I think looks good. --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 192 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c | 10 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 18 +- 6 files changed, 187 insertions(+), 49 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index f57e6d74fb0e..33ad0cfbd54c 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void clear_reset(struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_cleared(struct drm_buddy_block *block) +{ + block->header |= DRM_BUDDY_HEADER_CLEAR; +} + static void mark_allocated(struct drm_buddy_block *block) { block->header &= ~DRM_BUDDY_HEADER_STATE; @@ -223,6 +233,12 @@ static int split_block(struct drm_buddy *mm, mark_free(mm, block->left); mark_free(mm, block->right); + if (drm_buddy_block_is_clear(block)) { + mark_cleared(block->left); + mark_cleared(block->right); + clear_reset(block); + } + mark_split(block); return 0; @@ -273,6 +289,13 @@ static void __drm_buddy_free(struct drm_buddy *mm, if (!drm_buddy_block_is_free(buddy)) break; + if (drm_buddy_block_is_clear(block) != + drm_buddy_block_is_clear(buddy)) + break; + + if (drm_buddy_block_is_clear(block)) + mark_cleared(parent); + list_del(>link); drm_block_free(mm, block); @@ -295,26 +318,61 @@ void drm_buddy_free_block(struct drm_buddy *mm, { BUG_ON(!drm_buddy_block_is_allocated(block)); mm->avail += drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block))
Re: [PATCH v6 1/3] drm/buddy: Implement tracking clear page feature
On 2/16/2024 5:33 PM, Matthew Auld wrote: On 08/02/2024 15:49, Arunpravin Paneer Selvam wrote: - Add tracking clear page feature. - Driver should enable the DRM_BUDDY_CLEARED flag if it successfully clears the blocks in the free path. On the otherhand, DRM buddy marks each block as cleared. - Track the available cleared pages size - If driver requests cleared memory we prefer cleared memory but fallback to uncleared if we can't find the cleared blocks. when driver requests uncleared memory we try to use uncleared but fallback to cleared memory if necessary. - When a block gets freed we clear it and mark the freed block as cleared, when there are buddies which are cleared as well we can merge them. Otherwise, we prefer to keep the blocks as separated. v1: (Christian) - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as cleared. Else, reset the clear flag for each block in the list. - For merging the 2 cleared blocks compare as below, drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy) v2: (Matthew) - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks operation within drm buddy. - Write a macro block_incompatible() to allocate the required blocks. - Update the xe driver for the drm_buddy_free_list change in arguments. Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Matthew Auld Suggested-by: Christian König Probably needs a new unit test. Sure, I am working on it. I will send in a separate patch. I think we are missing something to forcefully re-merge everything at fini()? In theory we can just call the defrag routine. Otherwise we might trigger various warnings since the root(s) might still be split. I have added the full defrag in the fini() function. Please review the patch number 3. Thanks, Arun. Also one nit below. Otherwise I think looks good. --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/drm_buddy.c | 192 ++ drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 6 +- drivers/gpu/drm/tests/drm_buddy_test.c | 10 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 4 +- include/drm/drm_buddy.h | 18 +- 6 files changed, 187 insertions(+), 49 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..c0c851409241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); error_fini: ttm_resource_fini(man, >base); @@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, >blocks); + drm_buddy_free_list(mm, >blocks, 0); mutex_unlock(>lock); atomic64_sub(vis_usage, >vis_usage); @@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) { - drm_buddy_free_list(>mm, >allocated); + drm_buddy_free_list(>mm, >allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index f57e6d74fb0e..33ad0cfbd54c 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm, __list_add(>link, node->link.prev, >link); } +static void clear_reset(struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_cleared(struct drm_buddy_block *block) +{ + block->header |= DRM_BUDDY_HEADER_CLEAR; +} + static void mark_allocated(struct drm_buddy_block *block) { block->header &= ~DRM_BUDDY_HEADER_STATE; @@ -223,6 +233,12 @@ static int split_block(struct drm_buddy *mm, mark_free(mm, block->left); mark_free(mm, block->right); + if (drm_buddy_block_is_clear(block)) { + mark_cleared(block->left); + mark_cleared(block->right); + clear_reset(block); + } + mark_split(block); return 0; @@ -273,6 +289,13 @@ static void __drm_buddy_free(struct drm_buddy *mm, if (!drm_buddy_block_is_free(buddy)) break; + if (drm_buddy_block_is_clear(block) != + drm_buddy_block_is_clear(buddy)) + break; + + if (drm_buddy_block_is_clear(block)) + mark_cleared(parent); + list_del(>link); drm_block_free(mm, block); @@ -295,26 +318,61 @@ void drm_buddy_free_block(str
[PATCH v7 3/3] drm/buddy: Add defragmentation support
Add a function to support defragmentation. v1: - Defragment the memory beginning from min_order till the required memory space is available. v2(Matthew): - add amdgpu user for defragmentation - add a warning if the two blocks are incompatible on defragmentation - call full defragmentation in the fini() function - place a condition to test if min_order is equal to 0 - replace the list with safe_reverse() variant as we might remove the block from the list. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Matthew Auld --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++- drivers/gpu/drm/drm_buddy.c | 93 +--- include/drm/drm_buddy.h | 3 + 3 files changed, 97 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index e494f5bf136a..cff8a526c622 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -533,8 +533,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, min_block_size, >blocks, vres->flags); - if (unlikely(r)) - goto error_free_blocks; + if (unlikely(r)) { + if (r == -ENOSPC) { + drm_buddy_defrag(mm, min_block_size); + r = drm_buddy_alloc_blocks(mm, fpfn, + lpfn, + size, + min_block_size, + >blocks, + vres->flags); + if (unlikely(r)) + goto error_free_blocks; + } else { + goto error_free_blocks; + } + } if (size > remaining_size) remaining_size = 0; diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 18e004fa39d3..56bd1560fbcd 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -203,6 +203,8 @@ void drm_buddy_fini(struct drm_buddy *mm) drm_block_free(mm, mm->roots[i]); } + drm_buddy_defrag(mm, mm->chunk_size << mm->max_order); + WARN_ON(mm->avail != mm->size); kfree(mm->roots); @@ -276,25 +278,39 @@ drm_get_buddy(struct drm_buddy_block *block) } EXPORT_SYMBOL(drm_get_buddy); -static void __drm_buddy_free(struct drm_buddy *mm, -struct drm_buddy_block *block) +static unsigned int __drm_buddy_free(struct drm_buddy *mm, +struct drm_buddy_block *block, +bool defrag) { + unsigned int order, block_order; struct drm_buddy_block *parent; + block_order = drm_buddy_block_order(block); + while ((parent = block->parent)) { - struct drm_buddy_block *buddy; + struct drm_buddy_block *buddy = NULL; buddy = __get_buddy(block); if (!drm_buddy_block_is_free(buddy)) break; - if (drm_buddy_block_is_clear(block) != - drm_buddy_block_is_clear(buddy)) - break; + if (!defrag) { + /* +* Check the block and its buddy clear state and exit +* the loop if they both have the dissimilar state. +*/ + if (drm_buddy_block_is_clear(block) != + drm_buddy_block_is_clear(buddy)) + break; - if (drm_buddy_block_is_clear(block)) - mark_cleared(parent); + if (drm_buddy_block_is_clear(block)) + mark_cleared(parent); + } + + WARN_ON(defrag && + (drm_buddy_block_is_clear(block) == +drm_buddy_block_is_clear(buddy))); list_del(>link); @@ -304,8 +320,57 @@ static void __drm_buddy_free(struct drm_buddy *mm, block = parent; } - mark_free(mm, block); + order = drm_buddy_block_order(block); + if (block_order != order) + mark_free(mm, block); + + return order; +} + +/** + * drm_buddy_defrag - Defragmentation routine + * + * @mm: DRM buddy manager + * @min_block_size: minimum size in bytes to begin + * the defragmenta
[PATCH v7 2/3] drm/amdgpu: Enable clear page functionality
Add clear page support in vram memory region. v1(Christian): - Dont handle clear page as TTM flag since when moving the BO back in from GTT again we don't need that. - Make a specialized version of amdgpu_fill_buffer() which only clears the VRAM areas which are not already cleared - Drop the TTM_PL_FLAG_WIPE_ON_RELEASE check in amdgpu_object.c v2: - Modify the function name amdgpu_ttm_* (Alex) - Drop the delayed parameter (Christian) - handle amdgpu_res_cleared() just above the size calculation (Christian) - Use AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE for clearing the buffers in the free path to properly wait for fences etc.. (Christian) v3(Christian): - Remove buffer clear code in VRAM manager instead change the AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE handling to set the DRM_BUDDY_CLEARED flag. - Remove ! from amdgpu_res_cleared() check. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Acked-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 22 --- .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 5 ++ 6 files changed, 111 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 010b0cb7693c..904d71b3393f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h" /** * DOC: amdgpu_object @@ -595,8 +596,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (adev->ras_enabled) - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; + bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = >mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -626,15 +626,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && bo->tbo.resource->mem_type == TTM_PL_VRAM) { - struct dma_fence *fence; + struct dma_fence *fence = NULL; - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true); + r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, ); if (unlikely(r)) goto fail_unreserve; - dma_resv_add_fence(bo->tbo.base.resv, fence, - DMA_RESV_USAGE_KERNEL); - dma_fence_put(fence); + if (fence) { + dma_resv_add_fence(bo->tbo.base.resv, fence, + DMA_RESV_USAGE_KERNEL); + dma_fence_put(fence); + } } if (!bp->resv) amdgpu_bo_unreserve(bo); @@ -1359,8 +1361,12 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) return; - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, , true); + r = amdgpu_fill_buffer(abo, 0, bo->base.resv, , true); if (!WARN_ON(r)) { + struct amdgpu_vram_mgr_resource *vres; + + vres = to_amdgpu_vram_mgr_resource(bo->resource); + vres->flags |= DRM_BUDDY_CLEARED; amdgpu_bo_fence(abo, fence, false); dma_fence_put(fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05..50fcd86e1033 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) } } +/** + * amdgpu_res_cleared - check if blocks are cleared + * + * @cur: the cursor to extract the block + * + * Check if the @cur block is cleared + */ +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) +{ + struct drm_buddy_block *block; + + switch (cur->mem_type) { + case TTM_PL_VRAM: + block = cur->node; + + if (!amdgpu_vram_mgr_is_cleared(block)) + return false; + break; + default: + return false; + } + + return true; +} + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/am