RE: [PATCH] drm/amdkfd: simplify APU VRAM handling

2024-05-25 Thread Yu, Lang
[Public]

Reviewed-by: Lang Yu 

>-Original Message-
>From: amd-gfx  On Behalf Of Alex
>Deucher
>Sent: Friday, May 24, 2024 10:08 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander 
>Subject: [PATCH] drm/amdkfd: simplify APU VRAM handling
>
>With commit 89773b85599a
>("drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs") big and
>small APU "VRAM" handling in KFD was unified.  Since AMD_IS_APU is set for both
>big and small APUs, we can simplify the checks in the code.
>
>v2: clean up a few more places (Lang)
>
>Signed-off-by: Alex Deucher 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 
> drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  6 ++
> drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  1 -
> 4 files changed, 11 insertions(+), 14 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>index 336eb51c4839..3af00b57cd8a 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
>amdgpu_device *adev,
>   return -EINVAL;
>
>   vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
>-  if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>+  if (adev->flags & AMD_IS_APU) {
>   system_mem_needed = size;
>   ttm_mem_needed = size;
>   }
>@@ -233,7 +233,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
>amdgpu_device *adev,
>   if (adev && xcp_id >= 0) {
>   adev->kfd.vram_used[xcp_id] += vram_needed;
>   adev->kfd.vram_used_aligned[xcp_id] +=
>-  (adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU) ?
>+  (adev->flags & AMD_IS_APU) ?
>   vram_needed :
>   ALIGN(vram_needed,
>VRAM_AVAILABLITY_ALIGN);
>   }
>@@ -261,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct
>amdgpu_device *adev,
>
>   if (adev) {
>   adev->kfd.vram_used[xcp_id] -= size;
>-  if (adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU) {
>+  if (adev->flags & AMD_IS_APU) {
>   adev->kfd.vram_used_aligned[xcp_id] -= size;
>   kfd_mem_limit.system_mem_used -= size;
>   kfd_mem_limit.ttm_mem_used -= size; @@ -
>894,7 +894,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct
>kgd_mem *mem,
>* if peer device has large BAR. In contrast, access over xGMI is
>* allowed for both small and large BAR configurations of peer device
>*/
>-  if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU)) &&
>+  if ((adev != bo_adev && !(adev->flags & AMD_IS_APU)) &&
>   ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
>(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
>(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)))
>{ @@ -1682,7 +1682,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct
>amdgpu_device *adev,
>   - atomic64_read(>vram_pin_size)
>   - reserved_for_pt;
>
>-  if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>+  if (adev->flags & AMD_IS_APU) {
>   system_mem_available = no_system_mem_limit ?
>
>   kfd_mem_limit.max_system_mem_limit :
>   kfd_mem_limit.max_system_mem_limit
>- @@ -1730,7 +1730,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>   if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
>   domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
>
>-  if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>+  if (adev->flags & AMD_IS_APU) {
>   domain = AMDGPU_GEM_DOMAIN_GTT;
>   alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
>   alloc_flags = 0;
>@@ -1981,7 +1981,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>   if (size) {
>   if (!is_imported &&
>  (mem->bo->preferred_domains ==
>AMDGPU_GEM_DOMAIN_VRAM ||
>- ((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) &&
>+ ((adev->flags & AMD_IS_APU) &&
>   mem->bo->preferred_domains ==
>AMDGPU_GEM_DOMAIN_GTT)))
>   *size = bo_size;
>   else
>@@ -2404,7 +2404,7 @@ static int import_obj_create(struct amdgpu_device
>*adev,
>   (*mem)->bo = bo;
>   (*mem)->va = va;
>   (*mem)->domain = (bo->preferred_domains &
>AMDGPU_GEM_DOMAIN_VRAM) &&
>-   !(adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU) ?
>+   !(adev->flags & AMD_IS_APU) ?
>   

RE: [PATCH] drm/amdkfd: simplify APU VRAM handling

2024-05-23 Thread Yu, Lang
[Public]

Hi Alex,

3 places are missed.

--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -1023,7 +1023,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1))
return -EINVAL;

-   if (adev->gmc.is_app_apu)
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)
return 0;

pgmap = >pgmap;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 91175b1bd9ac..4885d1b2cc29 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2631,7 +2631,8 @@ svm_range_best_restore_location(struct svm_range *prange,
return -1;
}

-   if (node->adev->gmc.is_app_apu)
+   if (node->adev->gmc.is_app_apu ||
+   node->adev->flags & AMD_IS_APU)
return 0;

if (prange->preferred_loc == gpuid ||
@@ -3349,7 +3350,8 @@ svm_range_best_prefetch_location(struct svm_range *prange)
goto out;
}

-   if (bo_node->adev->gmc.is_app_apu) {
+   if (bo_node->adev->gmc.is_app_apu ||
+   bo_node->adev->flags & AMD_IS_APU) {
best_loc = 0;
goto out;
}

Regards,
Lang

>-Original Message-
>From: amd-gfx  On Behalf Of Alex
>Deucher
>Sent: Friday, May 24, 2024 2:39 AM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander 
>Subject: [PATCH] drm/amdkfd: simplify APU VRAM handling
>
>With commit 89773b85599a
>("drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs") big
>and small APU "VRAM" handling in KFD was unified.  Since AMD_IS_APU is
>set for both big and small APUs, we can simplify the checks in the code.
>
>Signed-off-by: Alex Deucher 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 ---
>-
> drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  1 -
> 2 files changed, 8 insertions(+), 9 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>index 336eb51c4839..3af00b57cd8a 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
>amdgpu_device *adev,
>   return -EINVAL;
>
>   vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
>-  if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>+  if (adev->flags & AMD_IS_APU) {
>   system_mem_needed = size;
>   ttm_mem_needed = size;
>   }
>@@ -233,7 +233,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
>amdgpu_device *adev,
>   if (adev && xcp_id >= 0) {
>   adev->kfd.vram_used[xcp_id] += vram_needed;
>   adev->kfd.vram_used_aligned[xcp_id] +=
>-  (adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU) ?
>+  (adev->flags & AMD_IS_APU) ?
>   vram_needed :
>   ALIGN(vram_needed,
>VRAM_AVAILABLITY_ALIGN);
>   }
>@@ -261,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct
>amdgpu_device *adev,
>
>   if (adev) {
>   adev->kfd.vram_used[xcp_id] -= size;
>-  if (adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU) {
>+  if (adev->flags & AMD_IS_APU) {
>   adev->kfd.vram_used_aligned[xcp_id] -= size;
>   kfd_mem_limit.system_mem_used -= size;
>   kfd_mem_limit.ttm_mem_used -= size; @@ -
>894,7 +894,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev,
>struct kgd_mem *mem,
>* if peer device has large BAR. In contrast, access over xGMI is
>* allowed for both small and large BAR configurations of peer device
>*/
>-  if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU)) &&
>+  if ((adev != bo_adev && !(adev->flags & AMD_IS_APU)) &&
>   ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
>(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
>(mem->alloc_flags &
>KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) { @@ -1682,7 +1682,7 @@
>size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
>   - atomic64_read(>vram_pin_size)
>   - reserved_for_pt;
>
>-  if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>+  if (adev->flags & AMD_IS_APU) {
>   system_mem_available = no_system_mem_limit ?
>
>   kfd_mem_limit.max_system_mem_limit :
>
>   kfd_mem_limit.max_system_mem_limit - @@ -1730,7 +1730,7 @@
>int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>   if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
>   domain = alloc_domain = 

RE: [PATCH 1/4 V2] drm/amdgpu: fix invadate operation for umsch

2024-05-22 Thread Yu, Lang
[Public]

>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, May 22, 2024 12:57 PM
>To: Zhang, Jesse(Jie) ; amd-
>g...@lists.freedesktop.org
>Cc: Deucher, Alexander ; Koenig, Christian
>; Huang, Tim ; Yu, Lang
>
>Subject: Re: [PATCH 1/4 V2] drm/amdgpu: fix invadate operation for umsch
>
>
>
>On 5/22/2024 7:49 AM, Zhang, Jesse(Jie) wrote:
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>> Hi Lijo
>>
>> -Original Message-
>> From: Lazar, Lijo 
>> Sent: Tuesday, May 21, 2024 4:20 PM
>> To: Zhang, Jesse(Jie) ;
>> amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander ; Koenig, Christian
>> ; Huang, Tim ; Yu,
>Lang
>> 
>> Subject: Re: [PATCH 1/4 V2] drm/amdgpu: fix invadate operation for
>> umsch
>>
>>
>>
>> On 5/21/2024 12:46 PM, Jesse Zhang wrote:
>>> Since the type of data_size is uint32_t, adev->umsch_mm.data_size - 1
>>>>> 16 >> 16 is 0 regardless of the values of its operands
>>>
>>> So removing the operations upper_32_bits and lower_32_bits.
>>>
>>> Signed-off-by: Jesse Zhang 
>>> Suggested-by: Tim Huang 
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c | 5 ++---
>>>  1 file changed, 2 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c
>>> index 2c5e7b0a73f9..ce3bb12e3572 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c
>>> @@ -116,9 +116,8 @@ static int
>umsch_mm_v4_0_load_microcode(struct amdgpu_umsch_mm *umsch)
>>>   upper_32_bits(adev->umsch_mm.data_start_addr));
>>>
>>>   WREG32_SOC15_UMSCH(regVCN_MES_LOCAL_MASK0_LO,
>>> - lower_32_bits(adev->umsch_mm.data_size - 1));
>>> - WREG32_SOC15_UMSCH(regVCN_MES_LOCAL_MASK0_HI,
>>> - upper_32_bits(adev->umsch_mm.data_size - 1));
>>> + adev->umsch_mm.data_size - 1);
>>> + WREG32_SOC15_UMSCH(regVCN_MES_LOCAL_MASK0_HI, 0);
>>
>> cc: Lang
>>
>> The original programming and the new one doesn't look correct.
>>
>> I see the below field definitions as per the header. As per this, both LO/HI
>are 16-bit fields.
>>
>> vcn/vcn_4_0_5_sh_mask.h:#define
>VCN_MES_LOCAL_MASK0_HI__MASK0_HI__SHIFT
>>  0x0 
>> vcn/vcn_4_0_5_sh_mask.h:#define
>VCN_MES_LOCAL_MASK0_HI__MASK0_HI_MASK
>>
>> 0xL
>>
>> vcn/vcn_4_0_5_sh_mask.h:#define
>VCN_MES_LOCAL_MASK0_LO__MASK0_LO__SHIFT
>>  0x10 
>> vcn/vcn_4_0_5_sh_mask.h:#define
>VCN_MES_LOCAL_MASK0_LO__MASK0_LO_MASK
>>
>> 0xL
>>
>> [Zhang, Jesse(Jie)]
>>
>> The code seem to aligin with Windows side that have same issue. Here
>> is the windows umsch_4_0 write register
>> regVCN_MES_LOCAL_MASK0_LO/regVCN_MES_LOCAL_MASK0_HI
>>
>> enum umsch_mm_status umsch_mm_engine_init_unsecure_4_0(struct
>umsch_mm_context* umsch_mm_ip) {
>> ...
>> temp_data = (uint32_t)umsch_mm_ip-
>>umsch_mm_fw.ucode_info[fw]->data_system_size - 1;
>> data = temp_data;
>> umsch_mm_cgs_write_register(umsch_mm_ip,
>> umsch_mm_reg_offset(hwip_info, regVCN_MES_LOCAL_MASK0_LO,
>> regVCN_MES_LOCAL_MASK0_LO_BASE_IDX), data, HWBLOCK_VCN);
>>
>> data = temp_data >> 32;
>> umsch_mm_cgs_write_register(umsch_mm_ip,
>umsch_mm_reg_offset(hwip_info, regVCN_MES_LOCAL_MASK0_HI,
>regVCN_MES_LOCAL_MASK0_HI_BASE_IDX), data, HWBLOCK_VCN);
>> ...
>> }
>>
>> struct umsch_mm_ucode_consts
>> {
>>  ...
>> uint32_t data_system_size;
>> ...
>> }
>>
>
>Thanks, checked the MES spec. Looks like the mask field definitions are
>wrong. They look like copies of BASE_HI/LO fields which are used for keeping
>a 64k aligned 48-bit address.
>
>Anyway, the mask fields are for indicating size of the local heap/stack, so
>most likely won't require usage of MASK0_HI.

Yes, the programing is aligned with windows side.

There is a typo " invadate " in the patch title.

Regards,
Lang

>Thanks,
>Lijo
>
>> Thanks
>> Jesse
>>
>>
>> Thanks,
>> Lijo
>>
>>>
>>>   data = adev->firmware.load_type == AMDGPU_FW_LOAD_PSP ?
>>>  0 : adev->umsch_mm.data_fw_gpu_addr;


RE: [PATCH] drm/amdgpu/vpe: fix vpe dpm clk ratio setup failed

2024-04-30 Thread Yu, Lang
[Public]

Reviewed-by: Lang Yu 

>-Original Message-
>From: Lee, Peyton 
>Sent: Monday, April 29, 2024 2:53 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Yu, Lang
>; Liu, HaoPing (Alan) ; Lee, Peyton
>
>Subject: [PATCH] drm/amdgpu/vpe: fix vpe dpm clk ratio setup failed
>
>Some version of BIOS does not enable all clock levels, resulting in high level 
>clock
>frequency of 0.
>The number of valid CLKs must be confirmed in advance.
>
>Signed-off-by: Peyton Lee 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 13 +++--
> 1 file changed, 11 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>index c23d97d34b7e..49881073ff58 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>@@ -128,6 +128,7 @@ int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe)
>   struct dpm_clock *VPEClks;
>   struct dpm_clock *SOCClks;
>   uint32_t idx;
>+  uint32_t vpeclk_enalbled_num = 0;
>   uint32_t pratio_vmax_vnorm = 0, pratio_vnorm_vmid = 0,
>pratio_vmid_vmin = 0;
>   uint16_t pratio_vmin_freq = 0, pratio_vmid_freq = 0,
>pratio_vnorm_freq = 0, pratio_vmax_freq = 0;
>
>@@ -144,6 +145,14 @@ int amdgpu_vpe_configure_dpm(struct amdgpu_vpe
>*vpe)
>   SOCClks = clock_table.SocClocks;
>   VPEClks = clock_table.VPEClocks;
>
>+  /* Comfirm enabled vpe clk num
>+   * Enabled VPE clocks are ordered from low to high in VPEClks
>+   * The highest valid clock index+1 is the number of VPEClks
>+   */
>+  for (idx = PP_SMU_NUM_VPECLK_DPM_LEVELS; idx
>&& !vpeclk_enalbled_num; idx--)
>+  if (VPEClks[idx-1].Freq)
>+  vpeclk_enalbled_num = idx;
>+
>   /* vpe dpm only cares 4 levels. */
>   for (idx = 0; idx < VPE_MAX_DPM_LEVEL; idx++) {
>   uint32_t soc_dpm_level;
>@@ -155,8 +164,8 @@ int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe)
>   soc_dpm_level = (idx * 2) + 1;
>
>   /* clamp the max level */
>-  if (soc_dpm_level >
>PP_SMU_NUM_VPECLK_DPM_LEVELS - 1)
>-  soc_dpm_level =
>PP_SMU_NUM_VPECLK_DPM_LEVELS - 1;
>+  if (soc_dpm_level > vpeclk_enalbled_num - 1)
>+  soc_dpm_level = vpeclk_enalbled_num - 1;
>
>   min_freq = (SOCClks[soc_dpm_level].Freq <
>VPEClks[soc_dpm_level].Freq) ?
>  SOCClks[soc_dpm_level].Freq :
>VPEClks[soc_dpm_level].Freq;
>--
>2.34.1



RE: [PATCH 1/2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs

2024-04-29 Thread Yu, Lang
[Public]

>-Original Message-
>From: Kuehling, Felix 
>Sent: Saturday, April 27, 2024 6:52 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Yang, Philip ; Koenig, Christian
>; Zhang, Yifan ; Liu,
>Aaron 
>Subject: Re: [PATCH 1/2] drm/amdkfd: Let VRAM allocations go to GTT
>domain on small APUs
>
>
>On 2024-04-26 04:37, Lang Yu wrote:
>> Small APUs(i.e., consumer, embedded products) usually have a small
>> carveout device memory which can't satisfy most compute workloads
>> memory allocation requirements.
>>
>> We can't even run a Basic MNIST Example with a default 512MB carveout.
>> https://github.com/pytorch/examples/tree/main/mnist.
>>
>> Though we can change BIOS settings to enlarge carveout size, which is
>> inflexible and may bring complaint. On the other hand, the memory
>> resource can't be effectively used between host and device.
>>
>> The solution is MI300A approach, i.e., let VRAM allocations go to GTT.
>>
>> Signed-off-by: Lang Yu 
>
>Two nit-picks inline. Other than that, this patch looks reasonable to me.

Thanks. Will update them accordingly.

Regards,
Lang

>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  6 +-
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 21 +++-
>---
>>   drivers/gpu/drm/amd/amdkfd/kfd_migrate.c  |  2 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  6 --
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |  3 ++-
>>   5 files changed, 24 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 7ba05f030dd1..3295838e9a1d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -456,7 +456,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct
>amdgpu_device *adev,
>>  mem_info->local_mem_size_private =
>>  KFD_XCP_MEMORY_SIZE(adev, xcp-
>>id);
>>  } else {
>> -mem_info->local_mem_size_public = adev-
>>gmc.visible_vram_size;
>> +mem_info->local_mem_size_public = adev->flags &
>AMD_IS_APU ?
>> +  (ttm_tt_pages_limit() <<
>PAGE_SHIFT) :
>> +  adev-
>>gmc.visible_vram_size;
>>  mem_info->local_mem_size_private = adev-
>>gmc.real_vram_size -
>>  adev->gmc.visible_vram_size;
>
>On an APU the private size should be reported as 0.
>
>
>>  }
>> @@ -824,6 +826,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct
>amdgpu_device *adev, int xcp_id)
>>  }
>>  do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
>>  return ALIGN_DOWN(tmp, PAGE_SIZE);
>> +} else if (adev->flags & AMD_IS_APU) {
>> +return (ttm_tt_pages_limit() << PAGE_SHIFT);
>>  } else {
>>  return adev->gmc.real_vram_size;
>>  }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index c4f9960dafbb..7eb5afcc4895 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
>amdgpu_device *adev,
>>  return -EINVAL;
>>
>>  vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
>> -if (adev->gmc.is_app_apu) {
>> +if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>>  system_mem_needed = size;
>>  ttm_mem_needed = size;
>>  }
>> @@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
>amdgpu_device *adev,
>>"adev reference can't be null when vram is used");
>>  if (adev && xcp_id >= 0) {
>>  adev->kfd.vram_used[xcp_id] += vram_needed;
>> -adev->kfd.vram_used_aligned[xcp_id] += adev-
>>gmc.is_app_apu ?
>> +adev->kfd.vram_used_aligned[xcp_id] +=
>> +(adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU) ?
>>  vram_needed :
>>  ALIGN(vram_needed,
>VRAM_AVAILABLITY_ALIGN);
>>  }
>> @@ -260,7 +261,7 @@ void
>amdgpu_amdkfd_unreserve_

RE: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on small APUs

2024-04-29 Thread Yu, Lang
[Public]

>-Original Message-
>From: Kuehling, Felix 
>Sent: Saturday, April 27, 2024 6:45 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Yang, Philip ; Koenig, Christian
>; Zhang, Yifan ; Liu,
>Aaron 
>Subject: Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on
>small APUs
>
>On 2024-04-26 04:37, Lang Yu wrote:
>> The default ttm_tt_pages_limit is 1/2 of system memory.
>> It is prone to out of memory with such a configuration.
>Indiscriminately allowing the violation of all memory limits is not a good
>solution. It will lead to poor performance once you actually reach
>ttm_pages_limit and TTM starts swapping out BOs.

Hi Felix,

I just feel it's like a bug that 1/2 of system memory is fee, the driver tells 
users out of memory.
On the other hand, if memory is available, why not use it.

By the way, can we use USERPTR for VRAM allocations?
Then we don't have ttm_tt_pages_limit limitations. Thanks.

I actually did some tests on Strix (12 CU@2100 MHz, 29412M 128bits 
LPDDR5@937MHz) with
https://github.com/ROCm/pytorch-micro-benchmarking.

Command: python micro_benchmarking_pytorch.py --network resnet50 
--batch-size=64 --iterations=20

1, Run 1 resnet50 (FP32, batch size 64)
Memory usage:
System mem used 6748M out of 29412M
TTM mem used 6658M out of 15719M
Memory oversubscription percentage:  0
Throughput [img/sec] : 49.04

2,  Run 2 resnet50 simultaneously (FP32, batch size 64)
Memory usage:
System mem used 13496M out of 29412M
TTM mem used 13316M out of 15719M
Memory oversubscription percentage:  0
Throughput [img/sec] (respectively) : 25.27 / 26.70

3, Run 3 resnet50 simultaneously (FP32, batch size 64)
Memory usage:
System mem used 20245M out of 29412M
TTM mem used 19974M out of 15719M
Memory oversubscription percentage:  ~27%

Throughput [img/sec](respectively) : 10.62 / 7.47 / 6.90 (In theory: 16 / 16 / 
16)

From my observations,

1, GPU is underutilized a lot, sometimes its loading is less than 50% and even 
0, when running 3 resnet50 simultaneously with ~27% memory oversubscription.
The driver is busying evicting and restoring process. It takes ~2-5 seconds to 
restore all the BOs for one process (swap in and out BOs, actually allocate and 
copy pages),
even though the process doesn't need all the allocated BOs to be resident.

2, Sometimes, the fairness can't be guaranteed between process when memory is 
oversubscribed.
They can't share the GPU equally when created with default priority.

3, The less GPU underutilization time during evicting and restoring, the less 
performance degradation under memory oversubscription.

Regards,
Lang

>Regards,
>   Felix
>
>
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  2 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   |  4 ++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12
>+---
>>   3 files changed, 12 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 3295838e9a1d..c01c6f3ab562 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct
>amdgpu_device *adev)
>>  int i;
>>  int last_valid_bit;
>>
>> -amdgpu_amdkfd_gpuvm_init_mem_limits();
>> +amdgpu_amdkfd_gpuvm_init_mem_limits(adev);
>>
>>  if (adev->kfd.dev) {
>>  struct kgd2kfd_shared_resources gpu_resources = { diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 1de021ebdd46..13284dbd8c58 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -363,7 +363,7 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct
>> amdgpu_device *adev, int xcp_id);
>>
>>
>>   #if IS_ENABLED(CONFIG_HSA_AMD)
>> -void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
>> +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device
>*adev);
>>   void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
>>  struct amdgpu_vm *vm);
>>
>> @@ -376,7 +376,7 @@ void amdgpu_amdkfd_release_notify(struct
>amdgpu_bo *bo);
>>   void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
>>   #else
>>   static inline
>> -void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>> +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device
>*adev)
>>   {
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdg

RE: [PATCH] drm/amdgpu/vpe: fix vpe dpm setup failed

2024-04-18 Thread Yu, Lang
[Public]

Reviewed-by: Lang Yu 

>-Original Message-
>From: Lee, Peyton 
>Sent: Thursday, April 18, 2024 1:13 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Liu, HaoPing (Alan)
>; Yu, Lang ; Lee, Peyton
>
>Subject: [PATCH] drm/amdgpu/vpe: fix vpe dpm setup failed
>
>The vpe dpm settings should be done before firmware is loaded.
>Otherwise, the frequency cannot be successfully raised.
>
>Signed-off-by: Peyton Lee 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c |  2 +-
> drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c   | 14 +++---
> 2 files changed, 8 insertions(+), 8 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>index 6695481f870f..c23d97d34b7e 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>@@ -205,7 +205,7 @@ int amdgpu_vpe_configure_dpm(struct amdgpu_vpe
>*vpe)
>   dpm_ctl &= 0xfffe; /* Disable DPM */
>   WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_enable),
>dpm_ctl);
>   dev_dbg(adev->dev, "%s: disable vpe dpm\n", __func__);
>-  return 0;
>+  return -EINVAL;
> }
>
> int amdgpu_vpe_psp_update_sram(struct amdgpu_device *adev) diff --git
>a/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c
>b/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c
>index 769eb8f7bb3c..09315dd5a1ec 100644
>--- a/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c
>+++ b/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c
>@@ -144,6 +144,12 @@ static int vpe_v6_1_load_microcode(struct
>amdgpu_vpe *vpe)
>   WREG32(vpe_get_reg_offset(vpe, j, regVPEC_CNTL),
>ret);
>   }
>
>+  /* setup collaborate mode */
>+  vpe_v6_1_set_collaborate_mode(vpe, true);
>+  /* setup DPM */
>+  if (amdgpu_vpe_configure_dpm(vpe))
>+  dev_warn(adev->dev, "VPE failed to enable DPM\n");
>+
>   /*
>* For VPE 6.1.1, still only need to add master's offset, and psp will
>apply it to slave as well.
>* Here use instance 0 as master.
>@@ -159,11 +165,7 @@ static int vpe_v6_1_load_microcode(struct
>amdgpu_vpe *vpe)
>   adev->vpe.cmdbuf_cpu_addr[0] = f32_offset;
>   adev->vpe.cmdbuf_cpu_addr[1] = f32_cntl;
>
>-  amdgpu_vpe_psp_update_sram(adev);
>-  vpe_v6_1_set_collaborate_mode(vpe, true);
>-  amdgpu_vpe_configure_dpm(vpe);
>-
>-  return 0;
>+  return amdgpu_vpe_psp_update_sram(adev);
>   }
>
>   vpe_hdr = (const struct vpe_firmware_header_v1_0 *)adev->vpe.fw-
>>data; @@ -196,8 +198,6 @@ static int vpe_v6_1_load_microcode(struct
>amdgpu_vpe *vpe)
>   }
>
>   vpe_v6_1_halt(vpe, false);
>-  vpe_v6_1_set_collaborate_mode(vpe, true);
>-  amdgpu_vpe_configure_dpm(vpe);
>
>   return 0;
> }
>--
>2.34.1



RE: [PATCH v2] drm/amdkfd: make sure VM is ready for updating operations

2024-04-15 Thread Yu, Lang
[Public]

ping

>-Original Message-
>From: Yu, Lang 
>Sent: Thursday, April 11, 2024 4:11 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Koenig, Christian ; Kuehling, Felix
>; Yu, Lang 
>Subject: [PATCH v2] drm/amdkfd: make sure VM is ready for updating
>operations
>
>When page table BOs were evicted but not validated before updating page
>tables, VM is still in evicting state, amdgpu_vm_update_range returns -EBUSY
>and restore_process_worker runs into a dead loop.
>
>v2: Split the BO validation and page table update into two separate loops in
>amdgpu_amdkfd_restore_process_bos. (Felix)
>  1.Validate BOs
>  2.Validate VM (and DMABuf attachments)
>  3.Update page tables for the BOs validated above
>
>Fixes: 2fdba514ad5a ("drm/amdgpu: Auto-validate DMABuf imports in
>compute VMs")
>
>Signed-off-by: Lang Yu 
>---
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 34 +++
>
> 1 file changed, 20 insertions(+), 14 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>index 0ae9fd844623..e2c9e6ddb1d1 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>@@ -2900,13 +2900,12 @@ int
>amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
>__rcu *
>
>   amdgpu_sync_create(_obj);
>
>-  /* Validate BOs and map them to GPUVM (update VM page tables).
>*/
>+  /* Validate BOs managed by KFD */
>   list_for_each_entry(mem, _info->kfd_bo_list,
>   validate_list) {
>
>   struct amdgpu_bo *bo = mem->bo;
>   uint32_t domain = mem->domain;
>-  struct kfd_mem_attachment *attachment;
>   struct dma_resv_iter cursor;
>   struct dma_fence *fence;
>
>@@ -2931,6 +2930,25 @@ int
>amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
>__rcu *
>   goto validate_map_fail;
>   }
>   }
>+  }
>+
>+  if (failed_size)
>+  pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);
>+
>+  /* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO
>+   * validations above would invalidate DMABuf imports again.
>+   */
>+  ret = process_validate_vms(process_info, );
>+  if (ret) {
>+  pr_debug("Validating VMs failed, ret: %d\n", ret);
>+  goto validate_map_fail;
>+  }
>+
>+  /* Update mappings managed by KFD. */
>+  list_for_each_entry(mem, _info->kfd_bo_list,
>+  validate_list) {
>+  struct kfd_mem_attachment *attachment;
>+
>   list_for_each_entry(attachment, >attachments, list) {
>   if (!attachment->is_mapped)
>   continue;
>@@ -2947,18 +2965,6 @@ int
>amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
>__rcu *
>   }
>   }
>
>-  if (failed_size)
>-  pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);
>-
>-  /* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO
>-   * validations above would invalidate DMABuf imports again.
>-   */
>-  ret = process_validate_vms(process_info, );
>-  if (ret) {
>-  pr_debug("Validating VMs failed, ret: %d\n", ret);
>-  goto validate_map_fail;
>-  }
>-
>   /* Update mappings not managed by KFD */
>   list_for_each_entry(peer_vm, _info->vm_list_head,
>   vm_list_node) {
>--
>2.25.1



RE: [PATCH] drm/amdkfd: make sure VM is ready for updating operations

2024-04-09 Thread Yu, Lang
[Public]


> I never saw this problem in my testing, probably because I never got my page 
> tables evicted?
I observed this problem on APUs with default 512MB VRAM when allocating memory 
aggressively from different APPs.

Will try to modify the patch per your suggestions. Thanks!

Regards,
Lang

From: Kuehling, Felix 
Sent: Wednesday, April 10, 2024 8:32 AM
To: Koenig, Christian ; Yu, Lang ; 
amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdkfd: make sure VM is ready for updating operations



On 2024-04-08 3:55, Christian König wrote:
Am 07.04.24 um 06:52 schrieb Lang Yu:

When VM is in evicting state, amdgpu_vm_update_range would return -EBUSY.
Then restore_process_worker runs into a dead loop.

Fixes: 2fdba514ad5a ("drm/amdgpu: Auto-validate DMABuf imports in compute VMs")

Mhm, while it would be good to have this case handled as error it should never 
occur in practice since we should have validated the VM before validating the 
DMA-bufs.

@Felix isn't that something we have taken care of?

The problem I saw when I implemented Auto-validate was, that migration of a BO 
invalidates its DMABuf attachments. So I need to validate the DMABuf 
attachments after validating the BOs they attach to. This auto-validation 
happens in amdgpu_vm_validate. So I needed to do the VM validation after the BO 
validation. The problem now seems to be that the BO validation happens in the 
same loop as the page table update. And the page table update fails if the VM 
is not valid.

I never saw this problem in my testing, probably because I never got my page 
tables evicted?

Anyway, I think the solution is to split the BO validation and page table 
update into two separate loops in amdgpu_amdkfd_restore_process_pos:

  1.  Validate BOs
  2.  Validate VM (and DMABuf attachments)
  3.  Update page tables for the BOs validated above

Regards,
  Felix



Regards,
Christian.




Signed-off-by: Lang Yu <mailto:lang...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..8c71fe07807a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2900,6 +2900,12 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
struct dma_fence __rcu *
amdgpu_sync_create(_obj);
  +ret = process_validate_vms(process_info, NULL);
+if (ret) {
+pr_debug("Validating VMs failed, ret: %d\n", ret);
+goto validate_map_fail;
+}
+
  /* Validate BOs and map them to GPUVM (update VM page tables). */
  list_for_each_entry(mem, _info->kfd_bo_list,
  validate_list) {



RE: [PATCH] drm/amdkfd: make sure VM is ready for updating operations

2024-04-08 Thread Yu, Lang
[AMD Official Use Only - General]

>-Original Message-
>From: Koenig, Christian 
>Sent: Monday, April 8, 2024 3:55 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Kuehling, Felix 
>Subject: Re: [PATCH] drm/amdkfd: make sure VM is ready for updating
>operations
>
>Am 07.04.24 um 06:52 schrieb Lang Yu:
>> When VM is in evicting state, amdgpu_vm_update_range would return -
>EBUSY.
>> Then restore_process_worker runs into a dead loop.
>>
>> Fixes: 2fdba514ad5a ("drm/amdgpu: Auto-validate DMABuf imports in
>> compute VMs")
>
>Mhm, while it would be good to have this case handled as error it should
>never occur in practice since we should have validated the VM before
>validating the DMA-bufs.

When page table BOs were evicted but not validated before updating page tables,
VM is still in evicting state, then the issue happened.

Regards,
Lang

>@Felix isn't that something we have taken care of?
>
>Regards,
>Christian.
>
>
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 ++
>>   1 file changed, 6 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 0ae9fd844623..8c71fe07807a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -2900,6 +2900,12 @@ int
>> amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct
>dma_fence
>> __rcu *
>>
>>  amdgpu_sync_create(_obj);
>>
>> +ret = process_validate_vms(process_info, NULL);
>> +if (ret) {
>> +pr_debug("Validating VMs failed, ret: %d\n", ret);
>> +goto validate_map_fail;
>> +}
>> +
>>  /* Validate BOs and map them to GPUVM (update VM page tables).
>*/
>>  list_for_each_entry(mem, _info->kfd_bo_list,
>>  validate_list) {



RE: [PATCH] drm/amdgpu: add post reset IP callback

2024-04-03 Thread Yu, Lang
[AMD Official Use Only - General]

>-Original Message-
>From: Sharma, Shashank 
>Sent: Wednesday, April 3, 2024 3:19 PM
>To: Yu, Lang ; Christian König
>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Koenig, Christian
>
>Subject: Re: [PATCH] drm/amdgpu: add post reset IP callback
>
>Hey Lang,
>
>On 03/04/2024 08:51, Yu, Lang wrote:
>> [AMD Official Use Only - General]
>>
>>> -Original Message-
>>> From: Christian König 
>>> Sent: Tuesday, April 2, 2024 9:38 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Koenig, Christian
>>> ; Sharma, Shashank
>>> 
>>> Subject: Re: [PATCH] drm/amdgpu: add post reset IP callback
>>>
>>> Am 28.03.24 um 05:40 schrieb Lang Yu:
>>>> There are use cases which need full GPU functionality (e.g., VM
>>>> update, TLB inavildate) when doing a GPU reset.
>>>>
>>>> Especially, the mes/umsch self tests which help validate the hw
>>>> state after hw init like ring/ib tests.
>>> I noted that before but just to repeat it once more: We can't do any
>>> MES or UMSCH validation while doing a GPU reset!
>> Yes, we can just easily disable it if it doesn't work well.
>> But it doesn't take too much effort to make it work.
>> It can expose issues as soon as possible and is useful for debugging
>purpose.
>IMO, its not that useful for debugging as well. In case of a problem, It will 
>just
>timeout waiting for MES packet write and we will still have to find out the
>actual problem which caused MES to go into bad state in the last GPU reset.
>>
>>> The ring and IB tests use some pre-allocated memory we put aside for
>>> the task during driver load and so can execute during GPU reset as well.
>> If user space can create a VM and allocate memory during GPU reset, it
>> makes no sense to prevent kernel space from doing that.
>
>I think the objection here is mostly about why to do it at all, when it is not
>helpful. It would be just a maintenance overhead.

If you think it is not helpful,  why doing ring/ib tests?
I don't think such sanity test is not helpful.

I only talk UMSCH test(different with MES test) here,
I don't think it has a maintenance overhead.

Regards,
Lang

>- Shashank
>
>>> But for the MES/UMSCH we need a full blown environment with VM and
>>> submission infrastructure and setting that up isn't possible here.
>> At least for UMSCH test, it only uses VM mapping functionality (if we
>> can create a VM with cpu update mode, that's enough), it doesn't use
>> other submission functionality.
>> It is actually a compute context.
>>
>>
>> Regards,
>> Lang
>>
>>> Adding Shashank as well, but I think we should probably just
>>> completely remove those from the kernel.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> Add a post reset IP callback to handle such use cases which will be
>>>> executed after GPU reset succeeds.
>>>>
>>>> Signed-off-by: Lang Yu 
>>>> ---
>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24
>>> ++
>>>>drivers/gpu/drm/amd/include/amd_shared.h   |  3 +++
>>>>2 files changed, 27 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 12dc71a6b5db..feeab9397aab 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -5556,6 +5556,27 @@ static int
>amdgpu_device_health_check(struct
>>> list_head *device_list_handle)
>>>>   return ret;
>>>>}
>>>>
>>>> +static int amdgpu_device_ip_post_reset(struct amdgpu_device *adev) {
>>>> +uint32_t i;
>>>> +int r;
>>>> +
>>>> +for (i = 0; i < adev->num_ip_blocks; i++) {
>>>> +if (!adev->ip_blocks[i].status.valid ||
>>>> +!adev->ip_blocks[i].version->funcs->post_reset)
>>>> +continue;
>>>> +
>>>> +r = adev->ip_blocks[i].version->funcs->post_reset(adev);
>>>> +if (r) {
>>>> +DRM_ERROR("post reset of IP block <%s>
>>> failed %d\n",
>>>> +  adev->ip_blocks[i].version->funcs->name, r);
>>>> +   

RE: [PATCH] drm/amdgpu: add post reset IP callback

2024-04-03 Thread Yu, Lang
[AMD Official Use Only - General]

>-Original Message-
>From: Christian König 
>Sent: Tuesday, April 2, 2024 9:38 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Koenig, Christian
>; Sharma, Shashank
>
>Subject: Re: [PATCH] drm/amdgpu: add post reset IP callback
>
>Am 28.03.24 um 05:40 schrieb Lang Yu:
>> There are use cases which need full GPU functionality (e.g., VM
>> update, TLB inavildate) when doing a GPU reset.
>>
>> Especially, the mes/umsch self tests which help validate the hw state
>> after hw init like ring/ib tests.
>
>I noted that before but just to repeat it once more: We can't do any MES or
>UMSCH validation while doing a GPU reset!

Yes, we can just easily disable it if it doesn't work well.
But it doesn't take too much effort to make it work.
It can expose issues as soon as possible and is useful for debugging purpose.

>The ring and IB tests use some pre-allocated memory we put aside for the
>task during driver load and so can execute during GPU reset as well.

If user space can create a VM and allocate memory during GPU reset,
it makes no sense to prevent kernel space from doing that.

>But for the MES/UMSCH we need a full blown environment with VM and
>submission infrastructure and setting that up isn't possible here.

At least for UMSCH test, it only uses VM mapping functionality
(if we can create a VM with cpu update mode, that's enough),
it doesn't use other submission functionality.
It is actually a compute context.


Regards,
Lang

>Adding Shashank as well, but I think we should probably just completely
>remove those from the kernel.
>
>Regards,
>Christian.
>
>>
>> Add a post reset IP callback to handle such use cases which will be
>> executed after GPU reset succeeds.
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24
>++
>>   drivers/gpu/drm/amd/include/amd_shared.h   |  3 +++
>>   2 files changed, 27 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 12dc71a6b5db..feeab9397aab 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5556,6 +5556,27 @@ static int amdgpu_device_health_check(struct
>list_head *device_list_handle)
>>  return ret;
>>   }
>>
>> +static int amdgpu_device_ip_post_reset(struct amdgpu_device *adev) {
>> +uint32_t i;
>> +int r;
>> +
>> +for (i = 0; i < adev->num_ip_blocks; i++) {
>> +if (!adev->ip_blocks[i].status.valid ||
>> +!adev->ip_blocks[i].version->funcs->post_reset)
>> +continue;
>> +
>> +r = adev->ip_blocks[i].version->funcs->post_reset(adev);
>> +if (r) {
>> +DRM_ERROR("post reset of IP block <%s>
>failed %d\n",
>> +  adev->ip_blocks[i].version->funcs->name, r);
>> +return r;
>> +}
>> +}
>> +
>> +return r;
>> +}
>> +
>>   /**
>>* amdgpu_device_gpu_recover - reset the asic and recover scheduler
>>*
>> @@ -5805,6 +5826,9 @@ int amdgpu_device_gpu_recover(struct
>amdgpu_device *adev,
>>  amdgpu_put_xgmi_hive(hive);
>>  }
>>
>> +if (!r && !job_signaled)
>> +r = amdgpu_device_ip_post_reset(adev);
>> +
>>  if (r)
>>  dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
>>
>> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h
>> b/drivers/gpu/drm/amd/include/amd_shared.h
>> index b0a6256e89f4..33ce30a8e3ab 100644
>> --- a/drivers/gpu/drm/amd/include/amd_shared.h
>> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
>> @@ -287,6 +287,7 @@ enum amd_dpm_forced_level;
>>* @pre_soft_reset: pre soft reset the IP block
>>* @soft_reset: soft reset the IP block
>>* @post_soft_reset: post soft reset the IP block
>> + * @post_reset: handles IP specific post reset stuff(e.g., self test)
>>* @set_clockgating_state: enable/disable cg for the IP block
>>* @set_powergating_state: enable/disable pg for the IP block
>>* @get_clockgating_state: get current clockgating status @@ -316,11
>> +317,13 @@ struct amd_ip_funcs {
>>  int (*pre_soft_reset)(void *handle);
>>  int (*soft_reset)(void *handle);
>>  int (*post_soft_reset)(void *handle);
>> +int (*post_reset)(void *handle);
>>  int (*set_clockgating_state)(void *handle,
>>   enum amd_clockgating_state state);
>>  int (*set_powergating_state)(void *handle,
>>   enum amd_powergating_state state);
>>  void (*get_clockgating_state)(void *handle, u64 *flags);
>> +
>>   };
>>
>>



RE: [PATCH 1/2] drm/amdgpu: use CPU to update VM during GPU reset

2024-04-02 Thread Yu, Lang
[AMD Official Use Only - General]

>-Original Message-
>From: Koenig, Christian 
>Sent: Friday, March 29, 2024 7:08 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander 
>Subject: Re: [PATCH 1/2] drm/amdgpu: use CPU to update VM during GPU
>reset
>
>Am 25.03.24 um 06:35 schrieb Lang Yu:
>> drm sched is stopped and SDMA mode is not available, while CPU mode
>> worked well in such a case.
>>
>> Use case,
>> amdgpu_do_asic_reset
>> amdgpu_device_ip_late_init
>> umsch_mm_late_init
>> umsch_mm_test
>> amdgpu_vm_init
>
>Well big NAK to that.
>
>The VM updates should just be scheduled and applied as soon as the GPU
>reset is completed.
>
>The problem is rather that a GPU reset should *never* create a VM to do a
>test. During GPU reset no memory allocation whatsoever is allowed.

But user space can still create a VM via open("/dev/dri/card0", ...) during GPU 
reset,
driver doesn't prevent user space from doing that. So is this reasonable? 
Thanks.

Regards,
Lang

>That's why we only do IB and ring tests with a pre-allocated memory pool
>during a GPU reset.
>
>If the umsch_mm_test abuses the VM tests like this then please remove that
>code immediately.
>
>Regards,
>Christian.
>
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 8af3f0fd3073..af53f9cfcc40 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -2404,8 +2404,8 @@ int amdgpu_vm_init(struct amdgpu_device
>*adev,
>> struct amdgpu_vm *vm,
>>
>>  vm->is_compute_context = false;
>>
>> -vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode
>&
>> -AMDGPU_VM_USE_CPU_FOR_GFX);
>> +vm->use_cpu_for_update = !!(amdgpu_in_reset(adev) ||
>> +adev->vm_manager.vm_update_mode &
>AMDGPU_VM_USE_CPU_FOR_GFX);
>>
>>  DRM_DEBUG_DRIVER("VM update mode is %s\n",
>>   vm->use_cpu_for_update ? "CPU" : "SDMA");



RE: [PATCH 1/2] drm/amdgpu: use CPU to update VM during GPU reset

2024-03-27 Thread Yu, Lang
[AMD Official Use Only - General]

Please ignore this one.

If user space calls amdgpu_vm_init between gpu reset start and end(driver 
doesn't prevent user space from doing that),

this will change VM update mode and affect user space. That's not expected.

Will find another way to solve this.

Regards,
Lang

>-Original Message-
>From: Yu, Lang 
>Sent: Monday, March 25, 2024 1:36 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Koenig, Christian
>; Yu, Lang 
>Subject: [PATCH 1/2] drm/amdgpu: use CPU to update VM during GPU reset
>
>drm sched is stopped and SDMA mode is not available, while CPU mode
>worked well in such a case.
>
>Use case,
>amdgpu_do_asic_reset
>amdgpu_device_ip_late_init
>umsch_mm_late_init
>umsch_mm_test
>amdgpu_vm_init
>
>Signed-off-by: Lang Yu 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>index 8af3f0fd3073..af53f9cfcc40 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>@@ -2404,8 +2404,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev,
>struct amdgpu_vm *vm,
>
>   vm->is_compute_context = false;
>
>-  vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode
>&
>-  AMDGPU_VM_USE_CPU_FOR_GFX);
>+  vm->use_cpu_for_update = !!(amdgpu_in_reset(adev) ||
>+  adev->vm_manager.vm_update_mode &
>AMDGPU_VM_USE_CPU_FOR_GFX);
>
>   DRM_DEBUG_DRIVER("VM update mode is %s\n",
>vm->use_cpu_for_update ? "CPU" : "SDMA");
>--
>2.25.1



RE: [PATCH] drm/amdgpu/vpe: power on vpe when hw_init

2024-03-18 Thread Yu, Lang
[Public]

Reviewed-by: Lang Yu 

>-Original Message-
>From: Lee, Peyton 
>Sent: Wednesday, March 13, 2024 7:45 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Zhang, Yifan
>; Ma, Li ; Yu, Lang
>; Lee, Peyton 
>Subject: [PATCH] drm/amdgpu/vpe: power on vpe when hw_init
>
>To fix mode2 reset failure.
>Should power on VPE when hw_init.
>
>Signed-off-by: Peyton Lee 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 6 ++
> 1 file changed, 6 insertions(+)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>index 70c5cc80ecdc..ecfe0f36e83e 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>@@ -396,6 +396,12 @@ static int vpe_hw_init(void *handle)
>   struct amdgpu_vpe *vpe = >vpe;
>   int ret;
>
>+  /* Power on VPE */
>+  ret = amdgpu_device_ip_set_powergating_state(adev,
>AMD_IP_BLOCK_TYPE_VPE,
>+   AMD_PG_STATE_UNGATE);
>+  if (ret)
>+  return ret;
>+
>   ret = vpe_load_microcode(vpe);
>   if (ret)
>   return ret;
>--
>2.34.1



RE: [PATCH] drm/amdgpu: fix mmhub client id out-of-bounds access

2024-03-06 Thread Yu, Lang
[Public]

>-Original Message-
>From: Zhang, Yifan 
>Sent: Thursday, March 7, 2024 11:11 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander 
>Subject: RE: [PATCH] drm/amdgpu: fix mmhub client id out-of-bounds access
>
>[AMD Official Use Only - General]
>
>Can we just add cid 0x140 "UMSCH" to mmhub_client_ids_v3_3 structure ?

Yes, we can. Then mmhub_client_ids_v3_3's capacity will change from 31 to 320
and most of the space are unused.

Regards,
Lang

>-Original Message-
>From: Yu, Lang 
>Sent: Thursday, March 7, 2024 10:49 AM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Zhang, Yifan
>; Yu, Lang 
>Subject: [PATCH] drm/amdgpu: fix mmhub client id out-of-bounds access
>
>Fixes: aba2be41470a ("drm/amdgpu: add mmhub 3.3.0 support")
>
>Signed-off-by: Lang Yu 
>---
> drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c | 7 +++
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c
>b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c
>index b3961968c10c..238ea40c2450 100644
>--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c
>+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c
>@@ -99,16 +99,15 @@
>mmhub_v3_3_print_l2_protection_fault_status(struct amdgpu_device *adev,
>switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) {
>case IP_VERSION(3, 3, 0):
>case IP_VERSION(3, 3, 1):
>-   mmhub_cid = mmhub_client_ids_v3_3[cid][rw];
>+   mmhub_cid = cid < ARRAY_SIZE(mmhub_client_ids_v3_3) ?
>+   mmhub_client_ids_v3_3[cid][rw] :
>+   cid == 0x140 ? "UMSCH" : NULL;
>break;
>default:
>mmhub_cid = NULL;
>break;
>}
>
>-   if (!mmhub_cid && cid == 0x140)
>-   mmhub_cid = "UMSCH";
>-
>dev_err(adev->dev, "\t Faulty UTCL2 client ID: %s (0x%x)\n",
>mmhub_cid ? mmhub_cid : "unknown", cid);
>dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n",
>--
>2.25.1
>



RE: [PATCH] drm/amdgpu: remove imu start dependency on amdgpu_dpm.

2024-01-30 Thread Yu, Lang
[Public]

Reviewed-by: Lang Yu 

>-Original Message-
>From: amd-gfx  On Behalf Of Yifan Zhang
>Sent: Saturday, January 20, 2024 4:32 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Tim
>; Feng, Kenneth ; Ma, Li
>; Zhang, Yifan 
>Subject: [PATCH] drm/amdgpu: remove imu start dependency on amdgpu_dpm.
>
>IMU starts anyway when dpm is disabled in backdoor loading.
>
>Signed-off-by: Yifan Zhang 
>---
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>index a2d3cced8f19..c5b1d036c95d 100644
>--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>@@ -4324,7 +4324,7 @@ static int gfx_v11_0_hw_init(void *handle)
>   return r;
>   } else {
>   if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
>-  if (adev->gfx.imu.funcs && (amdgpu_dpm > 0)) {
>+  if (adev->gfx.imu.funcs) {
>   if (adev->gfx.imu.funcs->load_microcode)
>   adev->gfx.imu.funcs-
>>load_microcode(adev);
>   if (adev->gfx.imu.funcs->setup_imu)
>--
>2.37.3



RE: [PATCH v2] drm/amdgpu: drm/amdgpu: remove golden setting for gfx 11.5.0

2024-01-30 Thread Yu, Lang
[Public]

Reviewed-by: Lang Yu 

>-Original Message-
>From: Zhang, Yifan 
>Sent: Tuesday, January 30, 2024 1:20 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Koenig, Christian
>; Huang, Tim ; Yu, Lang
>; Zhang, Yifan 
>Subject: [PATCH v2] drm/amdgpu: drm/amdgpu: remove golden setting for gfx
>11.5.0
>
>No need to set GC golden settings in driver from gfx 11.5.0 onwards.
>
>Signed-off-by: Yifan Zhang 
>---
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 22 --
> 1 file changed, 22 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>index c1e10760..2fb1342d5bd9 100644
>--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>@@ -107,23 +107,6 @@ static const struct soc15_reg_golden
>golden_settings_gc_11_0_1[] =
>   SOC15_REG_GOLDEN_VALUE(GC, 0, regTCP_CNTL2, 0xfcff,
>0x000a)  };
>
>-static const struct soc15_reg_golden golden_settings_gc_11_5_0[] = {
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regDB_DEBUG5, 0x,
>0x0800),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGB_ADDR_CONFIG, 0x0c1807ff,
>0x0242),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGCR_GENERAL_CNTL, 0x1ff1,
>0x0500),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2A_ADDR_MATCH_MASK,
>0x, 0xfff3),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_ADDR_MATCH_MASK,
>0x, 0xfff3),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL, 0x, 0xf37fff3f),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL3, 0xfffb,
>0x00f40188),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL4, 0xf0ff,
>0x80009007),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regPA_CL_ENHANCE, 0xf1ff,
>0x00880007),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regPC_CONFIG_CNTL_1, 0x,
>0x0001),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regTA_CNTL_AUX, 0xf7f7,
>0x0103),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regTA_CNTL2, 0x007f,
>0x),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regTCP_CNTL2, 0xffcf,
>0x200a),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regUTCL1_CTRL_2, 0x,
>0x048f)
>-};
>-
> #define DEFAULT_SH_MEM_CONFIG \
>   ((SH_MEM_ADDRESS_MODE_64 <<
>SH_MEM_CONFIG__ADDRESS_MODE__SHIFT) | \
>(SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
>SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | \ @@ -304,11 +287,6 @@
>static void gfx_v11_0_init_golden_registers(struct amdgpu_device *adev)
>   golden_settings_gc_11_0_1,
>   (const
>u32)ARRAY_SIZE(golden_settings_gc_11_0_1));
>   break;
>-  case IP_VERSION(11, 5, 0):
>-  soc15_program_register_sequence(adev,
>-  golden_settings_gc_11_5_0,
>-  (const
>u32)ARRAY_SIZE(golden_settings_gc_11_5_0));
>-  break;
>   default:
>   break;
>   }
>--
>2.37.3



RE: [PATCH v2] drm/amdkfd: reserve the BO before validating it

2024-01-29 Thread Yu, Lang
[Public]

>-Original Message-
>From: Kuehling, Felix 
>Sent: Monday, January 29, 2024 10:58 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Francis, David 
>Subject: Re: [PATCH v2] drm/amdkfd: reserve the BO before validating it
>
>On 2024-01-28 21:30, Yu, Lang wrote:
>> [AMD Official Use Only - General]
>>
>>> -Original Message-
>>> From: Kuehling, Felix 
>>> Sent: Saturday, January 27, 2024 3:22 AM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Francis, David 
>>> Subject: Re: [PATCH v2] drm/amdkfd: reserve the BO before validating
>>> it
>>>
>>>
>>> On 2024-01-25 20:59, Yu, Lang wrote:
>>>> [AMD Official Use Only - General]
>>>>
>>>>> -Original Message-
>>>>> From: Kuehling, Felix 
>>>>> Sent: Thursday, January 25, 2024 5:41 AM
>>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>>> Cc: Francis, David 
>>>>> Subject: Re: [PATCH v2] drm/amdkfd: reserve the BO before
>>>>> validating it
>>>>>
>>>>> On 2024-01-22 4:08, Lang Yu wrote:
>>>>>> Fixes: 410f08516e0f ("drm/amdkfd: Move dma unmapping after TLB
>>>>>> flush")
>>>>>>
>>>>>> v2:
>>>>>> Avoid unmapping attachment twice when ERESTARTSYS.
>>>>>>
>>>>>> [   41.708711] WARNING: CPU: 0 PID: 1463 at
>>>>> drivers/gpu/drm/ttm/ttm_bo.c:846 ttm_bo_validate+0x146/0x1b0 [ttm]
>>>>>> [   41.708989] Call Trace:
>>>>>> [   41.708992]  
>>>>>> [   41.708996]  ? show_regs+0x6c/0x80
>>>>>> [   41.709000]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>>>>>> [   41.709008]  ? __warn+0x93/0x190
>>>>>> [   41.709014]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>>>>>> [   41.709024]  ? report_bug+0x1f9/0x210
>>>>>> [   41.709035]  ? handle_bug+0x46/0x80
>>>>>> [   41.709041]  ? exc_invalid_op+0x1d/0x80
>>>>>> [   41.709048]  ? asm_exc_invalid_op+0x1f/0x30
>>>>>> [   41.709057]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80
>>>>> [amdgpu]
>>>>>> [   41.709185]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>>>>>> [   41.709197]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80
>>>>> [amdgpu]
>>>>>> [   41.709337]  ? srso_alias_return_thunk+0x5/0x7f
>>>>>> [   41.709346]  kfd_mem_dmaunmap_attachment+0x9e/0x1e0 [amdgpu]
>>>>>> [   41.709467]  amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x56/0x80
>>>>> [amdgpu]
>>>>>> [   41.709586]  kfd_ioctl_unmap_memory_from_gpu+0x1b7/0x300
>[amdgpu]
>>>>>> [   41.709710]  kfd_ioctl+0x1ec/0x650 [amdgpu]
>>>>>> [   41.709822]  ? __pfx_kfd_ioctl_unmap_memory_from_gpu+0x10/0x10
>>>>> [amdgpu]
>>>>>> [   41.709945]  ? srso_alias_return_thunk+0x5/0x7f
>>>>>> [   41.709949]  ? tomoyo_file_ioctl+0x20/0x30
>>>>>> [   41.709959]  __x64_sys_ioctl+0x9c/0xd0
>>>>>> [   41.709967]  do_syscall_64+0x3f/0x90
>>>>>> [   41.709973]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>>>>>>
>>>>>> Signed-off-by: Lang Yu 
>>>>>> ---
>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
>>>>>> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 28
>>>>> +--
>>>>>> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  4 ++-
>>>>>> 3 files changed, 29 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>>> index 584a0cea5572..41854417e487 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>>> @@ -311,7 +311,7 @@ int
>>>>> amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device
>*adev,
>>>>>>  struct kgd_mem *mem, void
>>>>> *drm_priv);
>>>>>> int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
>>>>>>struct amdgpu_device *adev, struct kgd_mem *mem,
>>>>>> void
>>>>> *drm_priv);
>>>>>> -void amdgpu_amdkf

RE: [PATCH] drm/amdgpu: remove golden setting for gfx 11.5.0

2024-01-29 Thread Yu, Lang
[Public]

>-Original Message-
>From: Zhang, Yifan 
>Sent: Monday, January 29, 2024 5:06 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Koenig, Christian
>; Huang, Tim ; Yu, Lang
>; Zhang, Yifan 
>Subject: [PATCH] drm/amdgpu: remove golden setting for gfx 11.5.0
>
>No need to set golden settings in driver from gfx 11.5.0 onwards
>
>Signed-off-by: Yifan Zhang 
>---
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 32 ++
> 1 file changed, 2 insertions(+), 30 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>index c1e10760..4e99af904e04 100644
>--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>@@ -90,10 +90,6 @@ MODULE_FIRMWARE("amdgpu/gc_11_5_0_me.bin");
> MODULE_FIRMWARE("amdgpu/gc_11_5_0_mec.bin");
> MODULE_FIRMWARE("amdgpu/gc_11_5_0_rlc.bin");
>
>-static const struct soc15_reg_golden golden_settings_gc_11_0[] = {
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regTCP_CNTL, 0x2000,
>0x2000)
>-};
>-
> static const struct soc15_reg_golden golden_settings_gc_11_0_1[] =  {
>   SOC15_REG_GOLDEN_VALUE(GC, 0, regCGTT_GS_NGG_CLK_CTRL,
>0x9fff8fff, 0x0010), @@ -104,24 +100,8 @@ static const struct
>soc15_reg_golden golden_settings_gc_11_0_1[] =
>   SOC15_REG_GOLDEN_VALUE(GC, 0, regPA_SC_ENHANCE_3, 0xfffd,
>0x0008),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, regPA_SC_VRS_SURFACE_CNTL_1,
>0xfff891ff, 0x55480100),
>   SOC15_REG_GOLDEN_VALUE(GC, 0, regTA_CNTL_AUX, 0xf7f7,
>0x0103),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regTCP_CNTL2, 0xfcff,
>0x000a)
>-};
>-
>-static const struct soc15_reg_golden golden_settings_gc_11_5_0[] = {
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regDB_DEBUG5, 0x,
>0x0800),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGB_ADDR_CONFIG, 0x0c1807ff,
>0x0242),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGCR_GENERAL_CNTL, 0x1ff1,
>0x0500),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2A_ADDR_MATCH_MASK,
>0x, 0xfff3),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_ADDR_MATCH_MASK,
>0x, 0xfff3),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL, 0x, 0xf37fff3f),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL3, 0xfffb,
>0x00f40188),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL4, 0xf0ff,
>0x80009007),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regPA_CL_ENHANCE, 0xf1ff,
>0x00880007),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regPC_CONFIG_CNTL_1, 0x,
>0x0001),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regTA_CNTL_AUX, 0xf7f7,
>0x0103),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regTA_CNTL2, 0x007f,
>0x),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regTCP_CNTL2, 0xffcf,
>0x200a),
>-  SOC15_REG_GOLDEN_VALUE(GC, 0, regUTCL1_CTRL_2, 0x,
>0x048f)
>+  SOC15_REG_GOLDEN_VALUE(GC, 0, regTCP_CNTL2, 0xfcff,
>0x000a),
>+  SOC15_REG_GOLDEN_VALUE(GC, 0, regTCP_CNTL, 0x2000,
>0x2000)
> };
>
> #define DEFAULT_SH_MEM_CONFIG \
>@@ -304,17 +284,9 @@ static void gfx_v11_0_init_golden_registers(struct
>amdgpu_device *adev)
>   golden_settings_gc_11_0_1,
>   (const
>u32)ARRAY_SIZE(golden_settings_gc_11_0_1));
>   break;
>-  case IP_VERSION(11, 5, 0):
>-  soc15_program_register_sequence(adev,
>-  golden_settings_gc_11_5_0,
>-  (const
>u32)ARRAY_SIZE(golden_settings_gc_11_5_0));
>-  break;
>   default:
>   break;
>   }
>-  soc15_program_register_sequence(adev,
>-  golden_settings_gc_11_0,
>-  (const
>u32)ARRAY_SIZE(golden_settings_gc_11_0));
>

Remove golden_settings_gc_11_0 may affect all gfx11 ASICs.

Regards,
Lang

> }
>
>--
>2.37.3



RE: [PATCH] drm/amdkfd: Relocate TBA/TMA to opposite side of VM hole (v2)

2024-01-29 Thread Yu, Lang
[AMD Official Use Only - General]

>-Original Message-
>From: amd-gfx  On Behalf Of Felix
>Kuehling
>Sent: Friday, January 26, 2024 6:28 AM
>To: amd-gfx@lists.freedesktop.org
>Cc: Cornwall, Jay ; Koenig, Christian
>; Paneer Selvam, Arunpravin
>
>Subject: [PATCH] drm/amdkfd: Relocate TBA/TMA to opposite side of VM hole (v2)
>
>The TBA and TMA, along with an unused IB allocation, reside at low addresses in
>the VM address space. A stray VM fault which hits these pages must be serviced
>by making their page table entries invalid.
>The scheduler depends upon these pages being resident and fails, preventing a
>debugger from inspecting the failure state.
>
>By relocating these pages above 47 bits in the VM address space they can only 
>be
>reached when bits [63:48] are set to 1. This makes it much less likely for a
>misbehaving program to generate accesses to them.
>The current placement at VA (PAGE_SIZE*2) is readily hit by a NULL access with 
>a
>small offset.
>
>v2:
>- Move it to the reserved space to avoid concflicts with Mesa
>- Add macros to make reserved space management easier
>
>Cc: Arunpravin Paneer Selvam 
>Cc: Christian Koenig 
>Signed-off-by: Jay Cornwall 
>Signed-off-by: Felix Kuehling 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c  |  4 +--
> drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c|  7 ++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h   | 12 ++--
> drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 30 +++-
> 4 files changed, 30 insertions(+), 23 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>index 823d31f4a2a3..53d0a458d78e 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>@@ -28,9 +28,9 @@
>
> uint64_t amdgpu_csa_vaddr(struct amdgpu_device *adev)  {
>-  uint64_t addr = adev->vm_manager.max_pfn <<
>AMDGPU_GPU_PAGE_SHIFT;
>+  uint64_t addr = AMDGPU_VA_RESERVED_CSA_START(
>+  adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT);
>
>-  addr -= AMDGPU_VA_RESERVED_CSA_SIZE;
>   addr = amdgpu_gmc_sign_extend(addr);
>
>   return addr;
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
>index 3d0d56087d41..9e769ef50f2e 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
>@@ -45,11 +45,8 @@
>  */
> static inline u64 amdgpu_seq64_get_va_base(struct amdgpu_device *adev)  {
>-  u64 addr = adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT;
>-
>-  addr -= AMDGPU_VA_RESERVED_TOP;
>-
>-  return addr;
>+  return AMDGPU_VA_RESERVED_SEQ64_START(
>+  adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT);
> }
>
> /**
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>index 98a57192..f23b6153d310 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>@@ -135,11 +135,19 @@ struct amdgpu_mem_stats;  #define
>AMDGPU_IS_MMHUB0(x) ((x) >= AMDGPU_MMHUB0_START && (x) <
>AMDGPU_MMHUB1_START)  #define AMDGPU_IS_MMHUB1(x) ((x) >=
>AMDGPU_MMHUB1_START && (x) < AMDGPU_MAX_VMHUBS)
>
>-/* Reserve 2MB at top/bottom of address space for kernel use */
>+/* Reserve space at top/bottom of address space for kernel use */
> #define AMDGPU_VA_RESERVED_CSA_SIZE   (2ULL << 20)
>+#define AMDGPU_VA_RESERVED_CSA_START(top) ((top) \
>+   -
>AMDGPU_VA_RESERVED_CSA_SIZE)
> #define AMDGPU_VA_RESERVED_SEQ64_SIZE (2ULL << 20)
>+#define AMDGPU_VA_RESERVED_SEQ64_START(top)
>   (AMDGPU_VA_RESERVED_CSA_START(top) \
>+   -
>AMDGPU_VA_RESERVED_SEQ64_SIZE)
>+#define AMDGPU_VA_RESERVED_TRAP_SIZE  (2ULL << 12)
>+#define AMDGPU_VA_RESERVED_TRAP_START(top)
>   (AMDGPU_VA_RESERVED_SEQ64_START(top) \
>+   -
>AMDGPU_VA_RESERVED_TRAP_SIZE)
> #define AMDGPU_VA_RESERVED_BOTTOM (2ULL << 20)
>-#define AMDGPU_VA_RESERVED_TOP
>   (AMDGPU_VA_RESERVED_SEQ64_SIZE + \
>+#define AMDGPU_VA_RESERVED_TOP
>   (AMDGPU_VA_RESERVED_TRAP_SIZE + \
>+
>AMDGPU_VA_RESERVED_SEQ64_SIZE + \
>
>AMDGPU_VA_RESERVED_CSA_SIZE)
>
> /* See vm_update_mode */
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>index 6604a3f99c5e..f899cce25b2a 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>@@ -36,6 +36,7 @@
> #include 
> #include 
> #include 
>+#include "amdgpu_vm.h"
>
> /*
>  * The primary memory I/O features being added for revisions of gfxip @@ -
>326,10 +327,16 @@ static void kfd_init_apertures_vi(struct kfd_process_device
>*pdd, uint8_t id)
>* with small reserved space for kernel.
>* Set them to CANONICAL addresses.
>*/
>-  pdd->gpuvm_base = SVM_USER_BASE;
>+  

RE: [PATCH v2] drm/amdkfd: reserve the BO before validating it

2024-01-28 Thread Yu, Lang
[AMD Official Use Only - General]

>-Original Message-
>From: Kuehling, Felix 
>Sent: Saturday, January 27, 2024 3:22 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Francis, David 
>Subject: Re: [PATCH v2] drm/amdkfd: reserve the BO before validating it
>
>
>On 2024-01-25 20:59, Yu, Lang wrote:
>> [AMD Official Use Only - General]
>>
>>> -Original Message-
>>> From: Kuehling, Felix 
>>> Sent: Thursday, January 25, 2024 5:41 AM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Francis, David 
>>> Subject: Re: [PATCH v2] drm/amdkfd: reserve the BO before validating
>>> it
>>>
>>> On 2024-01-22 4:08, Lang Yu wrote:
>>>> Fixes: 410f08516e0f ("drm/amdkfd: Move dma unmapping after TLB
>>>> flush")
>>>>
>>>> v2:
>>>> Avoid unmapping attachment twice when ERESTARTSYS.
>>>>
>>>> [   41.708711] WARNING: CPU: 0 PID: 1463 at
>>> drivers/gpu/drm/ttm/ttm_bo.c:846 ttm_bo_validate+0x146/0x1b0 [ttm]
>>>> [   41.708989] Call Trace:
>>>> [   41.708992]  
>>>> [   41.708996]  ? show_regs+0x6c/0x80
>>>> [   41.709000]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>>>> [   41.709008]  ? __warn+0x93/0x190
>>>> [   41.709014]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>>>> [   41.709024]  ? report_bug+0x1f9/0x210
>>>> [   41.709035]  ? handle_bug+0x46/0x80
>>>> [   41.709041]  ? exc_invalid_op+0x1d/0x80
>>>> [   41.709048]  ? asm_exc_invalid_op+0x1f/0x30
>>>> [   41.709057]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80
>>> [amdgpu]
>>>> [   41.709185]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>>>> [   41.709197]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80
>>> [amdgpu]
>>>> [   41.709337]  ? srso_alias_return_thunk+0x5/0x7f
>>>> [   41.709346]  kfd_mem_dmaunmap_attachment+0x9e/0x1e0 [amdgpu]
>>>> [   41.709467]  amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x56/0x80
>>> [amdgpu]
>>>> [   41.709586]  kfd_ioctl_unmap_memory_from_gpu+0x1b7/0x300 [amdgpu]
>>>> [   41.709710]  kfd_ioctl+0x1ec/0x650 [amdgpu]
>>>> [   41.709822]  ? __pfx_kfd_ioctl_unmap_memory_from_gpu+0x10/0x10
>>> [amdgpu]
>>>> [   41.709945]  ? srso_alias_return_thunk+0x5/0x7f
>>>> [   41.709949]  ? tomoyo_file_ioctl+0x20/0x30
>>>> [   41.709959]  __x64_sys_ioctl+0x9c/0xd0
>>>> [   41.709967]  do_syscall_64+0x3f/0x90
>>>> [   41.709973]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>>>>
>>>> Signed-off-by: Lang Yu 
>>>> ---
>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
>>>>.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 28
>>> +--
>>>>drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  4 ++-
>>>>3 files changed, 29 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>> index 584a0cea5572..41854417e487 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>> @@ -311,7 +311,7 @@ int
>>> amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device *adev,
>>>> struct kgd_mem *mem, void
>>> *drm_priv);
>>>>int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
>>>>   struct amdgpu_device *adev, struct kgd_mem *mem, void
>>> *drm_priv);
>>>> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void
>>>> *drm_priv);
>>>> +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void
>>>> +*drm_priv);
>>>>int amdgpu_amdkfd_gpuvm_sync_memory(
>>>>   struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
>>>>int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> index 6f3a4cb2a9ef..7a050d46fa4d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> @@ -2088,21 +2088,43 @@ int
>>> amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
>>>>   return ret;
>>>>}
>>>>
>>>> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_m

RE: [PATCH v2] drm/amdkfd: reserve the BO before validating it

2024-01-25 Thread Yu, Lang
[AMD Official Use Only - General]

>-Original Message-
>From: Kuehling, Felix 
>Sent: Thursday, January 25, 2024 5:41 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Francis, David 
>Subject: Re: [PATCH v2] drm/amdkfd: reserve the BO before validating it
>
>On 2024-01-22 4:08, Lang Yu wrote:
>> Fixes: 410f08516e0f ("drm/amdkfd: Move dma unmapping after TLB flush")
>>
>> v2:
>> Avoid unmapping attachment twice when ERESTARTSYS.
>>
>> [   41.708711] WARNING: CPU: 0 PID: 1463 at
>drivers/gpu/drm/ttm/ttm_bo.c:846 ttm_bo_validate+0x146/0x1b0 [ttm]
>> [   41.708989] Call Trace:
>> [   41.708992]  
>> [   41.708996]  ? show_regs+0x6c/0x80
>> [   41.709000]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>> [   41.709008]  ? __warn+0x93/0x190
>> [   41.709014]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>> [   41.709024]  ? report_bug+0x1f9/0x210
>> [   41.709035]  ? handle_bug+0x46/0x80
>> [   41.709041]  ? exc_invalid_op+0x1d/0x80
>> [   41.709048]  ? asm_exc_invalid_op+0x1f/0x30
>> [   41.709057]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80
>[amdgpu]
>> [   41.709185]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>> [   41.709197]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80
>[amdgpu]
>> [   41.709337]  ? srso_alias_return_thunk+0x5/0x7f
>> [   41.709346]  kfd_mem_dmaunmap_attachment+0x9e/0x1e0 [amdgpu]
>> [   41.709467]  amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x56/0x80
>[amdgpu]
>> [   41.709586]  kfd_ioctl_unmap_memory_from_gpu+0x1b7/0x300 [amdgpu]
>> [   41.709710]  kfd_ioctl+0x1ec/0x650 [amdgpu]
>> [   41.709822]  ? __pfx_kfd_ioctl_unmap_memory_from_gpu+0x10/0x10
>[amdgpu]
>> [   41.709945]  ? srso_alias_return_thunk+0x5/0x7f
>> [   41.709949]  ? tomoyo_file_ioctl+0x20/0x30
>> [   41.709959]  __x64_sys_ioctl+0x9c/0xd0
>> [   41.709967]  do_syscall_64+0x3f/0x90
>> [   41.709973]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 28
>+--
>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  4 ++-
>>   3 files changed, 29 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 584a0cea5572..41854417e487 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -311,7 +311,7 @@ int
>amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device *adev,
>>struct kgd_mem *mem, void
>*drm_priv);
>>   int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
>>  struct amdgpu_device *adev, struct kgd_mem *mem, void
>*drm_priv);
>> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void
>> *drm_priv);
>> +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void
>> +*drm_priv);
>>   int amdgpu_amdkfd_gpuvm_sync_memory(
>>  struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
>>   int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 6f3a4cb2a9ef..7a050d46fa4d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -2088,21 +2088,43 @@ int
>amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
>>  return ret;
>>   }
>>
>> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void
>> *drm_priv)
>> +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void
>> +*drm_priv)
>>   {
>>  struct kfd_mem_attachment *entry;
>>  struct amdgpu_vm *vm;
>> +bool reserved = false;
>> +int ret = 0;
>>
>>  vm = drm_priv_to_vm(drm_priv);
>>
>>  mutex_lock(>lock);
>>
>>  list_for_each_entry(entry, >attachments, list) {
>> -if (entry->bo_va->base.vm == vm)
>> -kfd_mem_dmaunmap_attachment(mem, entry);
>> +if (entry->bo_va->base.vm != vm)
>> +continue;
>> +if (entry->type == KFD_MEM_ATT_SHARED ||
>> +entry->type == KFD_MEM_ATT_DMABUF)
>> +continue;
>> +if (!entry->bo_va->base.bo->tbo.ttm->sg)
>> +continue;
>
>You're going to great lengths to avoid the reserva

RE: [PATCH] drm/amdkfd: reserve the BO before validating it

2024-01-11 Thread Yu, Lang
[Public]

>-Original Message-
>From: Kuehling, Felix 
>Sent: Friday, January 12, 2024 12:19 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org; Francis, David
>
>Cc: Yang, Philip 
>Subject: Re: [PATCH] drm/amdkfd: reserve the BO before validating it
>
>On 2024-01-11 02:22, Lang Yu wrote:
>> Fixes: 410f08516e0f ("drm/amdkfd: Move dma unmapping after TLB flush")
>>
>> [   41.708711] WARNING: CPU: 0 PID: 1463 at
>drivers/gpu/drm/ttm/ttm_bo.c:846 ttm_bo_validate+0x146/0x1b0 [ttm]
>> [   41.708989] Call Trace:
>> [   41.708992]  
>> [   41.708996]  ? show_regs+0x6c/0x80
>> [   41.709000]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>> [   41.709008]  ? __warn+0x93/0x190
>> [   41.709014]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>> [   41.709024]  ? report_bug+0x1f9/0x210
>> [   41.709035]  ? handle_bug+0x46/0x80
>> [   41.709041]  ? exc_invalid_op+0x1d/0x80
>> [   41.709048]  ? asm_exc_invalid_op+0x1f/0x30
>> [   41.709057]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80
>[amdgpu]
>> [   41.709185]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
>> [   41.709197]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80
>[amdgpu]
>> [   41.709337]  ? srso_alias_return_thunk+0x5/0x7f
>> [   41.709346]  kfd_mem_dmaunmap_attachment+0x9e/0x1e0 [amdgpu]
>> [   41.709467]  amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x56/0x80
>[amdgpu]
>> [   41.709586]  kfd_ioctl_unmap_memory_from_gpu+0x1b7/0x300 [amdgpu]
>> [   41.709710]  kfd_ioctl+0x1ec/0x650 [amdgpu]
>> [   41.709822]  ? __pfx_kfd_ioctl_unmap_memory_from_gpu+0x10/0x10
>[amdgpu]
>> [   41.709945]  ? srso_alias_return_thunk+0x5/0x7f
>> [   41.709949]  ? tomoyo_file_ioctl+0x20/0x30
>> [   41.709959]  __x64_sys_ioctl+0x9c/0xd0
>> [   41.709967]  do_syscall_64+0x3f/0x90
>> [   41.709973]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 ++-
>>   1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 48697b789342..f5542a4ab8ed 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -2095,8 +2095,13 @@ void
>amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void
>*drm_priv)
>>  mutex_lock(>lock);
>>
>>  list_for_each_entry(entry, >attachments, list) {
>> -if (entry->bo_va->base.vm == vm)
>> +if (entry->bo_va->base.vm != vm)
>> +continue;
>> +
>> +if (!WARN_ON(amdgpu_bo_reserve(entry->bo_va->base.bo,
>true))) {
>>  kfd_mem_dmaunmap_attachment(mem, entry);
>> +amdgpu_bo_unreserve(entry->bo_va->base.bo);
>> +}
>
>I'm pretty sure someone else worked on a fix for this before. This is not a 
>good
>solution. We need to handle failed reservations (due to
>ERESTARTSYS) and make sure that the unmap ioctl can be restarted correctly in
>that case.
>
>See
>https://lore.kernel.org/amd-gfx/530aac57-5561-4d1d-879a-
>93b108e5c...@gmail.com/

Got it. Thanks.

Christian's concern is
"Kernel operations should either completely fail, fully complete or
explicitly return how much they completed (e.g. how many bytes
transferred for example). That we only partially complete and track that
state inside the kernel is usually a no-go."

ERESTART_RESTARTBLOCK is for partially restart and may help.

Regards,
Lang

>David, do you have any update on this work?
>
>Regards,
>   Felix
>
>
>>  }
>>
>>  mutex_unlock(>lock);


RE: [PATCH 2/2] drm/amdgpu/vpe: enable vpe dpm

2023-12-13 Thread Yu, Lang
[Public]

The series is.

Reviewed-by: Lang Yu 

>-Original Message-
>From: Lee, Peyton 
>Sent: Wednesday, December 13, 2023 10:49 AM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Zhang, Yifan
>; Ma, Li ; Yu, Lang
>; Lee, Peyton 
>Subject: [PATCH 2/2] drm/amdgpu/vpe: enable vpe dpm
>
>enable vpe dpm
>
>Signed-off-by: Peyton Lee 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 250
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h |
>12 ++
> drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c   |  15 ++
> 3 files changed, 277 insertions(+)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>index e81579708e96..2020ddb4182a 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>@@ -26,6 +26,7 @@
> #include "amdgpu.h"
> #include "amdgpu_ucode.h"
> #include "amdgpu_vpe.h"
>+#include "amdgpu_smu.h"
> #include "soc15_common.h"
> #include "vpe_v6_1.h"
>
>@@ -33,8 +34,186 @@
> /* VPE CSA resides in the 4th page of CSA */
> #define AMDGPU_CSA_VPE_OFFSET (4096 * 3)
>
>+/* 1 second timeout */
>+#define VPE_IDLE_TIMEOUT  msecs_to_jiffies(1000)
>+
>+#define VPE_MAX_DPM_LEVEL 4
>+#define FIXED1_8_BITS_PER_FRACTIONAL_PART 8
>+#define GET_PRATIO_INTEGER_PART(x)((x) >>
>FIXED1_8_BITS_PER_FRACTIONAL_PART)
>+
> static void vpe_set_ring_funcs(struct amdgpu_device *adev);
>
>+static inline uint16_t div16_u16_rem(uint16_t dividend, uint16_t
>+divisor, uint16_t *remainder) {
>+  *remainder = dividend % divisor;
>+  return dividend / divisor;
>+}
>+
>+static inline uint16_t complete_integer_division_u16(
>+  uint16_t dividend,
>+  uint16_t divisor,
>+  uint16_t *remainder)
>+{
>+  return div16_u16_rem(dividend, divisor, (uint16_t *)remainder); }
>+
>+static uint16_t vpe_u1_8_from_fraction(uint16_t numerator, uint16_t
>+denominator) {
>+  bool arg1_negative = numerator < 0;
>+  bool arg2_negative = denominator < 0;
>+
>+  uint16_t arg1_value = (uint16_t)(arg1_negative ? -numerator :
>numerator);
>+  uint16_t arg2_value = (uint16_t)(arg2_negative ? -denominator :
>+denominator);
>+
>+  uint16_t remainder;
>+
>+  /* determine integer part */
>+  uint16_t res_value = complete_integer_division_u16(
>+  arg1_value, arg2_value, );
>+
>+  if (res_value > 127 /* CHAR_MAX */)
>+  return 0;
>+
>+  /* determine fractional part */
>+  {
>+  unsigned int i = FIXED1_8_BITS_PER_FRACTIONAL_PART;
>+
>+  do {
>+  remainder <<= 1;
>+
>+  res_value <<= 1;
>+
>+  if (remainder >= arg2_value) {
>+  res_value |= 1;
>+  remainder -= arg2_value;
>+  }
>+  } while (--i != 0);
>+  }
>+
>+  /* round up LSB */
>+  {
>+  uint16_t summand = (remainder << 1) >= arg2_value;
>+
>+  if ((res_value + summand) > 32767 /* SHRT_MAX */)
>+  return 0;
>+
>+  res_value += summand;
>+  }
>+
>+  if (arg1_negative ^ arg2_negative)
>+  res_value = -res_value;
>+
>+  return res_value;
>+}
>+
>+static uint16_t vpe_internal_get_pratio(uint16_t from_frequency,
>+uint16_t to_frequency) {
>+  uint16_t pratio = vpe_u1_8_from_fraction(from_frequency,
>+to_frequency);
>+
>+  if (GET_PRATIO_INTEGER_PART(pratio) > 1)
>+  pratio = 0;
>+
>+  return pratio;
>+}
>+
>+/*
>+ * VPE has 4 DPM levels from level 0 (lowerest) to 3 (highest),
>+ * VPE FW will dynamically decide which level should be used according to
>current loading.
>+ *
>+ * Get VPE and SOC clocks from PM, and select the appropriate four
>+clock values,
>+ * calculate the ratios of adjusting from one clock to another.
>+ * The VPE FW can then request the appropriate frequency from the PMFW.
>+ */
>+int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe) {
>+  struct amdgpu_device *adev = vpe->ring.adev;
>+  uint32_t dpm_ctl;
>+
>+  if (adev->pm.dpm_enabled) {
>+  struct dpm_clocks clock_table = { 0 };
>+  struct dpm_clock *VPEClks;
>+  struct dpm_clock *SOCClks;
>+  uint32_t idx;
>+  uint32_t pratio_vmax_vnorm = 0, pratio_vnorm_vmid = 0,
>pratio_vmid_vmin = 0;
>+  uint16_t pratio_vmin

Re: [PATCH v2 1/6] drm/amdgpu: add lsdma hw ip definition

2023-11-17 Thread Yu, Lang
[Public]

 enum amd_ip_block_type doc is missing for LSDMA.

@@ -85,6 +85,7 @@ enum amd_apu_flags {
 * @AMD_IP_BLOCK_TYPE_JPEG: JPEG Engine
 * @AMD_IP_BLOCK_TYPE_VPE: Video Processing Engine
 * @AMD_IP_BLOCK_TYPE_UMSCH_MM: User Mode Schduler for Multimedia
+* @AMD_IP_BLOCK_TYPE_LSDMA:
 * @AMD_IP_BLOCK_TYPE_NUM: Total number of IP block types
 */
 enum amd_ip_block_type {

With that fixed the series is Reviewed-by: Lang Yu .




From: Zhang, Yifan 
Sent: Friday, November 17, 2023 10:09 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Koenig, Christian 
; Huang, Tim ; Yu, Lang 
; Zhang, Yifan 
Subject: [PATCH v2 1/6] drm/amdgpu: add lsdma hw ip definition

This patch is to add lsdma hw ip definition.

Signed-off-by: Yifan Zhang 
Reviewed-by: Tim Huang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c| 15 +++
 drivers/gpu/drm/amd/include/amd_shared.h   |  1 +
 include/uapi/drm/amdgpu_drm.h  |  3 ++-
 5 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b49ad7690a8c..bba9e8da2384 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3418,6 +3418,7 @@ static int amdgpu_device_ip_reinit_late_sriov(struct 
amdgpu_device *adev)
 AMD_IP_BLOCK_TYPE_DCE,
 AMD_IP_BLOCK_TYPE_GFX,
 AMD_IP_BLOCK_TYPE_SDMA,
+   AMD_IP_BLOCK_TYPE_LSDMA,
 AMD_IP_BLOCK_TYPE_MES,
 AMD_IP_BLOCK_TYPE_UVD,
 AMD_IP_BLOCK_TYPE_VCE,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
index b3630ceaff4c..20519ca90fb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
@@ -52,6 +52,7 @@ static const char *amdgpu_ip_name[AMDGPU_HW_IP_NUM] = {
 [AMDGPU_HW_IP_VCN_ENC]  =   "enc",
 [AMDGPU_HW_IP_VCN_JPEG] =   "jpeg",
 [AMDGPU_HW_IP_VPE]  =   "vpe",
+   [AMDGPU_HW_IP_LSDMA]=   "lsdma",
 };

 void amdgpu_show_fdinfo(struct seq_file *m, struct file *f)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index b5ebafd4a3ad..91ec481377fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -215,6 +215,9 @@ static enum amd_ip_block_type
 case AMDGPU_HW_IP_DMA:
 type = AMD_IP_BLOCK_TYPE_SDMA;
 break;
+   case AMDGPU_HW_IP_LSDMA:
+   type = AMD_IP_BLOCK_TYPE_LSDMA;
+   break;
 case AMDGPU_HW_IP_UVD:
 case AMDGPU_HW_IP_UVD_ENC:
 type = AMD_IP_BLOCK_TYPE_UVD;
@@ -438,6 +441,13 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 ib_start_alignment = 256;
 ib_size_alignment = 4;
 break;
+   case AMDGPU_HW_IP_LSDMA:
+   type = AMD_IP_BLOCK_TYPE_LSDMA;
+   if (adev->lsdma.ring.sched.ready)
+   ++num_rings;
+   ib_start_alignment = 256;
+   ib_size_alignment = 4;
+   break;
 case AMDGPU_HW_IP_UVD:
 type = AMD_IP_BLOCK_TYPE_UVD;
 for (i = 0; i < adev->uvd.num_uvd_inst; i++) {
@@ -546,6 +556,10 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 result->ip_discovery_version =
 IP_VERSION_MAJ_MIN_REV(amdgpu_ip_version(adev, 
SDMA0_HWIP, 0));
 break;
+   case AMD_IP_BLOCK_TYPE_LSDMA:
+   result->ip_discovery_version =
+   IP_VERSION_MAJ_MIN_REV(amdgpu_ip_version(adev, 
LSDMA_HWIP, 0));
+   break;
 case AMD_IP_BLOCK_TYPE_UVD:
 case AMD_IP_BLOCK_TYPE_VCN:
 case AMD_IP_BLOCK_TYPE_JPEG:
@@ -679,6 +693,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
 switch (type) {
 case AMD_IP_BLOCK_TYPE_GFX:
 case AMD_IP_BLOCK_TYPE_VCE:
+   case AMD_IP_BLOCK_TYPE_LSDMA:
 count = 1;
 break;
 case AMD_IP_BLOCK_TYPE_SDMA:
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h 
b/drivers/gpu/drm/amd/include/amd_shared.h
index 579977f6ad52..8c03afd4d472 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -95,6 +95,7 @@ enum amd_ip_block_type {
 AMD_IP_BLOCK_TYPE_PSP,
 AMD_IP_BLOCK_TYPE_DCE,
 AMD_IP_BLOCK_TYPE_GFX,
+   A

Re: [PATCH] drm/amdgpu: add support to create large TMR BO for APU

2023-08-03 Thread Yu, Lang
[Public]

Good. You can just send that to amd gfx list directly.

Regards,
Lang


发件人: Paneer Selvam, Arunpravin 
发送时间: Friday, August 4, 2023 12:01:08 AM
收件人: Yu, Lang ; amd-gfx@lists.freedesktop.org 

抄送: Koenig, Christian ; Zhang, Yifan 

主题: Re: [PATCH] drm/amdgpu: add support to create large TMR BO for APU

Hi Lang,
This is not the right method to allocate memory if you dont have a specific 
offset for TMR. I will send a patch for your testing in drm buddy
which could fix this problem.

Regards,
Arun.

On 8/3/2023 8:58 PM, Yu, Lang wrote:

[Public]

Ping

发件人: Yu, Lang <mailto:lang...@amd.com>
发送时间: Tuesday, August 1, 2023 3:38:32 PM
收件人: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
<mailto:amd-gfx@lists.freedesktop.org>
抄送: Koenig, Christian 
<mailto:christian.koe...@amd.com>; Paneer Selvam, 
Arunpravin 
<mailto:arunpravin.paneersel...@amd.com>; 
Zhang, Yifan <mailto:yifan1.zh...@amd.com>; Yu, Lang 
<mailto:lang...@amd.com>
主题: [PATCH] drm/amdgpu: add support to create large TMR BO for APU

TMR requires physical contiguous memory, amdgpu_bo_create_kernel()
can't satisfy large(>128MB) physical contiguous memory allocation
request with default 512MB VRAM on APU.

When requested TMR size > 128MB, use amdgpu_bo_create_kernel_at()
to create the BO at offset 32MB with a step 1MB in the VRAM range.

Signed-off-by: Lang Yu <mailto:lang...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 38 +++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 15217e33b51d..3fadfaa63b2e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -783,6 +783,34 @@ static bool psp_boottime_tmr(struct psp_context *psp)
 }
 }

+static int psp_create_large_tmr_bo_for_apu(struct psp_context *psp,
+  int tmr_size,
+  void **cpu_addr)
+{
+   struct amdgpu_vram_mgr *mgr = >adev->mman.vram_mgr;
+   uint32_t rounded_size = round_up(tmr_size, 0x10);
+   uint32_t start = 0x200;
+   uint32_t step = 0x10;
+   int ret = -ENOMEM;
+
+   for (; start + rounded_size <= mgr->manager.size &&
+start + step <= mgr->manager.size; start += step) {
+
+   ret = amdgpu_bo_create_kernel_at(psp->adev, start, tmr_size,
+>tmr_bo, cpu_addr);
+   if (ret == -ENOMEM)
+   continue;
+   if (ret)
+   return ret;
+
+   psp->tmr_mc_addr = amdgpu_bo_gpu_offset(psp->tmr_bo);
+
+   break;
+   }
+
+   return ret;
+}
+
 /* Set up Trusted Memory Region */
 static int psp_tmr_init(struct psp_context *psp)
 {
@@ -813,8 +841,13 @@ static int psp_tmr_init(struct psp_context *psp)
 }
 }

-   if (!psp->tmr_bo) {
-   pptr = amdgpu_sriov_vf(psp->adev) ? _buf : NULL;
+   if (psp->tmr_bo)
+   return 0;
+
+   pptr = amdgpu_sriov_vf(psp->adev) ? _buf : NULL;
+   if (psp->adev->flags & AMD_IS_APU && tmr_size > 0x800)
+   ret = psp_create_large_tmr_bo_for_apu(psp, tmr_size, pptr);
+   else
 ret = amdgpu_bo_create_kernel(psp->adev, tmr_size,
   PSP_TMR_ALIGNMENT,
   AMDGPU_HAS_VRAM(psp->adev) ?
@@ -822,7 +855,6 @@ static int psp_tmr_init(struct psp_context *psp)
   AMDGPU_GEM_DOMAIN_GTT,
   >tmr_bo, >tmr_mc_addr,
   pptr);
-   }

 return ret;
 }
--
2.25.1




Re: [PATCH] drm/amdgpu: add support to create large TMR BO for APU

2023-08-03 Thread Yu, Lang
[Public]

Ping

发件人: Yu, Lang 
发送时间: Tuesday, August 1, 2023 3:38:32 PM
收件人: amd-gfx@lists.freedesktop.org 
抄送: Koenig, Christian ; Paneer Selvam, Arunpravin 
; Zhang, Yifan ; Yu, 
Lang 
主题: [PATCH] drm/amdgpu: add support to create large TMR BO for APU

TMR requires physical contiguous memory, amdgpu_bo_create_kernel()
can't satisfy large(>128MB) physical contiguous memory allocation
request with default 512MB VRAM on APU.

When requested TMR size > 128MB, use amdgpu_bo_create_kernel_at()
to create the BO at offset 32MB with a step 1MB in the VRAM range.

Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 38 +++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 15217e33b51d..3fadfaa63b2e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -783,6 +783,34 @@ static bool psp_boottime_tmr(struct psp_context *psp)
 }
 }

+static int psp_create_large_tmr_bo_for_apu(struct psp_context *psp,
+  int tmr_size,
+  void **cpu_addr)
+{
+   struct amdgpu_vram_mgr *mgr = >adev->mman.vram_mgr;
+   uint32_t rounded_size = round_up(tmr_size, 0x10);
+   uint32_t start = 0x200;
+   uint32_t step = 0x10;
+   int ret = -ENOMEM;
+
+   for (; start + rounded_size <= mgr->manager.size &&
+start + step <= mgr->manager.size; start += step) {
+
+   ret = amdgpu_bo_create_kernel_at(psp->adev, start, tmr_size,
+>tmr_bo, cpu_addr);
+   if (ret == -ENOMEM)
+   continue;
+   if (ret)
+   return ret;
+
+   psp->tmr_mc_addr = amdgpu_bo_gpu_offset(psp->tmr_bo);
+
+   break;
+   }
+
+   return ret;
+}
+
 /* Set up Trusted Memory Region */
 static int psp_tmr_init(struct psp_context *psp)
 {
@@ -813,8 +841,13 @@ static int psp_tmr_init(struct psp_context *psp)
 }
 }

-   if (!psp->tmr_bo) {
-   pptr = amdgpu_sriov_vf(psp->adev) ? _buf : NULL;
+   if (psp->tmr_bo)
+   return 0;
+
+   pptr = amdgpu_sriov_vf(psp->adev) ? _buf : NULL;
+   if (psp->adev->flags & AMD_IS_APU && tmr_size > 0x800)
+   ret = psp_create_large_tmr_bo_for_apu(psp, tmr_size, pptr);
+   else
 ret = amdgpu_bo_create_kernel(psp->adev, tmr_size,
   PSP_TMR_ALIGNMENT,
   AMDGPU_HAS_VRAM(psp->adev) ?
@@ -822,7 +855,6 @@ static int psp_tmr_init(struct psp_context *psp)
   AMDGPU_GEM_DOMAIN_GTT,
   >tmr_bo, >tmr_mc_addr,
   pptr);
-   }

 return ret;
 }
--
2.25.1



Re: [PATCH v3] drm/amdgpu: refine amdgpu_bo_create_kernel_at()

2023-08-03 Thread Yu, Lang
[Public]

Ping


发件人: Yu, Lang 
发送时间: 星期二, 八月 1, 2023 15:16
收件人: amd-gfx@lists.freedesktop.org 
抄送: Koenig, Christian ; Paneer Selvam, Arunpravin 
; Zhang, Yifan ; Yu, 
Lang 
主题: [PATCH v3] drm/amdgpu: refine amdgpu_bo_create_kernel_at()

Use amdgpu_bo_create_reserved() to create a BO in VRAM
domain would fail if requested VRAM size is large(>128MB)
on APU which usually has a default 512MB VRAM.

That's because VRAM is framgented after several allocations.

The approach is using amdgpu_bo_create_reserved() to
create a BO in CPU domain first, it will always succeed.

v2: Don't overwrite the contents at specific offset.
v3: Don't return GPU addr.

Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index ff73cc11d47e..df5ba9509a41 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -377,27 +377,36 @@ int amdgpu_bo_create_kernel_at(struct amdgpu_device *adev,
 size = ALIGN(size, PAGE_SIZE);

 r = amdgpu_bo_create_reserved(adev, size, PAGE_SIZE,
- AMDGPU_GEM_DOMAIN_VRAM, bo_ptr, NULL,
- cpu_addr);
+ AMDGPU_GEM_DOMAIN_CPU,
+ bo_ptr, NULL, NULL);
 if (r)
 return r;

 if ((*bo_ptr) == NULL)
 return 0;

+   (*bo_ptr)->preferred_domains = AMDGPU_GEM_DOMAIN_VRAM;
+   (*bo_ptr)->allowed_domains = (*bo_ptr)->preferred_domains;
+   (*bo_ptr)->flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+   (*bo_ptr)->flags |= cpu_addr ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED
+   : AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
+
 /*
  * Remove the original mem node and create a new one at the request
  * position.
  */
-   if (cpu_addr)
-   amdgpu_bo_kunmap(*bo_ptr);
-
 ttm_resource_free(&(*bo_ptr)->tbo, &(*bo_ptr)->tbo.resource);

 for (i = 0; i < (*bo_ptr)->placement.num_placement; ++i) {
 (*bo_ptr)->placements[i].fpfn = offset >> PAGE_SHIFT;
 (*bo_ptr)->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
+   (*bo_ptr)->placements[i].mem_type = TTM_PL_VRAM;
+   (*bo_ptr)->placements[i].flags = TTM_PL_FLAG_CONTIGUOUS;
+
+   if (!((*bo_ptr)->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED))
+   (*bo_ptr)->placements[i].flags |= TTM_PL_FLAG_TOPDOWN;
 }
+
 r = ttm_bo_mem_space(&(*bo_ptr)->tbo, &(*bo_ptr)->placement,
  &(*bo_ptr)->tbo.resource, );
 if (r)
--
2.25.1



RE: [PATCH] drm/amdgpu: unmap and remove csa_va properly

2023-06-06 Thread Yu, Lang
[Public]

Ping. Hi Christian, what do you think?

Regards,
Lang

>-Original Message-
>From: Yu, Lang 
>Sent: Saturday, May 6, 2023 3:03 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Liu, Monk ; Koenig, Christian
>; Yu, Lang 
>Subject: [PATCH] drm/amdgpu: unmap and remove csa_va properly
>
>Root PD BO should be reserved before unmap and remove a bo_va from VM
>otherwise lockdep will complain.
>
>[14616.936827] WARNING: CPU: 6 PID: 1711 at
>drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c:1762
>amdgpu_vm_bo_del+0x399/0x3f0 [amdgpu] [14616.937096] Call Trace:
>[14616.937097]  
>[14616.937102]  amdgpu_driver_postclose_kms+0x249/0x2f0 [amdgpu]
>[14616.937187]  drm_file_free+0x1d6/0x300 [drm] [14616.937207]
>drm_close_helper.isra.0+0x62/0x70 [drm] [14616.937220]
>drm_release+0x5e/0x100 [drm] [14616.937234]  __fput+0x9f/0x280
>[14616.937239]  fput+0xe/0x20 [14616.937241]
>task_work_run+0x61/0x90 [14616.937246]
>exit_to_user_mode_prepare+0x215/0x220
>[14616.937251]  syscall_exit_to_user_mode+0x2a/0x60
>[14616.937254]  do_syscall_64+0x48/0x90
>[14616.937257]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
>
>Signed-off-by: Lang Yu 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c | 38
>+
>drivers/gpu/drm/amd/amdgpu/amdgpu_csa.h |  3 ++
>drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  9 +++---
> 3 files changed, 45 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>index c6d4d41c4393..23d054526e7c 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>@@ -106,3 +106,41 @@ int amdgpu_map_static_csa(struct amdgpu_device
>*adev, struct amdgpu_vm *vm,
>   ttm_eu_backoff_reservation(, );
>   return 0;
> }
>+
>+int amdgpu_unmap_static_csa(struct amdgpu_device *adev, struct
>amdgpu_vm *vm,
>+  struct amdgpu_bo *bo, struct amdgpu_bo_va
>*bo_va,
>+  uint64_t csa_addr)
>+{
>+  struct ww_acquire_ctx ticket;
>+  struct list_head list;
>+  struct amdgpu_bo_list_entry pd;
>+  struct ttm_validate_buffer csa_tv;
>+  int r;
>+
>+  INIT_LIST_HEAD();
>+  INIT_LIST_HEAD(_tv.head);
>+  csa_tv.bo = >tbo;
>+  csa_tv.num_shared = 1;
>+
>+  list_add(_tv.head, );
>+  amdgpu_vm_get_pd_bo(vm, , );
>+
>+  r = ttm_eu_reserve_buffers(, , true, NULL);
>+  if (r) {
>+  DRM_ERROR("failed to reserve CSA,PD BOs: err=%d\n", r);
>+  return r;
>+  }
>+
>+  r = amdgpu_vm_bo_unmap(adev, bo_va, csa_addr);
>+  if (r) {
>+  DRM_ERROR("failed to do bo_unmap on static CSA,
>err=%d\n", r);
>+  ttm_eu_backoff_reservation(, );
>+  return r;
>+  }
>+
>+  amdgpu_vm_bo_del(adev, bo_va);
>+
>+  ttm_eu_backoff_reservation(, );
>+
>+  return 0;
>+}
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.h
>index 524b4437a021..7dfc1f2012eb 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.h
>@@ -34,6 +34,9 @@ int amdgpu_allocate_static_csa(struct amdgpu_device
>*adev, struct amdgpu_bo **bo  int amdgpu_map_static_csa(struct
>amdgpu_device *adev, struct amdgpu_vm *vm,
> struct amdgpu_bo *bo, struct amdgpu_bo_va
>**bo_va,
> uint64_t csa_addr, uint32_t size);
>+int amdgpu_unmap_static_csa(struct amdgpu_device *adev, struct
>amdgpu_vm *vm,
>+  struct amdgpu_bo *bo, struct amdgpu_bo_va
>*bo_va,
>+  uint64_t csa_addr);
> void amdgpu_free_static_csa(struct amdgpu_bo **bo);
>
> #endif
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>index 1d3b224b8b28..6b47ac3eb40d 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>@@ -1307,11 +1307,10 @@ void amdgpu_driver_postclose_kms(struct
>drm_device *dev,
>   amdgpu_vce_free_handles(adev, file_priv);
>
>   if (amdgpu_mcbp) {
>-  /* TODO: how to handle reserve failure */
>-  BUG_ON(amdgpu_bo_reserve(adev->virt.csa_obj, true));
>-  amdgpu_vm_bo_del(adev, fpriv->csa_va);
>-  fpriv->csa_va = NULL;
>-  amdgpu_bo_unreserve(adev->virt.csa_obj);
>+  uint64_t csa_addr = amdgpu_csa_vaddr(adev) &
>AMDGPU_GMC_HOLE_MASK;
>+
>+  WARN_ON(amdgpu_unmap_static_csa(adev, >vm,
>adev->virt.csa_obj,
>+  fpriv->csa_va, csa_addr));
>   }
>
>   pasid = fpriv->vm.pasid;
>--
>2.25.1



RE: [PATCH 3/3] drm/amdkfd: remove an unnecessary amdgpu_bo_ref

2022-07-27 Thread Yu, Lang
[Public]

Ping for this single patch. 

>-Original Message-
>From: Yu, Lang 
>Sent: Monday, July 25, 2022 6:32 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Kuehling, Felix ; Koenig, Christian
>; Deucher, Alexander
>; Huang, Ray ; Yu, Lang
>
>Subject: [PATCH 3/3] drm/amdkfd: remove an unnecessary amdgpu_bo_ref
>
>No need to reference the BO here, dmabuf framework will handle that.
>
>Signed-off-by: Lang Yu 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 1 -
> 1 file changed, 1 deletion(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>index c1855b72a3f0..802c762108b2 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>@@ -827,7 +827,6 @@ kfd_mem_attach_dmabuf(struct amdgpu_device *adev,
>struct kgd_mem *mem,
>
>   *bo = gem_to_amdgpu_bo(gobj);
>   (*bo)->flags |= AMDGPU_GEM_CREATE_PREEMPTIBLE;
>-  (*bo)->parent = amdgpu_bo_ref(mem->bo);
>
>   return 0;
> }
>--
>2.25.1


RE: [PATCH v3] drm/amdkfd: simplify vm_validate_pt_pd_bos

2022-06-30 Thread Yu, Lang
[AMD Official Use Only - General]

Ping!

Hi Felix, what do you think? Thanks!

Regards,
Lang

>-Original Message-
>From: Koenig, Christian 
>Sent: Tuesday, June 14, 2022 5:08 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Kuehling, Felix ; Deucher, Alexander
>; Huang, Ray 
>Subject: Re: [PATCH v3] drm/amdkfd: simplify vm_validate_pt_pd_bos
>
>Am 14.06.22 um 11:03 schrieb Lang Yu:
>> We don't need to validate and map root PD specially here, it would be
>> validated and mapped by amdgpu_vm_validate_pt_bos if it is evicted.
>>
>> The special case is when turning a GFX VM to a compute VM, if
>> vm_update_mode changed, we should make sure root PD gets mapped. So
>> just map root PD after updating vm->update_funcs in
>> amdgpu_vm_make_compute whether the vm_update_mode changed or
>not.
>>
>> v3:
>>   - Add some comments suggested by Christian.
>>
>> v2:
>>   - Don't rename vm_validate_pt_pd_bos and make it public.
>>
>> Signed-off-by: Lang Yu 
>
>I can't judge the kfd part, but the VM stuff looks good to me now.
>
>Acked-by: Christian König 
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 -
>-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   |  8 
>>   2 files changed, 8 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 50bb590c3306..c9ef242177e2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -439,22 +439,8 @@ static int vm_validate_pt_pd_bos(struct
>amdgpu_vm *vm)
>>  return ret;
>>  }
>>
>> -ret = amdgpu_amdkfd_validate_vm_bo(NULL, pd);
>> -if (ret) {
>> -pr_err("failed to validate PD\n");
>> -return ret;
>> -}
>> -
>>  vm->pd_phys_addr = amdgpu_gmc_pd_addr(vm->root.bo);
>>
>> -if (vm->use_cpu_for_update) {
>> -ret = amdgpu_bo_kmap(pd, NULL);
>> -if (ret) {
>> -pr_err("failed to kmap PD, ret=%d\n", ret);
>> -return ret;
>> -}
>> -}
>> -
>>  return 0;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 703552f9a6d7..3a6b827e540c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -2225,6 +2225,14 @@ int amdgpu_vm_make_compute(struct
>amdgpu_device *adev, struct amdgpu_vm *vm)
>>  } else {
>>  vm->update_funcs = _vm_sdma_funcs;
>>  }
>> +/*
>> + * Make sure root PD gets mapped. As vm_update_mode could be
>changed
>> + * when turning a GFX VM into a compute VM.
>> + */
>> +r = vm->update_funcs->map_table(to_amdgpu_bo_vm(vm-
>>root.bo));
>> +if (r)
>> +goto unreserve_bo;
>> +
>>  dma_fence_put(vm->last_update);
>>  vm->last_update = NULL;
>>  vm->is_compute_context = true;


RE: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

2022-01-27 Thread Yu, Lang
[Public]

>  Should we set *flags = 0 before we return?

That will clear other bit masks. Actually, the caller initialize flags to 0, we 
can just return.
Or  just *flags &= ~( AMD_CG_SUPPORT_XXX) before we return.

Regards,
Lang

From: Deucher, Alexander 
Sent: Thursday, January 27, 2022 4:27 AM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo ; Huang, Ray 
Subject: Re: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers


[Public]

Should we set *flags = 0 before we return?

Alex


From: Yu, Lang mailto:lang...@amd.com>>
Sent: Wednesday, January 26, 2022 2:53 AM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Deucher, Alexander 
mailto:alexander.deuc...@amd.com>>; Lazar, Lijo 
mailto:lijo.la...@amd.com>>; Huang, Ray 
mailto:ray.hu...@amd.com>>; Yu, Lang 
mailto:lang...@amd.com>>
Subject: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

We observed a gpu hang when querying mmhub CG status(i.e.,
cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
skillfish doesn't support any CG features.

Only allow asics which support CG features accessing related
registers. Will add similar safeguards for other IPs in the
furture.

Signed-off-by: Lang Yu mailto:lang...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 3 +++
 5 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index 4c9f0c0f3116..1869e2019461 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -550,6 +550,9 @@ static void mmhub_v1_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
index 3b901f941627..f7b9843b36e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
@@ -546,6 +546,9 @@ static void mmhub_v1_7_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
index 3718ff610ab2..3f5f326379b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
@@ -686,6 +686,9 @@ static void mmhub_v2_0_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
index 9e16da28505a..b23dd9ddfb5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c
@@ -580,6 +580,9 @@ static void mmhub_v2_3_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1, data2, data3;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 619106f7d23d..a2d5c8424e2b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -651,6 +651,9 @@ static void mmhub_v9_4_get_clockgating(struct amdgpu_device 
*adev, u32 *flags)
 {
 int data, data1;

+   if (!(adev->cg_flags & (AMD_CG_SUPPORT_MC_MGCG | AMD_CG_SUPPORT_MC_LS)))
+   return;
+
 if (amdgpu_sriov_vf(adev))
 *flags = 0;

--
2.25.1


RE: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers

2022-01-26 Thread Yu, Lang
[Public]

Hi Lijo,

For cyan skillfish, both adev->cg_flags and adev->pg_flags are zero. I just 
found "RREG32_SOC15(MMHUB, 0, mmMM_ATC_L2_MISC_CG);" in 
mmhub_v2_0_get_clockgating() caused a gpu hang(cat amdgpu_pm_info). I didn't 
check if it's some sort of PG which causes the issue.

Regards,
Lang

From: Lazar, Lijo 
Sent: Wednesday, January 26, 2022 8:06 PM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Huang, Ray 
; Yu, Lang 
Subject: Re: [PATCH] drm/amdgpu: add safeguards for accessing mmhub CG registers


[Public]

Hi Lang,

There are ASICs in which driver doesn't enable CG, and then these flags will be 
false. However, the CG will be enabled by another component like VBIOS. Driver 
still reports the CG status eventhough driver doesn't enable it. For those, 
this logic doesn't work.

BTW, could you check if it's some sort of PG which causes the issue?

Thanks,
Lijo


RE: [PATCH 2/2] drm/amdgpu: allow APU to send power gate message when dpm is disabled

2021-12-06 Thread Yu, Lang
[Public]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Friday, December 3, 2021 5:52 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH 2/2] drm/amdgpu: allow APU to send power gate message
>when dpm is disabled
>
>
>
>On 12/3/2021 12:24 PM, Lang Yu wrote:
>> The general hw fini sequence is SMU-> ... ->SDMA-> ...
>> We need to send power gate message to power off SDMA(in SDMA
>> hw_fini()) afer dpm is disabled(in SMU hw_fini()). Allow that for APU.
>
>This message is not right. In APUs there is no message provided by FW to
>enable/disable DPM, it is done in BIOS. Rephrase to something like after smu
>hw_fini is completed.

It is power on/off SDMA message. Not enable/disable DPM.

>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> index 2d718c30c8eb..285a237f3605 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> @@ -277,7 +277,7 @@ static int smu_dpm_set_power_gate(void *handle,
>>  struct smu_context *smu = handle;
>>  int ret = 0;
>>
>> -if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
>> +if (!smu->pm_enabled || (!smu->is_apu &&
>> +!smu->adev->pm.dpm_enabled)) {
>
>
>This check was there before also, only the WARN is added. That means it was
>skipping sending messages in APUs also and so far this was working fine (until 
>this
>gets noticed because of the warning).
>
>Now this would try to send the message to APU without any check. That doesn't
>look good. Ideal way should be to fix the sequence. Otherwise, suggest to do
>something like below as the last step of smu hw cleanup rather than sending the
>message blindly.
>
>   if (smu->is_apu)
>   smu->pm.dpm_enabled = smu_is_dpm_running(smu);

smu_is_dpm_running(smu) will cause errors in suspend.

Here we just  send some IP power on/off messages. 
Is it necessary to enable DPM to send such messages?

Regards,
Lang

>Thanks,
>Lijo
>
>>  dev_WARN(smu->adev->dev,
>>   "SMU uninitialized but power %s requested for %u!\n",
>>   gate ? "gate" : "ungate", block_type);
>>


RE: [PATCH 2/2] drm/amdgpu: allow APU to send power gate message when dpm is disabled

2021-12-06 Thread Yu, Lang
[Public]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Monday, December 6, 2021 11:41 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH 2/2] drm/amdgpu: allow APU to send power gate message
>when dpm is disabled
>
>
>
>On 12/6/2021 8:19 AM, Yu, Lang wrote:
>> [Public]
>>
>>
>>
>>> -Original Message-
>>> From: Lazar, Lijo 
>>> Sent: Friday, December 3, 2021 5:52 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Huang, Ray
>>> 
>>> Subject: Re: [PATCH 2/2] drm/amdgpu: allow APU to send power gate
>>> message when dpm is disabled
>>>
>>>
>>>
>>> On 12/3/2021 12:24 PM, Lang Yu wrote:
>>>> The general hw fini sequence is SMU-> ... ->SDMA-> ...
>>>> We need to send power gate message to power off SDMA(in SDMA
>>>> hw_fini()) afer dpm is disabled(in SMU hw_fini()). Allow that for APU.
>>>
>>> This message is not right. In APUs there is no message provided by FW
>>> to enable/disable DPM, it is done in BIOS. Rephrase to something like
>>> after smu hw_fini is completed.
>>
>> It is power on/off SDMA message. Not enable/disable DPM.
>>
>Bad choice of word :) I didn't mean FW message, it was about this line in 
>"commit
>message" - "afer dpm is disabled".

Ok. I got it.

>
>>>>
>>>> Signed-off-by: Lang Yu 
>>>> ---
>>>>drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 2 +-
>>>>1 file changed, 1 insertion(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>>> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>>> index 2d718c30c8eb..285a237f3605 100644
>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>>> @@ -277,7 +277,7 @@ static int smu_dpm_set_power_gate(void *handle,
>>>>struct smu_context *smu = handle;
>>>>int ret = 0;
>>>>
>>>> -  if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
>>>> +  if (!smu->pm_enabled || (!smu->is_apu &&
>>>> +!smu->adev->pm.dpm_enabled)) {
>>>
>>>
>>> This check was there before also, only the WARN is added. That means
>>> it was skipping sending messages in APUs also and so far this was
>>> working fine (until this gets noticed because of the warning).
>>>
>>> Now this would try to send the message to APU without any check. That
>>> doesn't look good. Ideal way should be to fix the sequence.
>>> Otherwise, suggest to do something like below as the last step of smu
>>> hw cleanup rather than sending the message blindly.
>>>
>>> if (smu->is_apu)
>>> smu->pm.dpm_enabled = smu_is_dpm_running(smu);
>>
>> smu_is_dpm_running(smu) will cause errors in suspend.
>>
>That is interesting. What is the error you get?

[drm:amdgpu_dpm_enable_uvd [amdgpu]] *ERROR* Dpm enable uvd failed, ret = -95
That means EOPNOTSUPP.

Actually, in resume process, but adev->in_suspend  is still false.
For Renoir series APU, smu_is_dpm_running is hardcoded as following,

static bool renoir_is_dpm_running(struct smu_context *smu)
{
struct amdgpu_device *adev = smu->adev;

/*
 * Until now, the pmfw hasn't exported the interface of SMU
 * feature mask to APU SKU so just force on all the feature
 * at early initial stage.
 */
if (adev->in_suspend)
return false;
else
return true;

}

So we got such an error.

Regards,
Lang
  
>Thanks,
>Lijo
>
>> Here we just  send some IP power on/off messages.
>> Is it necessary to enable DPM to send such messages?
>>
>> Regards,
>> Lang
>>
>>> Thanks,
>>> Lijo
>>>
>>>>dev_WARN(smu->adev->dev,
>>>> "SMU uninitialized but power %s requested for 
>>>> %u!\n",
>>>> gate ? "gate" : "ungate", block_type);
>>>>


RE: [PATCH 2/2] drm/amdgpu: allow APU to send power gate message when dpm is disabled

2021-12-06 Thread Yu, Lang
[Public]

A typo.

>-Original Message-
>From: Yu, Lang
>Sent: Monday, December 6, 2021 2:47 PM
>To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: RE: [PATCH 2/2] drm/amdgpu: allow APU to send power gate message
>when dpm is disabled
>
>[Public]
>
>
>
>>-Original Message-
>>From: Lazar, Lijo 
>>Sent: Monday, December 6, 2021 11:41 AM
>>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>Cc: Deucher, Alexander ; Huang, Ray
>>
>>Subject: Re: [PATCH 2/2] drm/amdgpu: allow APU to send power gate
>>message when dpm is disabled
>>
>>
>>
>>On 12/6/2021 8:19 AM, Yu, Lang wrote:
>>> [Public]
>>>
>>>
>>>
>>>> -Original Message-
>>>> From: Lazar, Lijo 
>>>> Sent: Friday, December 3, 2021 5:52 PM
>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>> Cc: Deucher, Alexander ; Huang, Ray
>>>> 
>>>> Subject: Re: [PATCH 2/2] drm/amdgpu: allow APU to send power gate
>>>> message when dpm is disabled
>>>>
>>>>
>>>>
>>>> On 12/3/2021 12:24 PM, Lang Yu wrote:
>>>>> The general hw fini sequence is SMU-> ... ->SDMA-> ...
>>>>> We need to send power gate message to power off SDMA(in SDMA
>>>>> hw_fini()) afer dpm is disabled(in SMU hw_fini()). Allow that for APU.
>>>>
>>>> This message is not right. In APUs there is no message provided by
>>>> FW to enable/disable DPM, it is done in BIOS. Rephrase to something
>>>> like after smu hw_fini is completed.
>>>
>>> It is power on/off SDMA message. Not enable/disable DPM.
>>>
>>Bad choice of word :) I didn't mean FW message, it was about this line
>>in "commit message" - "afer dpm is disabled".
>
>Ok. I got it.
>
>>
>>>>>
>>>>> Signed-off-by: Lang Yu 
>>>>> ---
>>>>>drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 2 +-
>>>>>1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>>>> index 2d718c30c8eb..285a237f3605 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>>>> @@ -277,7 +277,7 @@ static int smu_dpm_set_power_gate(void *handle,
>>>>>   struct smu_context *smu = handle;
>>>>>   int ret = 0;
>>>>>
>>>>> - if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) {
>>>>> + if (!smu->pm_enabled || (!smu->is_apu &&
>>>>> +!smu->adev->pm.dpm_enabled)) {
>>>>
>>>>
>>>> This check was there before also, only the WARN is added. That means
>>>> it was skipping sending messages in APUs also and so far this was
>>>> working fine (until this gets noticed because of the warning).
>>>>
>>>> Now this would try to send the message to APU without any check.
>>>> That doesn't look good. Ideal way should be to fix the sequence.
>>>> Otherwise, suggest to do something like below as the last step of
>>>> smu hw cleanup rather than sending the message blindly.
>>>>
>>>>if (smu->is_apu)
>>>>smu->pm.dpm_enabled = smu_is_dpm_running(smu);
>>>
>>> smu_is_dpm_running(smu) will cause errors in suspend.
>>>
>>That is interesting. What is the error you get?
>
>[drm:amdgpu_dpm_enable_uvd [amdgpu]] *ERROR* Dpm enable uvd failed, ret =
>-95 That means EOPNOTSUPP.
>
>Actually, in resume process, but adev->in_suspend  is still true.
>For Renoir series APU, smu_is_dpm_running is hardcoded as following,
>
>static bool renoir_is_dpm_running(struct smu_context *smu) {
>   struct amdgpu_device *adev = smu->adev;
>
>   /*
>* Until now, the pmfw hasn't exported the interface of SMU
>* feature mask to APU SKU so just force on all the feature
>* at early initial stage.
>*/
>   if (adev->in_suspend)
>   return false;
>   else
>   return true;
>
>}
>
>So we got such an error.
>
>Regards,
>Lang
>
>>Thanks,
>>Lijo
>>
>>> Here we just  send some IP power on/off messages.
>>> Is it necessary to enable DPM to send such messages?
>>>
>>> Regards,
>>> Lang
>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>   dev_WARN(smu->adev->dev,
>>>>>"SMU uninitialized but power %s requested for 
>>>>> %u!\n",
>>>>>gate ? "gate" : "ungate", block_type);
>>>>>


RE: [PATCH] drm/amdgpu: add support to SMU debug option

2021-12-02 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Grodzovsky, Andrey 
>Sent: Thursday, December 2, 2021 12:01 AM
>To: Christian König ; Yu, Lang
>; Koenig, Christian ; amd-
>g...@lists.freedesktop.org
>Cc: Deucher, Alexander ; Lazar, Lijo
>; Huang, Ray 
>Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>
>
>On 2021-12-01 8:11 a.m., Christian König wrote:
>> Adding Andrey as well.
>>
>> Am 01.12.21 um 12:37 schrieb Yu, Lang:
>>> [SNIP]
>>>>>>>>> + BUG_ON(unlikely(smu->smu_debug_mode) && res);
>>>>>>>> BUG_ON() really crashes the kernel and is only allowed if we
>>>>>>>> prevent further data corruption with that.
>>>>>>>>
>>>>>>>> Most of the time WARN_ON() is more appropriate, but I can't
>>>>>>>> fully judge here since I don't know the SMU code well enough.
>>>>>>> This is what SMU FW guys want. They want "user-visible
>>>>>>> (potentially
>>>>>>> fatal)
>>>>>> errors", then a hang.
>>>>>>> They want to keep system state since the error occurred.
>>>>>> Well that is rather problematic.
>>>>>>
>>>>>> First of all we need to really justify that, crashing the kernel
>>>>>> is not something easily done.
>>>>>>
>>>>>> Then this isn't really effective here. What happens is that you
>>>>>> crash the kernel thread of the currently executing process, but it
>>>>>> is perfectly possible that another thread still tries to send
>>>>>> messages to the SMU. You need to have the BUG_ON() before dropping
>>>>>> the lock to make sure that this really gets the driver stuck in
>>>>>> the current state.
>>>>> Thanks. I got it. I just thought it is a kenel panic.
>>>>> Could we use a panic() here?
>>>> Potentially, but that might reboot the system automatically which is
>>>> probably not what you want either.
>>>>
>>>> How does the SMU firmware team gather the necessary information when
>>>> a problem occurs?
>>> As far as I know, they usually use a HDT to collect information.
>>> And they request a hang when error occurred in ticket.
>>> "Suggested error responses include pop-up windows (by x86 driver, if
>>> this is possible) or simply hanging after logging the error."
>>
>> In that case I suggest to set the "don't_touch_the_hardware_any_more"
>> procedure we also use in case of PCIe hotplug.
>>
>> Andrey has the details but essentially it stops the driver from
>> touching the hardware any more, signals all fences and unblocks
>> everything.
>>
>> It should then be trivial to inspect the hardware state and see what's
>> going on, but the system will keep stable at least for SSH access.
>>
>> Might be a good idea to have that mode for other fault cases like page
>> faults and hardware crashes.
>>
>> Regards,
>> Christian.
>
>
>There is no one specific function that does all of that, what I think can be 
>done is
>to bring the device to kind of halt state where no one touches it - as 
>following -
>
>1) Follow amdpgu_pci_remove -
>
>     drm_dev_unplug to make device inaccessible to user space (IOCTLs
>e.t.c.) and clears MMIO mappings to device and disallows remappings through
>page faults
>
>     No need to call all of amdgpu_driver_unload_kms but, within it call
>amdgpu_irq_disable_all and amdgpu_fence_driver_hw_fini toi disable interrupts
>and force signall all HW fences.
>
>     pci_disable_device and pci_wait_for_pending_transaction to flush any in 
> flight
>DMA operations from device
>
>2) set adev->no_hw_access so that most of places we access HW (all subsequent
>registers reads/writes and SMU/PSP message sending is skipped, but some race
>will be with those already in progress so maybe adding some wait)
>
>Andrey

Thanks for Christian's advice and Andrey's clarifications about that.
It seems that we should also handle kfd related stuff.

Regards,
Lang

>
>
>>
>>>
>>> Regards,
>>> Lang
>>>
>>

RE: [PATCH] drm/amdgpu: add support to SMU debug option

2021-12-01 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Quan, Evan 
>Sent: Thursday, December 2, 2021 10:48 AM
>To: Yu, Lang ; Koenig, Christian
>; Christian König
>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Lazar, Lijo
>; Huang, Ray 
>Subject: RE: [PATCH] drm/amdgpu: add support to SMU debug option
>
>[AMD Official Use Only]
>
>
>
>> -Original Message-
>> From: amd-gfx  On Behalf Of Yu,
>> Lang
>> Sent: Wednesday, December 1, 2021 7:37 PM
>> To: Koenig, Christian ; Christian König
>> ; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander ; Lazar, Lijo
>> ; Huang, Ray 
>> Subject: RE: [PATCH] drm/amdgpu: add support to SMU debug option
>>
>> [AMD Official Use Only]
>>
>>
>>
>> >-Original Message-
>> >From: Koenig, Christian 
>> >Sent: Wednesday, December 1, 2021 7:29 PM
>> >To: Yu, Lang ; Christian König
>> >; amd-gfx@lists.freedesktop.org
>> >Cc: Deucher, Alexander ; Lazar, Lijo
>> >; Huang, Ray 
>> >Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>> >
>> >Am 01.12.21 um 12:20 schrieb Yu, Lang:
>> >> [AMD Official Use Only]
>> >>
>> >>> -Original Message-
>> >>> From: Christian König 
>> >>> Sent: Wednesday, December 1, 2021 6:49 PM
>> >>> To: Yu, Lang ; Koenig, Christian
>> >>> ; amd-gfx@lists.freedesktop.org
>> >>> Cc: Deucher, Alexander ; Lazar, Lijo
>> >>> ; Huang, Ray 
>> >>> Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>> >>>
>> >>> Am 01.12.21 um 11:44 schrieb Yu, Lang:
>> >>>> [AMD Official Use Only]
>> >>>>
>> >>>>
>> >>>>
>> >>>>> -Original Message-
>> >>>>> From: Koenig, Christian 
>> >>>>> Sent: Wednesday, December 1, 2021 5:30 PM
>> >>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>> >>>>> Cc: Deucher, Alexander ; Lazar, Lijo
>> >>>>> ; Huang, Ray 
>> >>>>> Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>> >>>>>
>> >>>>> Am 01.12.21 um 10:24 schrieb Lang Yu:
>> >>>>>> To maintain system error state when SMU errors occurred, which
>> >>>>>> will aid in debugging SMU firmware issues, add SMU debug option
>> support.
>> >>>>>>
>> >>>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>> >>>>>> When enabled, it makes SMU errors fatal.
>> >>>>>> It is disabled by default.
>> >>>>>>
>> >>>>>> == Command Guide ==
>> >>>>>>
>> >>>>>> 1, enable SMU debug option
>> >>>>>>
>> >>>>>> # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>> >>>>>>
>> >>>>>> 2, disable SMU debug option
>> >>>>>>
>> >>>>>> # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>> >>>>>>
>> >>>>>> v3:
>> >>>>>> - Use debugfs_create_bool().(Christian)
>> >>>>>> - Put variable into smu_context struct.
>> >>>>>> - Don't resend command when timeout.
>> >>>>>>
>> >>>>>> v2:
>> >>>>>> - Resend command when timeout.(Lijo)
>> >>>>>> - Use debugfs file instead of module parameter.
>> >>>>>>
>> >>>>>> Signed-off-by: Lang Yu 
>> >>>>> Well the debugfs part looks really nice and clean now, but one
>> >>>>> more comment below.
>> >>>>>
>> >>>>>> ---
>> >>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c| 3 +++
>> >>>>>> drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h| 5 +
>> >>>>>> drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 ++
>> >>>>>> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +++-
>> >>>>>> 4 files changed, 17 insertions(+), 1 deletion(-)
>> >>>>>>
>> >>>>>> diff --git a/drivers/gpu/drm

RE: [PATCH] drm/amdgpu: add support to SMU debug option

2021-12-01 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Koenig, Christian 
>Sent: Wednesday, December 1, 2021 7:29 PM
>To: Yu, Lang ; Christian König
>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Lazar, Lijo
>; Huang, Ray 
>Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>
>Am 01.12.21 um 12:20 schrieb Yu, Lang:
>> [AMD Official Use Only]
>>
>>> -Original Message-
>>> From: Christian König 
>>> Sent: Wednesday, December 1, 2021 6:49 PM
>>> To: Yu, Lang ; Koenig, Christian
>>> ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Lazar, Lijo
>>> ; Huang, Ray 
>>> Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>>>
>>> Am 01.12.21 um 11:44 schrieb Yu, Lang:
>>>> [AMD Official Use Only]
>>>>
>>>>
>>>>
>>>>> -Original Message-
>>>>> From: Koenig, Christian 
>>>>> Sent: Wednesday, December 1, 2021 5:30 PM
>>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander ; Lazar, Lijo
>>>>> ; Huang, Ray 
>>>>> Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>>>>>
>>>>> Am 01.12.21 um 10:24 schrieb Lang Yu:
>>>>>> To maintain system error state when SMU errors occurred, which
>>>>>> will aid in debugging SMU firmware issues, add SMU debug option support.
>>>>>>
>>>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>>>>>> When enabled, it makes SMU errors fatal.
>>>>>> It is disabled by default.
>>>>>>
>>>>>> == Command Guide ==
>>>>>>
>>>>>> 1, enable SMU debug option
>>>>>>
>>>>>> # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>>
>>>>>> 2, disable SMU debug option
>>>>>>
>>>>>> # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>>
>>>>>> v3:
>>>>>> - Use debugfs_create_bool().(Christian)
>>>>>> - Put variable into smu_context struct.
>>>>>> - Don't resend command when timeout.
>>>>>>
>>>>>> v2:
>>>>>> - Resend command when timeout.(Lijo)
>>>>>> - Use debugfs file instead of module parameter.
>>>>>>
>>>>>> Signed-off-by: Lang Yu 
>>>>> Well the debugfs part looks really nice and clean now, but one more
>>>>> comment below.
>>>>>
>>>>>> ---
>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c| 3 +++
>>>>>> drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h| 5 +
>>>>>> drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 ++
>>>>>> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +++-
>>>>>> 4 files changed, 17 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> index 164d6a9e9fbb..86cd888c7822 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> @@ -1618,6 +1618,9 @@ int amdgpu_debugfs_init(struct amdgpu_device
>>>>> *adev)
>>>>>>  if (!debugfs_initialized())
>>>>>>  return 0;
>>>>>>
>>>>>> +debugfs_create_bool("amdgpu_smu_debug", 0600, root,
>>>>>> +  >smu.smu_debug_mode);
>>>>>> +
>>>>>>  ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root,
>adev,
>>>>>>_ib_preempt);
>>>>>>  if (IS_ERR(ent)) {
>>>>>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>>> index f738f7dc20c9..50dbf5594a9d 100644
>>>>>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>>> @@ -569,6 +569,11 @@ struct smu_context
>>>>>

RE: [PATCH] drm/amdgpu: add support to SMU debug option

2021-12-01 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Christian König 
>Sent: Wednesday, December 1, 2021 6:49 PM
>To: Yu, Lang ; Koenig, Christian
>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Lazar, Lijo
>; Huang, Ray 
>Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>
>Am 01.12.21 um 11:44 schrieb Yu, Lang:
>> [AMD Official Use Only]
>>
>>
>>
>>> -Original Message-
>>> From: Koenig, Christian 
>>> Sent: Wednesday, December 1, 2021 5:30 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Lazar, Lijo
>>> ; Huang, Ray 
>>> Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>>>
>>> Am 01.12.21 um 10:24 schrieb Lang Yu:
>>>> To maintain system error state when SMU errors occurred, which will
>>>> aid in debugging SMU firmware issues, add SMU debug option support.
>>>>
>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>>>> When enabled, it makes SMU errors fatal.
>>>> It is disabled by default.
>>>>
>>>> == Command Guide ==
>>>>
>>>> 1, enable SMU debug option
>>>>
>>>># echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>
>>>> 2, disable SMU debug option
>>>>
>>>># echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>
>>>> v3:
>>>>- Use debugfs_create_bool().(Christian)
>>>>- Put variable into smu_context struct.
>>>>- Don't resend command when timeout.
>>>>
>>>> v2:
>>>>- Resend command when timeout.(Lijo)
>>>>- Use debugfs file instead of module parameter.
>>>>
>>>> Signed-off-by: Lang Yu 
>>> Well the debugfs part looks really nice and clean now, but one more
>>> comment below.
>>>
>>>> ---
>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c| 3 +++
>>>>drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h| 5 +
>>>>drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 ++
>>>>drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +++-
>>>>4 files changed, 17 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> index 164d6a9e9fbb..86cd888c7822 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> @@ -1618,6 +1618,9 @@ int amdgpu_debugfs_init(struct amdgpu_device
>>> *adev)
>>>>if (!debugfs_initialized())
>>>>return 0;
>>>>
>>>> +  debugfs_create_bool("amdgpu_smu_debug", 0600, root,
>>>> +>smu.smu_debug_mode);
>>>> +
>>>>ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>>>  _ib_preempt);
>>>>if (IS_ERR(ent)) {
>>>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>> index f738f7dc20c9..50dbf5594a9d 100644
>>>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>> @@ -569,6 +569,11 @@ struct smu_context
>>>>struct smu_user_dpm_profile user_dpm_profile;
>>>>
>>>>struct stb_context stb_context;
>>>> +  /*
>>>> +   * When enabled, it makes SMU errors fatal.
>>>> +   * (0 = disabled (default), 1 = enabled)
>>>> +   */
>>>> +  bool smu_debug_mode;
>>>>};
>>>>
>>>>struct i2c_adapter;
>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>> index 6e781cee8bb6..d3797a2d6451 100644
>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>> @@ -1919,6 +1919,8 @@ static int aldebaran_mode2_reset(struct
>>> smu_context *smu)
>>>>out:
>>>>mutex_unlock(>message_lock);
>>>>
>>>> +  BUG_ON(unlikely(smu->smu_debug_mode) && ret);
>>>> +
>>&g

RE: [PATCH] drm/amdgpu: add support to SMU debug option

2021-12-01 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, December 1, 2021 6:46 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Koenig, Christian 
>Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>
>
>
>On 12/1/2021 4:08 PM, Yu, Lang wrote:
>> [AMD Official Use Only]
>>
>>
>>
>>> -Original Message-
>>> From: Lazar, Lijo 
>>> Sent: Wednesday, December 1, 2021 5:47 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Huang, Ray
>>> ; Koenig, Christian 
>>> Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>>>
>>>
>>>
>>> On 12/1/2021 2:54 PM, Lang Yu wrote:
>>>> To maintain system error state when SMU errors occurred, which will
>>>> aid in debugging SMU firmware issues, add SMU debug option support.
>>>>
>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>>>> When enabled, it makes SMU errors fatal.
>>>> It is disabled by default.
>>>>
>>>> == Command Guide ==
>>>>
>>>> 1, enable SMU debug option
>>>>
>>>># echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>
>>>> 2, disable SMU debug option
>>>>
>>>># echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>
>>>> v3:
>>>>- Use debugfs_create_bool().(Christian)
>>>>- Put variable into smu_context struct.
>>>>- Don't resend command when timeout.
>>>>
>>>> v2:
>>>>- Resend command when timeout.(Lijo)
>>>>- Use debugfs file instead of module parameter.
>>>>
>>>> Signed-off-by: Lang Yu 
>>>> ---
>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c| 3 +++
>>>>drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h| 5 +
>>>>drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 ++
>>>>drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +++-
>>>>4 files changed, 17 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> index 164d6a9e9fbb..86cd888c7822 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> @@ -1618,6 +1618,9 @@ int amdgpu_debugfs_init(struct amdgpu_device
>>> *adev)
>>>>if (!debugfs_initialized())
>>>>return 0;
>>>>
>>>> +  debugfs_create_bool("amdgpu_smu_debug", 0600, root,
>>>> +>smu.smu_debug_mode);
>>>> +
>>>>ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>>>  _ib_preempt);
>>>>if (IS_ERR(ent)) {
>>>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>> index f738f7dc20c9..50dbf5594a9d 100644
>>>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>> @@ -569,6 +569,11 @@ struct smu_context
>>>>struct smu_user_dpm_profile user_dpm_profile;
>>>>
>>>>struct stb_context stb_context;
>>>> +  /*
>>>> +   * When enabled, it makes SMU errors fatal.
>>>> +   * (0 = disabled (default), 1 = enabled)
>>>> +   */
>>>> +  bool smu_debug_mode;
>>>>};
>>>>
>>>>struct i2c_adapter;
>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>> index 6e781cee8bb6..d3797a2d6451 100644
>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>> @@ -1919,6 +1919,8 @@ static int aldebaran_mode2_reset(struct
>>> smu_context *smu)
>>>>out:
>>>>mutex_unlock(>message_lock);
>>>>
>>>> +  BUG_ON(unlikely(smu->smu_debug_mode) && ret);
>>>> +
>>> This hunk can be skipped while submitting. If this fails, GPU reset
>>> will fail and am

RE: [PATCH] drm/amdgpu: add support to SMU debug option

2021-12-01 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Koenig, Christian 
>Sent: Wednesday, December 1, 2021 5:30 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Lazar, Lijo
>; Huang, Ray 
>Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>
>Am 01.12.21 um 10:24 schrieb Lang Yu:
>> To maintain system error state when SMU errors occurred, which will
>> aid in debugging SMU firmware issues, add SMU debug option support.
>>
>> It can be enabled or disabled via amdgpu_smu_debug debugfs file. When
>> enabled, it makes SMU errors fatal.
>> It is disabled by default.
>>
>> == Command Guide ==
>>
>> 1, enable SMU debug option
>>
>>   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>
>> 2, disable SMU debug option
>>
>>   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>
>> v3:
>>   - Use debugfs_create_bool().(Christian)
>>   - Put variable into smu_context struct.
>>   - Don't resend command when timeout.
>>
>> v2:
>>   - Resend command when timeout.(Lijo)
>>   - Use debugfs file instead of module parameter.
>>
>> Signed-off-by: Lang Yu 
>
>Well the debugfs part looks really nice and clean now, but one more comment
>below.
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c| 3 +++
>>   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h| 5 +
>>   drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 ++
>>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +++-
>>   4 files changed, 17 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 164d6a9e9fbb..86cd888c7822 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -1618,6 +1618,9 @@ int amdgpu_debugfs_init(struct amdgpu_device
>*adev)
>>  if (!debugfs_initialized())
>>  return 0;
>>
>> +debugfs_create_bool("amdgpu_smu_debug", 0600, root,
>> +  >smu.smu_debug_mode);
>> +
>>  ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>_ib_preempt);
>>  if (IS_ERR(ent)) {
>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>> index f738f7dc20c9..50dbf5594a9d 100644
>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>> @@ -569,6 +569,11 @@ struct smu_context
>>  struct smu_user_dpm_profile user_dpm_profile;
>>
>>  struct stb_context stb_context;
>> +/*
>> + * When enabled, it makes SMU errors fatal.
>> + * (0 = disabled (default), 1 = enabled)
>> + */
>> +bool smu_debug_mode;
>>   };
>>
>>   struct i2c_adapter;
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> index 6e781cee8bb6..d3797a2d6451 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> @@ -1919,6 +1919,8 @@ static int aldebaran_mode2_reset(struct
>smu_context *smu)
>>   out:
>>  mutex_unlock(>message_lock);
>>
>> +BUG_ON(unlikely(smu->smu_debug_mode) && ret);
>> +
>>  return ret;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> index 048ca1673863..9be005eb4241 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> @@ -349,15 +349,21 @@ int smu_cmn_send_smc_msg_with_param(struct
>smu_context *smu,
>>  __smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>  goto Out;
>>  }
>> +
>>  __smu_cmn_send_msg(smu, (uint16_t) index, param);
>>  reg = __smu_cmn_poll_stat(smu);
>>  res = __smu_cmn_reg2errno(smu, reg);
>> -if (res != 0)
>> +if (res != 0) {
>>  __smu_cmn_reg_print_error(smu, reg, index, param, msg);
>> +goto Out;
>> +}
>>  if (read_arg)
>>  smu_cmn_read_arg(smu, read_arg);
>>   Out:
>>  mutex_unlock(>message_lock);
>> +
>> +BUG_ON(unlikely(smu->smu_debug_mode) && res);
>
>BUG_ON() really crashes the kernel and is only allowed if we prevent further 
>data
>corruption with that.
>
>Most of the time WARN_ON() is more appropriate, but I can't fully judge here
>since I don't know the SMU code well enough.

This is what SMU FW guys want. They want "user-visible (potentially fatal) 
errors", then a hang.
They want to keep system state since the error occurred.

Regards,
Lang

>Christian.
>
>> +
>>  return res;
>>   }
>>


RE: [PATCH] drm/amdgpu: add support to SMU debug option

2021-12-01 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, December 1, 2021 5:47 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Koenig, Christian 
>Subject: Re: [PATCH] drm/amdgpu: add support to SMU debug option
>
>
>
>On 12/1/2021 2:54 PM, Lang Yu wrote:
>> To maintain system error state when SMU errors occurred, which will
>> aid in debugging SMU firmware issues, add SMU debug option support.
>>
>> It can be enabled or disabled via amdgpu_smu_debug debugfs file. When
>> enabled, it makes SMU errors fatal.
>> It is disabled by default.
>>
>> == Command Guide ==
>>
>> 1, enable SMU debug option
>>
>>   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>
>> 2, disable SMU debug option
>>
>>   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>
>> v3:
>>   - Use debugfs_create_bool().(Christian)
>>   - Put variable into smu_context struct.
>>   - Don't resend command when timeout.
>>
>> v2:
>>   - Resend command when timeout.(Lijo)
>>   - Use debugfs file instead of module parameter.
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c| 3 +++
>>   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h| 5 +
>>   drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 ++
>>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +++-
>>   4 files changed, 17 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 164d6a9e9fbb..86cd888c7822 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -1618,6 +1618,9 @@ int amdgpu_debugfs_init(struct amdgpu_device
>*adev)
>>  if (!debugfs_initialized())
>>  return 0;
>>
>> +debugfs_create_bool("amdgpu_smu_debug", 0600, root,
>> +  >smu.smu_debug_mode);
>> +
>>  ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>_ib_preempt);
>>  if (IS_ERR(ent)) {
>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>> index f738f7dc20c9..50dbf5594a9d 100644
>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>> @@ -569,6 +569,11 @@ struct smu_context
>>  struct smu_user_dpm_profile user_dpm_profile;
>>
>>  struct stb_context stb_context;
>> +/*
>> + * When enabled, it makes SMU errors fatal.
>> + * (0 = disabled (default), 1 = enabled)
>> + */
>> +bool smu_debug_mode;
>>   };
>>
>>   struct i2c_adapter;
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> index 6e781cee8bb6..d3797a2d6451 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> @@ -1919,6 +1919,8 @@ static int aldebaran_mode2_reset(struct
>smu_context *smu)
>>   out:
>>  mutex_unlock(>message_lock);
>>
>> +BUG_ON(unlikely(smu->smu_debug_mode) && ret);
>> +
>This hunk can be skipped while submitting. If this fails, GPU reset will fail 
>and
>amdgpu won't continue.

Ok, we don't handle such cases.

>
>>  return ret;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> index 048ca1673863..9be005eb4241 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> @@ -349,15 +349,21 @@ int smu_cmn_send_smc_msg_with_param(struct
>smu_context *smu,
>>  __smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>  goto Out;
>>  }
>> +
>>  __smu_cmn_send_msg(smu, (uint16_t) index, param);
>>  reg = __smu_cmn_poll_stat(smu);
>>  res = __smu_cmn_reg2errno(smu, reg);
>> -if (res != 0)
>> +if (res != 0) {
>>  __smu_cmn_reg_print_error(smu, reg, index, param, msg);
>> +goto Out;
>
>Next step is reading smu parameter register which is harmless as reading
>response register and it's not clear on read. This goto also may be skipped.

I just think that does some extra work. We don’t want to read response register.
This goto makes error handling more clear.

Regards,
Lang

>Thanks,
>Lijo
>
>> +}
>>  if (read_arg)
>>  smu_cmn_read_arg(smu, read_arg);
>>   Out:
>>  mutex_unlock(>message_lock);
>> +
>> +BUG_ON(unlikely(smu->smu_debug_mode) && res);
>> +
>>  return res;
>>   }
>>
>>


RE: [PATCH] drm/amdgpu: add SMU debug option support

2021-12-01 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: amd-gfx  On Behalf Of Yu, Lang
>Sent: Wednesday, December 1, 2021 3:58 PM
>To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Koenig, Christian 
>Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>
>[AMD Official Use Only]
>
>
>
>>-Original Message-
>>From: Lazar, Lijo 
>>Sent: Wednesday, December 1, 2021 3:28 PM
>>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>Cc: Deucher, Alexander ; Huang, Ray
>>; Koenig, Christian 
>>Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>
>>
>>
>>On 12/1/2021 12:37 PM, Yu, Lang wrote:
>>> [AMD Official Use Only]
>>>
>>>
>>>
>>>> -Original Message-
>>>> From: Lazar, Lijo 
>>>> Sent: Wednesday, December 1, 2021 2:56 PM
>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>> Cc: Deucher, Alexander ; Huang, Ray
>>>> ; Koenig, Christian 
>>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>>
>>>>
>>>>
>>>> On 12/1/2021 11:57 AM, Yu, Lang wrote:
>>>>> [AMD Official Use Only]
>>>>>
>>>>> Hi Lijo,
>>>>>
>>>>> Thanks for your comments.
>>>>>
>>>>>   From my understanding, that just increases the timeout threshold
>>>>> and could hide some potential issues which should be exposed and solved.
>>>>>
>>>>> If current timeout threshold is not enough for some corner cases,
>>>>> (1) Do we consider to increase the threshold to cover these cases?
>>>>> (2) Or do we just expose them and request SMU FW to optimize them?
>>>>>
>>>>> I think it doesn't make much sense to increase the threshold in debug 
>>>>> mode.
>>>>> How do you think? Thanks!
>>>>
>>>> In normal cases, 2secs would be more than enough. If we hang
>>>> immediately, then check the FW registers later, the response would
>>>> have come. I thought we just need to note those cases and not to
>>>> fail everytime. Just to mark as a red flag in the log to tell us
>>>> that FW is unexpectedly busy processing something else when the message is
>sent.
>>>>
>>>> There are some issues related to S0ix where we see the FW comes back
>>>> with a response with an increased timeout under certain conditions.
>>>
>>> If these issues still exists, could we just blacklist the tests that
>>> triggered them before solve them? Or we just increase the threshold
>>> to cover
>>all the cases?
>>>
>>
>>Actually, the timeout is message specific - like i2c transfer from
>>EEPROM could take longer time.
>>
>>I am not sure if we should have more than 2s as timeout. Whenever this
>>kind of issue happens, FW team check registers (then it will have a
>>proper value) and say they don't see anything abnormal :) Usually,
>>those are just signs of crack and it eventually breaks.
>>
>>Option is just fail immediately (then again not sure useful it will be
>>if the issue is this sort of thing) or wait to see how far it goes with
>>an added timeout before it fails eventually.
>
>Are smu_cmn_wait_for_response()/smu_cmn_send_msg_without_waiting()
>designed for long timeout cases? Is it fine that we don't fail here in the 
>event of
>timeout?

Or we can add a timeout parameter into smu_cmn_send_smc_msg_with_param() 
to specify the timeout you want for specific message.
I think this may be another story. Thanks!
 
Thanks,
Lang
>
>>
>>Thanks,
>>Lijo
>>
>>> Regards,
>>> Lang
>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>
>>>>> Regards,
>>>>> Lang
>>>>>
>>>>>> -Original Message-
>>>>>> From: Lazar, Lijo 
>>>>>> Sent: Wednesday, December 1, 2021 1:44 PM
>>>>>> To: Lazar, Lijo ; Yu, Lang ;
>>>>>> amd- g...@lists.freedesktop.org
>>>>>> Cc: Deucher, Alexander ; Huang, Ray
>>>>>> ; Koenig, Christian 
>>>>>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>>
>>>>>> Just realized that the patch I pasted won't work. Outer loop exit
>>>>>> needs to be like this.
>>>>>>

RE: [PATCH] drm/amdgpu: add SMU debug option support

2021-11-30 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, December 1, 2021 3:28 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Koenig, Christian 
>Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>
>
>
>On 12/1/2021 12:37 PM, Yu, Lang wrote:
>> [AMD Official Use Only]
>>
>>
>>
>>> -Original Message-
>>> From: Lazar, Lijo 
>>> Sent: Wednesday, December 1, 2021 2:56 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Huang, Ray
>>> ; Koenig, Christian 
>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>
>>>
>>>
>>> On 12/1/2021 11:57 AM, Yu, Lang wrote:
>>>> [AMD Official Use Only]
>>>>
>>>> Hi Lijo,
>>>>
>>>> Thanks for your comments.
>>>>
>>>>   From my understanding, that just increases the timeout threshold
>>>> and could hide some potential issues which should be exposed and solved.
>>>>
>>>> If current timeout threshold is not enough for some corner cases,
>>>> (1) Do we consider to increase the threshold to cover these cases?
>>>> (2) Or do we just expose them and request SMU FW to optimize them?
>>>>
>>>> I think it doesn't make much sense to increase the threshold in debug mode.
>>>> How do you think? Thanks!
>>>
>>> In normal cases, 2secs would be more than enough. If we hang
>>> immediately, then check the FW registers later, the response would
>>> have come. I thought we just need to note those cases and not to fail
>>> everytime. Just to mark as a red flag in the log to tell us that FW
>>> is unexpectedly busy processing something else when the message is sent.
>>>
>>> There are some issues related to S0ix where we see the FW comes back
>>> with a response with an increased timeout under certain conditions.
>>
>> If these issues still exists, could we just blacklist the tests that
>> triggered them before solve them? Or we just increase the threshold to cover
>all the cases?
>>
>
>Actually, the timeout is message specific - like i2c transfer from EEPROM could
>take longer time.
>
>I am not sure if we should have more than 2s as timeout. Whenever this kind of
>issue happens, FW team check registers (then it will have a proper value) and 
>say
>they don't see anything abnormal :) Usually, those are just signs of crack and 
>it
>eventually breaks.
>
>Option is just fail immediately (then again not sure useful it will be if the 
>issue is
>this sort of thing) or wait to see how far it goes with an added timeout 
>before it
>fails eventually.

Does smu_cmn_wait_for_response()/smu_cmn_send_msg_without_waiting() are 
designed for long timeout cases? Is it fine that we don't fail here in the 
event of timeout?

Thanks,
Lang 

>
>Thanks,
>Lijo
>
>> Regards,
>> Lang
>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>>
>>>> Regards,
>>>> Lang
>>>>
>>>>> -Original Message-
>>>>> From: Lazar, Lijo 
>>>>> Sent: Wednesday, December 1, 2021 1:44 PM
>>>>> To: Lazar, Lijo ; Yu, Lang ;
>>>>> amd- g...@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander ; Huang, Ray
>>>>> ; Koenig, Christian 
>>>>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>
>>>>> Just realized that the patch I pasted won't work. Outer loop exit
>>>>> needs to be like this.
>>>>>   (reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>>>>> 0
>>>>>
>>>>> Anyway, that patch is only there to communicate what I really meant
>>>>> in the earlier comment.
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>> -Original Message-
>>>>> From: amd-gfx  On Behalf Of
>>>>> Lazar, Lijo
>>>>> Sent: Wednesday, December 1, 2021 10:44 AM
>>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander ; Huang, Ray
>>>>> ; Koenig, Christian 
>>>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>
>>>>>
>>>>>
>>>>> On 11/30/2021 10:47 AM, Lang Yu wrote:
>>>>>> T

RE: [PATCH] drm/amdgpu: add SMU debug option support

2021-11-30 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, December 1, 2021 2:56 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Koenig, Christian 
>Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>
>
>
>On 12/1/2021 11:57 AM, Yu, Lang wrote:
>> [AMD Official Use Only]
>>
>> Hi Lijo,
>>
>> Thanks for your comments.
>>
>>  From my understanding, that just increases the timeout threshold and
>> could hide some potential issues which should be exposed and solved.
>>
>> If current timeout threshold is not enough for some corner cases,
>> (1) Do we consider to increase the threshold to cover these cases?
>> (2) Or do we just expose them and request SMU FW to optimize them?
>>
>> I think it doesn't make much sense to increase the threshold in debug mode.
>> How do you think? Thanks!
>
>In normal cases, 2secs would be more than enough. If we hang immediately, then
>check the FW registers later, the response would have come. I thought we just
>need to note those cases and not to fail everytime. Just to mark as a red flag 
>in
>the log to tell us that FW is unexpectedly busy processing something else when
>the message is sent.
>
>There are some issues related to S0ix where we see the FW comes back with a
>response with an increased timeout under certain conditions.

If these issues still exists, could we just blacklist the tests that triggered 
them 
before solve them? Or we just increase the threshold to cover all the cases?

Regards,
Lang

>
>Thanks,
>Lijo
>
>>
>> Regards,
>> Lang
>>
>>> -Original Message-
>>> From: Lazar, Lijo 
>>> Sent: Wednesday, December 1, 2021 1:44 PM
>>> To: Lazar, Lijo ; Yu, Lang ;
>>> amd- g...@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Huang, Ray
>>> ; Koenig, Christian 
>>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>>
>>> Just realized that the patch I pasted won't work. Outer loop exit
>>> needs to be like this.
>>> (reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>>> 0
>>>
>>> Anyway, that patch is only there to communicate what I really meant
>>> in the earlier comment.
>>>
>>> Thanks,
>>> Lijo
>>>
>>> -Original Message-
>>> From: amd-gfx  On Behalf Of
>>> Lazar, Lijo
>>> Sent: Wednesday, December 1, 2021 10:44 AM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Huang, Ray
>>> ; Koenig, Christian 
>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>
>>>
>>>
>>> On 11/30/2021 10:47 AM, Lang Yu wrote:
>>>> To maintain system error state when SMU errors occurred, which will
>>>> aid in debugging SMU firmware issues, add SMU debug option support.
>>>>
>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>>>> When enabled, it makes SMU errors fatal.
>>>> It is disabled by default.
>>>>
>>>> == Command Guide ==
>>>>
>>>> 1, enable SMU debug option
>>>>
>>>># echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>
>>>> 2, disable SMU debug option
>>>>
>>>># echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>
>>>> v2:
>>>>- Resend command when timeout.(Lijo)
>>>>- Use debugfs file instead of module parameter.
>>>>
>>>> Signed-off-by: Lang Yu 
>>>> ---
>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32
>+
>>>>drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c  | 39
>+++-
>>> -
>>>>2 files changed, 69 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> index 164d6a9e9fbb..f9412de86599 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> @@ -39,6 +39,8 @@
>>>>
>>>>#if defined(CONFIG_DEBUG_FS)
>>>>
>>>> +extern int amdgpu_smu_debug;
>>>> +
>>>>/**
>>>> * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>>&g

RE: [PATCH] drm/amdgpu: add SMU debug option support

2021-11-30 Thread Yu, Lang
[AMD Official Use Only]

Hi Lijo,

Thanks for your comments.
 
From my understanding, that just increases the timeout threshold and
could hide some potential issues which should be exposed and solved.

If current timeout threshold is not enough for some corner cases,
(1) Do we consider to increase the threshold to cover these cases?
(2) Or do we just expose them and request SMU FW to optimize them?

I think it doesn't make much sense to increase the threshold in debug mode.
How do you think? Thanks!

Regards,
Lang

>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, December 1, 2021 1:44 PM
>To: Lazar, Lijo ; Yu, Lang ; amd-
>g...@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Koenig, Christian 
>Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>
>Just realized that the patch I pasted won't work. Outer loop exit needs to be 
>like
>this.
>   (reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>0
>
>Anyway, that patch is only there to communicate what I really meant in the
>earlier comment.
>
>Thanks,
>Lijo
>
>-Original Message-
>From: amd-gfx  On Behalf Of Lazar,
>Lijo
>Sent: Wednesday, December 1, 2021 10:44 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Koenig, Christian 
>Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>
>
>
>On 11/30/2021 10:47 AM, Lang Yu wrote:
>> To maintain system error state when SMU errors occurred, which will
>> aid in debugging SMU firmware issues, add SMU debug option support.
>>
>> It can be enabled or disabled via amdgpu_smu_debug debugfs file. When
>> enabled, it makes SMU errors fatal.
>> It is disabled by default.
>>
>> == Command Guide ==
>>
>> 1, enable SMU debug option
>>
>>   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>
>> 2, disable SMU debug option
>>
>>   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>
>> v2:
>>   - Resend command when timeout.(Lijo)
>>   - Use debugfs file instead of module parameter.
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +
>>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c  | 39 +++-
>-
>>   2 files changed, 69 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 164d6a9e9fbb..f9412de86599 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -39,6 +39,8 @@
>>
>>   #if defined(CONFIG_DEBUG_FS)
>>
>> +extern int amdgpu_smu_debug;
>> +
>>   /**
>>* amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>>*
>> @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct
>file *f, char __user *buf,
>>  return result;
>>   }
>>
>> +
>> +
>>   static const struct file_operations amdgpu_debugfs_regs2_fops = {
>>  .owner = THIS_MODULE,
>>  .unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6 +1613,26
>> @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>  amdgpu_debugfs_sclk_set, "%llu\n");
>>
>> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
>> +*val = amdgpu_smu_debug;
>> +return 0;
>> +}
>> +
>> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
>> +if (val != 0 && val != 1)
>> +return -EINVAL;
>> +
>> +amdgpu_smu_debug = val;
>> +return 0;
>> +}
>> +
>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
>> + amdgpu_debugfs_smu_debug_get,
>> + amdgpu_debugfs_smu_debug_set,
>> + "%llu\n");
>> +
>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>   {
>>  struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device
>*adev)
>>  return PTR_ERR(ent);
>>  }
>>
>> +ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
>> +  _smu_debug);
>> +if (IS_ERR(ent)) {
>> +DRM_ERROR("unable to create amdgpu_smu_debug debugsfs
>file\n");
>> +return PTR_ERR(ent);
>> +}
>> +
>> +
>>  /

RE: [PATCH] drm/amdgpu/powerplay: fix sysfs_emit/sysfs_emit_at handling

2021-11-03 Thread Yu, Lang
[AMD Official Use Only]

Yes, I missed such conversions in powerplay. Thanks!

Reviewed-by: Lang Yu  

>-Original Message-
>From: Deucher, Alexander 
>Sent: Thursday, November 4, 2021 8:59 AM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Yu, Lang
>; Powell, Darren 
>Subject: [PATCH] drm/amdgpu/powerplay: fix sysfs_emit/sysfs_emit_at handling
>
>sysfs_emit and sysfs_emit_at requrie a page boundary aligned buf address. Make
>them happy!
>
>v2: fix sysfs_emit -> sysfs_emit_at missed conversions
>
>Cc: Lang Yu 
>Cc: Darren Powell 
>Fixes: 6db0c87a0a8e ("amdgpu/pm: Replace hwmgr smu usage of sprintf with
>sysfs_emit")
>Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1774
>Signed-off-by: Alex Deucher 
>---
> .../gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c   |  8 ++--
> .../gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c| 10 +++---
> .../gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c|  2 ++
> .../gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.h| 13 +
> .../gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c  | 12 +---
>  .../gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c  |  4
>  .../gpu/drm/amd/pm/powerplay/hwmgr/vega20_hwmgr.c  | 14
>++
> 7 files changed, 51 insertions(+), 12 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c
>b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c
>index 1de3ae77e03e..258c573acc97 100644
>--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c
>+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c
>@@ -1024,6 +1024,8 @@ static int smu10_print_clock_levels(struct pp_hwmgr
>*hwmgr,
>   uint32_t min_freq, max_freq = 0;
>   uint32_t ret = 0;
>
>+  phm_get_sysfs_buf(, );
>+
>   switch (type) {
>   case PP_SCLK:
>   smum_send_msg_to_smc(hwmgr,
>PPSMC_MSG_GetGfxclkFrequency, ); @@ -1065,7 +1067,7 @@ static int
>smu10_print_clock_levels(struct pp_hwmgr *hwmgr,
>   if (ret)
>   return ret;
>
>-  size = sysfs_emit(buf, "%s:\n", "OD_SCLK");
>+  size += sysfs_emit_at(buf, size, "%s:\n", "OD_SCLK");
>   size += sysfs_emit_at(buf, size, "0: %10uMhz\n",
>   (data->gfx_actual_soft_min_freq > 0) ? data-
>>gfx_actual_soft_min_freq : min_freq);
>   size += sysfs_emit_at(buf, size, "1: %10uMhz\n", @@ -
>1081,7 +1083,7 @@ static int smu10_print_clock_levels(struct pp_hwmgr
>*hwmgr,
>   if (ret)
>   return ret;
>
>-  size = sysfs_emit(buf, "%s:\n", "OD_RANGE");
>+  size += sysfs_emit_at(buf, size, "%s:\n", "OD_RANGE");
>   size += sysfs_emit_at(buf, size,
>"SCLK: %7uMHz %10uMHz\n",
>   min_freq, max_freq);
>   }
>@@ -1456,6 +1458,8 @@ static int smu10_get_power_profile_mode(struct
>pp_hwmgr *hwmgr, char *buf)
>   if (!buf)
>   return -EINVAL;
>
>+  phm_get_sysfs_buf(, );
>+
>   size += sysfs_emit_at(buf, size, "%s %16s %s %s %s %s\n",title[0],
>   title[1], title[2], title[3], title[4], title[5]);
>
>diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
>b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
>index e7803ce8f67a..aceebf584225 100644
>--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
>+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
>@@ -4914,6 +4914,8 @@ static int smu7_print_clock_levels(struct pp_hwmgr
>*hwmgr,
>   int size = 0;
>   uint32_t i, now, clock, pcie_speed;
>
>+  phm_get_sysfs_buf(, );
>+
>   switch (type) {
>   case PP_SCLK:
>   smum_send_msg_to_smc(hwmgr,
>PPSMC_MSG_API_GetSclkFrequency, ); @@ -4963,7 +4965,7 @@ static int
>smu7_print_clock_levels(struct pp_hwmgr *hwmgr,
>   break;
>   case OD_SCLK:
>   if (hwmgr->od_enabled) {
>-  size = sysfs_emit(buf, "%s:\n", "OD_SCLK");
>+  size += sysfs_emit_at(buf, size, "%s:\n", "OD_SCLK");
>   for (i = 0; i < odn_sclk_table->num_of_pl; i++)
>   size += sysfs_emit_at(buf, size,
>"%d: %10uMHz %10umV\n",
>   i, odn_sclk_table->entries[i].clock/100,
>@@ -4972,7 +4974,7 @@ static int smu7_print_clock_levels(struct pp_hwmgr
>*hwmgr,
> 

RE: [PATCH 2/2] drm/amdkfd: Remove cu mask from struct queue_properties

2021-10-25 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Kuehling, Felix 
>Sent: Friday, October 22, 2021 1:11 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH 2/2] drm/amdkfd: Remove cu mask from struct
>queue_properties
>
>Am 2021-10-15 um 4:48 a.m. schrieb Lang Yu:
>> +enum queue_update_flag {
>> +UPDATE_FLAG_PROPERTITY = 0,
>> +UPDATE_FLAG_CU_MASK,
>> +};
>> +
>> +struct queue_update_info {
>> +union {
>> +struct queue_properties properties;
>> +struct {
>> +uint32_t count; /* Must be a multiple of 32 */
>> +uint32_t *ptr;
>> +} cu_mask;
>> +};
>> +
>> +enum queue_update_flag update_flag;
>> +};
>> +
>
>This doesn't make sense to me. As I understand it, queue_update_info is for
>information that is not stored in queue_properties but only in the MQDs.
>Therefore, it should not include the queue_properties.
>
>All the low level functions in the MQD managers get both the queue_properties
>and the queue_update_info. So trying to wrap both in the same union doesn't
>make sense there either.
>
>I think you only need this because you tried to generalize pqm_update_queue to
>handle both updates to queue_properties and CU mask updates with a single
>argument. IMO this does not make the interface any clearer. I think it would be
>more straight-forward to keep a separate pqm_set_cu_mask function that takes
>a queue_update_info parameter. If you're looking for more generic names, I
>suggest the following:
>
>  * Rename pqm_update_queue to pqm_update_queue_properties
>  * Rename struct queue_update_info to struct mqd_update_info
>  * Rename pqm_set_cu_mask to pqm_update_mqd. For now this is only used
>for CU mask (the union has only one struct member for now). It may
>be used for other MQD properties that don't need to be stored in
>queue_properties in the future

Got it. Thanks for your suggestions!

Regards,
Lang

>
>Regards,
>  Felix
>


RE: [PATCH 1/2] drm/amdkfd: Add an optional argument into update queue operation

2021-10-25 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Kuehling, Felix 
>Sent: Friday, October 22, 2021 12:46 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH 1/2] drm/amdkfd: Add an optional argument into update
>queue operation
>
>
>Am 2021-10-15 um 4:48 a.m. schrieb Lang Yu:
>> Currently, queue is updated with data stored in queue_properties.
>> And all allocated resource in queue_properties will not be freed until
>> the queue is destroyed.
>>
>> But some properties(e.g., cu mask) bring some memory management
>> headaches(e.g., memory leak) and make code complex. Actually they
>> don't have to persist in queue_properties.
>>
>> So add an argument into update queue to pass such properties and
>> remove them from queue_properties.
>>
>> Signed-off-by: Lang Yu 
>> ---
>>  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  4 ++--
>> .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 +-
>> drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h  |  2 +-
>> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  | 18 +++
>> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  8 +++
>>  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  8 +++
>>  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   | 22 +--
>>  .../amd/amdkfd/kfd_process_queue_manager.c|  6 ++---
>>  8 files changed, 35 insertions(+), 35 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index f8fce9d05f50..7f6f4937eedb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -557,7 +557,7 @@ static int destroy_queue_nocpsch(struct
>device_queue_manager *dqm,
>>  return retval;
>>  }
>>
>> -static int update_queue(struct device_queue_manager *dqm, struct
>> queue *q)
>> +static int update_queue(struct device_queue_manager *dqm, struct
>> +queue *q, void *args)
>
>Please don't use a void * here. If you don't want to declare the struct
>queue_update_info in this patch, you can just declare it as an abstract
>type:
>
>struct queue_update_info;
>
>You can cast NULL to (struct queue_update_info *) without requiring the
>structure definition.

Got it. Thanks!

Regards,
Lang
>
>Regards,
>  Felix
>
>
>>  {
>>  int retval = 0;
>>  struct mqd_manager *mqd_mgr;
>> @@ -605,7 +605,7 @@ static int update_queue(struct device_queue_manager
>*dqm, struct queue *q)
>>  }
>>  }
>>
>> -mqd_mgr->update_mqd(mqd_mgr, q->mqd, >properties);
>> +mqd_mgr->update_mqd(mqd_mgr, q->mqd, >properties, args);
>>
>>  /*
>>   * check active state vs. the previous state and modify diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> index c8719682c4da..08cfc2a2fdbb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> @@ -93,7 +93,7 @@ struct device_queue_manager_ops {
>>  struct queue *q);
>>
>>  int (*update_queue)(struct device_queue_manager *dqm,
>> -struct queue *q);
>> +struct queue *q, void *args);
>>
>>  int (*register_process)(struct device_queue_manager *dqm,
>>  struct qcm_process_device *qpd); diff --
>git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
>> index 6e6918ccedfd..6ddf93629b8c 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
>> @@ -80,7 +80,7 @@ struct mqd_manager {
>>  struct mm_struct *mms);
>>
>>  void(*update_mqd)(struct mqd_manager *mm, void *mqd,
>> -struct queue_properties *q);
>> +struct queue_properties *q, void *args);
>>
>>  int (*destroy_mqd)(struct mqd_manager *mm, void *mqd,
>>  enum kfd_preempt_type type,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
>> index 064914e1e8d6..8bb2fd4cba41 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
>> +++ b/dr

RE: FW: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in amdgpu_device_fini_sw()

2021-10-21 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Grodzovsky, Andrey 
>Sent: Thursday, October 21, 2021 11:18 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Subject: Re: FW: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in
>amdgpu_device_fini_sw()
>
>On 2021-10-21 3:19 a.m., Yu, Lang wrote:
>
>> [AMD Official Use Only]
>>
>>
>>
>>> -Original Message-
>>> From: Yu, Lang 
>>> Sent: Thursday, October 21, 2021 3:18 PM
>>> To: Grodzovsky, Andrey 
>>> Cc: Deucher, Alexander ; Koenig, Christian
>>> ; Huang, Ray ; Yu, Lang
>>> 
>>> Subject: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in
>>> amdgpu_device_fini_sw()
>>>
>>> amdgpu_fence_driver_sw_fini() should be executed before
>>> amdgpu_device_ip_fini(), otherwise fence driver resource won't be
>>> properly freed as adev->rings have been tore down.
>
>
>Cam you clarify more where exactly the memleak happens ?
>
>Andrey

See amdgpu_fence_driver_sw_fini(), ring->fence_drv.fences will only be freed
when adev->rings[i] is not NULL.

void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev)
{
unsigned int i, j;

for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];

if (!ring || !ring->fence_drv.initialized)
continue;

if (!ring->no_scheduler)
drm_sched_fini(>sched);

for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
dma_fence_put(ring->fence_drv.fences[j]);
kfree(ring->fence_drv.fences);
ring->fence_drv.fences = NULL;
ring->fence_drv.initialized = false;
}
}

If amdgpu_device_ip_fini() is executed before amdgpu_fence_driver_sw_fini(), 
amdgpu_device_ip_fini() will call gfx_vX_0_sw_fini() 
then call amdgpu_ring_fini() and set adev->rings[i] to NULL.
Nothing will be freed in amdgpu_fence_driver_sw_fini().
ring->fence_drv.fences  memory leak happened!

void amdgpu_ring_fini(struct amdgpu_ring *ring)
{
..
ring->adev->rings[ring->idx] = NULL;
}

Regards,
Lang

>
>
>>>
>>> Fixes: 72c8c97b1522 ("drm/amdgpu: Split amdgpu_device_fini into early
>>> and late")
>>>
>>> Signed-off-by: Lang Yu 
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 41ce86244144..5654c4790773 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3843,8 +3843,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device
>>> *adev)
>>>
>>> void amdgpu_device_fini_sw(struct amdgpu_device *adev)  {
>>> -   amdgpu_device_ip_fini(adev);
>>> amdgpu_fence_driver_sw_fini(adev);
>>> +   amdgpu_device_ip_fini(adev);
>>> release_firmware(adev->firmware.gpu_info_fw);
>>> adev->firmware.gpu_info_fw = NULL;
>>> adev->accel_working = false;
>>> --
>>> 2.25.1


RE: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in amdgpu_device_fini_sw()

2021-10-21 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Koenig, Christian 
>Sent: Thursday, October 21, 2021 3:27 PM
>To: Yu, Lang ; Grodzovsky, Andrey
>
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in
>amdgpu_device_fini_sw()
>
>Is there any reason you are sending that around only internally and not to the
>public mailing list?

Sorry, I missed that. It’s a mistake.

Regards,
Lang

>Christian.
>
>Am 21.10.21 um 09:17 schrieb Lang Yu:
>> amdgpu_fence_driver_sw_fini() should be executed before
>> amdgpu_device_ip_fini(), otherwise fence driver resource won't be
>> properly freed as adev->rings have been tore down.
>>
>> Fixes: 72c8c97b1522 ("drm/amdgpu: Split amdgpu_device_fini into early
>> and late")
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 41ce86244144..5654c4790773 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3843,8 +3843,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device
>> *adev)
>>
>>   void amdgpu_device_fini_sw(struct amdgpu_device *adev)
>>   {
>> -amdgpu_device_ip_fini(adev);
>>  amdgpu_fence_driver_sw_fini(adev);
>> +amdgpu_device_ip_fini(adev);
>>  release_firmware(adev->firmware.gpu_info_fw);
>>  adev->firmware.gpu_info_fw = NULL;
>>  adev->accel_working = false;


FW: [PATCH 2/3] drm/amdgpu: use some wrapper functions in amdgpu_device_fini_sw()

2021-10-21 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Yu, Lang 
>Sent: Thursday, October 21, 2021 3:18 PM
>To: Grodzovsky, Andrey 
>Cc: Deucher, Alexander ; Koenig, Christian
>; Huang, Ray ; Yu, Lang
>
>Subject: [PATCH 2/3] drm/amdgpu: use some wrapper functions in
>amdgpu_device_fini_sw()
>
>Add some wrapper functions to make amdgpu_device_fini_sw() more clear.
>
>Fix an error handling in amdgpu_device_parse_gpu_info_fw().
>
>Signed-off-by: Lang Yu 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h| 10 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 --
> 2 files changed, 34 insertions(+), 8 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>index d58e37fd01f4..5df194259e15 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>@@ -372,6 +372,11 @@ int amdgpu_device_ip_block_add(struct amdgpu_device
>*adev,
>  */
> bool amdgpu_get_bios(struct amdgpu_device *adev);  bool
>amdgpu_read_bios(struct amdgpu_device *adev);
>+static inline void amdgpu_free_bios(struct amdgpu_device *adev) {
>+  kfree(adev->bios);
>+  adev->bios = NULL;
>+}
>
> /*
>  * Clocks
>@@ -1440,6 +1445,11 @@ void amdgpu_pci_resume(struct pci_dev *pdev);
>
> bool amdgpu_device_cache_pci_state(struct pci_dev *pdev);  bool
>amdgpu_device_load_pci_state(struct pci_dev *pdev);
>+static inline void amdgpu_device_free_pci_state(struct amdgpu_device
>+*adev) {
>+  kfree(adev->pci_state);
>+  adev->pci_state = NULL;
>+}
>
> bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev);
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index 5654c4790773..be64861ed19a 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -1871,6 +1871,19 @@ static void
>amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
>   }
> }
>
>+/**
>+ * amdgpu_device_release_gpu_info_fw - release gpu info firmware
>+ *
>+ * @adev: amdgpu_device pointer
>+ *
>+ *  Wrapper to release gpu info firmware  */ static inline void
>+amdgpu_device_release_gpu_info_fw(struct amdgpu_device *adev) {
>+  release_firmware(adev->firmware.gpu_info_fw);
>+  adev->firmware.gpu_info_fw = NULL;
>+}
>+
> /**
>  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
>  *
>@@ -1987,7 +2000,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct
>amdgpu_device *adev)
>   dev_err(adev->dev,
>   "Failed to validate gpu_info firmware \"%s\"\n",
>   fw_name);
>-  goto out;
>+  goto release_fw;
>   }
>
>   hdr = (const struct gpu_info_firmware_header_v1_0 *)adev-
>>firmware.gpu_info_fw->data;
>@@ -2051,8 +2064,12 @@ static int amdgpu_device_parse_gpu_info_fw(struct
>amdgpu_device *adev)
>   dev_err(adev->dev,
>   "Unsupported gpu_info table %d\n", hdr-
>>header.ucode_version);
>   err = -EINVAL;
>-  goto out;
>+  goto release_fw;
>   }
>+
>+  return 0;
>+release_fw:
>+  amdgpu_device_release_gpu_info_fw(adev);
> out:
>   return err;
> }
>@@ -3845,8 +3862,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device
>*adev)  {
>   amdgpu_fence_driver_sw_fini(adev);
>   amdgpu_device_ip_fini(adev);
>-  release_firmware(adev->firmware.gpu_info_fw);
>-  adev->firmware.gpu_info_fw = NULL;
>+  amdgpu_device_release_gpu_info_fw(adev);
>+
>   adev->accel_working = false;
>
>   amdgpu_reset_fini(adev);
>@@ -3858,8 +3875,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device
>*adev)
>   if (amdgpu_emu_mode != 1)
>   amdgpu_atombios_fini(adev);
>
>-  kfree(adev->bios);
>-  adev->bios = NULL;
>+  amdgpu_free_bios(adev);
>+
>   if (amdgpu_device_supports_px(adev_to_drm(adev))) {
>   vga_switcheroo_unregister_client(adev->pdev);
>   vga_switcheroo_fini_domain_pm_ops(adev->dev);
>@@ -3872,8 +3889,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device
>*adev)
>   if (adev->mman.discovery_bin)
>   amdgpu_discovery_fini(adev);
>
>-  kfree(adev->pci_state);
>-
>+  amdgpu_device_free_pci_state(adev);
> }
>
> /**
>--
>2.25.1


FW: [PATCH 3/3] drm/amdgpu: remove unnecessary NULL check in amdgpu_device.c

2021-10-21 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Yu, Lang 
>Sent: Thursday, October 21, 2021 3:18 PM
>To: Grodzovsky, Andrey 
>Cc: Deucher, Alexander ; Koenig, Christian
>; Huang, Ray ; Yu, Lang
>
>Subject: [PATCH 3/3] drm/amdgpu: remove unnecessary NULL check in
>amdgpu_device.c
>
>NULL is safe for these functions.
>
>Signed-off-by: Lang Yu 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
> 1 file changed, 7 insertions(+), 10 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index be64861ed19a..dd979db93399 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -1091,12 +1091,9 @@ static void amdgpu_device_doorbell_fini(struct
>amdgpu_device *adev)
>  */
> static void amdgpu_device_wb_fini(struct amdgpu_device *adev)  {
>-  if (adev->wb.wb_obj) {
>-  amdgpu_bo_free_kernel(>wb.wb_obj,
>->wb.gpu_addr,
>-(void **)>wb.wb);
>-  adev->wb.wb_obj = NULL;
>-  }
>+  amdgpu_bo_free_kernel(>wb.wb_obj,
>+>wb.gpu_addr,
>+(void **)>wb.wb);
> }
>
> /**
>@@ -3794,8 +3791,8 @@ static void amdgpu_device_unmap_mmio(struct
>amdgpu_device *adev)
>
>   iounmap(adev->rmmio);
>   adev->rmmio = NULL;
>-  if (adev->mman.aper_base_kaddr)
>-  iounmap(adev->mman.aper_base_kaddr);
>+
>+  iounmap(adev->mman.aper_base_kaddr);
>   adev->mman.aper_base_kaddr = NULL;
>
>   /* Memory manager related */
>@@ -3886,8 +3883,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device
>*adev)
>
>   if (IS_ENABLED(CONFIG_PERF_EVENTS))
>   amdgpu_pmu_fini(adev);
>-  if (adev->mman.discovery_bin)
>-  amdgpu_discovery_fini(adev);
>+
>+  amdgpu_discovery_fini(adev);
>
>   amdgpu_device_free_pci_state(adev);
> }
>--
>2.25.1


FW: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in amdgpu_device_fini_sw()

2021-10-21 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Yu, Lang 
>Sent: Thursday, October 21, 2021 3:18 PM
>To: Grodzovsky, Andrey 
>Cc: Deucher, Alexander ; Koenig, Christian
>; Huang, Ray ; Yu, Lang
>
>Subject: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in
>amdgpu_device_fini_sw()
>
>amdgpu_fence_driver_sw_fini() should be executed before
>amdgpu_device_ip_fini(), otherwise fence driver resource won't be properly 
>freed
>as adev->rings have been tore down.
>
>Fixes: 72c8c97b1522 ("drm/amdgpu: Split amdgpu_device_fini into early and 
>late")
>
>Signed-off-by: Lang Yu 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index 41ce86244144..5654c4790773 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -3843,8 +3843,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device
>*adev)
>
> void amdgpu_device_fini_sw(struct amdgpu_device *adev)  {
>-  amdgpu_device_ip_fini(adev);
>   amdgpu_fence_driver_sw_fini(adev);
>+  amdgpu_device_ip_fini(adev);
>   release_firmware(adev->firmware.gpu_info_fw);
>   adev->firmware.gpu_info_fw = NULL;
>   adev->accel_working = false;
>--
>2.25.1


RE: [PATCH] drm/amdkfd: Separate pinned BOs destruction from general routine

2021-10-14 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Kuehling, Felix 
>Sent: Wednesday, October 13, 2021 11:25 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Koenig, Christian ; Deucher, Alexander
>; Huang, Ray 
>Subject: Re: [PATCH] drm/amdkfd: Separate pinned BOs destruction from
>general routine
>
>Am 2021-10-11 um 4:58 a.m. schrieb Lang Yu:
>> Currently, all kfd BOs use same destruction routine. But pinned BOs
>> are not unpinned properly. Separate them from general routine.
>>
>> Signed-off-by: Lang Yu 
>
>I think the general idea is right. However, we need another safeguard for the
>signal BO, which is allocated by user mode and can be freed by user mode at
>any time. We can solve this in one of two ways:
>
> 1. Add special handling for the signal BO in
>kfd_ioctl_free_memory_of_gpu to kunmap the BO and make sure the
>signal handling code is aware of it
> 2. Fail kfd_ioctl_free_memory_of_gpu for signal BOs and only allow them
>to be destroyed at process termination
>
>I think #2 is easier, and is consistent with what current user mode does.

Will add safeguard to prevent that according to #2.
 
>
>A few more comment inline ...
>
>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   2 +
>>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  10 ++
>>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   3 +
>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   3 +
>>  drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 125 ++---
>-
>>  5 files changed, 114 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 69de31754907..751557af09bb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -279,6 +279,8 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
>>  struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);  int
>> amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
>>  struct kgd_mem *mem, void **kptr, uint64_t *size);
>> +void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct
>kgd_dev
>> +*kgd, struct kgd_mem *mem);
>> +
>>  int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
>>  struct dma_fence **ef);
>>  int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, diff
>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 054c1a224def..6acc78b02bdc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -1871,6 +1871,16 @@ int
>amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
>>  return ret;
>>  }
>>
>> +void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct
>kgd_dev
>> +*kgd, struct kgd_mem *mem) {
>> +struct amdgpu_bo *bo = mem->bo;
>> +
>> +amdgpu_bo_reserve(bo, true);
>> +amdgpu_bo_kunmap(bo);
>> +amdgpu_bo_unpin(bo);
>> +amdgpu_bo_unreserve(bo);
>> +}
>> +
>>  int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
>>struct kfd_vm_fault_info *mem)
>{ diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index f1e7edeb4e6b..0db48ac10fde 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -1051,6 +1051,9 @@ static int kfd_ioctl_create_event(struct file *filp,
>struct kfd_process *p,
>>  pr_err("Failed to set event page\n");
>
>Need to kunmap the signal BO here.

Will kunmap it here.

>
>>  return err;
>>  }
>> +
>> +p->signal_handle = args->event_page_offset;
>> +
>>  }
>>
>>  err = kfd_event_create(filp, p, args->event_type, diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 6d8f9bb2d905..30f08f1606bb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -608,12 +608,14 @@ struct qcm_process_device {
>>  uint32_t sh_hidden_private_base;
>>
>>  /* CWSR memory */
>> +struct kgd_mem *cwsr_mem;
>>  void *cwsr_kaddr;
>>  uint64_t cwsr_base;
>>  uint64_t tba_addr;
>>  uint64_t tma_addr;
>>
>

RE: [PATCH] drm/amdkfd: Fix a __user pointer dereference in create_signal_event

2021-10-13 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, October 13, 2021 4:07 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org; Kuehling,
>Felix 
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH] drm/amdkfd: Fix a __user pointer dereference in
>create_signal_event
>
>
>
>On 10/13/2021 1:03 PM, Lang Yu wrote:
>> We should not dereference __user pointers directly.
>> https://yarchive.net/comp/linux/user_pointers.html
>>
>> Fixes: 482f07775cf5
>> ("drm/amdkfd: Simplify event ID and signal slot management")
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_events.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> index 3eea4edee355..74d3bdcfe341 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> @@ -201,7 +201,7 @@ static int create_signal_event(struct file
>> *devkfd,
>>
>>  p->signal_event_count++;
>>
>> -ev->user_signal_address = >signal_page->user_address[ev-
>>event_id];
>
>This is interesting. I thought this wouldn't dereference.
>
>See here -
>
>https://en.cppreference.com/w/c/language/operator_member_access
>
>"If the operand is an array index expression, no action is taken other than the
>array-to-pointer conversion and the addition, so [N] is valid for an array 
>of size
>N (obtaining a pointer one past the end is okay, dereferencing it is not, but
>dereference cancels out in this expression)"

Thanks for your clarification about this. I got it.

Regards,
Lang

>Thanks,
>Lijo
>
>
>> +ev->user_signal_address = p->signal_page->user_address +
>> +ev->event_id;
>>  pr_debug("Signal event number %zu created with id %d, address %p\n",
>>  p->signal_event_count, ev->event_id,
>>  ev->user_signal_address);
>>


RE: [PATCH] drm/amdgpu: enable display for cyan skillfish

2021-10-12 Thread Yu, Lang
[Public]



>-Original Message-
>From: Paul Menzel 
>Sent: Tuesday, October 12, 2021 4:51 PM
>To: Yu, Lang 
>Cc: Deucher, Alexander ; Huang, Ray
>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: enable display for cyan skillfish
>
>Dear Lang,
>
>
>Am 12.10.21 um 08:16 schrieb Lang Yu:
>> Display support for cyan skillfish is ready now.
>
>What is the last commit making it “ready”?

This one, 
commit 4ac93fa0ec12a887b40b81d9b8b7fcd1033f48d5
drm/amd/display: add cyan_skillfish display support

Actually, it is fine before switching to amdgpu_discovery_set_ip_blocks.
During developing amdgpu_discovery_set_ip_blocks, it is not ready.
So skip to enable it.
 
>> Enable it!
>
>How did you test the patch?

I compiled amdgpu driver with this patch and loaded it on ubuntu 20.04. 
The display worked well. Otherwise the display won't work.

Regards,
Lang

>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 3 +--
>>   1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>> index 2bebd2ce6474..4228c7964175 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>> @@ -736,6 +736,7 @@ static int
>amdgpu_discovery_set_display_ip_blocks(struct amdgpu_device *adev)
>>  case IP_VERSION(1, 0, 1):
>>  case IP_VERSION(2, 0, 2):
>>  case IP_VERSION(2, 0, 0):
>> +case IP_VERSION(2, 0, 3):
>
>The numbers are not ordered. Could you please put a patch before this one, 
>fixing
>the ordering?
>
>>  case IP_VERSION(2, 1, 0):
>>  case IP_VERSION(3, 0, 0):
>>  case IP_VERSION(3, 0, 2):
>> @@ -745,8 +746,6 @@ static int
>amdgpu_discovery_set_display_ip_blocks(struct amdgpu_device *adev)
>>  case IP_VERSION(3, 1, 3):
>>  amdgpu_device_ip_block_add(adev, _ip_block);
>>  break;
>> -case IP_VERSION(2, 0, 3):
>> -break;
>>  default:
>>  return -EINVAL;
>>  }
>>
>
>
>Kind regards,
>
>Paul


RE: [PATCH] drm/amdgpu: query default sclk from smu for cyan_skillfish

2021-10-11 Thread Yu, Lang
[Public]



>-Original Message-
>From: Chen, Guchun 
>Sent: Monday, October 11, 2021 10:27 PM
>To: Lazar, Lijo ; Yu, Lang ; amd-
>g...@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: RE: [PATCH] drm/amdgpu: query default sclk from smu for cyan_skillfish
>
>[Public]
>
>Global variable to carry the sclk value looks a bit over-killed. Is it 
>possible that
>move all into cyan_skillfish_od_edit_dpm_table, like querying sclk first and
>setting it to cyan_skillfish_user_settings.sclk?

1, We need to query default sclk in smu init phase and use it in 
od_edit_dpm_table,
so global variable is needed.
2,  To maintain "set then commit" command rule of pp_od_clk_voltage,
global variable is also needed. 

Regards,
Lang

We need some global variables to store user settings and 
>
>Regards,
>Guchun
>
>-Original Message-
>From: amd-gfx  On Behalf Of Lazar,
>Lijo
>Sent: Monday, October 11, 2021 4:54 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH] drm/amdgpu: query default sclk from smu for cyan_skillfish
>
>
>
>On 10/11/2021 2:01 PM, Lang Yu wrote:
>> Query default sclk instead of hard code.
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   .../gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c  | 12 +---
>>   1 file changed, 9 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c
>> index 3d4c65bc29dc..d98fd06a2574 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c
>> @@ -47,7 +47,6 @@
>>   /* unit: MHz */
>>   #define CYAN_SKILLFISH_SCLK_MIN1000
>>   #define CYAN_SKILLFISH_SCLK_MAX2000
>> -#define CYAN_SKILLFISH_SCLK_DEFAULT 1800
>>
>>   /* unit: mV */
>>   #define CYAN_SKILLFISH_VDDC_MIN700
>> @@ -59,6 +58,8 @@ static struct gfx_user_settings {
>>  uint32_t vddc;
>>   } cyan_skillfish_user_settings;
>>
>> +static uint32_t cyan_skillfish_sclk_default;
>> +
>>   #define FEATURE_MASK(feature) (1ULL << feature)
>>   #define SMC_DPM_FEATURE ( \
>>  FEATURE_MASK(FEATURE_FCLK_DPM_BIT)  |   \
>> @@ -365,13 +366,18 @@ static bool cyan_skillfish_is_dpm_running(struct
>smu_context *smu)
>>  return false;
>>
>>  ret = smu_cmn_get_enabled_32_bits_mask(smu, feature_mask, 2);
>> -
>>  if (ret)
>>  return false;
>>
>>  feature_enabled = (uint64_t)feature_mask[0] |
>>  ((uint64_t)feature_mask[1] << 32);
>>
>> +/*
>> + * cyan_skillfish specific, query default sclk inseted of hard code.
>> + */
>> +cyan_skillfish_get_smu_metrics_data(smu, METRICS_CURR_GFXCLK,
>> +_skillfish_sclk_default);
>> +
>
>Maybe add if (!cyan_skillfish_sclk_default) so that it's read only once during 
>driver
>load and not on every suspend/resume.
>
>Reviewed-by: Lijo Lazar 
>
>Thanks,
>Lijo
>
>>  return !!(feature_enabled & SMC_DPM_FEATURE);
>>   }
>>
>> @@ -468,7 +474,7 @@ static int cyan_skillfish_od_edit_dpm_table(struct
>smu_context *smu,
>>  return -EINVAL;
>>  }
>>
>> -cyan_skillfish_user_settings.sclk =
>CYAN_SKILLFISH_SCLK_DEFAULT;
>> +cyan_skillfish_user_settings.sclk = cyan_skillfish_sclk_default;
>>  cyan_skillfish_user_settings.vddc =
>CYAN_SKILLFISH_VDDC_MAGIC;
>>
>>  break;
>>

RE: [PATCH] drm/amdgpu: query default sclk from smu for cyan_skillfish

2021-10-11 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Monday, October 11, 2021 4:54 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH] drm/amdgpu: query default sclk from smu for cyan_skillfish
>
>
>
>On 10/11/2021 2:01 PM, Lang Yu wrote:
>> Query default sclk instead of hard code.
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   .../gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c  | 12 +---
>>   1 file changed, 9 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c
>> index 3d4c65bc29dc..d98fd06a2574 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c
>> @@ -47,7 +47,6 @@
>>   /* unit: MHz */
>>   #define CYAN_SKILLFISH_SCLK_MIN1000
>>   #define CYAN_SKILLFISH_SCLK_MAX2000
>> -#define CYAN_SKILLFISH_SCLK_DEFAULT 1800
>>
>>   /* unit: mV */
>>   #define CYAN_SKILLFISH_VDDC_MIN700
>> @@ -59,6 +58,8 @@ static struct gfx_user_settings {
>>  uint32_t vddc;
>>   } cyan_skillfish_user_settings;
>>
>> +static uint32_t cyan_skillfish_sclk_default;
>> +
>>   #define FEATURE_MASK(feature) (1ULL << feature)
>>   #define SMC_DPM_FEATURE ( \
>>  FEATURE_MASK(FEATURE_FCLK_DPM_BIT)  |   \
>> @@ -365,13 +366,18 @@ static bool cyan_skillfish_is_dpm_running(struct
>smu_context *smu)
>>  return false;
>>
>>  ret = smu_cmn_get_enabled_32_bits_mask(smu, feature_mask, 2);
>> -
>>  if (ret)
>>  return false;
>>
>>  feature_enabled = (uint64_t)feature_mask[0] |
>>  ((uint64_t)feature_mask[1] << 32);
>>
>> +/*
>> + * cyan_skillfish specific, query default sclk inseted of hard code.
>> + */
>> +cyan_skillfish_get_smu_metrics_data(smu, METRICS_CURR_GFXCLK,
>> +_skillfish_sclk_default);
>> +
>
>Maybe add if (!cyan_skillfish_sclk_default) so that it's read only once during 
>driver
>load and not on every suspend/resume.

Good idea! 

Thanks,
Lang

>Reviewed-by: Lijo Lazar 
>
>Thanks,
>Lijo
>
>>  return !!(feature_enabled & SMC_DPM_FEATURE);
>>   }
>>
>> @@ -468,7 +474,7 @@ static int cyan_skillfish_od_edit_dpm_table(struct
>smu_context *smu,
>>  return -EINVAL;
>>  }
>>
>> -cyan_skillfish_user_settings.sclk =
>CYAN_SKILLFISH_SCLK_DEFAULT;
>> +cyan_skillfish_user_settings.sclk = cyan_skillfish_sclk_default;
>>  cyan_skillfish_user_settings.vddc =
>CYAN_SKILLFISH_VDDC_MAGIC;
>>
>>  break;
>>


RE: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak

2021-09-29 Thread Yu, Lang


>-Original Message-
>From: Kuehling, Felix 
>Sent: Thursday, September 30, 2021 11:26 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>
>On 2021-09-29 10:38 p.m., Yu, Lang wrote:
>>> -Original Message-
>>> From: Kuehling, Felix 
>>> Sent: Thursday, September 30, 2021 10:28 AM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Huang, Ray
>>> 
>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>>
>>> On 2021-09-29 10:23 p.m., Yu, Lang wrote:
>>>>> -Original Message-
>>>>> From: Kuehling, Felix 
>>>>> Sent: Thursday, September 30, 2021 9:47 AM
>>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander ; Huang, Ray
>>>>> 
>>>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory
>>>>> leak
>>>>>
>>>>> On 2021-09-29 7:32 p.m., Yu, Lang wrote:
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -Original Message-
>>>>>>> From: Kuehling, Felix 
>>>>>>> Sent: Wednesday, September 29, 2021 11:25 PM
>>>>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>>>>> Cc: Deucher, Alexander ; Huang,
>Ray
>>>>>>> 
>>>>>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory
>>>>>>> leak
>>>>>>>
>>>>>>> Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>>>>>>>> If user doesn't explicitly call kfd_ioctl_destroy_queue to
>>>>>>>> destroy all created queues, when the kfd process is destroyed,
>some queues'
>>>>>>>> cu_mask memory are not freed.
>>>>>>>>
>>>>>>>> To avoid forgetting to free them in some places, free them
>>>>>>>> immediately after use.
>>>>>>>>
>>>>>>>> Signed-off-by: Lang Yu 
>>>>>>>> ---
>>>>>>>> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  8 
>>>>>>>> 
>>>>>>>> drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |
>10
>>>>>>>> --
>>>>>>>> 2 files changed, 8 insertions(+), 10 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>>> index 4de907f3e66a..5c0e6dcf692a 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>>>> *filp, struct
>>>>>>> kfd_process *p,
>>>>>>>>retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>>>>>>> cu_mask_size);
>>>>>>>>if (retval) {
>>>>>>>>pr_debug("Could not copy CU mask from userspace");
>>>>>>>> -  kfree(properties.cu_mask);
>>>>>>>> -  return -EFAULT;
>>>>>>>> +  retval = -EFAULT;
>>>>>>>> +  goto out;
>>>>>>>>}
>>>>>>>>
>>>>>>>>mutex_lock(>mutex);
>>>>>>>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>>>> *filp, struct kfd_process *p,
>>>>>>>>
>>>>>>>>mutex_unlock(>mutex);
>>>>>>>>
>>>>>>>> -  if (retval)
>>>>>>>> -  kfree(properties.cu_mask);
>>>>>>>> +out:
>>>>>>>> +  kfree(properties.cu_mask);
>>>>>>>>
>>>>>>>>return retval;
>>>>>>>> }
>>>>>>>> diff --git
>>>>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_ma

RE: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak

2021-09-29 Thread Yu, Lang


>-Original Message-
>From: Kuehling, Felix 
>Sent: Thursday, September 30, 2021 10:28 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>
>On 2021-09-29 10:23 p.m., Yu, Lang wrote:
>>> -Original Message-
>>> From: Kuehling, Felix 
>>> Sent: Thursday, September 30, 2021 9:47 AM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Huang, Ray
>>> 
>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>>
>>> On 2021-09-29 7:32 p.m., Yu, Lang wrote:
>>>> [AMD Official Use Only]
>>>>
>>>>
>>>>
>>>>> -Original Message-
>>>>> From: Kuehling, Felix 
>>>>> Sent: Wednesday, September 29, 2021 11:25 PM
>>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander ; Huang, Ray
>>>>> 
>>>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory
>>>>> leak
>>>>>
>>>>> Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>>>>>> If user doesn't explicitly call kfd_ioctl_destroy_queue to destroy
>>>>>> all created queues, when the kfd process is destroyed, some queues'
>>>>>> cu_mask memory are not freed.
>>>>>>
>>>>>> To avoid forgetting to free them in some places, free them
>>>>>> immediately after use.
>>>>>>
>>>>>> Signed-off-by: Lang Yu 
>>>>>> ---
>>>>>>drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  8 
>>>>>>drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
>>>>>> --
>>>>>>2 files changed, 8 insertions(+), 10 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> index 4de907f3e66a..5c0e6dcf692a 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>> *filp, struct
>>>>> kfd_process *p,
>>>>>>  retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>>>>> cu_mask_size);
>>>>>>  if (retval) {
>>>>>>  pr_debug("Could not copy CU mask from userspace");
>>>>>> -kfree(properties.cu_mask);
>>>>>> -return -EFAULT;
>>>>>> +retval = -EFAULT;
>>>>>> +goto out;
>>>>>>  }
>>>>>>
>>>>>>  mutex_lock(>mutex);
>>>>>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>> *filp, struct kfd_process *p,
>>>>>>
>>>>>>  mutex_unlock(>mutex);
>>>>>>
>>>>>> -if (retval)
>>>>>> -kfree(properties.cu_mask);
>>>>>> +out:
>>>>>> +kfree(properties.cu_mask);
>>>>>>
>>>>>>  return retval;
>>>>>>}
>>>>>> diff --git
>>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>> index 243dd1efcdbf..4c81d690f31a 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>>>>> process_queue_manager *pqm, unsigned int qid)
>>>>>>  pdd->qpd.num_gws = 0;
>>>>>>  }
>>>>>>
>>>>>> -kfree(pqn->q->properties.cu_mask);
>>>>>> -pqn->q->properties.cu_mask = NULL;
>>>>>>  uninit_queue(pqn->q);
>>>>>>  }
>>>>>>
>>>>>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>>>>> process_queue_mana

RE: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak

2021-09-29 Thread Yu, Lang


>-Original Message-
>From: Kuehling, Felix 
>Sent: Thursday, September 30, 2021 9:47 AM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>
>On 2021-09-29 7:32 p.m., Yu, Lang wrote:
>> [AMD Official Use Only]
>>
>>
>>
>>> -Original Message-
>>> From: Kuehling, Felix 
>>> Sent: Wednesday, September 29, 2021 11:25 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Huang, Ray
>>> 
>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>>
>>> Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>>>> If user doesn't explicitly call kfd_ioctl_destroy_queue to destroy
>>>> all created queues, when the kfd process is destroyed, some queues'
>>>> cu_mask memory are not freed.
>>>>
>>>> To avoid forgetting to free them in some places, free them
>>>> immediately after use.
>>>>
>>>> Signed-off-by: Lang Yu 
>>>> ---
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  8 
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
>>>> --
>>>>   2 files changed, 8 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> index 4de907f3e66a..5c0e6dcf692a 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>> *filp, struct
>>> kfd_process *p,
>>>>retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>>> cu_mask_size);
>>>>if (retval) {
>>>>pr_debug("Could not copy CU mask from userspace");
>>>> -  kfree(properties.cu_mask);
>>>> -  return -EFAULT;
>>>> +  retval = -EFAULT;
>>>> +  goto out;
>>>>}
>>>>
>>>>mutex_lock(>mutex);
>>>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>> *filp, struct kfd_process *p,
>>>>
>>>>mutex_unlock(>mutex);
>>>>
>>>> -  if (retval)
>>>> -  kfree(properties.cu_mask);
>>>> +out:
>>>> +  kfree(properties.cu_mask);
>>>>
>>>>return retval;
>>>>   }
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> index 243dd1efcdbf..4c81d690f31a 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>>> process_queue_manager *pqm, unsigned int qid)
>>>>pdd->qpd.num_gws = 0;
>>>>}
>>>>
>>>> -  kfree(pqn->q->properties.cu_mask);
>>>> -  pqn->q->properties.cu_mask = NULL;
>>>>uninit_queue(pqn->q);
>>>>}
>>>>
>>>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>>> process_queue_manager *pqm, unsigned int qid,
>>>>return -EFAULT;
>>>>}
>>>>
>>>> -  /* Free the old CU mask memory if it is already allocated, then
>>>> -   * allocate memory for the new CU mask.
>>>> -   */
>>>> -  kfree(pqn->q->properties.cu_mask);
>>>> +  WARN_ON_ONCE(pqn->q->properties.cu_mask);
>>>>
>>>>pqn->q->properties.cu_mask_count = p->cu_mask_count;
>>>>pqn->q->properties.cu_mask = p->cu_mask;
>>>>
>>>>retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
>>>>pqn->q);
>>>> +
>>>> +  pqn->q->properties.cu_mask = NULL;
>>>> +
>>> This won't work correctly. We need to save the cu_mask for later.
>>> Otherwise the next time dqm->ops.update_queue is called, for example
>>> in pqm_update_queue or pqm_set_gws, it will wipe out the CU mask in the
>MQD.
>> Let's just return when me

RE: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak

2021-09-29 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Kuehling, Felix 
>Sent: Wednesday, September 29, 2021 11:25 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>
>Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>
>Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>> If user doesn't explicitly call kfd_ioctl_destroy_queue to destroy all
>> created queues, when the kfd process is destroyed, some queues'
>> cu_mask memory are not freed.
>>
>> To avoid forgetting to free them in some places, free them immediately
>> after use.
>>
>> Signed-off-by: Lang Yu 
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  8 
>>  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
>> --
>>  2 files changed, 8 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index 4de907f3e66a..5c0e6dcf692a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, 
>> struct
>kfd_process *p,
>>  retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>cu_mask_size);
>>  if (retval) {
>>  pr_debug("Could not copy CU mask from userspace");
>> -kfree(properties.cu_mask);
>> -return -EFAULT;
>> +retval = -EFAULT;
>> +goto out;
>>  }
>>
>>  mutex_lock(>mutex);
>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>> *filp, struct kfd_process *p,
>>
>>  mutex_unlock(>mutex);
>>
>> -if (retval)
>> -kfree(properties.cu_mask);
>> +out:
>> +kfree(properties.cu_mask);
>>
>>  return retval;
>>  }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index 243dd1efcdbf..4c81d690f31a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>process_queue_manager *pqm, unsigned int qid)
>>  pdd->qpd.num_gws = 0;
>>  }
>>
>> -kfree(pqn->q->properties.cu_mask);
>> -pqn->q->properties.cu_mask = NULL;
>>  uninit_queue(pqn->q);
>>  }
>>
>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>process_queue_manager *pqm, unsigned int qid,
>>  return -EFAULT;
>>  }
>>
>> -/* Free the old CU mask memory if it is already allocated, then
>> - * allocate memory for the new CU mask.
>> - */
>> -kfree(pqn->q->properties.cu_mask);
>> +WARN_ON_ONCE(pqn->q->properties.cu_mask);
>>
>>  pqn->q->properties.cu_mask_count = p->cu_mask_count;
>>  pqn->q->properties.cu_mask = p->cu_mask;
>>
>>  retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
>>  pqn->q);
>> +
>> +pqn->q->properties.cu_mask = NULL;
>> +
>
>This won't work correctly. We need to save the cu_mask for later.
>Otherwise the next time dqm->ops.update_queue is called, for example in
>pqm_update_queue or pqm_set_gws, it will wipe out the CU mask in the MQD.

Let's just return when meeting a null cu_mask in update_cu_mask() to avoid that.
Like following,

static void update_cu_mask(struct mqd_manager *mm, void *mqd,
   struct queue_properties *q)
{
struct v10_compute_mqd *m;
uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */

if (!q-> cu_mask || q->cu_mask_count == 0)
return;
..
}

Is this fine with you? Thanks!

Regards,
Lang
 
>Regards,
>  Felix
>
>
>>  if (retval != 0)
>>  return retval;
>>


RE: [PATCH] drm/kfd: fix ttm_bo_release warning

2021-09-24 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Christian König 
>Sent: Friday, September 24, 2021 2:37 PM
>To: Yu, Lang ; Koenig, Christian
>; amd-gfx@lists.freedesktop.org
>Cc: Kuehling, Felix ; Huang, Ray
>
>Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>
>Am 24.09.21 um 08:34 schrieb Yu, Lang:
>> [AMD Official Use Only]
>>
>>
>>
>>> -Original Message-
>>> From: Koenig, Christian 
>>> Sent: Friday, September 24, 2021 1:54 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Kuehling, Felix ; Huang, Ray
>>> 
>>> Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>>>
>>>
>>> Am 24.09.21 um 07:50 schrieb Yu, Lang:
>>>> [AMD Official Use Only]
>>>>> [SNIP]
>>>>>>>> Hi Christian,
>>>>>>>>
>>>>>>>> Thanks for your explanation and advice. I got your point.
>>>>>>>> Actually, these BOs are allocated and pinned during a kfd process
>lifecycle.
>>>>>>>> I will try to add a flag into struct kgd_mem to indicate which
>>>>>>>> BO is pined and should be unpinned. Which will make
>>>>>>>> amdgpu_bo_pin/amdgpu_bo_unpin calls balanced. Thanks!
>>>>>>> That isn't to much better. The real solution would be to unpin
>>>>>>> them when the kfd process is destroyed.
>>>>>> Yes, will unpin them when the kfd process is destroyed.
>>>>>> But we should indicate which BO is pinned and should be unpinned. Right?
>>>>> Well not with a flag or something like that.
>>>>>
>>>>> The knowledge which BO is pinned and needs to be unpinned should
>>>>> come from the control logic and not be papered over by some general
>handling.
>>>>> That's the background why we have removed the general handling for
>>>>> this from TTM in the first place.
>>>>>
>>>>> In other words when you need to pin a BO because it is kmapped it
>>>>> should be unpinned when it is kunmapped and if you don't kunmap at
>>>>> all then there is something wrong with the code structure from a
>>>>> higher level
>>> point of view.
>>>> Yes, this function "amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel" did a
>>>> kmap, but without a kunmap when the kfd process is destroyed. The
>>>> flag
>>> actually indicates kmap/kunmap.
>>>
>>> Well that's the wrong approach then. I mean you need to have the BO
>>> reference and the mapped pointer somewhere, don't you?
>>>
>>> How do you clean those up?
>> They are respectively cleaned by " kfd_process_device_free_bos " and "
>kfd_process_destroy_pdds".
>> Let me put the code here. Thanks!
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index ec028cf963f5..d65b3bf13fd8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -81,6 +81,7 @@ struct kgd_mem {
>>
>>  bool aql_queue;
>>  bool is_imported;
>> +   bool is_mapped_to_kernel;
>
>Yeah, that is exactly what you absolutely should NOT do.
>
>>   };
>>
>>   /* KFD Memory Eviction */
>> @@ -278,6 +279,8 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
>>  struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);
>>   int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
>>  struct kgd_mem *mem, void **kptr, uint64_t *size);
>
>The real question is who is calling this function here?

Currently  there are 3 places called function 
"amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel" to kmap a BO. 

1, kmap a ptr for pdd->qpd->cwsr_kaddr
Call stack:
amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel
kfd_process_alloc_gpuvm
kfd_process_device_init_cwsr_dgpu
kfd_process_device_init_vm
kfd_ioctl_acquire_vm

2, kmap a ptr for pdd->qpd->ib_kaddr
Call stack:
amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel
kfd_process_alloc_gpuvm
kfd_process_device_reserve_ib_mem
kfd_process_device_init_vm
kfd_ioctl_acquire_vm

3, kmap a ptr for p->signal_page->kernel_address
Call stack:
amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel
kfd_ioctl_create_event

The problem is these kmaped BOs were not kunmaped properly 
when the kfd process is destroyed.

Regards,
Lang

>> +void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_dev
>*kgd,
>> 

RE: [PATCH] drm/kfd: fix ttm_bo_release warning

2021-09-24 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Koenig, Christian 
>Sent: Friday, September 24, 2021 1:54 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Kuehling, Felix ; Huang, Ray
>
>Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>
>
>Am 24.09.21 um 07:50 schrieb Yu, Lang:
>> [AMD Official Use Only]
>>> [SNIP]
>>>>>> Hi Christian,
>>>>>>
>>>>>> Thanks for your explanation and advice. I got your point.
>>>>>> Actually, these BOs are allocated and pinned during a kfd process 
>>>>>> lifecycle.
>>>>>> I will try to add a flag into struct kgd_mem to indicate which BO
>>>>>> is pined and should be unpinned. Which will make
>>>>>> amdgpu_bo_pin/amdgpu_bo_unpin calls balanced. Thanks!
>>>>> That isn't to much better. The real solution would be to unpin them
>>>>> when the kfd process is destroyed.
>>>> Yes, will unpin them when the kfd process is destroyed.
>>>> But we should indicate which BO is pinned and should be unpinned. Right?
>>> Well not with a flag or something like that.
>>>
>>> The knowledge which BO is pinned and needs to be unpinned should come
>>> from the control logic and not be papered over by some general handling.
>>> That's the background why we have removed the general handling for
>>> this from TTM in the first place.
>>>
>>> In other words when you need to pin a BO because it is kmapped it
>>> should be unpinned when it is kunmapped and if you don't kunmap at
>>> all then there is something wrong with the code structure from a higher 
>>> level
>point of view.
>> Yes, this function "amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel" did a
>> kmap, but without a kunmap when the kfd process is destroyed. The flag
>actually indicates kmap/kunmap.
>
>Well that's the wrong approach then. I mean you need to have the BO reference
>and the mapped pointer somewhere, don't you?
>
>How do you clean those up?

They are respectively cleaned by " kfd_process_device_free_bos " and " 
kfd_process_destroy_pdds".
Let me put the code here. Thanks!

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ec028cf963f5..d65b3bf13fd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -81,6 +81,7 @@ struct kgd_mem {

bool aql_queue;
bool is_imported;
+   bool is_mapped_to_kernel;
 };

 /* KFD Memory Eviction */
@@ -278,6 +279,8 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);
 int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
struct kgd_mem *mem, void **kptr, uint64_t *size);
+void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_dev *kgd,
+   struct kgd_mem *mem);
 int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
struct dma_fence **ef);
 int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 2d6b2d77b738..45ccbe9f63ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1857,6 +1857,8 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct 
kgd_dev *kgd,

amdgpu_bo_unreserve(bo);

+   mem->is_mapped_to_kernel = true;
+
mutex_unlock(>process_info->lock);
return 0;

@@ -1870,6 +1872,20 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct 
kgd_dev *kgd,
return ret;
 }

+void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_dev *kgd, struct 
kgd_mem *mem)
+{
+   struct amdgpu_bo *bo = mem->bo;
+
+   if (!mem->is_mapped_to_kernel)
+   return;
+
+   amdgpu_bo_reserve(bo, true);
+   amdgpu_bo_kunmap(bo);
+   amdgpu_bo_unpin(bo);
+   amdgpu_bo_unreserve(bo);
+   mem->is_mapped_to_kernel = false;
+}
+
 int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
  struct kfd_vm_fault_info *mem)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 21ec8a18cad2..f5506b153aed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -941,6 +941,8 @@ static void kfd_process_device_free_bos(struct 
kfd_process_device *pdd)
peer_pdd->dev->kgd, mem, peer_pdd->drm_priv);
}

+   amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(pdd->dev->kgd, 
mem);
+
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->kgd, mem,
   pdd->drm_priv, NULL);
kfd_process_device_remove_obj_handle(pdd, id);

Regards,
Lang

>Regards,
>Christian.


RE: [PATCH] drm/kfd: fix ttm_bo_release warning

2021-09-23 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Koenig, Christian 
>Sent: Friday, September 24, 2021 1:42 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Kuehling, Felix ; Huang, Ray
>
>Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>
>Am 24.09.21 um 07:35 schrieb Yu, Lang:
>> [AMD Official Use Only]
>>> -Original Message-
>>> From: Koenig, Christian 
>>> Sent: Thursday, September 23, 2021 10:52 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Kuehling, Felix ; Huang, Ray
>>> 
>>> Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>>>
>>> Am 23.09.21 um 16:24 schrieb Yu, Lang:
>>>> [AMD Official Use Only]
>>>>> -Original Message-
>>>>> From: Koenig, Christian 
>>>>> Sent: Thursday, September 23, 2021 8:24 PM
>>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>>> Cc: Kuehling, Felix ; Christian K nig
>>>>> ; Huang, Ray 
>>>>> Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>>>>>
>>>>> Am 23.09.21 um 14:09 schrieb Yu, Lang:
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -Original Message-
>>>>>>> From: Koenig, Christian 
>>>>>>> Sent: Thursday, September 23, 2021 7:40 PM
>>>>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>>>>> Cc: Kuehling, Felix ; Christian K nig
>>>>>>> ; Huang, Ray 
>>>>>>> Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Am 23.09.21 um 11:44 schrieb Lang Yu:
>>>>>>>> If a BO is pinned, unpin it before freeing it.
>>>>>>>>
>>>>>>>> Call Trace:
>>>>>>>>ttm_bo_put+0x30/0x50 [ttm]
>>>>>>>>amdgpu_bo_unref+0x1e/0x30 [amdgpu]
>>>>>>>>amdgpu_gem_object_free+0x34/0x50 [amdgpu]
>>>>>>>>drm_gem_object_free+0x1d/0x30 [drm]
>>>>>>>>amdgpu_amdkfd_gpuvm_free_memory_of_gpu+0x31f/0x3a0
>[amdgpu]
>>>>>>>>kfd_process_device_free_bos+0xa3/0xf0 [amdgpu]
>>>>>>>>kfd_process_wq_release+0x224/0x2e0 [amdgpu]
>>>>>>>>process_one_work+0x220/0x3c0
>>>>>>>>worker_thread+0x4d/0x3f0
>>>>>>>>kthread+0x114/0x150
>>>>>>>>process_one_work+0x3c0/0x3c0
>>>>>>>>kthread_park+0x90/0x90
>>>>>>>>ret_from_fork+0x22/0x30
>>>>>>>>
>>>>>>>> Signed-off-by: Lang Yu 
>>>>>>>> ---
>>>>>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 +++
>>>>>>>>  1 file changed, 3 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>>>>> index 2d6b2d77b738..7e693b064072 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>>>>> @@ -1567,6 +1567,9 @@ int
>>>>> amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>>>>>>>>pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
>>>>>>>>mem->va + bo_size * (1 + mem->aql_queue));
>>>>>>>>
>>>>>>>> +  if (mem->bo->tbo.pin_count)
>>>>>>>> +  amdgpu_bo_unpin(mem->bo);
>>>>>>>> +
>>>>>>> NAK, using mem->bo->tbo.pin_count like this is illegal.
>>>>>> I didn't  get your point. I referred to function-"void
>>>>>> amdgpu_bo_unpin(struct amdgpu_bo *bo)", which uses it like this.
>>>>> What amdgpu_bo_unpin() does is the following:
>>>>>
>>>>>       ttm_bo_unpin(>tbo);
>>>>>       if (bo->tbo.pin_count)
>>>>>       return;
>>>>> 
>>>>>
>>>>> I

RE: [PATCH] drm/kfd: fix ttm_bo_release warning

2021-09-23 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Koenig, Christian 
>Sent: Thursday, September 23, 2021 10:52 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Kuehling, Felix ; Huang, Ray
>
>Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>
>Am 23.09.21 um 16:24 schrieb Yu, Lang:
>> [AMD Official Use Only]
>>> -Original Message-
>>> From: Koenig, Christian 
>>> Sent: Thursday, September 23, 2021 8:24 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Kuehling, Felix ; Christian K nig
>>> ; Huang, Ray 
>>> Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>>>
>>> Am 23.09.21 um 14:09 schrieb Yu, Lang:
>>>> [AMD Official Use Only]
>>>>
>>>>
>>>>
>>>>> -Original Message-
>>>>> From: Koenig, Christian 
>>>>> Sent: Thursday, September 23, 2021 7:40 PM
>>>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>>>> Cc: Kuehling, Felix ; Christian K nig
>>>>> ; Huang, Ray 
>>>>> Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>>>>>
>>>>>
>>>>>
>>>>> Am 23.09.21 um 11:44 schrieb Lang Yu:
>>>>>> If a BO is pinned, unpin it before freeing it.
>>>>>>
>>>>>> Call Trace:
>>>>>>  ttm_bo_put+0x30/0x50 [ttm]
>>>>>>  amdgpu_bo_unref+0x1e/0x30 [amdgpu]
>>>>>>  amdgpu_gem_object_free+0x34/0x50 [amdgpu]
>>>>>>  drm_gem_object_free+0x1d/0x30 [drm]
>>>>>>  amdgpu_amdkfd_gpuvm_free_memory_of_gpu+0x31f/0x3a0 [amdgpu]
>>>>>>  kfd_process_device_free_bos+0xa3/0xf0 [amdgpu]
>>>>>>  kfd_process_wq_release+0x224/0x2e0 [amdgpu]
>>>>>>  process_one_work+0x220/0x3c0
>>>>>>  worker_thread+0x4d/0x3f0
>>>>>>  kthread+0x114/0x150
>>>>>>  process_one_work+0x3c0/0x3c0
>>>>>>  kthread_park+0x90/0x90
>>>>>>  ret_from_fork+0x22/0x30
>>>>>>
>>>>>> Signed-off-by: Lang Yu 
>>>>>> ---
>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 +++
>>>>>> 1 file changed, 3 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>>> index 2d6b2d77b738..7e693b064072 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>>> @@ -1567,6 +1567,9 @@ int
>>> amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>>>>>>  pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
>>>>>>  mem->va + bo_size * (1 + mem->aql_queue));
>>>>>>
>>>>>> +if (mem->bo->tbo.pin_count)
>>>>>> +amdgpu_bo_unpin(mem->bo);
>>>>>> +
>>>>> NAK, using mem->bo->tbo.pin_count like this is illegal.
>>>> I didn't  get your point. I referred to function-"void
>>>> amdgpu_bo_unpin(struct amdgpu_bo *bo)", which uses it like this.
>>> What amdgpu_bo_unpin() does is the following:
>>>
>>>      ttm_bo_unpin(>tbo);
>>>      if (bo->tbo.pin_count)
>>>      return;
>>> 
>>>
>>> In other words we take further actions based on if the buffer us is
>>> still pinned or not after an unpin operation.
>>>
>>> What you try to do here is unpinning the BO when it is pinned
>>> independent if somebody else or our code has pinned it before.
>> Hi Christian,
>>
>> Thanks for your explanation and advice. I got your point.
>> Actually, these BOs are allocated and pinned during a kfd process lifecycle.
>> I will try to add a flag into struct kgd_mem to indicate which BO is
>> pined and should be unpinned. Which will make
>> amdgpu_bo_pin/amdgpu_bo_unpin calls balanced. Thanks!
>
>That isn't to much better. The real solution would be to unpin them when the 
>kfd
>process is destroyed.

Yes, will unpin them when the kfd process is destroyed.
But we should indicate which BO is pinned and should be unpinned. Right?

Regards,
Lang
 
>Regards,
>Christian.
>
>>
>> Regards,
>> Lang
>>> That can lead to all kind of problems and is clearly illegal.
>>>
>>>>> Where was the BO pinned in the first place?
>>>> I found two places:
>>>>
>>>>ret = kfd_process_alloc_gpuvm(pdd, qpd->ib_base, PAGE_SIZE, flags,
>>>>  );
>>>>
>>>>ret = kfd_process_alloc_gpuvm(pdd, qpd->cwsr_base,
>>>>  KFD_CWSR_TBA_TMA_SIZE, flags, );
>>> Well then you need to figure out where that memory is freed again and
>>> place the unpin appropriately.
>>>
>>> General rule of thumb is that calls to amdgpu_bo_pin/amdgpu_bo_unpin
>>> should be balanced.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> Regards,
>>>> Lang
>>>>
>>>>> Christian.
>>>>>
>>>>>>  ret = unreserve_bo_and_vms(, false, false);
>>>>>>
>>>>>>  /* Remove from VM internal data structures */


RE: [PATCH] drm/kfd: fix ttm_bo_release warning

2021-09-23 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Kuehling, Felix 
>Sent: Friday, September 24, 2021 12:21 AM
>To: Yu, Lang ; Koenig, Christian
>; amd-gfx@lists.freedesktop.org
>Cc: Huang, Ray 
>Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>
>
>On 2021-09-23 8:09 a.m., Yu, Lang wrote:
>> [AMD Official Use Only]
>>
>>
>>
>>> -Original Message-
>>> From: Koenig, Christian 
>>> Sent: Thursday, September 23, 2021 7:40 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Kuehling, Felix ; Christian K nig
>>> ; Huang, Ray 
>>> Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>>>
>>>
>>>
>>> Am 23.09.21 um 11:44 schrieb Lang Yu:
>>>> If a BO is pinned, unpin it before freeing it.
>>>>
>>>> Call Trace:
>>>>ttm_bo_put+0x30/0x50 [ttm]
>>>>amdgpu_bo_unref+0x1e/0x30 [amdgpu]
>>>>amdgpu_gem_object_free+0x34/0x50 [amdgpu]
>>>>drm_gem_object_free+0x1d/0x30 [drm]
>>>>amdgpu_amdkfd_gpuvm_free_memory_of_gpu+0x31f/0x3a0 [amdgpu]
>>>>kfd_process_device_free_bos+0xa3/0xf0 [amdgpu]
>>>>kfd_process_wq_release+0x224/0x2e0 [amdgpu]
>>>>process_one_work+0x220/0x3c0
>>>>worker_thread+0x4d/0x3f0
>>>>kthread+0x114/0x150
>>>>process_one_work+0x3c0/0x3c0
>>>>kthread_park+0x90/0x90
>>>>ret_from_fork+0x22/0x30
>>>>
>>>> Signed-off-by: Lang Yu 
>>>> ---
>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 +++
>>>>1 file changed, 3 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> index 2d6b2d77b738..7e693b064072 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> @@ -1567,6 +1567,9 @@ int
>amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>>>>pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
>>>>mem->va + bo_size * (1 + mem->aql_queue));
>>>>
>>>> +  if (mem->bo->tbo.pin_count)
>>>> +  amdgpu_bo_unpin(mem->bo);
>>>> +
>>> NAK, using mem->bo->tbo.pin_count like this is illegal.
>> I didn't  get your point. I referred to function-"void
>> amdgpu_bo_unpin(struct amdgpu_bo *bo)", which uses it like this.
>>
>>> Where was the BO pinned in the first place?
>> I found two places:
>>
>>  ret = kfd_process_alloc_gpuvm(pdd, qpd->ib_base, PAGE_SIZE, flags,
>>);
>>
>>  ret = kfd_process_alloc_gpuvm(pdd, qpd->cwsr_base,
>>KFD_CWSR_TBA_TMA_SIZE, flags, );
>
>These two allocations are created by the kernel mode driver. There is another
>case where a user-allocated BO can get pinned because we need to kmap it (in
>kfd_ioctl_create_event).
>
>Regards,
>   Felix

Yes, these BOs will not be freed until a kfd process is destroyed.
I will make a v2 patch, please help review. Thanks!

Regards,
Lang 
>
>> Regards,
>> Lang
>>
>>> Christian.
>>>
>>>>ret = unreserve_bo_and_vms(, false, false);
>>>>
>>>>/* Remove from VM internal data structures */

RE: [PATCH] drm/kfd: fix ttm_bo_release warning

2021-09-23 Thread Yu, Lang
[AMD Official Use Only]


>-Original Message-
>From: Koenig, Christian 
>Sent: Thursday, September 23, 2021 8:24 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Kuehling, Felix ; Christian K nig
>; Huang, Ray 
>Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>
>Am 23.09.21 um 14:09 schrieb Yu, Lang:
>> [AMD Official Use Only]
>>
>>
>>
>>> -Original Message-
>>> From: Koenig, Christian 
>>> Sent: Thursday, September 23, 2021 7:40 PM
>>> To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>>> Cc: Kuehling, Felix ; Christian K nig
>>> ; Huang, Ray 
>>> Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>>>
>>>
>>>
>>> Am 23.09.21 um 11:44 schrieb Lang Yu:
>>>> If a BO is pinned, unpin it before freeing it.
>>>>
>>>> Call Trace:
>>>>ttm_bo_put+0x30/0x50 [ttm]
>>>>amdgpu_bo_unref+0x1e/0x30 [amdgpu]
>>>>amdgpu_gem_object_free+0x34/0x50 [amdgpu]
>>>>drm_gem_object_free+0x1d/0x30 [drm]
>>>>amdgpu_amdkfd_gpuvm_free_memory_of_gpu+0x31f/0x3a0 [amdgpu]
>>>>kfd_process_device_free_bos+0xa3/0xf0 [amdgpu]
>>>>kfd_process_wq_release+0x224/0x2e0 [amdgpu]
>>>>process_one_work+0x220/0x3c0
>>>>worker_thread+0x4d/0x3f0
>>>>kthread+0x114/0x150
>>>>process_one_work+0x3c0/0x3c0
>>>>kthread_park+0x90/0x90
>>>>ret_from_fork+0x22/0x30
>>>>
>>>> Signed-off-by: Lang Yu 
>>>> ---
>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 +++
>>>>1 file changed, 3 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> index 2d6b2d77b738..7e693b064072 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> @@ -1567,6 +1567,9 @@ int
>amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>>>>pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
>>>>mem->va + bo_size * (1 + mem->aql_queue));
>>>>
>>>> +  if (mem->bo->tbo.pin_count)
>>>> +  amdgpu_bo_unpin(mem->bo);
>>>> +
>>> NAK, using mem->bo->tbo.pin_count like this is illegal.
>> I didn't  get your point. I referred to function-"void
>> amdgpu_bo_unpin(struct amdgpu_bo *bo)", which uses it like this.
>
>What amdgpu_bo_unpin() does is the following:
>
>     ttm_bo_unpin(>tbo);
>     if (bo->tbo.pin_count)
>     return;
>
>
>In other words we take further actions based on if the buffer us is still 
>pinned or
>not after an unpin operation.
>
>What you try to do here is unpinning the BO when it is pinned independent if
>somebody else or our code has pinned it before.

Hi Christian,

Thanks for your explanation and advice. I got your point.
Actually, these BOs are allocated and pinned during a kfd process lifecycle.
I will try to add a flag into struct kgd_mem to indicate which BO is pined 
and should be unpinned. Which will make amdgpu_bo_pin/amdgpu_bo_unpin 
calls balanced. Thanks!

Regards,
Lang
>
>That can lead to all kind of problems and is clearly illegal.
>
>>> Where was the BO pinned in the first place?
>> I found two places:
>>
>>  ret = kfd_process_alloc_gpuvm(pdd, qpd->ib_base, PAGE_SIZE, flags,
>>);
>>
>>  ret = kfd_process_alloc_gpuvm(pdd, qpd->cwsr_base,
>>KFD_CWSR_TBA_TMA_SIZE, flags, );
>
>Well then you need to figure out where that memory is freed again and place the
>unpin appropriately.
>
>General rule of thumb is that calls to amdgpu_bo_pin/amdgpu_bo_unpin should
>be balanced.
>
>Regards,
>Christian.
>
>> Regards,
>> Lang
>>
>>> Christian.
>>>
>>>>ret = unreserve_bo_and_vms(, false, false);
>>>>
>>>>/* Remove from VM internal data structures */


RE: [PATCH] drm/kfd: fix ttm_bo_release warning

2021-09-23 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Koenig, Christian 
>Sent: Thursday, September 23, 2021 7:40 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Kuehling, Felix ; Christian K nig
>; Huang, Ray 
>Subject: Re: [PATCH] drm/kfd: fix ttm_bo_release warning
>
>
>
>Am 23.09.21 um 11:44 schrieb Lang Yu:
>> If a BO is pinned, unpin it before freeing it.
>>
>> Call Trace:
>>  ttm_bo_put+0x30/0x50 [ttm]
>>  amdgpu_bo_unref+0x1e/0x30 [amdgpu]
>>  amdgpu_gem_object_free+0x34/0x50 [amdgpu]
>>  drm_gem_object_free+0x1d/0x30 [drm]
>>  amdgpu_amdkfd_gpuvm_free_memory_of_gpu+0x31f/0x3a0 [amdgpu]
>>  kfd_process_device_free_bos+0xa3/0xf0 [amdgpu]
>>  kfd_process_wq_release+0x224/0x2e0 [amdgpu]
>>  process_one_work+0x220/0x3c0
>>  worker_thread+0x4d/0x3f0
>>  kthread+0x114/0x150
>>  process_one_work+0x3c0/0x3c0
>>  kthread_park+0x90/0x90
>>  ret_from_fork+0x22/0x30
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 +++
>>   1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 2d6b2d77b738..7e693b064072 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -1567,6 +1567,9 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>>  pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
>>  mem->va + bo_size * (1 + mem->aql_queue));
>>
>> +if (mem->bo->tbo.pin_count)
>> +amdgpu_bo_unpin(mem->bo);
>> +
>
>NAK, using mem->bo->tbo.pin_count like this is illegal.

I didn't  get your point. I referred to function-"void amdgpu_bo_unpin(struct 
amdgpu_bo *bo)",
which uses it like this.

>Where was the BO pinned in the first place?

I found two places:

ret = kfd_process_alloc_gpuvm(pdd, qpd->ib_base, PAGE_SIZE, flags,
  );

ret = kfd_process_alloc_gpuvm(pdd, qpd->cwsr_base,
  KFD_CWSR_TBA_TMA_SIZE, flags, );
Regards,
Lang

>Christian.
>
>>  ret = unreserve_bo_and_vms(, false, false);
>>
>>  /* Remove from VM internal data structures */


RE: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at warnings

2021-09-08 Thread Yu, Lang
[AMD Official Use Only]

So the final decision is rollback to scnprintf().
If we can define our own helper functions like sysfs_emit/sysfs_emit_at
but without page boundary aligned limitation to make life easier?

Regards,
Lang

From: Powell, Darren 
Sent: Thursday, September 9, 2021 6:18 AM
To: Christian König ; Lazar, Lijo 
; Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Huang, Ray 
; Tian Tao 
Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at warnings


[AMD Official Use Only]




From: Christian König 
mailto:ckoenig.leichtzumer...@gmail.com>>
Sent: Wednesday, September 8, 2021 8:43 AM
To: Lazar, Lijo mailto:lijo.la...@amd.com>>; Yu, Lang 
mailto:lang...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Deucher, Alexander 
mailto:alexander.deuc...@amd.com>>; Huang, Ray 
mailto:ray.hu...@amd.com>>; Tian Tao 
mailto:tiant...@hisilicon.com>>; Powell, Darren 
mailto:darren.pow...@amd.com>>
Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at warnings

Am 08.09.21 um 12:22 schrieb Lazar, Lijo:
> On 9/8/2021 3:08 PM, Christian König wrote:
>> Am 08.09.21 um 11:29 schrieb Lazar, Lijo:
>>> On 9/8/2021 2:32 PM, Yu, Lang wrote:
>>>> [AMD Official Use Only]
>>>>> -Original Message-
>>>>> From: Lazar, Lijo mailto:lijo.la...@amd.com>>
>>>>> Sent: Wednesday, September 8, 2021 4:55 PM
>>>>> To: Yu, Lang mailto:lang...@amd.com>>; Christian König
>>>>> mailto:ckoenig.leichtzumer...@gmail.com>>;
>>>>>  amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
>>>>> Cc: Deucher, Alexander 
>>>>> mailto:alexander.deuc...@amd.com>>; Huang, Ray
>>>>> mailto:ray.hu...@amd.com>>; Tian Tao 
>>>>> mailto:tiant...@hisilicon.com>>
>>>>> Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at
>>>>> warnings
>>>>>
>>>>>
>>>>>
>>>>> On 9/8/2021 1:14 PM, Yu, Lang wrote:
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -Original Message-
>>>>>>> From: Lazar, Lijo mailto:lijo.la...@amd.com>>
>>>>>>> Sent: Wednesday, September 8, 2021 3:36 PM
>>>>>>> To: Christian König 
>>>>>>> mailto:ckoenig.leichtzumer...@gmail.com>>;
>>>>>>>  Yu, Lang
>>>>>>> mailto:lang...@amd.com>>; 
>>>>>>> amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
>>>>>>> Cc: Deucher, Alexander 
>>>>>>> mailto:alexander.deuc...@amd.com>>; Huang, 
>>>>>>> Ray
>>>>>>> mailto:ray.hu...@amd.com>>; Tian Tao 
>>>>>>> mailto:tiant...@hisilicon.com>>
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at
>>>>>>> warnings
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 9/8/2021 12:07 PM, Christian König wrote:
>>>>>>>> Am 08.09.21 um 07:56 schrieb Lang Yu:
>>>>>>>>> sysfs_emit and sysfs_emit_at requrie a page boundary aligned buf
>>>>>>>>> address. Make them happy!
>>>>>>>>>
>>>>>>>>> Warning Log:
>>>>>>>>> [  492.545174] invalid sysfs_emit_at: buf:f19bdfde at:0 [
>>>>>>>>> 492.546416] WARNING: CPU: 7 PID: 1304 at fs/sysfs/file.c:765
>>>>>>>>> sysfs_emit_at+0x4a/0xa0
>>>>>>>>> [  492.654805] Call Trace:
>>>>>>>>> [  492.655353]  ? smu_cmn_get_metrics_table+0x40/0x50 [amdgpu] [
>>>>>>>>> 492.656780]  vangogh_print_clk_levels+0x369/0x410 [amdgpu] [
>>>>>>>>> 492.658245] vangogh_common_print_clk_levels+0x77/0x80 [amdgpu] [
>>>>>>>>> 492.659733]  ? preempt_schedule_common+0x18/0x30 [ 492.660713]
>>>>>>>>> smu_print_ppclk_levels+0x65/0x90 [amdgpu] [ 492.662107]
>>>>>>>>> amdgpu_get_pp_od_clk_voltage+0x13d/0x190 [amdgpu] [ 492.663620]
>>>>>>>>> dev_attr_show+0x1d/0x40
>>>>>>>>
>>>>>>>> Mhm, that at least partially doesn't looks li

RE: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at warnings

2021-09-08 Thread Yu, Lang
[AMD Official Use Only]

Or try to send a patch to remove page boundary aligned limitation. Any 
considerations? Thanks.

int sysfs_emit(char *buf, const char *fmt, ...)
 {
va_list args;
-   int len;
+   int len, offset;

-   if (WARN(!buf || offset_in_page(buf),
+   offset = offset_in_page(buf);
+
+   if (WARN(!buf,
 "invalid sysfs_emit: buf:%p\n", buf))
return 0;

va_start(args, fmt);
-   len = vscnprintf(buf, PAGE_SIZE, fmt, args);
+   len = vscnprintf(buf, PAGE_SIZE - offset, fmt, args);
va_end(args);

return len;
@@ -760,14 +762,16 @@ EXPORT_SYMBOL_GPL(sysfs_emit);
 int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
 {
va_list args;
-   int len;
+   int len, offset;
+
+   offset = offset_in_page(buf);

-   if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE,
+   if (WARN(!buf || at < 0 || at + offset >= PAGE_SIZE,
 "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at))
return 0;

va_start(args, fmt);
-   len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args);
+   len = vscnprintf(buf + at, PAGE_SIZE - at - offset, fmt, args);
va_end(args);

return len;

Regards,
Lang

>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, September 8, 2021 6:22 PM
>To: Christian König ; Yu, Lang
>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Tian Tao ; Powell, Darren
>
>Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at warnings
>
>
>
>On 9/8/2021 3:08 PM, Christian König wrote:
>> Am 08.09.21 um 11:29 schrieb Lazar, Lijo:
>>>
>>>
>>> On 9/8/2021 2:32 PM, Yu, Lang wrote:
>>>> [AMD Official Use Only]
>>>>
>>>>
>>>>
>>>>> -Original Message-
>>>>> From: Lazar, Lijo 
>>>>> Sent: Wednesday, September 8, 2021 4:55 PM
>>>>> To: Yu, Lang ; Christian König
>>>>> ; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander ; Huang, Ray
>>>>> ; Tian Tao 
>>>>> Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at
>>>>> warnings
>>>>>
>>>>>
>>>>>
>>>>> On 9/8/2021 1:14 PM, Yu, Lang wrote:
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -Original Message-
>>>>>>> From: Lazar, Lijo 
>>>>>>> Sent: Wednesday, September 8, 2021 3:36 PM
>>>>>>> To: Christian König ; Yu, Lang
>>>>>>> ; amd-gfx@lists.freedesktop.org
>>>>>>> Cc: Deucher, Alexander ; Huang, Ray
>>>>>>> ; Tian Tao 
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at
>>>>>>> warnings
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 9/8/2021 12:07 PM, Christian König wrote:
>>>>>>>> Am 08.09.21 um 07:56 schrieb Lang Yu:
>>>>>>>>> sysfs_emit and sysfs_emit_at requrie a page boundary aligned
>>>>>>>>> buf address. Make them happy!
>>>>>>>>>
>>>>>>>>> Warning Log:
>>>>>>>>> [  492.545174] invalid sysfs_emit_at: buf:f19bdfde at:0
>>>>>>>>> [ 492.546416] WARNING: CPU: 7 PID: 1304 at fs/sysfs/file.c:765
>>>>>>>>> sysfs_emit_at+0x4a/0xa0
>>>>>>>>> [  492.654805] Call Trace:
>>>>>>>>> [  492.655353]  ? smu_cmn_get_metrics_table+0x40/0x50 [amdgpu]
>>>>>>>>> [ 492.656780]  vangogh_print_clk_levels+0x369/0x410 [amdgpu] [
>>>>>>>>> 492.658245]  vangogh_common_print_clk_levels+0x77/0x80 [amdgpu]
>>>>>>>>> [ 492.659733]  ? preempt_schedule_common+0x18/0x30 [
>>>>>>>>> 492.660713]
>>>>>>>>> smu_print_ppclk_levels+0x65/0x90 [amdgpu] [ 492.662107]
>>>>>>>>> amdgpu_get_pp_od_clk_voltage+0x13d/0x190 [amdgpu] [ 492.663620]
>>>>>>>>> dev_attr_show+0x1d/0x40
>>>>>>>>
>>>>>>>> Mhm, that at least partially doesn't looks like the right
>>>>>>>> approach to me.
>>>>>>>>
>>>>>>>> Why do we have string printing and sysfs code in the hardware
>>

RE: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at warnings

2021-09-08 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, September 8, 2021 4:55 PM
>To: Yu, Lang ; Christian König
>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Tian Tao 
>Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at warnings
>
>
>
>On 9/8/2021 1:14 PM, Yu, Lang wrote:
>> [AMD Official Use Only]
>>
>>
>>
>>> -Original Message-
>>> From: Lazar, Lijo 
>>> Sent: Wednesday, September 8, 2021 3:36 PM
>>> To: Christian König ; Yu, Lang
>>> ; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander ; Huang, Ray
>>> ; Tian Tao 
>>> Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at
>>> warnings
>>>
>>>
>>>
>>> On 9/8/2021 12:07 PM, Christian König wrote:
>>>> Am 08.09.21 um 07:56 schrieb Lang Yu:
>>>>> sysfs_emit and sysfs_emit_at requrie a page boundary aligned buf
>>>>> address. Make them happy!
>>>>>
>>>>> Warning Log:
>>>>> [  492.545174] invalid sysfs_emit_at: buf:f19bdfde at:0 [
>>>>> 492.546416] WARNING: CPU: 7 PID: 1304 at fs/sysfs/file.c:765
>>>>> sysfs_emit_at+0x4a/0xa0
>>>>> [  492.654805] Call Trace:
>>>>> [  492.655353]  ? smu_cmn_get_metrics_table+0x40/0x50 [amdgpu] [
>>>>> 492.656780]  vangogh_print_clk_levels+0x369/0x410 [amdgpu] [
>>>>> 492.658245]  vangogh_common_print_clk_levels+0x77/0x80 [amdgpu] [
>>>>> 492.659733]  ? preempt_schedule_common+0x18/0x30 [  492.660713]
>>>>> smu_print_ppclk_levels+0x65/0x90 [amdgpu] [  492.662107]
>>>>> amdgpu_get_pp_od_clk_voltage+0x13d/0x190 [amdgpu] [  492.663620]
>>>>> dev_attr_show+0x1d/0x40
>>>>
>>>> Mhm, that at least partially doesn't looks like the right approach to me.
>>>>
>>>> Why do we have string printing and sysfs code in the hardware
>>>> version specific backend in the first place?
>>>>
>>>
>>> This is a callback meant for printing ASIC specific information to
>>> sysfs node. The buffer passed in sysfs read is passed as it is to the 
>>> callback API.
>>>
>>>> That stuff needs to be implemented for each hardware generation and
>>>> is now cluttered with sysfs buffer offset calculations.
>>>>
>>>
>>> Looks like the warning happened because of this usage.
>>>
>>>  size = amdgpu_dpm_print_clock_levels(adev, OD_SCLK, buf);
>>>  size += amdgpu_dpm_print_clock_levels(adev, OD_MCLK,
>>> buf+size);
>>>  size += amdgpu_dpm_print_clock_levels(adev,
>>> OD_VDDC_CURVE, buf+size);
>>>  size += amdgpu_dpm_print_clock_levels(adev,
>>> OD_VDDGFX_OFFSET, buf+size);
>>>  size += amdgpu_dpm_print_clock_levels(adev,
>>> OD_RANGE,
>>> buf+size);
>>>  size += amdgpu_dpm_print_clock_levels(adev, OD_CCLK,
>>> buf+size);
>>>
>>>
>> [Yu, Lang]
>> Yes. So it is fine we just fix the caller amdgpu_get_pp_od_clk_voltage like
>following:
>>
>> static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
>>  struct device_attribute *attr,
>>  char *buf)
>> {
>>  struct drm_device *ddev = dev_get_drvdata(dev);
>>  struct amdgpu_device *adev = drm_to_adev(ddev);
>>  ssize_t size, offset;
>>  int ret, i;
>>  char temp_buf[512];
>>  char clock_type[] = {OD_SCLK, OD_MCLK, OD_VDDC_CURVE,
>>   OD_VDDGFX_OFFSET, OD_RANGE, OD_CCLK};
>>
>>  if (amdgpu_in_reset(adev))
>>  return -EPERM;
>>  if (adev->in_suspend && !adev->in_runpm)
>>  return -EPERM;
>>
>>  ret = pm_runtime_get_sync(ddev->dev);
>>  if (ret < 0) {
>>  pm_runtime_put_autosuspend(ddev->dev);
>>  return ret;
>>  }
>>
>>  offset = 0;
>>
>>  if (adev->powerplay.pp_funcs->print_clock_levels) {
>>  for (i = 0; i < ARRAY_SIZE(clock_type); i++) {
>>  size = amdgpu_dpm_print_clock_levels(adev,
>clock_type[i], buf);
>>  if (offset + size > PAGE_SIZE)
>>  break;
>>  memcpy(temp_buf + offset, buf, size);
>> 

RE: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at warnings

2021-09-08 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Lazar, Lijo 
>Sent: Wednesday, September 8, 2021 3:36 PM
>To: Christian König ; Yu, Lang
>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Huang, Ray
>; Tian Tao 
>Subject: Re: [PATCH] drm/amdgpu: fix sysfs_emit/sysfs_emit_at warnings
>
>
>
>On 9/8/2021 12:07 PM, Christian König wrote:
>> Am 08.09.21 um 07:56 schrieb Lang Yu:
>>> sysfs_emit and sysfs_emit_at requrie a page boundary aligned buf
>>> address. Make them happy!
>>>
>>> Warning Log:
>>> [  492.545174] invalid sysfs_emit_at: buf:f19bdfde at:0 [
>>> 492.546416] WARNING: CPU: 7 PID: 1304 at fs/sysfs/file.c:765
>>> sysfs_emit_at+0x4a/0xa0
>>> [  492.654805] Call Trace:
>>> [  492.655353]  ? smu_cmn_get_metrics_table+0x40/0x50 [amdgpu] [
>>> 492.656780]  vangogh_print_clk_levels+0x369/0x410 [amdgpu] [
>>> 492.658245]  vangogh_common_print_clk_levels+0x77/0x80 [amdgpu] [
>>> 492.659733]  ? preempt_schedule_common+0x18/0x30 [  492.660713]
>>> smu_print_ppclk_levels+0x65/0x90 [amdgpu] [  492.662107]
>>> amdgpu_get_pp_od_clk_voltage+0x13d/0x190 [amdgpu] [  492.663620]
>>> dev_attr_show+0x1d/0x40
>>
>> Mhm, that at least partially doesn't looks like the right approach to me.
>>
>> Why do we have string printing and sysfs code in the hardware version
>> specific backend in the first place?
>>
>
>This is a callback meant for printing ASIC specific information to sysfs node. 
>The
>buffer passed in sysfs read is passed as it is to the callback API.
>
>> That stuff needs to be implemented for each hardware generation and is
>> now cluttered with sysfs buffer offset calculations.
>>
>
>Looks like the warning happened because of this usage.
>
> size = amdgpu_dpm_print_clock_levels(adev, OD_SCLK, buf);
> size += amdgpu_dpm_print_clock_levels(adev, OD_MCLK,
>buf+size);
> size += amdgpu_dpm_print_clock_levels(adev,
>OD_VDDC_CURVE, buf+size);
> size += amdgpu_dpm_print_clock_levels(adev,
>OD_VDDGFX_OFFSET, buf+size);
> size += amdgpu_dpm_print_clock_levels(adev, OD_RANGE,
>buf+size);
> size += amdgpu_dpm_print_clock_levels(adev, OD_CCLK,
>buf+size);
>
>
[Yu, Lang] 
Yes. So it is fine we just fix the caller amdgpu_get_pp_od_clk_voltage like 
following:

static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
ssize_t size, offset;
int ret, i;
char temp_buf[512];
char clock_type[] = {OD_SCLK, OD_MCLK, OD_VDDC_CURVE, 
 OD_VDDGFX_OFFSET, OD_RANGE, OD_CCLK};

if (amdgpu_in_reset(adev))
return -EPERM;
if (adev->in_suspend && !adev->in_runpm)
return -EPERM;

ret = pm_runtime_get_sync(ddev->dev);
if (ret < 0) {
pm_runtime_put_autosuspend(ddev->dev);
return ret;
}

offset = 0;

if (adev->powerplay.pp_funcs->print_clock_levels) { 
for (i = 0; i < ARRAY_SIZE(clock_type); i++) {
size = amdgpu_dpm_print_clock_levels(adev, 
clock_type[i], buf);
if (offset + size > PAGE_SIZE)
break;
memcpy(temp_buf + offset, buf, size);
offset += size;
}
memcpy(buf, temp_buf, offset);
size = offset;
} else {
size = sysfs_emit(buf, "\n");
}
pm_runtime_mark_last_busy(ddev->dev);
pm_runtime_put_autosuspend(ddev->dev);

return size;
}

Regards,
Lang

>
>> Regards,
>> Christian.
>>
>>>
>>> Signed-off-by: Lang Yu 
>>> ---
>>>   drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c |  9 +++--
>>>   drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c   |  5 -
>>>   .../drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   |  5 -
>>>   drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c  | 15
>>> +--
>>>   drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c   |  3 +++
>>>   .../gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 13
>>> +
>>>   .../gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c  |  7 +--
>>>   7 files changed, 41 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu1

RE: [PATCH v2 2/2] drm/ttm: check with temporary GTT memory in BO validation

2021-05-31 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Koenig, Christian 
>Sent: Monday, May 31, 2021 7:55 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org; dri-
>de...@lists.freedesktop.org
>Cc: Thomas Hellströ ; Olsak, Marek
>; Huang, Ray ; Deucher,
>Alexander 
>Subject: Re: [PATCH v2 2/2] drm/ttm: check with temporary GTT memory in BO
>validation
>
>Am 31.05.21 um 13:30 schrieb Lang Yu:
>> If a BO's backing store is temporary GTT memory, we should move it in
>> BO validation.
>>
>> v2: move the check outside of for loop
>>
>> Signed-off-by: Lang Yu 
>
>In general those patches now have my rb, but let me add some more
>documentation to them to better explain why we do this.
>
[Yu, Lang] 
Thanks for your review and advice. Happy to see that!

Regards,
Lang

>Thanks,
>Christian.
>
>> ---
>>   drivers/gpu/drm/ttm/ttm_bo.c | 3 +++
>>   1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>> b/drivers/gpu/drm/ttm/ttm_bo.c index c32a37d0a460..1802fc77cfcb 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -938,6 +938,9 @@ static bool ttm_bo_places_compat(const struct
>ttm_place *places,
>>   {
>>  unsigned i;
>>
>> +if (mem->placement & TTM_PL_FLAG_TEMPORARY)
>> +return false;
>> +
>>  for (i = 0; i < num_placement; i++) {
>>  const struct ttm_place *heap = [i];
>>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH v2 3/3] drm/amdgpu: allow temporary GTT allocation under memory pressure

2021-05-31 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Koenig, Christian 
>Sent: Monday, May 31, 2021 8:49 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org; dri-
>de...@lists.freedesktop.org
>Cc: Thomas Hellströ ; Olsak, Marek
>; Huang, Ray ; Deucher,
>Alexander 
>Subject: Re: [PATCH v2 3/3] drm/amdgpu: allow temporary GTT allocation under
>memory pressure
>
>On which branch are you working? I have problems applying that one to amd-
>staging-drm-next.
>
[Yu, Lang] 
amd-staging-drm-next.

Regards,
Lang

>Christian.
>
>Am 31.05.21 um 10:22 schrieb Lang Yu:
>> Currently, we have a limitted GTT memory size and need a bounce buffer
>> when doing buffer migration between VRAM and SYSTEM domain.
>>
>> The problem is under GTT memory pressure we can't do buffer migration
>> between VRAM and SYSTEM domain. But in some cases we really need that.
>> Eespecially when validating a VRAM backing store BO which resides in
>> SYSTEM domain.
>>
>> To solve the problem, we allow temporary GTT allocation under memory
>> pressure and do the following:
>>
>> 1. Change mgr->available into mgr->used (invert the value).
>> 2. Always account all GTT BOs to the used space.
>> 3. Only when it is not a temporary allocation bail out.
>>
>> v2: still account temporary GTT allocations
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 27 ++---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  4 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  2 +-
>>   3 files changed, 17 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> index 8860545344c7..393f55f412b7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> @@ -111,14 +111,11 @@ static int amdgpu_gtt_mgr_new(struct
>ttm_resource_manager *man,
>>  struct amdgpu_gtt_node *node;
>>  int r;
>>
>> -spin_lock(>lock);
>> -if ((>mem == mem || tbo->mem.mem_type != TTM_PL_TT) &&
>> -atomic64_read(>available) < mem->num_pages) {
>> -spin_unlock(>lock);
>> +if ((atomic64_add_return(mem->num_pages, >used) > man->size)
>&&
>> +!(mem->placement & TTM_PL_FLAG_TEMPORARY)) {
>> +atomic64_sub(mem->num_pages, >used);
>>  return -ENOSPC;
>>  }
>> -atomic64_sub(mem->num_pages, >available);
>> -spin_unlock(>lock);
>>
>>  if (!place->lpfn) {
>>  mem->mm_node = NULL;
>> @@ -152,7 +149,7 @@ static int amdgpu_gtt_mgr_new(struct
>ttm_resource_manager *man,
>>  kfree(node);
>>
>>   err_out:
>> -atomic64_add(mem->num_pages, >available);
>> +atomic64_sub(mem->num_pages, >used);
>>
>>  return r;
>>   }
>> @@ -178,7 +175,7 @@ static void amdgpu_gtt_mgr_del(struct
>ttm_resource_manager *man,
>>  kfree(node);
>>  }
>>
>> -atomic64_add(mem->num_pages, >available);
>> +atomic64_sub(mem->num_pages, >used);
>>   }
>>
>>   /**
>> @@ -191,9 +188,8 @@ static void amdgpu_gtt_mgr_del(struct
>ttm_resource_manager *man,
>>   uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
>>   {
>>  struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>> -s64 result = man->size - atomic64_read(>available);
>>
>> -return (result > 0 ? result : 0) * PAGE_SIZE;
>> +return atomic64_read(>used) * PAGE_SIZE;
>>   }
>>
>>   /**
>> @@ -234,14 +230,17 @@ static void amdgpu_gtt_mgr_debug(struct
>ttm_resource_manager *man,
>>   struct drm_printer *printer)
>>   {
>>  struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>> +uint64_t used, used_pages;
>>
>>  spin_lock(>lock);
>>  drm_mm_print(>mm, printer);
>>  spin_unlock(>lock);
>>
>> -drm_printf(printer, "man size:%llu pages, gtt available:%lld pages,
>usage:%lluMB\n",
>> -   man->size, (u64)atomic64_read(>available),
>> -   amdgpu_gtt_mgr_usage(man) >> 20);
>> +used = amdgpu_gtt_mgr_usage(man);
>> +used_pages = used/PAGE_SIZE;
>> +
>> +drm_printf(printer, "man size:%llu pages,  gtt available:%lld pages,
>usage:%lluMB\n",
>> +man->size, us

RE: [PATCH 1/2] drm/ttm: cleanup and add TTM_PL_FLAG_TEMPORARY

2021-05-31 Thread Yu, Lang
[Public]

>Hi,

>On 5/27/21 3:30 AM, Lang Yu wrote:
>> Make TTM_PL_FLAG_* start from zero and add
>> TTM_PL_FLAG_TEMPORARY flag for temporary
>> GTT allocation use.

>GTT is a driver private acronym, right? And it doesn't look like 
>TTM_PL_FLAG_TEMPORARY will be used in core TTM, so should we instead set 
>aside a mask in the PL flag for driver-private use?

Hi Thomas,  

Thanks for your comments and advice, GTT means Graphics Translation Table here, 
seems
a general acronym. TTM_PL_FLAG_TEMPORARY may also be used by other drives.
I have made other patches for this. Please help review. 

Regards,
Lang

>Thomas

>-----Original Message-
>From: Yu, Lang 
>Sent: Thursday, May 27, 2021 9:31 AM
>To: amd-gfx@lists.freedesktop.org; dri-de...@lists.freedesktop.org
>Cc: Koenig, Christian ; Huang, Ray
>; Deucher, Alexander ;
>Yu, Lang 
>Subject: [PATCH 1/2] drm/ttm: cleanup and add TTM_PL_FLAG_TEMPORARY
>
>Make TTM_PL_FLAG_* start from zero and add TTM_PL_FLAG_TEMPORARY flag
>for temporary GTT allocation use.
>
>Signed-off-by: Lang Yu 
>---
> include/drm/ttm/ttm_placement.h | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
>diff --git a/include/drm/ttm/ttm_placement.h
>b/include/drm/ttm/ttm_placement.h index aa6ba4d0cf78..9f5cfc7c2d5a 100644
>--- a/include/drm/ttm/ttm_placement.h
>+++ b/include/drm/ttm/ttm_placement.h
>@@ -47,8 +47,9 @@
>  * top of the memory area, instead of the bottom.
>  */
>
>-#define TTM_PL_FLAG_CONTIGUOUS  (1 << 19)
>-#define TTM_PL_FLAG_TOPDOWN (1 << 22)
>+#define TTM_PL_FLAG_CONTIGUOUS  (1 << 0)
>+#define TTM_PL_FLAG_TOPDOWN (1 << 1)
>+#define TTM_PL_FLAG_TEMPORARY   (1 << 2)
>
> /**
>  * struct ttm_place
>--
>2.25.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/2] drm/amdgpu: stop bookkeeping of temporary GTT allocation

2021-05-28 Thread Yu, Lang
[AMD Official Use Only]


Inline.

>-Original Message-
>From: Koenig, Christian 
>Sent: Thursday, May 27, 2021 7:51 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org; dri-
>de...@lists.freedesktop.org
>Cc: Huang, Ray ; Deucher, Alexander
>; Olsak, Marek 
>Subject: Re: [PATCH 2/2] drm/amdgpu: stop bookkeeping of temporary GTT
>allocation
>
>Puttin Marek on CC.
>
>Am 27.05.21 um 03:30 schrieb Lang Yu:
>> To improve buffer migration performace, stop bookkeeping of temporary
>> GTT allocation, including allocation for BO evicted from VRAM and
>> bounce buffer.
>>
>> Signed-off-by: Lang Yu 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 16 ++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  4 +++-
>>   2 files changed, 13 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> index 8860545344c7..32fedd495c7f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> @@ -111,14 +111,15 @@ static int amdgpu_gtt_mgr_new(struct
>ttm_resource_manager *man,
>>  struct amdgpu_gtt_node *node;
>>  int r;
>>
>> -spin_lock(>lock);
>> -if ((>mem == mem || tbo->mem.mem_type != TTM_PL_TT) &&
>> -atomic64_read(>available) < mem->num_pages) {
>> +if (!(mem->placement & TTM_PL_FLAG_TEMPORARY)) {
>> +spin_lock(>lock);
>> +if (atomic64_read(>available) < mem->num_pages) {
>> +spin_unlock(>lock);
>> +return -ENOSPC;
>> +}
>> +atomic64_sub(mem->num_pages, >available);
>
>After sleeping a night over that I think we need to talk about this part here 
>once
>more.
>
>While temporary GTT allocations can temporary exceed the GTT limitation we
>still need to account them in the case the eviction is interrupted for some 
>reason.
>
>In other words what can happen is that we want to move
>VRAM->GTT->SYSTEM, but GTT->SYSTEM never happens because it is
>interrupted in the wait (that's unfortunately rather likely).
>
>To solve this I think we should do the following:
>1. Change mgr->available into mgr->used (e.g. invert the value).
>2. Always account all GTT BOs to the used space.
>3. Only when it is not a temporary allocation bail out.
>
>This way temporary allocations are accounted for, but we still allow
>memory evictions to happen under pressure.
>
>While at it you could also drop taking the spinlock to check the atomic,
>that is pretty much unnecessary.
>
>Regards,
>Christian.
>
[Yu, Lang] Hi Christian,

Yes, it can actually happen that the BO was evicted from VRAM to GTT domain,
but was not moved forward to SYSTEM domain. It resides in GTT domain 
waiting for next time validation or eviction or destruction.

It is reasonable that we count all GTT allocation. 
1, I find if the temporary GTT BO was not counted but used for command 
submission, 
then we can use more GTT memory than GTT limit for command submission. Is that 
your concern? 
2, Or if we don't count temporary GTT allocation, that will mess up gtt manager.

In other words, if we don't count it when it resides in GTT domain, what is the 
consequence? 
Would like to know your concern. Actually it is counted by ttm_pages_allocated. 

If we use "used" instead of "available" in gtt manager, the used size may 
exceed man size.
We should also deal with gtt mgr debug interface.

Rework the logic like this with your idea:

if ((atomic64_add_return(mem->num_pages, >used) > man->size) &&
!(mem->placement & TTM_PL_FLAG_TEMPORARY)) {
atomic64_sub(mem->num_pages, >used);
return -ENOSPC;
}

Regards,
Lang

>>  spin_unlock(>lock);
>> -return -ENOSPC;
>>  }
>> -atomic64_sub(mem->num_pages, >available);
>> -spin_unlock(>lock);
>>
>>  if (!place->lpfn) {
>>  mem->mm_node = NULL;
>> @@ -178,6 +179,9 @@ static void amdgpu_gtt_mgr_del(struct
>ttm_resource_manager *man,
>>  kfree(node);
>>  }
>>
>> +if (mem->placement & TTM_PL_FLAG_TEMPORARY)
>> +return;
>> +
>>  atomic64_add(mem->num_pages, >available);
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index c0aef327292a..129d39392859 100644
>>

RE: [PATCH libdrm] Revert "tests/amdgpu: fix bo eviction test issue"

2021-05-08 Thread Yu, Lang
[AMD Official Use Only - Internal Distribution Only]

Hi Alex,

I have opened a MR: 
https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/165.
Thanks.

Regards,
Lang

From: Deucher, Alexander 
Sent: Friday, May 7, 2021 9:28 PM
To: Yu, Lang ; Chen, Guchun ; 
amd-gfx@lists.freedesktop.org; Huang, Ray ; Song, Asher 

Subject: Re: [PATCH libdrm] Revert "tests/amdgpu: fix bo eviction test issue"


[AMD Official Use Only - Internal Distribution Only]

For libdrm tests, please open a gitlab merge request:
https://gitlab.freedesktop.org/mesa/drm/-/merge_requests

Alex


From: amd-gfx 
mailto:amd-gfx-boun...@lists.freedesktop.org>>
 on behalf of Yu, Lang mailto:lang...@amd.com>>
Sent: Friday, May 7, 2021 3:10 AM
To: Chen, Guchun mailto:guchun.c...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>; Huang, 
Ray mailto:ray.hu...@amd.com>>; Song, Asher 
mailto:asher.s...@amd.com>>
Subject: RE: [PATCH libdrm] Revert "tests/amdgpu: fix bo eviction test issue"

[AMD Official Use Only - Internal Distribution Only]


Reviewed-by:  Lang Yu mailto:lang...@amd.com>>

Regards,
Lang

-Original Message-
From: Chen, Guchun mailto:guchun.c...@amd.com>>
Sent: Thursday, May 6, 2021 5:55 PM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Yu, 
Lang mailto:lang...@amd.com>>; Huang, Ray 
mailto:ray.hu...@amd.com>>; Song, Asher 
mailto:asher.s...@amd.com>>
Cc: Chen, Guchun mailto:guchun.c...@amd.com>>
Subject: [PATCH libdrm] Revert "tests/amdgpu: fix bo eviction test issue"

This reverts commit a5a400c9581c3b91598623603067556b18084c5d.

bo evict test was disabled by default per below commit. So still keep it as 
disabled.

1f6a85cc test/amdgpu: disable bo eviction test by default

Signed-off-by: Guchun Chen mailto:guchun.c...@amd.com>>
---
 tests/amdgpu/amdgpu_test.c |  3 +++
 tests/amdgpu/basic_tests.c | 13 -
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/amdgpu/amdgpu_test.c b/tests/amdgpu/amdgpu_test.c index 
60f3a508..77bbfbcc 100644
--- a/tests/amdgpu/amdgpu_test.c
+++ b/tests/amdgpu/amdgpu_test.c
@@ -496,6 +496,9 @@ static void amdgpu_disable_suites()
 "gfx ring slow bad draw test (set 
amdgpu.lockup_timeout=50)", CU_FALSE))
 fprintf(stderr, "test deactivation failed - %s\n", 
CU_get_error_msg());

+   if (amdgpu_set_test_active(BASIC_TESTS_STR, "bo eviction Test", 
CU_FALSE))
+   fprintf(stderr, "test deactivation failed - %s\n",
+CU_get_error_msg());
+
 /* This test was ran on GFX8 and GFX9 only */
 if (family_id < AMDGPU_FAMILY_VI || family_id > AMDGPU_FAMILY_RV)
 if (amdgpu_set_test_active(BASIC_TESTS_STR, "Sync dependency 
Test", CU_FALSE)) diff --git a/tests/amdgpu/basic_tests.c 
b/tests/amdgpu/basic_tests.c index 8e7c4916..3a4214f5 100644
--- a/tests/amdgpu/basic_tests.c
+++ b/tests/amdgpu/basic_tests.c
@@ -928,15 +928,6 @@ static void amdgpu_bo_eviction_test(void)
0, _info);
 CU_ASSERT_EQUAL(r, 0);

-   r = amdgpu_query_heap_info(device_handle, AMDGPU_GEM_DOMAIN_GTT,
-  0, _info);
-   CU_ASSERT_EQUAL(r, 0);
-
-   if (vram_info.max_allocation > gtt_info.heap_size/3) {
-   vram_info.max_allocation = gtt_info.heap_size/3;
-   gtt_info.max_allocation = vram_info.max_allocation;
-   }
-
 r = amdgpu_bo_alloc_wrap(device_handle, vram_info.max_allocation, 4096,
  AMDGPU_GEM_DOMAIN_VRAM, 0, _max[0]);
 CU_ASSERT_EQUAL(r, 0);
@@ -944,6 +935,10 @@ static void amdgpu_bo_eviction_test(void)
  AMDGPU_GEM_DOMAIN_VRAM, 0, _max[1]);
 CU_ASSERT_EQUAL(r, 0);

+   r = amdgpu_query_heap_info(device_handle, AMDGPU_GEM_DOMAIN_GTT,
+  0, _info);
+   CU_ASSERT_EQUAL(r, 0);
+
 r = amdgpu_bo_alloc_wrap(device_handle, gtt_info.max_allocation, 4096,
  AMDGPU_GEM_DOMAIN_GTT, 0, _max[0]);
 CU_ASSERT_EQUAL(r, 0);
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7Calexander.deucher%40amd.com%7Cb3ce363db6e94aa1396308d9112727fc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637559682163619573%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=d9uch8frkIAiVkdtaOHillKHngoVp8brn%2FxJuKatmYU%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH libdrm] Revert "tests/amdgpu: fix bo eviction test issue"

2021-05-07 Thread Yu, Lang
[AMD Official Use Only - Internal Distribution Only]


Reviewed-by:  Lang Yu 

Regards,
Lang

-Original Message-
From: Chen, Guchun  
Sent: Thursday, May 6, 2021 5:55 PM
To: amd-gfx@lists.freedesktop.org; Yu, Lang ; Huang, Ray 
; Song, Asher 
Cc: Chen, Guchun 
Subject: [PATCH libdrm] Revert "tests/amdgpu: fix bo eviction test issue"

This reverts commit a5a400c9581c3b91598623603067556b18084c5d.

bo evict test was disabled by default per below commit. So still keep it as 
disabled.

1f6a85cc test/amdgpu: disable bo eviction test by default

Signed-off-by: Guchun Chen 
---
 tests/amdgpu/amdgpu_test.c |  3 +++
 tests/amdgpu/basic_tests.c | 13 -
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/amdgpu/amdgpu_test.c b/tests/amdgpu/amdgpu_test.c index 
60f3a508..77bbfbcc 100644
--- a/tests/amdgpu/amdgpu_test.c
+++ b/tests/amdgpu/amdgpu_test.c
@@ -496,6 +496,9 @@ static void amdgpu_disable_suites()
"gfx ring slow bad draw test (set 
amdgpu.lockup_timeout=50)", CU_FALSE))
fprintf(stderr, "test deactivation failed - %s\n", 
CU_get_error_msg());
 
+   if (amdgpu_set_test_active(BASIC_TESTS_STR, "bo eviction Test", 
CU_FALSE))
+   fprintf(stderr, "test deactivation failed - %s\n", 
+CU_get_error_msg());
+
/* This test was ran on GFX8 and GFX9 only */
if (family_id < AMDGPU_FAMILY_VI || family_id > AMDGPU_FAMILY_RV)
if (amdgpu_set_test_active(BASIC_TESTS_STR, "Sync dependency 
Test", CU_FALSE)) diff --git a/tests/amdgpu/basic_tests.c 
b/tests/amdgpu/basic_tests.c index 8e7c4916..3a4214f5 100644
--- a/tests/amdgpu/basic_tests.c
+++ b/tests/amdgpu/basic_tests.c
@@ -928,15 +928,6 @@ static void amdgpu_bo_eviction_test(void)
   0, _info);
CU_ASSERT_EQUAL(r, 0);
 
-   r = amdgpu_query_heap_info(device_handle, AMDGPU_GEM_DOMAIN_GTT,
-  0, _info);
-   CU_ASSERT_EQUAL(r, 0);
-
-   if (vram_info.max_allocation > gtt_info.heap_size/3) {
-   vram_info.max_allocation = gtt_info.heap_size/3;
-   gtt_info.max_allocation = vram_info.max_allocation;
-   }
-
r = amdgpu_bo_alloc_wrap(device_handle, vram_info.max_allocation, 4096,
 AMDGPU_GEM_DOMAIN_VRAM, 0, _max[0]);
CU_ASSERT_EQUAL(r, 0);
@@ -944,6 +935,10 @@ static void amdgpu_bo_eviction_test(void)
 AMDGPU_GEM_DOMAIN_VRAM, 0, _max[1]);
CU_ASSERT_EQUAL(r, 0);
 
+   r = amdgpu_query_heap_info(device_handle, AMDGPU_GEM_DOMAIN_GTT,
+  0, _info);
+   CU_ASSERT_EQUAL(r, 0);
+
r = amdgpu_bo_alloc_wrap(device_handle, gtt_info.max_allocation, 4096,
 AMDGPU_GEM_DOMAIN_GTT, 0, _max[0]);
CU_ASSERT_EQUAL(r, 0);
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amd/dispaly: fix deadlock issue in amdgpu reset

2021-03-22 Thread Yu, Lang
[AMD Official Use Only - Internal Distribution Only]



-Original Message-
From: Grodzovsky, Andrey  
Sent: Monday, March 22, 2021 11:01 PM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Huang, Ray 

Subject: Re: [PATCH] drm/amd/dispaly: fix deadlock issue in amdgpu reset



On 2021-03-22 4:11 a.m., Lang Yu wrote:
> In amdggpu reset, while dm.dc_lock is held by dm_suspend, 
> handle_hpd_rx_irq tries to acquire it. Deadlock occurred!
> 
> Deadlock log:
> 
> [  104.528304] amdgpu :03:00.0: amdgpu: GPU reset begin!
> 
> [  104.640084] ==
> [  104.640092] WARNING: possible circular locking dependency detected
> [  104.640099] 5.11.0-custom #1 Tainted: GW   E
> [  104.640107] --
> [  104.640114] cat/1158 is trying to acquire lock:
> [  104.640120] 88810a09ce00 
> ((work_completion)(>work)){+.+.}-{0:0}, at: __flush_work+0x2e3/0x450 [  
> 104.640144]
> but task is already holding lock:
> [  104.640151] 88810a09cc70 (>dm.dc_lock){+.+.}-{3:3}, at: 
> dm_suspend+0xb2/0x1d0 [amdgpu] [  104.640581]
> which lock already depends on the new lock.
> 
> [  104.640590]
> the existing dependency chain (in reverse order) is:
> [  104.640598]
> -> #2 (>dm.dc_lock){+.+.}-{3:3}:
> [  104.640611]lock_acquire+0xca/0x390
> [  104.640623]__mutex_lock+0x9b/0x930
> [  104.640633]mutex_lock_nested+0x1b/0x20
> [  104.640640]handle_hpd_rx_irq+0x9b/0x1c0 [amdgpu]
> [  104.640959]dm_irq_work_func+0x4e/0x60 [amdgpu]
> [  104.641264]process_one_work+0x2a7/0x5b0
> [  104.641275]worker_thread+0x4a/0x3d0
> [  104.641283]kthread+0x125/0x160
> [  104.641290]ret_from_fork+0x22/0x30
> [  104.641300]
> -> #1 (>hpd_lock){+.+.}-{3:3}:
> [  104.641312]lock_acquire+0xca/0x390
> [  104.641321]__mutex_lock+0x9b/0x930
> [  104.641328]mutex_lock_nested+0x1b/0x20
> [  104.641336]handle_hpd_rx_irq+0x67/0x1c0 [amdgpu]
> [  104.641635]dm_irq_work_func+0x4e/0x60 [amdgpu]
> [  104.641931]process_one_work+0x2a7/0x5b0
> [  104.641940]worker_thread+0x4a/0x3d0
> [  104.641948]kthread+0x125/0x160
> [  104.641954]ret_from_fork+0x22/0x30
> [  104.641963]
> -> #0 ((work_completion)(>work)){+.+.}-{0:0}:
> [  104.641975]check_prev_add+0x94/0xbf0
> [  104.641983]__lock_acquire+0x130d/0x1ce0
> [  104.641992]lock_acquire+0xca/0x390
> [  104.642000]__flush_work+0x303/0x450
> [  104.642008]flush_work+0x10/0x20
> [  104.642016]amdgpu_dm_irq_suspend+0x93/0x100 [amdgpu]
> [  104.642312]dm_suspend+0x181/0x1d0 [amdgpu]
> [  104.642605]amdgpu_device_ip_suspend_phase1+0x8a/0x100 [amdgpu]
> [  104.642835]amdgpu_device_ip_suspend+0x21/0x70 [amdgpu]
> [  104.643066]amdgpu_device_pre_asic_reset+0x1bd/0x1d2 [amdgpu]
> [  104.643403]amdgpu_device_gpu_recover.cold+0x5df/0xa9d [amdgpu]
> [  104.643715]gpu_recover_get+0x2e/0x60 [amdgpu]
> [  104.643951]simple_attr_read+0x6d/0x110
> [  104.643960]debugfs_attr_read+0x49/0x70
> [  104.643970]full_proxy_read+0x5f/0x90
> [  104.643979]vfs_read+0xa3/0x190
> [  104.643986]ksys_read+0x70/0xf0
> [  104.643992]__x64_sys_read+0x1a/0x20
> [  104.643999]do_syscall_64+0x38/0x90
> [  104.644007]entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  104.644017]
> other info that might help us debug this:
> 
> [  104.644026] Chain exists of:
>   (work_completion)(>work) --> 
> >hpd_lock --> >dm.dc_lock
> 
> [  104.644043]  Possible unsafe locking scenario:
> 
> [  104.644049]CPU0CPU1
> [  104.644055]
> [  104.644060]   lock(>dm.dc_lock);
> [  104.644066]lock(>hpd_lock);
> [  104.644075]lock(>dm.dc_lock);
> [  104.644083]   lock((work_completion)(>work));
> [  104.644090]
>  *** DEADLOCK ***
> 
> [  104.644096] 3 locks held by cat/1158:
> [  104.644103]  #0: 88810d0e4eb8 (>mutex){+.+.}-{3:3}, at: 
> simple_attr_read+0x4e/0x110 [  104.644119]  #1: 88810a0a1600 
> (>reset_sem){}-{3:3}, at: amdgpu_device_lock_adev+0x42/0x94 
> [amdgpu] [  104.644489]  #2: 88810a09cc70 
> (>dm.dc_lock){+.+.}-{3:3}, at: dm_suspend+0xb2/0x1d0 [amdgpu]
> 
> Signed-off-by: Lang Yu 
> ---
>   drivers/gpu/drm/amd/display/a

RE: [PATCH] drm/amd/amdkfd: adjust dummy functions ' placement

2021-01-27 Thread Yu, Lang
[AMD Public Use]

Thanks for Felix's review, I will update soon.

Regards,
Yu Lang

-Original Message-
From: Kuehling, Felix  
Sent: Thursday, January 28, 2021 8:22 AM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Huang, Ray ; Deucher, Alexander 

Subject: Re: [PATCH] drm/amd/amdkfd: adjust dummy functions ' placement

Am 2021-01-27 um 5:14 a.m. schrieb Lang Yu:
> Move all the dummy functions in amdgpu_amdkfd.c to amdgpu_amdkfd.h as 
> inline functions.
>
> Signed-off-by: Lang Yu 
> Suggested-by: Felix Kuehling 

Just a some nit-picking inline, other than that the patch is

Reviewed-by: Felix Kuehling 


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  87   
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 151 ++---
>  2 files changed, 130 insertions(+), 108 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index db96d69eb45e..c5343a5eecbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -47,12 +47,8 @@ int amdgpu_amdkfd_init(void)
>   amdgpu_amdkfd_total_mem_size = si.totalram - si.totalhigh;
>   amdgpu_amdkfd_total_mem_size *= si.mem_unit;
>  
> -#ifdef CONFIG_HSA_AMD
>   ret = kgd2kfd_init();
>   amdgpu_amdkfd_gpuvm_init_mem_limits();
> -#else
> - ret = -ENOENT;
> -#endif
>   kfd_initialized = !ret;
>  
>   return ret;
> @@ -696,86 +692,3 @@ bool amdgpu_amdkfd_have_atomics_support(struct 
> kgd_dev *kgd)
>  
>   return adev->have_atomics_support;
>  }
> -
> -#ifndef CONFIG_HSA_AMD
> -bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) 
> -{
> - return false;
> -}
> -
> -void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo) -{ -}
> -
> -int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo) -{
> - return 0;
> -}
> -
> -void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
> - struct amdgpu_vm *vm)
> -{
> -}
> -
> -struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence 
> *f) -{
> - return NULL;
> -}
> -
> -int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct 
> *mm) -{
> - return 0;
> -}
> -
> -struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev,
> -   unsigned int asic_type, bool vf)
> -{
> - return NULL;
> -}
> -
> -bool kgd2kfd_device_init(struct kfd_dev *kfd,
> -  struct drm_device *ddev,
> -  const struct kgd2kfd_shared_resources *gpu_resources)
> -{
> - return false;
> -}
> -
> -void kgd2kfd_device_exit(struct kfd_dev *kfd) -{ -}
> -
> -void kgd2kfd_exit(void)
> -{
> -}
> -
> -void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) -{ -}
> -
> -int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) -{
> - return 0;
> -}
> -
> -int kgd2kfd_pre_reset(struct kfd_dev *kfd) -{
> - return 0;
> -}
> -
> -int kgd2kfd_post_reset(struct kfd_dev *kfd) -{
> - return 0;
> -}
> -
> -void kgd2kfd_interrupt(struct kfd_dev *kfd, const void 
> *ih_ring_entry) -{ -}
> -
> -void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd) -{ -}
> -
> -void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t 
> throttle_bitmask) -{ -} -#endif diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index bc9f0e42e0a2..c3a51c0d54e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -94,11 +94,6 @@ enum kgd_engine_type {
>   KGD_ENGINE_MAX
>  };
>  
> -struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
> -struct mm_struct *mm);
> -bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct 
> *mm); -struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct 
> dma_fence *f); -int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct 
> amdgpu_bo *bo);
>  
>  struct amdkfd_process_info {
>   /* List head of all VMs that belong to a KFD process */ @@ -132,8 
> +127,6 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,  
> void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);  void 
> amdgpu_amdkfd_device_init(struct amdgpu_device *adev);  void 
> amdgpu_amdkfd_device_fini(struct amdgpu_device *adev);
> -
> -int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct 
> *mm);  int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type 
> engine,
>   

RE: [PATCH] drm/amd/display: 64-bit division on 32-bit arch issue

2021-01-22 Thread Yu, Lang
[AMD Public Use]


The header  has been included by dm_services.h. The following 
is the sequence,

dm_services.h -> dm_services_types.h ->  os_types.h  ->  drm/drm_print.h -> 
linux/device.h -> linux/pm.h -> linux/timer.h ->  linux/time.h -> 
linux/jiffies.h -> linux/math64.h


Regards,
Lang 

-Original Message-
From: Chen, Guchun  
Sent: Friday, January 22, 2021 5:32 PM
To: Huang, Ray ; Yu, Lang 
Cc: Deucher, Alexander ; Lakha, Bhawanpreet 
; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amd/display: 64-bit division on 32-bit arch issue

[AMD Public Use]

Maybe it's good to modify subject to " drm/amd/display: fix 64-bit division 
issue on 32-bit OS"

And if header  should be included?

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Huang Rui
Sent: Friday, January 22, 2021 5:04 PM
To: Yu, Lang 
Cc: Deucher, Alexander ; Lakha, Bhawanpreet 
; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amd/display: 64-bit division on 32-bit arch issue

On Fri, Jan 22, 2021 at 05:00:59PM +0800, Yu, Lang wrote:
> Replace "/" with div_u64 for 32-bit arch. On 32-bit arch, the use of 
> "/" for 64-bit division will cause build error, i.e.
> "__udivdi3/__divdi3 undefined!".
> 
> Fixes: 27755cdf83f1
> drm/amd/display: Update dcn30_apply_idle_power_optimizations() code
> 
> Signed-off-by: Lang Yu 

Acked-by: Huang Rui 

> ---
>  drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
> b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
> index dff83c6a142a..9620fb8a27dc 100644
> --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
> +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
> @@ -772,8 +772,8 @@ bool dcn30_apply_idle_power_optimizations(struct dc *dc, 
> bool enable)
>   cursor_cache_enable ? 
> _attr : NULL)) {
>   unsigned int v_total = 
> stream->adjust.v_total_max ?
>   stream->adjust.v_total_max : 
> stream->timing.v_total;
> - unsigned int refresh_hz = (unsigned long long) 
> stream->timing.pix_clk_100hz *
> - 100LL / (v_total * 
> stream->timing.h_total);
> + unsigned int refresh_hz = div_u64((unsigned 
> long long) stream->timing.pix_clk_100hz *
> + 100LL, (v_total * 
> stream->timing.h_total));
>  
>   /*
>* one frame time in microsec:
> @@ -800,8 +800,8 @@ bool dcn30_apply_idle_power_optimizations(struct dc *dc, 
> bool enable)
>   unsigned int denom = refresh_hz * 6528;
>   unsigned int stutter_period =
> dc->current_state->perf_params.stutter_period_us;
>  
> - tmr_delay = (((100LL + 2 * stutter_period * 
> refresh_hz) *
> - (100LL + 
> dc->debug.mall_additional_timer_percent) + denom - 1) /
> + tmr_delay = div_u64(((100LL + 2 * 
> stutter_period * refresh_hz) *
> + (100LL + 
> dc->debug.mall_additional_timer_percent) + denom - 1),
>   denom) - 64LL;
>  
>   /* scale should be increased until it fits into 
> 6 bits */ @@
> -815,8 +815,8 @@ bool dcn30_apply_idle_power_optimizations(struct dc *dc, 
> bool enable)
>   }
>  
>   denom *= 2;
> - tmr_delay = (((100LL + 2 * 
> stutter_period * refresh_hz) *
> - (100LL + 
> dc->debug.mall_additional_timer_percent) + denom - 1) /
> + tmr_delay = div_u64(((100LL + 2 * 
> stutter_period * refresh_hz) *
> + (100LL + 
> dc->debug.mall_additional_timer_percent) + denom - 
> +1),
>   denom) - 64LL;
>   }
>  
> --
> 2.25.1
> 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7Cguchun.chen%40amd.com%7Cd61b9d6686b64c78d73b08d8beb4ac34%7C3dd8961fe4884e608e11a8