Re: [PATCH] drm/amdgpu/pm: Fix the error of pwm1_enable setting

2024-03-03 Thread Ma, Jun



On 3/1/2024 5:41 PM, Lazar, Lijo wrote:
> 
> 
> On 3/1/2024 1:15 PM, Ma Jun wrote:
>> Fix the pwm_mode value error which used for
>> pwm1_enable setting
>>
>> Signed-off-by: Ma Jun 
>> ---
>>  drivers/gpu/drm/amd/pm/amdgpu_pm.c | 12 +++-
>>  1 file changed, 11 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
>> b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
>> index 9e70c41ad98f..7cc5cd7616b1 100644
>> --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
>> +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
>> @@ -2582,6 +2582,7 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct 
>> device *dev,
>>  struct amdgpu_device *adev = dev_get_drvdata(dev);
>>  int err, ret;
>>  int value;
>> +u32 pwm_mode;
>>  
> 
> You may move this declaration up to follow reverse Christmas tree order.

Thanks, will fix it when submit.

Regards,
Ma Jun
> 
> Reviewed-by: Lijo Lazar 
> 
> Thanks,
> Lijo
>>  if (amdgpu_in_reset(adev))
>>  return -EPERM;
>> @@ -2592,13 +2593,22 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct 
>> device *dev,
>>  if (err)
>>  return err;
>>  
>> +if (value == 0)
>> +pwm_mode = AMD_FAN_CTRL_NONE;
>> +else if (value == 1)
>> +pwm_mode = AMD_FAN_CTRL_MANUAL;
>> +else if (value == 2)
>> +pwm_mode = AMD_FAN_CTRL_AUTO;
>> +else
>> +return -EINVAL;
>> +
>>  ret = pm_runtime_get_sync(adev_to_drm(adev)->dev);
>>  if (ret < 0) {
>>  pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
>>  return ret;
>>  }
>>  
>> -ret = amdgpu_dpm_set_fan_control_mode(adev, value);
>> +ret = amdgpu_dpm_set_fan_control_mode(adev, pwm_mode);
>>  
>>  pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
>>  pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);


[PATCH V2] drm/amdkfd: fix shift out of bounds about gpu debug

2024-03-03 Thread jesse.zhang
From: Jesse Zhang 

[ 3810.410040] UBSAN: shift-out-of-bounds in 
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_int_process_v10.c:345:5
[ 3810.410044] shift exponent 4294967295 is too large for 64-bit type 'long 
long unsigned int'
[ 3810.410047] CPU: 6 PID: 331 Comm: kworker/6:1H Not tainted 6.5.0+ #508
[ 3810.410050] Hardware name: AMD Splinter/Splinter-GNR, BIOS WS54117N_140 
01/16/2024
[ 3810.410052] Workqueue: KFD IH interrupt_wq [amdgpu]
[ 3810.410273] Call Trace:
[ 3810.410274]  
[ 3810.410277]  dump_stack_lvl+0x4c/0x70
[ 3810.410283]  dump_stack+0x14/0x20
[ 3810.410285]  ubsan_epilogue+0x9/0x40
[ 3810.410290]  __ubsan_handle_shift_out_of_bounds+0x113/0x170
[ 3810.410292]  ? 
ZSTD_decompressSequencesSplitLitBuffer_default.isra.0+0x1389/0x1b50
[ 3810.410296]  event_interrupt_wq_v10.cold+0x16/0x1e [amdgpu]
[ 3810.410523]  ? raw_spin_rq_unlock+0x14/0x40
[ 3810.410526]  ? finish_task_switch+0x85/0x2b0
[ 3810.410528]  interrupt_wq+0xb2/0x120 [amdgpu]
[ 3810.410692]  ? interrupt_wq+0xb2/0x120 [amdgpu]
[ 3810.410806]  process_one_work+0x229/0x430
[ 3810.410810]  worker_thread+0x4e/0x3c0
[ 3810.410811]  ? __pfx_worker_thread+0x10/0x10
[ 3810.410813]  kthread+0xfb/0x130
[ 3810.410815]  ? __pfx_kthread+0x10/0x10
[ 3810.410816]  ret_from_fork+0x3d/0x60
[ 3810.410819]  ? __pfx_kthread+0x10/0x10
[ 3810.410820]  ret_from_fork_asm+0x1b/0x30
[ 3810.410823]  

 -v2: define a macro. KFD process interrupts v9, v10, v11 can use that check 
prior to mask conversion
  and user space may find it useful as well.(Jon)

Signed-off-by: Jesse Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  | 3 +++
 include/uapi/linux/kfd_ioctl.h   | 6 ++
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 9a06c6fb6605..110ec5f71056 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -340,6 +340,9 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
}
kfd_signal_event_interrupt(pasid, context_id0 & 
0x7f, 23);
} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+   /* filter out the invalidate context_id0 */
+   if (KFD_DBG_EC_RANGE_CHECK(context_id0))
+   return;
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 7e2859736a55..c28cafa4b902 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -328,11 +328,15 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
/* CP */
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
kfd_signal_event_interrupt(pasid, context_id0, 32);
-   else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
+   else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+   /* filter out the invalidate context_id0 */
+   if (KFD_DBG_EC_RANGE_CHECK(context_id0))
+   return;
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_CTXID0_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
NULL, 0);
+   }
 
/* SDMA */
else if (source_id == SOC21_INTSRC_SDMA_TRAP)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 91dd5e045b51..89dbefbd3081 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -389,6 +389,9 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
}
kfd_signal_event_interrupt(pasid, sq_int_data, 24);
} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+/* filter out the invalidate context_id0 */
+   if (KFD_DBG_EC_RANGE_CHECK(context_id0))
+   return;
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 9ce46edc62a5..9cd3aa83aac3 100644
--- a/include/uapi/linux/kfd_