[Public]

> -----Original Message-----
> From: Zhu, James <[email protected]>
> Sent: Monday, January 19, 2026 12:15 PM
> To: [email protected]
> Cc: Six, Lancelot <[email protected]>; Kim, Jonathan
> <[email protected]>; Zhu, James <[email protected]>
> Subject: [PATCH] drm/amdkfd: add mask for debug trap flag setting
>
> to specify which bits are valid setting on flags.

Can you elaborate?

>
> Signed-off-by: James Zhu <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 48 ++++++++++++------------
>  drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  3 +-
>  include/uapi/linux/kfd_ioctl.h           |  3 +-
>  4 files changed, 30 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 79586abad7fd..fd43ef7c9076 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -3377,7 +3377,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep,
> struct kfd_process *p, v
>                               args->clear_node_address_watch.id);
>               break;
>       case KFD_IOC_DBG_TRAP_SET_FLAGS:
> -             r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
> +             r = kfd_dbg_trap_set_flags(target, &args->set_flags);
>               break;
>       case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
>               r = kfd_dbg_ev_query_debug_event(target,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index a04875236647..279160ca71a9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -512,38 +512,42 @@ static void
> kfd_dbg_clear_process_address_watch(struct kfd_process *target)
>                       kfd_dbg_trap_clear_dev_address_watch(target-
> >pdds[i], j);
>  }
>
> -int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
> +int kfd_dbg_trap_set_flags(struct kfd_process *target,
> +     struct kfd_ioctl_dbg_trap_set_flags_args *set_flags)
>  {
>       uint32_t prev_flags = target->dbg_flags;
>       int i, r = 0, rewind_count = 0;
>
> +     if (!((set_flags->flags ^ prev_flags) & set_flags->mask))

Does this block old debuggers from setting debug mode if pad is 0?

> +             return 0;
> +
>       for (i = 0; i < target->n_pdds; i++) {
>               struct kfd_topology_device *topo_dev =
>                               kfd_topology_device_by_id(target->pdds[i]-
> >dev->id);
>               uint32_t caps = topo_dev->node_props.capability;
>               uint32_t caps2 = topo_dev->node_props.capability2;
> +             struct amdgpu_device *adev = target->pdds[i]->dev->adev;
>
> -             if (!(caps &
> HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
> -                     (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
> -                     *flags = prev_flags;
> -                     return -EACCES;
> -             }
> -
> -             if (!(caps &
> HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&
> -                 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {
> -                     *flags = prev_flags;
> -                     return -EACCES;
> -             }
> -
> -             if (!(caps2 &
> HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED) &&
> -                 (*flags &
> KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE)) {
> -                     *flags = prev_flags;
> +             if (set_flags->mask == 0xFFFFFFFF && !set_flags->flags)
> +                     break;

Seems like this is only for a deactivation system call.
Probably don't want to let users abuse this with a magic number then.
Maybe breakout into new static call in deactivation or just device loop 
deactivate directly there since this logic is starting to get complicated.

> +             if ((!(caps &
> HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
> +                     (set_flags->mask &
> KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) ||
> +                     (!(caps &
> HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&
> +                 (set_flags->mask & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP))
> ||
> +                     (!(caps2 &
> HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED) &&
> +                 (set_flags->mask &
> KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE))) {

Please vertically indent align "caps" and "set_flags" underneath each other for 
legibility.

> +                     set_flags->flags = prev_flags;
> +                     dev_dbg(adev->dev, "Debug Trap set mask 0x%x caps
> 0x%x caps2 0x%x",
> +                             set_flags->mask, caps, caps2);
>                       return -EACCES;
>               }
>       }
>
> -     target->dbg_flags = *flags;
> -     *flags = prev_flags;
> +     target->dbg_flags ^= (target->dbg_flags ^ set_flags->flags) & set_flags-
> >mask;

I assume we can only set/unset 1 flag at a time, which is why this works.
Otherwise, maybe I don't have enough background on this, but please do 
elaborate.

> +     pr_debug("Debug Trap previous flags 0x%x set flags 0x%x set mask 0x%x
> target flags 0x%x",
> +             prev_flags, set_flags->flags, set_flags->mask, target-
> >dbg_flags);
> +
> +     set_flags->flags = prev_flags;
>       for (i = 0; i < target->n_pdds; i++) {
>               struct kfd_process_device *pdd = target->pdds[i];
>
> @@ -555,10 +559,8 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target,
> uint32_t *flags)
>               else
>                       r = kfd_dbg_set_mes_debug_mode(pdd, true);
>
> -             if (r) {
> -                     target->dbg_flags = prev_flags;

Doesn't rewind require the previous value for something to rewind to?

> +             if (r)
>                       break;
> -             }
>
>               rewind_count++;
>       }
> @@ -596,7 +598,7 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target,
> bool unwind, int unwind
>       int i;
>
>       if (!unwind) {
> -             uint32_t flags = 0;
> +             struct kfd_ioctl_dbg_trap_set_flags_args set_flags = {0,
> 0xFFFFFFFF};

Similar to 3 comments above.  Just call a new static or loop directly to reset 
debug mode (i.e. all flags clear).
i.e. you don't have to rewind on preemption failure here because deactivation 
itself is a rewind we're pretty much out of luck at this point if we fail.

Jon

>               int resume_count = resume_queues(target, 0, NULL);
>
>               if (resume_count)
> @@ -606,7 +608,7 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target,
> bool unwind, int unwind
>               kfd_dbg_clear_process_address_watch(target);
>               kfd_dbg_trap_set_wave_launch_mode(target, 0);
>
> -             kfd_dbg_trap_set_flags(target, &flags);
> +             kfd_dbg_trap_set_flags(target, &set_flags);
>       }
>
>       for (i = 0; i < target->n_pdds; i++) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 894753818cba..34d27eb500a5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -62,7 +62,8 @@ int kfd_dbg_trap_set_dev_address_watch(struct
> kfd_process_device *pdd,
>                                       uint32_t watch_address_mask,
>                                       uint32_t *watch_id,
>                                       uint32_t watch_mode);
> -int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
> +int kfd_dbg_trap_set_flags(struct kfd_process *target,
> +             struct kfd_ioctl_dbg_trap_set_flags_args *set_flags);
>  int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
>               uint32_t source_id,
>               uint32_t exception_code,
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index e9b756ca228c..0522fe7344e4 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -1515,6 +1515,7 @@ struct
> kfd_ioctl_dbg_trap_clear_node_address_watch_args {
>   *     Sets flags for wave behaviour.
>   *
>   *     @flags (IN/OUT) - IN = flags to enable, OUT = flags previously enabled
> + *     @mask  (IN)     - IN = specified debug trap operation bits on flag
>   *
>   *     Generic errors apply (see kfd_dbg_trap_operations).
>   *     Return - 0 on SUCCESS.
> @@ -1522,7 +1523,7 @@ struct
> kfd_ioctl_dbg_trap_clear_node_address_watch_args {
>   */
>  struct kfd_ioctl_dbg_trap_set_flags_args {
>       __u32 flags;
> -     __u32 pad;
> +     __u32 mask;
>  };
>
>  /**
> --
> 2.34.1

Reply via email to