AMD General
Yes, the mux of per device delay time happens in kfd_get_sigbus_delay_ms. But
we still need some per device storage to store all the delay times.
>> +/*
>> + * Resolve the per-process SIGBUS opt-in setting by scanning all of
>> +the
>> + * process' KFD pdds (each backed by an amdgpu render fd). Returns
>> +the
>> + * "most lenient" value across all fds, in this priority:
>> + * DISABLED (no SIGBUS) > any non-zero timeout > 0 (immediate)
>> + *
>> + * Rationale: if the app has explicitly opted in on any GPU it uses,
>> +it
>> + * wants the chance to handle the error in userspace.
>> + */
>> +static u32 kfd_get_sigbus_delay_ms(struct kfd_process *p) {
>> + u32 result = 0;
>> + int i;
>> +
>> + for (i = 0; i < p->n_pdds; i++) {
>> + struct kfd_process_device *pdd = p->pdds[i];
>> + struct amdgpu_fpriv *drv_priv;
>> + u32 v;
>> +
>> + if (!pdd || !pdd->drm_file)
>> + continue;
>> + if (amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))
>> + continue;
>> +
>> + v = atomic_read(&drv_priv->kfd_sigbus_delay_ms);
>> + if (v == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED)
>> + return v;
>> + if (v > result)
>> + result = v;
>> + }
>> +
>> + return result;
>> +}
Best Regards,
Yifan
-----Original Message-----
From: Lazar, Lijo <[email protected]>
Sent: Thursday, May 21, 2026 1:31 PM
To: Zhang, Yifan <[email protected]>; [email protected]
Cc: Deucher, Alexander <[email protected]>; Koenig, Christian
<[email protected]>; Kuehling, Felix <[email protected]>; Yat Sin,
David <[email protected]>; Russell, Kent <[email protected]>; Yuan, Perry
<[email protected]>
Subject: Re: [PATCH v2] drm/amdgpu: add ioctl to handle RAS poison error
On 21-May-26 10:53 AM, Zhang, Yifan wrote:
> AMD General
>
> The signal itself is indeed process-scoped, but the policy is naturally
> per-device, because the poison event has a dev associated with it. ROCr
> already holds one amdgpu render fd per device, so attaching the option to
> amdgpu_fpriv lets the app configure each device independently with the handle
> it already has.
>
> E.g.
>
> Device A is driven by a worker that has its own RAS handler: set DISABLED on
> fd(A).
> Device B has no special handling: leave default / use a 10s safety timeout on
> fd(B).
>
> Storing a single process-wide value would force the app to pick one policy
> for all GPUs it has open, which doesn't match how ROCr layers per-device
> handlers.
>
From a process's perspective, what it wants to do - isn't it to delay the
signal if any of the devices it uses gets into poison error situation and buy
some time to clean up?
Thanks,
Lijo
>
> Best Regards,
> Yifan
>
> -----Original Message-----
> From: Lazar, Lijo <[email protected]>
> Sent: Thursday, May 21, 2026 12:35 PM
> To: Zhang, Yifan <[email protected]>; [email protected]
> Cc: Deucher, Alexander <[email protected]>; Koenig, Christian
> <[email protected]>; Kuehling, Felix <[email protected]>;
> Yat Sin, David <[email protected]>; Russell, Kent
> <[email protected]>; Yuan, Perry <[email protected]>
> Subject: Re: [PATCH v2] drm/amdgpu: add ioctl to handle RAS poison
> error
>
>
>
> On 15-May-26 6:57 PM, Yifan Zhang wrote:
>> Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the
>> AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing userspace
>> (ROCr) to control per-process SIGBUS delivery.
>>
>> Userspace for this can be found at:
>> https://github.com/ROCm/rocm-systems/pull/6148
>>
>> Signed-off-by: Yifan Zhang <[email protected]>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 12 +++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 29 ++++++
>> drivers/gpu/drm/amd/amdkfd/kfd_events.c | 114 +++++++++++++++++++++++-
>> include/uapi/drm/amdgpu_drm.h | 25 ++++++
>> 5 files changed, 179 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 5d7bfa59424a..6a5459b59af2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -455,6 +455,16 @@ struct amdgpu_fpriv {
>>
>> /** GPU partition selection */
>> uint32_t xcp_id;
>> +
>> + /**
>> + * @kfd_sigbus_delay_ms: Per-fd KFD SIGBUS delivery option (set via
>> + * DRM_IOCTL_AMDGPU_USER_OPTIONS /
>> AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY).
>> + *
>> + * 0 - send SIGBUS immediately (default)
>> + * 0xFFFFFFFF - suppress SIGBUS delivery
>> + * other - delay SIGBUS delivery by this many milliseconds
>> + */
>> + atomic_t kfd_sigbus_delay_ms;
>> };
>>
>> int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv
>> **fpriv); @@ -1467,6 +1477,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc
>> *crtc);
>> void amdgpu_disable_vblank_kms(struct drm_crtc *crtc);
>> int amdgpu_info_ioctl(struct drm_device *dev, void *data,
>> struct drm_file *filp);
>> +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
>> + struct drm_file *filp);
>>
>> /*
>> * functions used by amdgpu_encoder.c diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 99688391e70b..cad18bd6f8b3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -3078,6 +3078,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
>> DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl,
>> DRM_AUTH|DRM_RENDER_ALLOW),
>> DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl,
>> DRM_AUTH|DRM_RENDER_ALLOW),
>> DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES,
>> amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
>> + DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS,
>> +amdgpu_user_options_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
>> };
>>
>> static const struct drm_driver amdgpu_kms_driver = { diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 24526e92f9b8..7903587b8bbb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -1423,6 +1423,35 @@ int amdgpu_info_ioctl(struct drm_device *dev, void
>> *data, struct drm_file *filp)
>> return 0;
>> }
>>
>> +/**
>> + * amdgpu_user_options_ioctl - set per-fd user options
>> + *
>> + * @dev: drm dev pointer
>> + * @data: pointer to struct drm_amdgpu_user_options
>> + * @filp: drm file
>> + *
>> + * Sets options stored on the per-file amdgpu_fpriv. Currently the
>> +only
>> + * supported option is %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
>> +which
>> + * controls how KFD delivers SIGBUS for poison/RAS events to the
>> +calling
>> + * process (immediate, suppressed, or delayed by N milliseconds).
>> + */
>> +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
>> + struct drm_file *filp) {
>> + struct amdgpu_fpriv *fpriv = filp->driver_priv;
>> + struct drm_amdgpu_user_options *args = data;
>> +
>> + switch (args->op) {
>> + case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY:
>> + atomic_set(&fpriv->kfd_sigbus_delay_ms,
>> + args->kfd_sigbus_delay.value);
>
> Why this is stored at device level? A signal is process specific. I think the
> delay should be associated with the process regardless of multi-dev scenario.
>
> Thanks,
> Lijo
>
>> + return 0;
>> + default:
>> + DRM_DEBUG_KMS("Invalid user option op %u\n", args->op);
>> + return -EINVAL;
>> + }
>> +}
>> +
>> /**
>> * amdgpu_driver_open_kms - drm callback for open
>> *
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> index e9be798c0a2b..2ff6348105b7 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> @@ -29,10 +29,12 @@
>> #include <linux/uaccess.h>
>> #include <linux/mman.h>
>> #include <linux/memory.h>
>> +#include <linux/workqueue.h>
>> #include "kfd_priv.h"
>> #include "kfd_events.h"
>> #include "kfd_device_queue_manager.h"
>> #include <linux/device.h>
>> +#include <uapi/drm/amdgpu_drm.h>
>>
>> /*
>> * Wrapper around wait_queue_entry_t @@ -1337,6 +1339,115 @@ void
>> kfd_signal_reset_event(struct kfd_node *dev)
>> srcu_read_unlock(&kfd_processes_srcu, idx);
>> }
>>
>> +/*
>> + * Per-process opt-in for poison-consumption SIGBUS handling.
>> + *
>> + * Default: kernel sends SIGBUS to the process immediately when
>> +poison is
>> + * consumed, in addition to delivering the KFD HW/MEMORY exception events.
>> + *
>> + * Userspace (ROCr) can opt-in per-process via the
>> + * DRM_IOCTL_AMDGPU_USER_OPTIONS /
>> +AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
>> + * option. This lets the app's registered system-event callback
>> +handle the
>> + * RAS error first, instead of being killed by SIGBUS.
>> + *
>> + * Encoded value (set on any of the process' amdgpu render fds):
>> + * 0 - default: SIGBUS immediately (no opt-in)
>> + * 0xFFFFFFFF - opt-in, never escalate to SIGBUS
>> + * N (other) - opt-in, escalate to SIGBUS after N ms if app does not
>> + * handle the error in time (safety timeout)
>> + *
>> + * Per-process scope: the option is honored if ANY of the process'
>> +amdgpu
>> + * fds has been configured. This matches the slide deck's
>> +"Per-process,
>> + * App set at init" semantics, while keeping the UAPI on amdgpu
>> +where ROCr
>> + * sets it.
>> + */
>> +struct kfd_sigbus_delayed_work {
>> + struct delayed_work work;
>> + struct kfd_process *p;
>> +};
>> +
>> +static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) {
>> + struct kfd_sigbus_delayed_work *dw =
>> container_of(to_delayed_work(work),
>> + struct kfd_sigbus_delayed_work, work);
>> + struct kfd_process *p = dw->p;
>> +
>> + if (p->lead_thread)
>> + send_sig(SIGBUS, p->lead_thread, 0);
>> +
>> + kfd_unref_process(p);
>> + kfree(dw);
>> +}
>> +
>> +/*
>> + * Resolve the per-process SIGBUS opt-in setting by scanning all of
>> +the
>> + * process' KFD pdds (each backed by an amdgpu render fd). Returns
>> +the
>> + * "most lenient" value across all fds, in this priority:
>> + * DISABLED (no SIGBUS) > any non-zero timeout > 0 (immediate)
>> + *
>> + * Rationale: if the app has explicitly opted in on any GPU it uses,
>> +it
>> + * wants the chance to handle the error in userspace.
>> + */
>> +static u32 kfd_get_sigbus_delay_ms(struct kfd_process *p) {
>> + u32 result = 0;
>> + int i;
>> +
>> + for (i = 0; i < p->n_pdds; i++) {
>> + struct kfd_process_device *pdd = p->pdds[i];
>> + struct amdgpu_fpriv *drv_priv;
>> + u32 v;
>> +
>> + if (!pdd || !pdd->drm_file)
>> + continue;
>> + if (amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))
>> + continue;
>> +
>> + v = atomic_read(&drv_priv->kfd_sigbus_delay_ms);
>> + if (v == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED)
>> + return v;
>> + if (v > result)
>> + result = v;
>> + }
>> +
>> + return result;
>> +}
>> +
>> +static void kfd_signal_sigbus_with_delay(struct kfd_node *dev,
>> + struct kfd_process *p) {
>> + u32 delay_ms = kfd_get_sigbus_delay_ms(p);
>> + struct kfd_sigbus_delayed_work *dw;
>> +
>> + if (delay_ms == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) {
>> + dev_info(dev->adev->dev,
>> + "SIGBUS suppressed for process %s(pid:%d): app opted
>> in to handle RAS error\n",
>> + p->lead_thread->comm, p->lead_thread->pid);
>> + return;
>> + }
>> +
>> + if (delay_ms == 0)
>> + goto send_now;
>> +
>> + dw = kzalloc(sizeof(*dw), GFP_ATOMIC);
>> + if (!dw)
>> + goto send_now;
>> +
>> + /* Take an extra reference for the delayed worker. */
>> + kref_get(&p->ref);
>> + dw->p = p;
>> + INIT_DELAYED_WORK(&dw->work, kfd_signal_sigbus_delayed_fn);
>> +
>> + dev_info(dev->adev->dev,
>> + "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS error
>> opt-in safety timeout)\n",
>> + p->lead_thread->comm, p->lead_thread->pid, delay_ms);
>> + schedule_delayed_work(&dw->work, msecs_to_jiffies(delay_ms));
>> + return;
>> +
>> +send_now:
>> + send_sig(SIGBUS, p->lead_thread, 0); }
>> +
>> void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
>> {
>> struct kfd_process *p = kfd_lookup_process_by_pasid(pasid,
>> NULL); @@ -1345,7 +1456,6 @@ void kfd_signal_poison_consumed_event(struct
>> kfd_node *dev, u32 pasid)
>> struct kfd_event *ev;
>> uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
>> int user_gpu_id;
>> -
>> if (!p) {
>> dev_warn(dev->adev->dev, "Not find process with pasid:%d\n",
>> pasid);
>> return; /* Presumably process exited. */ @@ -1391,7
>> +1501,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32
>> pasid)
>> rcu_read_unlock();
>>
>> /* user application will handle SIGBUS signal */
>> - send_sig(SIGBUS, p->lead_thread, 0);
>> + kfd_signal_sigbus_with_delay(dev, p);
>>
>> kfd_unref_process(p);
>> }
>> diff --git a/include/uapi/drm/amdgpu_drm.h
>> b/include/uapi/drm/amdgpu_drm.h index 9f3090db2f16..dfc91d25c80d
>> 100644
>> --- a/include/uapi/drm/amdgpu_drm.h
>> +++ b/include/uapi/drm/amdgpu_drm.h
>> @@ -58,6 +58,7 @@ extern "C" {
>> #define DRM_AMDGPU_USERQ_SIGNAL 0x17
>> #define DRM_AMDGPU_USERQ_WAIT 0x18
>> #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19
>> +#define DRM_AMDGPU_USER_OPTIONS 0x1A
>>
>> #define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE +
>> DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create)
>> #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE +
>> DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
>> @@ -79,6 +80,7 @@ extern "C" {
>> #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE +
>> DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal)
>> #define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE +
>> DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait)
>> #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES
>> DRM_IOWR(DRM_COMMAND_BASE
>> + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles)
>> +#define DRM_IOCTL_AMDGPU_USER_OPTIONS DRM_IOWR(DRM_COMMAND_BASE +
>> DRM_AMDGPU_USER_OPTIONS, struct drm_amdgpu_user_options)
>>
>> /**
>> * DOC: memory domains
>> @@ -1673,6 +1675,29 @@ struct drm_amdgpu_info_uq_metadata {
>> #define AMDGPU_FAMILY_GC_11_5_4 154 /* GC 11.5.4 */
>> #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */
>>
>> +/*
>> + * Definition of user options
>> + *
>> + * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
>> + * 0: Disable sigbus delay - SIGBUS will be raised immediately
>> + * 0xFFFFFFFF: SIGBUS will not be raised
>> + * other: Set the sigbus delay in milliseconds
>> + */
>> +#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY 0
>> +
>> +#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED 0xFFFFFFFFu
>> +
>> +struct drm_amdgpu_user_options {
>> + __u32 op;
>> + union {
>> + struct {
>> + __u16 value;
>> + __u16 _pad;
>> + } kfd_sigbus_delay;
>> + __u32 _pad;
>> + };
>> +};
>> +
>> #if defined(__cplusplus)
>> }
>> #endif
>