On Sun, May 17, 2026 at 1:44 AM Yifan Zhang <[email protected]> wrote:
>
> Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the
> AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing userspace (ROCr)
> to control per-process SIGBUS delivery.
>
> Userspace for this can be found at:
> https://github.com/ROCm/rocm-systems/pull/6190
>
> Signed-off-by: Yifan Zhang <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 12 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 29 ++++++
> drivers/gpu/drm/amd/amdkfd/kfd_events.c | 118 +++++++++++++++++++++++-
> include/uapi/drm/amdgpu_drm.h | 24 +++++
> 5 files changed, 182 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 5d7bfa59424a..0408476f1070 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -455,6 +455,16 @@ struct amdgpu_fpriv {
>
> /** GPU partition selection */
> uint32_t xcp_id;
> +
> + /**
> + * @kfd_sigbus_delay_ms: Per-fd KFD SIGBUS delivery option (set via
> + * DRM_IOCTL_AMDGPU_USER_OPTIONS /
> AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY).
> + *
> + * 0 - send SIGBUS immediately (default)
> + * 0xFFFF - suppress SIGBUS delivery
> + * other - delay SIGBUS delivery by this many milliseconds
> + */
> + atomic_t kfd_sigbus_delay_ms;
> };
>
> int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv);
> @@ -1467,6 +1477,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc *crtc);
> void amdgpu_disable_vblank_kms(struct drm_crtc *crtc);
> int amdgpu_info_ioctl(struct drm_device *dev, void *data,
> struct drm_file *filp);
> +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
> + struct drm_file *filp);
>
> /*
> * functions used by amdgpu_encoder.c
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 99688391e70b..cad18bd6f8b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -3078,6 +3078,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
> DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl,
> DRM_AUTH|DRM_RENDER_ALLOW),
> DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl,
> DRM_AUTH|DRM_RENDER_ALLOW),
> DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES,
> amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
> + DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS, amdgpu_user_options_ioctl,
> DRM_AUTH|DRM_RENDER_ALLOW),
> };
>
> static const struct drm_driver amdgpu_kms_driver = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 24526e92f9b8..7903587b8bbb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1423,6 +1423,35 @@ int amdgpu_info_ioctl(struct drm_device *dev, void
> *data, struct drm_file *filp)
> return 0;
> }
>
> +/**
> + * amdgpu_user_options_ioctl - set per-fd user options
> + *
> + * @dev: drm dev pointer
> + * @data: pointer to struct drm_amdgpu_user_options
> + * @filp: drm file
> + *
> + * Sets options stored on the per-file amdgpu_fpriv. Currently the only
> + * supported option is %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY which
> + * controls how KFD delivers SIGBUS for poison/RAS events to the calling
> + * process (immediate, suppressed, or delayed by N milliseconds).
> + */
> +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
> + struct drm_file *filp)
> +{
> + struct amdgpu_fpriv *fpriv = filp->driver_priv;
> + struct drm_amdgpu_user_options *args = data;
> +
> + switch (args->op) {
> + case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY:
> + atomic_set(&fpriv->kfd_sigbus_delay_ms,
> + args->kfd_sigbus_delay.value);
> + return 0;
> + default:
> + DRM_DEBUG_KMS("Invalid user option op %u\n", args->op);
> + return -EINVAL;
> + }
> +}
> +
> /**
> * amdgpu_driver_open_kms - drm callback for open
> *
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index e9be798c0a2b..200570401f51 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -29,10 +29,12 @@
> #include <linux/uaccess.h>
> #include <linux/mman.h>
> #include <linux/memory.h>
> +#include <linux/workqueue.h>
> #include "kfd_priv.h"
> #include "kfd_events.h"
> #include "kfd_device_queue_manager.h"
> #include <linux/device.h>
> +#include <uapi/drm/amdgpu_drm.h>
>
> /*
> * Wrapper around wait_queue_entry_t
> @@ -1337,6 +1339,119 @@ void kfd_signal_reset_event(struct kfd_node *dev)
> srcu_read_unlock(&kfd_processes_srcu, idx);
> }
>
> +/*
> + * Per-process opt-in for poison-consumption SIGBUS handling.
> + *
> + * Default: kernel sends SIGBUS to the process immediately when poison is
> + * consumed, in addition to delivering the KFD HW/MEMORY exception events.
> + *
> + * Userspace (ROCr) can opt-in per-process via the
> + * DRM_IOCTL_AMDGPU_USER_OPTIONS / AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
> + * option. This lets the app's registered system-event callback handle the
> + * RAS error first, instead of being killed by SIGBUS.
> + *
> + * Encoded value (set on any of the process' amdgpu render fds):
> + * 0 - default: SIGBUS immediately (no opt-in)
> + * 0xFFFF - opt-in, never escalate to SIGBUS
> + * N (other) - opt-in, escalate to SIGBUS after N ms if app does not
> + * handle the error in time (safety timeout)
> + *
> + * Per-process scope: the option is honored if ANY of the process' amdgpu
> + * fds has been configured. This matches the slide deck's "Per-process,
> + * App set at init" semantics, while keeping the UAPI on amdgpu where ROCr
> + * sets it.
> + */
> +struct kfd_sigbus_delayed_work {
> + struct delayed_work work;
> + struct kfd_process *p;
> +};
> +
> +static void kfd_signal_sigbus_delayed_fn(struct work_struct *work)
> +{
> + struct kfd_sigbus_delayed_work *dw =
> container_of(to_delayed_work(work),
> + struct kfd_sigbus_delayed_work, work);
> + struct kfd_process *p = dw->p;
> +
> + if (p->lead_thread)
> + send_sig(SIGBUS, p->lead_thread, 0);
> +
> + kfd_unref_process(p);
> + kfree(dw);
> +}
> +
> +/*
> + * Resolve the per-process SIGBUS opt-in setting by scanning all of the
> + * process' KFD pdds (each backed by an amdgpu render fd). Returns the
> + * "most lenient" value across all fds, in this priority:
> + * DISABLED (no SIGBUS) > any non-zero timeout > 0 (immediate)
> + *
> + * Rationale: if the app has explicitly opted in on any GPU it uses, it
> + * wants the chance to handle the error in userspace.
> + */
> +static u16 kfd_get_sigbus_delay_ms(struct kfd_process *p)
> +{
> + u16 result = 0;
> + int i;
> +
> + mutex_lock(&p->mutex);
> + for (i = 0; i < p->n_pdds; i++) {
> + struct kfd_process_device *pdd = p->pdds[i];
> + struct amdgpu_fpriv *drv_priv;
> + u16 v;
> +
> + if (!pdd || !pdd->drm_file)
> + continue;
> + if (amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))
> + continue;
> +
> + v = atomic_read(&drv_priv->kfd_sigbus_delay_ms);
> + if (v == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) {
> + result = v;
> + break;
> + }
> + if (v > result)
> + result = v;
> + }
> + mutex_unlock(&p->mutex);
> +
> + return result;
> +}
> +
> +static void kfd_signal_sigbus_with_delay(struct kfd_node *dev,
> + struct kfd_process *p)
> +{
> + u16 delay_ms = kfd_get_sigbus_delay_ms(p);
> + struct kfd_sigbus_delayed_work *dw;
> +
> + if (delay_ms == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) {
> + dev_info(dev->adev->dev,
> + "SIGBUS suppressed for process %s(pid:%d): app opted
> in to handle RAS error\n",
> + p->lead_thread->comm, p->lead_thread->pid);
> + return;
> + }
> +
> + if (delay_ms == 0)
> + goto send_now;
> +
> + dw = kzalloc(sizeof(*dw), GFP_ATOMIC);
> + if (!dw)
> + goto send_now;
> +
> + /* Take an extra reference for the delayed worker. */
> + kref_get(&p->ref);
> + dw->p = p;
> + INIT_DELAYED_WORK(&dw->work, kfd_signal_sigbus_delayed_fn);
> +
> + dev_info(dev->adev->dev,
> + "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS error
> opt-in safety timeout)\n",
> + p->lead_thread->comm, p->lead_thread->pid, delay_ms);
> + schedule_delayed_work(&dw->work, msecs_to_jiffies(delay_ms));
> + return;
> +
> +send_now:
> + send_sig(SIGBUS, p->lead_thread, 0);
Probably worth adding a comment here that this feature is not
supported with confidential compute. Other than that, looks good to
me.
Alex
> +}
> +
> void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
> {
> struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
> @@ -1345,7 +1460,6 @@ void kfd_signal_poison_consumed_event(struct kfd_node
> *dev, u32 pasid)
> struct kfd_event *ev;
> uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
> int user_gpu_id;
> -
> if (!p) {
> dev_warn(dev->adev->dev, "Not find process with pasid:%d\n",
> pasid);
> return; /* Presumably process exited. */
> @@ -1391,7 +1505,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node
> *dev, u32 pasid)
> rcu_read_unlock();
>
> /* user application will handle SIGBUS signal */
> - send_sig(SIGBUS, p->lead_thread, 0);
> + kfd_signal_sigbus_with_delay(dev, p);
>
> kfd_unref_process(p);
> }
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 9f3090db2f16..e88d7cf53858 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -58,6 +58,7 @@ extern "C" {
> #define DRM_AMDGPU_USERQ_SIGNAL 0x17
> #define DRM_AMDGPU_USERQ_WAIT 0x18
> #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19
> +#define DRM_AMDGPU_USER_OPTIONS 0x1A
>
> #define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE +
> DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create)
> #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE +
> DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
> @@ -79,6 +80,7 @@ extern "C" {
> #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE +
> DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal)
> #define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE +
> DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait)
> #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE +
> DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles)
> +#define DRM_IOCTL_AMDGPU_USER_OPTIONS DRM_IOWR(DRM_COMMAND_BASE +
> DRM_AMDGPU_USER_OPTIONS, struct drm_amdgpu_user_options)
>
> /**
> * DOC: memory domains
> @@ -1673,6 +1675,28 @@ struct drm_amdgpu_info_uq_metadata {
> #define AMDGPU_FAMILY_GC_11_5_4 154 /* GC 11.5.4 */
> #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */
>
> +/*
> + * Definition of user options
> + *
> + * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
> + * 0: Disable sigbus delay - SIGBUS will be raised immediately
> + * 0xFFFF: SIGBUS will not be raised
> + * other: Set the sigbus delay in milliseconds
> + */
> +#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY 0
> +
> +#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED 0xFFFFu
> +
> +struct drm_amdgpu_user_options {
> + __u32 op;
> + union {
> + struct {
> + __u16 value;
> + __u16 _pad;
> + } kfd_sigbus_delay;
> + };
> +};
> +
> #if defined(__cplusplus)
> }
> #endif
> --
> 2.43.0
>