On Tue, May 12, 2026 at 9:19 AM Yifan Zhang <[email protected]> wrote: > > Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the > AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing userspace (ROCr) > to control per-process SIGBUS delivery.
Please provide a link to the proposed userspace patches which use this new interface. Alex > > Signed-off-by: Yifan Zhang <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 12 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 29 ++++++ > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 114 +++++++++++++++++++++++- > include/uapi/drm/amdgpu_drm.h | 23 +++++ > 5 files changed, 177 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 80b18bbd7f3a..653a2a516e18 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -455,6 +455,16 @@ struct amdgpu_fpriv { > > /** GPU partition selection */ > uint32_t xcp_id; > + > + /** > + * @kfd_sigbus_delay_ms: Per-fd KFD SIGBUS delivery option (set via > + * DRM_IOCTL_AMDGPU_USER_OPTIONS / > AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY). > + * > + * 0 - send SIGBUS immediately (default) > + * 0xFFFFFFFF - suppress SIGBUS delivery > + * other - delay SIGBUS delivery by this many milliseconds > + */ > + atomic_t kfd_sigbus_delay_ms; > }; > > int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv); > @@ -1467,6 +1477,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc *crtc); > void amdgpu_disable_vblank_kms(struct drm_crtc *crtc); > int amdgpu_info_ioctl(struct drm_device *dev, void *data, > struct drm_file *filp); > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, > + struct drm_file *filp); > > /* > * functions used by amdgpu_encoder.c > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 99688391e70b..cad18bd6f8b3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -3078,6 +3078,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, > DRM_AUTH|DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, > DRM_AUTH|DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES, > amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS, amdgpu_user_options_ioctl, > DRM_AUTH|DRM_RENDER_ALLOW), > }; > > static const struct drm_driver amdgpu_kms_driver = { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > index 24526e92f9b8..7903587b8bbb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > @@ -1423,6 +1423,35 @@ int amdgpu_info_ioctl(struct drm_device *dev, void > *data, struct drm_file *filp) > return 0; > } > > +/** > + * amdgpu_user_options_ioctl - set per-fd user options > + * > + * @dev: drm dev pointer > + * @data: pointer to struct drm_amdgpu_user_options > + * @filp: drm file > + * > + * Sets options stored on the per-file amdgpu_fpriv. Currently the only > + * supported option is %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY which > + * controls how KFD delivers SIGBUS for poison/RAS events to the calling > + * process (immediate, suppressed, or delayed by N milliseconds). > + */ > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, > + struct drm_file *filp) > +{ > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > + struct drm_amdgpu_user_options *args = data; > + > + switch (args->op) { > + case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY: > + atomic_set(&fpriv->kfd_sigbus_delay_ms, > + args->kfd_sigbus_delay.value); > + return 0; > + default: > + DRM_DEBUG_KMS("Invalid user option op %u\n", args->op); > + return -EINVAL; > + } > +} > + > /** > * amdgpu_driver_open_kms - drm callback for open > * > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > index e9be798c0a2b..2ff6348105b7 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > @@ -29,10 +29,12 @@ > #include <linux/uaccess.h> > #include <linux/mman.h> > #include <linux/memory.h> > +#include <linux/workqueue.h> > #include "kfd_priv.h" > #include "kfd_events.h" > #include "kfd_device_queue_manager.h" > #include <linux/device.h> > +#include <uapi/drm/amdgpu_drm.h> > > /* > * Wrapper around wait_queue_entry_t > @@ -1337,6 +1339,115 @@ void kfd_signal_reset_event(struct kfd_node *dev) > srcu_read_unlock(&kfd_processes_srcu, idx); > } > > +/* > + * Per-process opt-in for poison-consumption SIGBUS handling. > + * > + * Default: kernel sends SIGBUS to the process immediately when poison is > + * consumed, in addition to delivering the KFD HW/MEMORY exception events. > + * > + * Userspace (ROCr) can opt-in per-process via the > + * DRM_IOCTL_AMDGPU_USER_OPTIONS / AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY > + * option. This lets the app's registered system-event callback handle the > + * RAS error first, instead of being killed by SIGBUS. > + * > + * Encoded value (set on any of the process' amdgpu render fds): > + * 0 - default: SIGBUS immediately (no opt-in) > + * 0xFFFFFFFF - opt-in, never escalate to SIGBUS > + * N (other) - opt-in, escalate to SIGBUS after N ms if app does not > + * handle the error in time (safety timeout) > + * > + * Per-process scope: the option is honored if ANY of the process' amdgpu > + * fds has been configured. This matches the slide deck's "Per-process, > + * App set at init" semantics, while keeping the UAPI on amdgpu where ROCr > + * sets it. > + */ > +struct kfd_sigbus_delayed_work { > + struct delayed_work work; > + struct kfd_process *p; > +}; > + > +static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) > +{ > + struct kfd_sigbus_delayed_work *dw = > container_of(to_delayed_work(work), > + struct kfd_sigbus_delayed_work, work); > + struct kfd_process *p = dw->p; > + > + if (p->lead_thread) > + send_sig(SIGBUS, p->lead_thread, 0); > + > + kfd_unref_process(p); > + kfree(dw); > +} > + > +/* > + * Resolve the per-process SIGBUS opt-in setting by scanning all of the > + * process' KFD pdds (each backed by an amdgpu render fd). Returns the > + * "most lenient" value across all fds, in this priority: > + * DISABLED (no SIGBUS) > any non-zero timeout > 0 (immediate) > + * > + * Rationale: if the app has explicitly opted in on any GPU it uses, it > + * wants the chance to handle the error in userspace. > + */ > +static u32 kfd_get_sigbus_delay_ms(struct kfd_process *p) > +{ > + u32 result = 0; > + int i; > + > + for (i = 0; i < p->n_pdds; i++) { > + struct kfd_process_device *pdd = p->pdds[i]; > + struct amdgpu_fpriv *drv_priv; > + u32 v; > + > + if (!pdd || !pdd->drm_file) > + continue; > + if (amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv)) > + continue; > + > + v = atomic_read(&drv_priv->kfd_sigbus_delay_ms); > + if (v == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) > + return v; > + if (v > result) > + result = v; > + } > + > + return result; > +} > + > +static void kfd_signal_sigbus_with_delay(struct kfd_node *dev, > + struct kfd_process *p) > +{ > + u32 delay_ms = kfd_get_sigbus_delay_ms(p); > + struct kfd_sigbus_delayed_work *dw; > + > + if (delay_ms == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) { > + dev_info(dev->adev->dev, > + "SIGBUS suppressed for process %s(pid:%d): app opted > in to handle RAS error\n", > + p->lead_thread->comm, p->lead_thread->pid); > + return; > + } > + > + if (delay_ms == 0) > + goto send_now; > + > + dw = kzalloc(sizeof(*dw), GFP_ATOMIC); > + if (!dw) > + goto send_now; > + > + /* Take an extra reference for the delayed worker. */ > + kref_get(&p->ref); > + dw->p = p; > + INIT_DELAYED_WORK(&dw->work, kfd_signal_sigbus_delayed_fn); > + > + dev_info(dev->adev->dev, > + "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS error > opt-in safety timeout)\n", > + p->lead_thread->comm, p->lead_thread->pid, delay_ms); > + schedule_delayed_work(&dw->work, msecs_to_jiffies(delay_ms)); > + return; > + > +send_now: > + send_sig(SIGBUS, p->lead_thread, 0); > +} > + > void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) > { > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); > @@ -1345,7 +1456,6 @@ void kfd_signal_poison_consumed_event(struct kfd_node > *dev, u32 pasid) > struct kfd_event *ev; > uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID; > int user_gpu_id; > - > if (!p) { > dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", > pasid); > return; /* Presumably process exited. */ > @@ -1391,7 +1501,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node > *dev, u32 pasid) > rcu_read_unlock(); > > /* user application will handle SIGBUS signal */ > - send_sig(SIGBUS, p->lead_thread, 0); > + kfd_signal_sigbus_with_delay(dev, p); > > kfd_unref_process(p); > } > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h > index 9f3090db2f16..e0a382673b90 100644 > --- a/include/uapi/drm/amdgpu_drm.h > +++ b/include/uapi/drm/amdgpu_drm.h > @@ -58,6 +58,7 @@ extern "C" { > #define DRM_AMDGPU_USERQ_SIGNAL 0x17 > #define DRM_AMDGPU_USERQ_WAIT 0x18 > #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 > +#define DRM_AMDGPU_USER_OPTIONS 0x1A > > #define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) > #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) > @@ -79,6 +80,7 @@ extern "C" { > #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) > #define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) > #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) > +#define DRM_IOCTL_AMDGPU_USER_OPTIONS DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_USER_OPTIONS, struct drm_amdgpu_user_options) > > /** > * DOC: memory domains > @@ -1673,6 +1675,27 @@ struct drm_amdgpu_info_uq_metadata { > #define AMDGPU_FAMILY_GC_11_5_4 154 /* GC 11.5.4 */ > #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ > > +/* > + * Definition of user options > + * > + * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY > + * 0: Disable sigbus delay - SIGBUS will be raised immediately > + * 0xFFFFFFFF: SIGBUS will not be raised > + * other: Set the sigbus delay in milliseconds > + */ > +#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY 0 > + > +#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED 0xFFFFFFFFu > + > +struct drm_amdgpu_user_options { > + __u32 op; > + union { > + struct { > + __u32 value; > + } kfd_sigbus_delay; > + }; > +}; > + > #if defined(__cplusplus) > } > #endif > -- > 2.43.0 >
