On Tue, May 19, 2026 at 11:16 AM Yat Sin, David <[email protected]> wrote:
>
> AMD General
>
> I think the kfd_sigbus_delay should be uint32_t.
>
> uint16_t is only ~1 minute. Once userspace gets the first exception event, it 
> may start generating a coredump file and the coredump file generation can 
> take > 40 minutes.
>

Ah, ok.  That makes sense.

Alex

> ~David
>
>
> > -----Original Message-----
> > From: Alex Deucher <[email protected]>
> > Sent: Tuesday, May 19, 2026 9:55 AM
> > To: Zhang, Yifan <[email protected]>
> > Cc: [email protected]; Deucher, Alexander
> > <[email protected]>; Koenig, Christian <[email protected]>;
> > Kuehling, Felix <[email protected]>; Yat Sin, David
> > <[email protected]>; Russell, Kent <[email protected]>; Yuan, Perry
> > <[email protected]>
> > Subject: Re: [PATCH v3] drm/amdgpu: add ioctl to handle RAS poison error
> >
> > On Sun, May 17, 2026 at 1:44 AM Yifan Zhang <[email protected]> wrote:
> > >
> > > Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the
> > > AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing
> > userspace
> > > (ROCr) to control per-process SIGBUS delivery.
> > >
> > > Userspace for this can be found at:
> > > https://github.com/ROCm/rocm-systems/pull/6190
> > >
> > > Signed-off-by: Yifan Zhang <[email protected]>
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  12 +++
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |   1 +
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  29 ++++++
> > > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 118 +++++++++++++++++++++++-
> > >  include/uapi/drm/amdgpu_drm.h           |  24 +++++
> > >  5 files changed, 182 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > index 5d7bfa59424a..0408476f1070 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > @@ -455,6 +455,16 @@ struct amdgpu_fpriv {
> > >
> > >         /** GPU partition selection */
> > >         uint32_t                xcp_id;
> > > +
> > > +       /**
> > > +        * @kfd_sigbus_delay_ms: Per-fd KFD SIGBUS delivery option (set 
> > > via
> > > +        * DRM_IOCTL_AMDGPU_USER_OPTIONS /
> > AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY).
> > > +        *
> > > +        *   0          - send SIGBUS immediately (default)
> > > +        *   0xFFFF - suppress SIGBUS delivery
> > > +        *   other      - delay SIGBUS delivery by this many milliseconds
> > > +        */
> > > +       atomic_t                kfd_sigbus_delay_ms;
> > >  };
> > >
> > >  int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv
> > > **fpriv); @@ -1467,6 +1477,8 @@ int amdgpu_enable_vblank_kms(struct
> > > drm_crtc *crtc);  void amdgpu_disable_vblank_kms(struct drm_crtc
> > > *crtc);  int amdgpu_info_ioctl(struct drm_device *dev, void *data,
> > >                       struct drm_file *filp);
> > > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
> > > +                             struct drm_file *filp);
> > >
> > >  /*
> > >   * functions used by amdgpu_encoder.c diff --git
> > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > index 99688391e70b..cad18bd6f8b3 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > @@ -3078,6 +3078,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
> > >         DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL,
> > amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
> > >         DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT,
> > amdgpu_userq_wait_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
> > >         DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES,
> > > amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
> > > +       DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS,
> > > + amdgpu_user_options_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
> > >  };
> > >
> > >  static const struct drm_driver amdgpu_kms_driver = { diff --git
> > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > index 24526e92f9b8..7903587b8bbb 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > @@ -1423,6 +1423,35 @@ int amdgpu_info_ioctl(struct drm_device *dev, void
> > *data, struct drm_file *filp)
> > >         return 0;
> > >  }
> > >
> > > +/**
> > > + * amdgpu_user_options_ioctl - set per-fd user options
> > > + *
> > > + * @dev: drm dev pointer
> > > + * @data: pointer to struct drm_amdgpu_user_options
> > > + * @filp: drm file
> > > + *
> > > + * Sets options stored on the per-file amdgpu_fpriv. Currently the
> > > +only
> > > + * supported option is
> > %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY which
> > > + * controls how KFD delivers SIGBUS for poison/RAS events to the
> > > +calling
> > > + * process (immediate, suppressed, or delayed by N milliseconds).
> > > + */
> > > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
> > > +                             struct drm_file *filp) {
> > > +       struct amdgpu_fpriv *fpriv = filp->driver_priv;
> > > +       struct drm_amdgpu_user_options *args = data;
> > > +
> > > +       switch (args->op) {
> > > +       case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY:
> > > +               atomic_set(&fpriv->kfd_sigbus_delay_ms,
> > > +                          args->kfd_sigbus_delay.value);
> > > +               return 0;
> > > +       default:
> > > +               DRM_DEBUG_KMS("Invalid user option op %u\n", args->op);
> > > +               return -EINVAL;
> > > +       }
> > > +}
> > > +
> > >  /**
> > >   * amdgpu_driver_open_kms - drm callback for open
> > >   *
> > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > > b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > > index e9be798c0a2b..200570401f51 100644
> > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > > @@ -29,10 +29,12 @@
> > >  #include <linux/uaccess.h>
> > >  #include <linux/mman.h>
> > >  #include <linux/memory.h>
> > > +#include <linux/workqueue.h>
> > >  #include "kfd_priv.h"
> > >  #include "kfd_events.h"
> > >  #include "kfd_device_queue_manager.h"
> > >  #include <linux/device.h>
> > > +#include <uapi/drm/amdgpu_drm.h>
> > >
> > >  /*
> > >   * Wrapper around wait_queue_entry_t
> > > @@ -1337,6 +1339,119 @@ void kfd_signal_reset_event(struct kfd_node *dev)
> > >         srcu_read_unlock(&kfd_processes_srcu, idx);  }
> > >
> > > +/*
> > > + * Per-process opt-in for poison-consumption SIGBUS handling.
> > > + *
> > > + * Default: kernel sends SIGBUS to the process immediately when
> > > +poison is
> > > + * consumed, in addition to delivering the KFD HW/MEMORY exception 
> > > events.
> > > + *
> > > + * Userspace (ROCr) can opt-in per-process via the
> > > + * DRM_IOCTL_AMDGPU_USER_OPTIONS /
> > > +AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
> > > + * option. This lets the app's registered system-event callback
> > > +handle the
> > > + * RAS error first, instead of being killed by SIGBUS.
> > > + *
> > > + * Encoded value (set on any of the process' amdgpu render fds):
> > > + *   0          - default: SIGBUS immediately (no opt-in)
> > > + *   0xFFFF - opt-in, never escalate to SIGBUS
> > > + *   N (other)  - opt-in, escalate to SIGBUS after N ms if app does not
> > > + *                handle the error in time (safety timeout)
> > > + *
> > > + * Per-process scope: the option is honored if ANY of the process'
> > > +amdgpu
> > > + * fds has been configured. This matches the slide deck's
> > > +"Per-process,
> > > + * App set at init" semantics, while keeping the UAPI on amdgpu where
> > > +ROCr
> > > + * sets it.
> > > + */
> > > +struct kfd_sigbus_delayed_work {
> > > +       struct delayed_work work;
> > > +       struct kfd_process *p;
> > > +};
> > > +
> > > +static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) {
> > > +       struct kfd_sigbus_delayed_work *dw = 
> > > container_of(to_delayed_work(work),
> > > +                               struct kfd_sigbus_delayed_work, work);
> > > +       struct kfd_process *p = dw->p;
> > > +
> > > +       if (p->lead_thread)
> > > +               send_sig(SIGBUS, p->lead_thread, 0);
> > > +
> > > +       kfd_unref_process(p);
> > > +       kfree(dw);
> > > +}
> > > +
> > > +/*
> > > + * Resolve the per-process SIGBUS opt-in setting by scanning all of
> > > +the
> > > + * process' KFD pdds (each backed by an amdgpu render fd). Returns
> > > +the
> > > + * "most lenient" value across all fds, in this priority:
> > > + *   DISABLED (no SIGBUS)  >  any non-zero timeout  >  0 (immediate)
> > > + *
> > > + * Rationale: if the app has explicitly opted in on any GPU it uses,
> > > +it
> > > + * wants the chance to handle the error in userspace.
> > > + */
> > > +static u16 kfd_get_sigbus_delay_ms(struct kfd_process *p) {
> > > +       u16 result = 0;
> > > +       int i;
> > > +
> > > +       mutex_lock(&p->mutex);
> > > +       for (i = 0; i < p->n_pdds; i++) {
> > > +               struct kfd_process_device *pdd = p->pdds[i];
> > > +               struct amdgpu_fpriv *drv_priv;
> > > +               u16 v;
> > > +
> > > +               if (!pdd || !pdd->drm_file)
> > > +                       continue;
> > > +               if (amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))
> > > +                       continue;
> > > +
> > > +               v = atomic_read(&drv_priv->kfd_sigbus_delay_ms);
> > > +               if (v ==
> > AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) {
> > > +                       result = v;
> > > +                       break;
> > > +               }
> > > +               if (v > result)
> > > +                       result = v;
> > > +       }
> > > +       mutex_unlock(&p->mutex);
> > > +
> > > +       return result;
> > > +}
> > > +
> > > +static void kfd_signal_sigbus_with_delay(struct kfd_node *dev,
> > > +                                        struct kfd_process *p) {
> > > +       u16 delay_ms = kfd_get_sigbus_delay_ms(p);
> > > +       struct kfd_sigbus_delayed_work *dw;
> > > +
> > > +       if (delay_ms ==
> > AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) {
> > > +               dev_info(dev->adev->dev,
> > > +                        "SIGBUS suppressed for process %s(pid:%d): app 
> > > opted in to
> > handle RAS error\n",
> > > +                        p->lead_thread->comm, p->lead_thread->pid);
> > > +               return;
> > > +       }
> > > +
> > > +       if (delay_ms == 0)
> > > +               goto send_now;
> > > +
> > > +       dw = kzalloc(sizeof(*dw), GFP_ATOMIC);
> > > +       if (!dw)
> > > +               goto send_now;
> > > +
> > > +       /* Take an extra reference for the delayed worker. */
> > > +       kref_get(&p->ref);
> > > +       dw->p = p;
> > > +       INIT_DELAYED_WORK(&dw->work, kfd_signal_sigbus_delayed_fn);
> > > +
> > > +       dev_info(dev->adev->dev,
> > > +                "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS 
> > > error opt-
> > in safety timeout)\n",
> > > +                p->lead_thread->comm, p->lead_thread->pid, delay_ms);
> > > +       schedule_delayed_work(&dw->work, msecs_to_jiffies(delay_ms));
> > > +       return;
> > > +
> > > +send_now:
> > > +       send_sig(SIGBUS, p->lead_thread, 0);
> >
> >
> > Probably worth adding a comment here that this feature is not supported with
> > confidential compute.  Other than that, looks good to me.
> >
> > Alex
> >
> > > +}
> > > +
> > >  void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32
> > > pasid)  {
> > >         struct kfd_process *p = kfd_lookup_process_by_pasid(pasid,
> > > NULL); @@ -1345,7 +1460,6 @@ void
> > kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
> > >         struct kfd_event *ev;
> > >         uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
> > >         int user_gpu_id;
> > > -
> > >         if (!p) {
> > >                 dev_warn(dev->adev->dev, "Not find process with 
> > > pasid:%d\n", pasid);
> > >                 return; /* Presumably process exited. */ @@ -1391,7
> > > +1505,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32
> > pasid)
> > >         rcu_read_unlock();
> > >
> > >         /* user application will handle SIGBUS signal */
> > > -       send_sig(SIGBUS, p->lead_thread, 0);
> > > +       kfd_signal_sigbus_with_delay(dev, p);
> > >
> > >         kfd_unref_process(p);
> > >  }
> > > diff --git a/include/uapi/drm/amdgpu_drm.h
> > > b/include/uapi/drm/amdgpu_drm.h index 9f3090db2f16..e88d7cf53858
> > > 100644
> > > --- a/include/uapi/drm/amdgpu_drm.h
> > > +++ b/include/uapi/drm/amdgpu_drm.h
> > > @@ -58,6 +58,7 @@ extern "C" {
> > >  #define DRM_AMDGPU_USERQ_SIGNAL                0x17
> > >  #define DRM_AMDGPU_USERQ_WAIT          0x18
> > >  #define DRM_AMDGPU_GEM_LIST_HANDLES    0x19
> > > +#define DRM_AMDGPU_USER_OPTIONS                0x1A
> > >
> > >  #define DRM_IOCTL_AMDGPU_GEM_CREATE
> > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union
> > drm_amdgpu_gem_create)
> > >  #define DRM_IOCTL_AMDGPU_GEM_MMAP
> > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union
> > drm_amdgpu_gem_mmap)
> > > @@ -79,6 +80,7 @@ extern "C" {
> > >  #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL
> > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct
> > drm_amdgpu_userq_signal)
> > >  #define DRM_IOCTL_AMDGPU_USERQ_WAIT
> > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct
> > drm_amdgpu_userq_wait)
> > >  #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES
> > DRM_IOWR(DRM_COMMAND_BASE +
> > > DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles)
> > > +#define DRM_IOCTL_AMDGPU_USER_OPTIONS
> > DRM_IOWR(DRM_COMMAND_BASE +
> > > +DRM_AMDGPU_USER_OPTIONS, struct drm_amdgpu_user_options)
> > >
> > >  /**
> > >   * DOC: memory domains
> > > @@ -1673,6 +1675,28 @@ struct drm_amdgpu_info_uq_metadata {
> > >  #define AMDGPU_FAMILY_GC_11_5_4                        154 /* GC 11.5.4 
> > > */
> > >  #define AMDGPU_FAMILY_GC_12_0_0                        152 /* GC 12.0.0 
> > > */
> > >
> > > +/*
> > > + * Definition of user options
> > > + *
> > > + * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
> > > + *    0:          Disable sigbus delay - SIGBUS will be raised 
> > > immediately
> > > + *    0xFFFF: SIGBUS will not be raised
> > > + *    other:      Set the sigbus delay in milliseconds
> > > + */
> > > +#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY                0
> > > +
> > > +#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED
> > 0xFFFFu
> > > +
> > > +struct drm_amdgpu_user_options {
> > > +       __u32 op;
> > > +       union {
> > > +               struct {
> > > +                       __u16 value;
> > > +                       __u16 _pad;
> > > +               } kfd_sigbus_delay;
> > > +       };
> > > +};
> > > +
> > >  #if defined(__cplusplus)
> > >  }
> > >  #endif
> > > --
> > > 2.43.0
> > >

Reply via email to