On Thu, Sep 18, 2025 at 8:55 PM Chai, Thomas <[email protected]> wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of Alex 
> Deucher
> Sent: Friday, September 19, 2025 4:05 AM
> To: Chai, Thomas <[email protected]>
> Cc: [email protected]; Zhang, Hawking <[email protected]>; 
> Zhou1, Tao <[email protected]>; Li, Candice <[email protected]>; Yang, 
> Stanley <[email protected]>
> Subject: Re: [PATCH 06/10] drm/amd/ras: Add amdgpu ras system functions
>
> On Wed, Sep 17, 2025 at 9:37 PM YiPeng Chai <[email protected]> wrote:
> >
> > Add amdgpu ras system functions.
> >
> > Signed-off-by: YiPeng Chai <[email protected]>
> > Reviewed-by: Tao Zhou <[email protected]>
> > ---
> >  .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c  | 268 ++++++++++++++++++
> >  drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h     | 109 +++++++
> >  2 files changed, 377 insertions(+)
> >  create mode 100644 drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> >  create mode 100644 drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> >
> > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> > new file mode 100644
> > index 000000000000..40071b876333
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> > @@ -0,0 +1,268 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright 2025 Advanced Micro Devices, Inc.
> > + *
> > + * Permission is hereby granted, free of charge, to any person
> > +obtaining a
> > + * copy of this software and associated documentation files (the
> > +"Software"),
> > + * to deal in the Software without restriction, including without
> > +limitation
> > + * the rights to use, copy, modify, merge, publish, distribute,
> > +sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom
> > +the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be
> > +included in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > +EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > +MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> > +SHALL
> > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> > +DAMAGES OR
> > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> > +OTHERWISE,
> > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
> > +OR
> > + * OTHER DEALINGS IN THE SOFTWARE.
> > + *
> > + */
> > +#include "ras_sys.h"
> > +#include "amdgpu_ras_mgr.h"
> > +#include "amdgpu_ras.h"
> > +#include "amdgpu_reset.h"
> > +
> > +static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context
> > +*ras_core, void *data) {
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> > +       uint64_t seq_no;
> > +
> > +       seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, 
> > RAS_SEQNO_TYPE_UE);
> > +       RAS_DEV_INFO(adev,
> > +               "{%llu} Uncorrectable hardware 
> > error(ERREVENT_ATHUB_INTERRUPT) detected!\n",
> > +               seq_no);
> > +
> > +       return amdgpu_ras_process_handle_unexpected_interrupt(adev,
> > +data); }
> > +
> > +static int amdgpu_ras_sys_poison_consumption_event(struct ras_core_context 
> > *ras_core,
> > +                               void *data) {
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> > +       struct ras_event_req *req = (struct ras_event_req *)data;
> > +       pasid_notify pasid_fn;
> > +
> > +       if (!req)
> > +               return -EINVAL;
> > +
> > +       if (req->pasid_fn) {
> > +               pasid_fn = (pasid_notify)req->pasid_fn;
> > +               pasid_fn(adev, req->pasid, req->data);
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static int amdgpu_ras_sys_gen_seqno(struct ras_core_context *ras_core,
> > +                       enum ras_seqno_type seqno_type, uint64_t
> > +*seqno) {
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> > +       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
> > +       struct ras_event_manager *event_mgr;
> > +       struct ras_event_state *event_state;
> > +       struct amdgpu_hive_info *hive;
> > +       enum ras_event_type event_type;
> > +       uint64_t seq_no;
> > +
> > +       if (!ras_mgr || !seqno ||
> > +               (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX))
> > +               return -EINVAL;
> > +
> > +       switch (seqno_type) {
> > +       case RAS_SEQNO_TYPE_UE:
> > +               event_type = RAS_EVENT_TYPE_FATAL;
> > +               break;
> > +       case RAS_SEQNO_TYPE_CE:
> > +       case RAS_SEQNO_TYPE_DE:
> > +               event_type = RAS_EVENT_TYPE_POISON_CREATION;
> > +               break;
> > +       case RAS_SEQNO_TYPE_POISON_CONSUMPTION:
> > +               event_type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
> > +               break;
> > +       default:
> > +               event_type = RAS_EVENT_TYPE_INVALID;
> > +               break;
> > +       }
> > +
> > +       hive = amdgpu_get_xgmi_hive(adev);
> > +       event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr;
> > +       event_state = &event_mgr->event_state[event_type];
> > +       if ((event_type == RAS_EVENT_TYPE_FATAL) && 
> > amdgpu_ras_in_recovery(adev)) {
> > +               seq_no = event_state->last_seqno;
> > +       } else {
> > +               seq_no = atomic64_inc_return(&event_mgr->seqno);
> > +               event_state->last_seqno = seq_no;
> > +               atomic64_inc(&event_state->count);
> > +       }
> > +       amdgpu_put_xgmi_hive(hive);
> > +
> > +       *seqno = seq_no;
> > +       return 0;
> > +
> > +}
> > +
> > +static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
> > +                                  enum ras_notify_event event_id,
> > +void *data) {
> > +       struct amdgpu_ras_mgr *ras_mgr = 
> > amdgpu_ras_mgr_get_context(ras_core->dev);
> > +       int ret = 0;
> > +
> > +       switch (event_id) {
> > +       case RAS_EVENT_ID__BAD_PAGE_DETECTED:
> > +               schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
> > +               break;
> > +       case RAS_EVENT_ID__POISON_CONSUMPTION:
> > +               amdgpu_ras_sys_poison_consumption_event(ras_core, data);
> > +               break;
> > +       case RAS_EVENT_ID__RESERVE_BAD_PAGE:
> > +               ret = amdgpu_ras_reserve_page(ras_core->dev, *(uint64_t 
> > *)data);
> > +               break;
> > +       case RAS_EVENT_ID__FATAL_ERROR_DETECTED:
> > +               ret = amdgpu_ras_sys_detect_fatal_event(ras_core, data);
> > +               break;
> > +       case RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM:
> > +               ret = amdgpu_dpm_send_hbm_bad_pages_num(ras_core->dev, 
> > *(uint32_t *)data);
> > +               break;
> > +       case RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP:
> > +               ret = amdgpu_dpm_send_hbm_bad_channel_flag(ras_core->dev, 
> > *(uint32_t *)data);
> > +               break;
> > +       case RAS_EVENT_ID__DEVICE_RMA:
> > +               ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, 
> > NULL, NULL);
> > +               ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
> > +               break;
> > +       case RAS_EVENT_ID__RESET_GPU:
> > +               ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t 
> > *)data);
> > +               break;
> > +       default:
> > +               RAS_DEV_WARN(ras_core->dev, "Invalid ras notify 
> > event:%d\n", event_id);
> > +               break;
> > +       }
> > +
> > +       return ret;
> > +}
> > +
> > +static u64 amdgpu_ras_sys_get_utc_second_timestamp(struct
> > +ras_core_context *ras_core) {
> > +       return ktime_get_real_seconds(); }
> > +
> > +static int amdgpu_ras_sys_check_gpu_status(struct ras_core_context 
> > *ras_core,
> > +                               uint32_t *status) {
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> > +       uint32_t gpu_status = 0;
> > +
> > +       if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev))
> > +               gpu_status |= RAS_GPU_STATUS__IN_RESET;
> > +
> > +       if (amdgpu_sriov_vf(adev))
> > +               gpu_status |= RAS_GPU_STATUS__IS_VF;
> > +
> > +       *status = gpu_status;
> > +
> > +       return 0;
> > +}
> > +
> > +static int amdgpu_ras_sys_get_device_system_info(struct ras_core_context 
> > *ras_core,
> > +                       struct device_system_info *dev_info) {
> > +       struct amdgpu_device *adev = (struct amdgpu_device
> > +*)ras_core->dev;
> > +
> > +       dev_info->device_id = adev->pdev->device;
> > +       dev_info->vendor_id = adev->pdev->vendor;
> > +       dev_info->socket_id = adev->smuio.funcs->get_socket_id(adev);
> > +
> > +       return 0;
> > +}
> > +
> > +static int amdgpu_ras_sys_gpu_reset_lock(struct ras_core_context *ras_core,
> > +                       bool down, bool try) {
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> > +       int ret = 0;
> > +
> > +       if (down && try)
> > +               ret = down_read_trylock(&adev->reset_domain->sem);
> > +       else if (down)
> > +               down_read(&adev->reset_domain->sem);
> > +       else
> > +               up_read(&adev->reset_domain->sem);
> > +
> > +       return ret;
> > +}
> > +
> > +static bool amdgpu_ras_sys_detect_ras_interrupt(struct
> > +ras_core_context *ras_core) {
> > +       return !!atomic_read(&amdgpu_ras_in_intr);
> > +}
> > +
> > +static int amdgpu_ras_sys_get_gpu_mem(struct ras_core_context *ras_core,
> > +       enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) {
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> > +       struct psp_context *psp = &adev->psp;
> > +       struct psp_ring *psp_ring;
> > +       struct ta_mem_context *mem_ctx;
> > +
> > +       if (mem_type == GPU_MEM_TYPE_RAS_PSP_RING) {
> > +               psp_ring = &psp->km_ring;
> > +               gpu_mem->mem_bo = adev->firmware.rbuf;
> > +               gpu_mem->mem_size = psp_ring->ring_size;
> > +               gpu_mem->mem_mc_addr = psp_ring->ring_mem_mc_addr;
> > +               gpu_mem->mem_cpu_addr = psp_ring->ring_mem;
> > +       } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_CMD) {
> > +               gpu_mem->mem_bo = psp->cmd_buf_bo;
> > +               gpu_mem->mem_size = PSP_CMD_BUFFER_SIZE;
> > +               gpu_mem->mem_mc_addr = psp->cmd_buf_mc_addr;
> > +               gpu_mem->mem_cpu_addr = psp->cmd_buf_mem;
> > +       } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_FENCE) {
> > +               gpu_mem->mem_bo = psp->fence_buf_bo;
> > +               gpu_mem->mem_size = PSP_FENCE_BUFFER_SIZE;
> > +               gpu_mem->mem_mc_addr = psp->fence_buf_mc_addr;
> > +               gpu_mem->mem_cpu_addr = psp->fence_buf;
> > +       } else if (mem_type == GPU_MEM_TYPE_RAS_TA_FW) {
> > +               gpu_mem->mem_bo = psp->fw_pri_bo;
> > +               gpu_mem->mem_size = PSP_1_MEG;
> > +               gpu_mem->mem_mc_addr = psp->fw_pri_mc_addr;
> > +               gpu_mem->mem_cpu_addr = psp->fw_pri_buf;
> > +       } else if (mem_type == GPU_MEM_TYPE_RAS_TA_CMD) {
> > +               mem_ctx = &psp->ras_context.context.mem_context;
> > +               gpu_mem->mem_bo = mem_ctx->shared_bo;
> > +               gpu_mem->mem_size = mem_ctx->shared_mem_size;
> > +               gpu_mem->mem_mc_addr = mem_ctx->shared_mc_addr;
> > +               gpu_mem->mem_cpu_addr = mem_ctx->shared_buf;
> > +       } else {
> > +               return -EINVAL;
> > +       }
> > +
> > +       if (!gpu_mem->mem_bo || !gpu_mem->mem_size ||
> > +               !gpu_mem->mem_mc_addr || !gpu_mem->mem_cpu_addr) {
> > +               RAS_DEV_ERR(ras_core->dev, "The ras psp gpu memory is 
> > invalid!\n");
> > +               return -ENOMEM;
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static int amdgpu_ras_sys_put_gpu_mem(struct ras_core_context *ras_core,
> > +       enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) {
> > +
> > +       return 0;
> > +}
> > +
> > +const struct ras_sys_func amdgpu_ras_sys_fn = {
> > +       .ras_notifier = amdgpu_ras_sys_event_notifier,
> > +       .get_utc_second_timestamp = amdgpu_ras_sys_get_utc_second_timestamp,
> > +       .gen_seqno = amdgpu_ras_sys_gen_seqno,
> > +       .check_gpu_status = amdgpu_ras_sys_check_gpu_status,
> > +       .get_device_system_info = amdgpu_ras_sys_get_device_system_info,
> > +       .gpu_reset_lock = amdgpu_ras_sys_gpu_reset_lock,
> > +       .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt,
> > +       .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem,
> > +       .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem, };
> > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> > b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> > new file mode 100644
> > index 000000000000..c48ff26525d6
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> > @@ -0,0 +1,109 @@
> > +/* SPDX-License-Identifier: MIT */
> > +/*
> > + * Copyright 2025 Advanced Micro Devices, Inc.
> > + *
> > + * Permission is hereby granted, free of charge, to any person
> > +obtaining a
> > + * copy of this software and associated documentation files (the
> > +"Software"),
> > + * to deal in the Software without restriction, including without
> > +limitation
> > + * the rights to use, copy, modify, merge, publish, distribute,
> > +sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom
> > +the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be
> > +included in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > +EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > +MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> > +SHALL
> > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> > +DAMAGES OR
> > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> > +OTHERWISE,
> > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
> > +OR
> > + * OTHER DEALINGS IN THE SOFTWARE.
> > + *
> > + */
> > +
> > +#ifndef __RAS_SYS_H__
> > +#define __RAS_SYS_H__
> > +#include <linux/stdarg.h>
> > +#include <linux/printk.h>
> > +#include <linux/dev_printk.h>
> > +#include "amdgpu.h"
> > +
> > +#define RAS_DEV_ERR(device, fmt, ...)                                      
> >          \
> > +       do {                                                                
> >       \
> > +               if (device)                                                 
> >             \
> > +                       dev_err(((struct amdgpu_device *)device)->dev, fmt, 
> > ##__VA_ARGS__); \
> > +               else                                                        
> >           \
> > +                       printk(KERN_ERR fmt, ##__VA_ARGS__);                
> >               \
> > +       } while (0)
> > +
> > +#define RAS_DEV_WARN(device, fmt, ...)                                     
> >           \
> > +       do {                                                                
> >        \
> > +               if (device)                                                 
> >              \
> > +                       dev_warn(((struct amdgpu_device *)device)->dev, 
> > fmt, ##__VA_ARGS__); \
> > +               else                                                        
> >            \
> > +                       printk(KERN_WARNING fmt, ##__VA_ARGS__);            
> >                \
> > +       } while (0)
> > +
> > +#define RAS_DEV_INFO(device, fmt, ...)                                     
> >             \
> > +       do {                                                                
> >          \
> > +               if (device)                                                 
> >                \
> > +                       dev_info(((struct amdgpu_device *)device)->dev, 
> > fmt, ##__VA_ARGS__);   \
> > +               else                                                        
> >              \
> > +                       printk(KERN_INFO fmt, ##__VA_ARGS__);               
> >                  \
> > +       } while (0)
> > +
> > +#define RAS_DEV_DBG(device, fmt, ...)                                      
> >             \
> > +       do {                                                                
> >          \
> > +               if (device)                                                 
> >                \
> > +                       dev_dbg(((struct amdgpu_device *)device)->dev, fmt, 
> > ##__VA_ARGS__);    \
> > +               else                                                        
> >              \
> > +                       printk(KERN_DEBUG fmt, ##__VA_ARGS__);              
> >                  \
> > +       } while (0)
> > +
> > +#define RAS_INFO(fmt, ...)  printk(KERN_INFO fmt, ##__VA_ARGS__)
>
> > Why do we need these wrappers?  Is there ever a case where we don't have a 
> > device?
>
> > Alex
>
> [Thomas] Wrappers are only called in ras module, during the initialization of 
> the ras module ,before amdgpu device pointer is attached to ras device, these 
> wrappers are called and device will be NULL.
>

Thanks for clarifying.  It would probably be good to add some
kerneldoc which explains the design of the ras core and the rasmgr
components.  It would be helpful for code review and provide
documentation for developers and customers.

Alex

> > +
> > +#define RAS_DEV_RREG32_SOC15(dev, ip, inst, reg) \ ({ \
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
> > +       
> > __RREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + 
> > reg, \
> > +                        0, ip##_HWIP, inst); \
> > +})
> > +
> > +#define RAS_DEV_WREG32_SOC15(dev, ip, inst, reg, value) \ ({ \
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
> > +       
> > __WREG32_SOC15_RLC__((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + 
> > reg), \
> > +                         value, 0, ip##_HWIP, inst); \
> > +})
> > +
> > +/* GET_INST returns the physical instance corresponding to a logical
> > +instance */ #define RAS_GET_INST(dev, ip, inst) \ ({ \
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
> > +       adev->ip_map.logical_to_dev_inst ? \
> > +               adev->ip_map.logical_to_dev_inst(adev, ip##_HWIP,
> > +inst) : inst; \
> > +})
> > +
> > +#define RAS_GET_MASK(dev, ip, mask) \ ({ \
> > +       struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
> > +       (adev->ip_map.logical_to_dev_mask ? \
> > +               adev->ip_map.logical_to_dev_mask(adev, ip##_HWIP,
> > +mask) : mask); \
> > +})
> > +
> > +static inline void *ras_radix_tree_delete_iter(struct radix_tree_root
> > +*root, void *iter) {
> > +       return radix_tree_delete(root, ((struct radix_tree_iter
> > +*)iter)->index); }
> > +
> > +static inline long ras_wait_event_interruptible_timeout(void *wq_head,
> > +                       int (*condition)(void *param), void *param,
> > +unsigned int timeout) {
> > +       return wait_event_interruptible_timeout(*(wait_queue_head_t 
> > *)wq_head,
> > +                               condition(param), timeout); }
> > +
> > +extern const struct ras_sys_func amdgpu_ras_sys_fn;
> > +
> > +#endif
> > --
> > 2.34.1
> >

Reply via email to