[AMD Official Use Only - AMD Internal Distribution Only] Reviewed-and-tested-by: Jinzhou Su <[email protected]>
Regards Joe > -----Original Message----- > From: Chai, Thomas <[email protected]> > Sent: Thursday, February 12, 2026 2:07 PM > To: [email protected] > Cc: Zhang, Hawking <[email protected]>; Zhou1, Tao > <[email protected]>; Li, Candice <[email protected]>; Yang, Stanley > <[email protected]>; Su, Joe <[email protected]>; Chai, Thomas > <[email protected]> > Subject: [PATCH V2] drm/amd/ras: use dedicated memory as vf ras command > buffer > > Use dedicated memory as vf ras command buffer. > > V2: > Add lock to ensure serialization of sending vf ras commands. > > Signed-off-by: YiPeng Chai <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 24 +++- > .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c | 120 +++++++++++++----- > .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h | 10 +- > 3 files changed, 114 insertions(+), 40 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h > b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h > index c20d10263492..aa8871515212 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h > @@ -498,14 +498,22 @@ struct amd_sriov_ras_chk_criti { > uint32_t hit; > }; > > +union amd_sriov_ras_host_push { > + struct amd_sriov_ras_telemetry_error_count error_count; > + struct amd_sriov_ras_cper_dump cper_dump; > + struct amd_sriov_ras_chk_criti chk_criti; }; > + > +#define AMD_SRIOV_UNIRAS_CMD_MAX_SIZE (PAGE_SIZE * 13) struct > +amd_sriov_uniras_shared_mem { > + uint8_t blocks_ecc_buf[PAGE_SIZE]; > + uint8_t cmd_buf[AMD_SRIOV_UNIRAS_CMD_MAX_SIZE]; > +}; > + > struct amdsriov_ras_telemetry { > struct amd_sriov_ras_telemetry_header header; > - > - union { > - struct amd_sriov_ras_telemetry_error_count error_count; > - struct amd_sriov_ras_cper_dump cper_dump; > - struct amd_sriov_ras_chk_criti chk_criti; > - } body; > + union amd_sriov_ras_host_push body; > + struct amd_sriov_uniras_shared_mem uniras_shared_mem; > }; > > /* version data stored in MAILBOX_MSGBUF_RCV_DW1 for future expansion > */ @@ -538,6 +546,10 @@ > _Static_assert(AMD_SRIOV_MSG_RESERVE_UCODE % 4 == 0, > _Static_assert(AMD_SRIOV_MSG_RESERVE_UCODE > > AMD_SRIOV_UCODE_ID__MAX, > "AMD_SRIOV_MSG_RESERVE_UCODE must be bigger than > AMD_SRIOV_UCODE_ID__MAX"); > > +_Static_assert( > + sizeof(struct amdsriov_ras_telemetry) <= > +AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10, > "amdsriov_ras_telemetry > +must be " stringification(AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1) " > +KB"); > + > #undef _stringification > #undef stringification > #endif > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > index a75479593864..c101743b6299 100644 > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > @@ -30,28 +30,83 @@ > #include "amdgpu_virt_ras_cmd.h" > #include "amdgpu_ras_mgr.h" > > +static int amdgpu_virt_ras_get_cmd_shared_mem(struct ras_core_context > *ras_core, > + uint32_t cmd, uint32_t mem_size, struct > amdgpu_virt_shared_mem > +*shared_mem) { > + struct amdgpu_device *adev = ras_core->dev; > + struct amdsriov_ras_telemetry *ras_telemetry_cpu; > + struct amdsriov_ras_telemetry *ras_telemetry_gpu; > + uint64_t fw_vram_usage_start_offset = 0; > + uint64_t ras_telemetry_offset = 0; > + > + if (!adev->virt.fw_reserve.ras_telemetry) > + return -EINVAL; > + > + if (adev->mman.fw_vram_usage_va && > + adev->mman.fw_vram_usage_va <= adev- > >virt.fw_reserve.ras_telemetry) { > + fw_vram_usage_start_offset = adev- > >mman.fw_vram_usage_start_offset; > + ras_telemetry_offset = (uintptr_t)adev- > >virt.fw_reserve.ras_telemetry - > + (uintptr_t)adev->mman.fw_vram_usage_va; > + } else if (adev->mman.drv_vram_usage_va && > + adev->mman.drv_vram_usage_va <= adev- > >virt.fw_reserve.ras_telemetry) { > + fw_vram_usage_start_offset = adev- > >mman.drv_vram_usage_start_offset; > + ras_telemetry_offset = (uintptr_t)adev- > >virt.fw_reserve.ras_telemetry - > + (uintptr_t)adev->mman.drv_vram_usage_va; > + } else { > + return -EINVAL; > + } > + > + ras_telemetry_cpu = > + (struct amdsriov_ras_telemetry *)adev- > >virt.fw_reserve.ras_telemetry; > + ras_telemetry_gpu = > + (struct amdsriov_ras_telemetry > *)(fw_vram_usage_start_offset + > + ras_telemetry_offset); > + > + if (cmd == RAS_CMD__GET_ALL_BLOCK_ECC_STATUS) { > + if (mem_size > PAGE_SIZE) > + return -ENOMEM; > + > + shared_mem->cpu_addr = ras_telemetry_cpu- > >uniras_shared_mem.blocks_ecc_buf; > + shared_mem->gpa = > + (uintptr_t)ras_telemetry_gpu- > >uniras_shared_mem.blocks_ecc_buf - > + adev->gmc.vram_start; > + shared_mem->size = mem_size; > + } else { > + if (mem_size > AMD_SRIOV_UNIRAS_CMD_MAX_SIZE) > + return -ENOMEM; > + > + shared_mem->cpu_addr = ras_telemetry_cpu- > >uniras_shared_mem.cmd_buf; > + shared_mem->gpa = > + (uintptr_t)ras_telemetry_gpu- > >uniras_shared_mem.cmd_buf - > + adev->gmc.vram_start; > + shared_mem->size = mem_size; > + } > + > + return 0; > +} > + > static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context > *ras_core, > struct ras_cmd_ctx *cmd, void *output_data, uint32_t > output_size) { > - struct amdgpu_device *adev = (struct amdgpu_device *)ras_core- > >dev; > + struct amdgpu_ras_mgr *ras_mgr = > amdgpu_ras_mgr_get_context(ras_core->dev); > + struct amdgpu_virt_ras_cmd *virt_ras = ras_mgr->virt_ras_cmd; > uint32_t mem_len = ALIGN(sizeof(*cmd) + output_size, > AMDGPU_GPU_PAGE_SIZE); > struct ras_cmd_ctx *rcmd; > - struct amdgpu_bo *rcmd_bo = NULL; > - uint64_t mc_addr = 0; > - void *cpu_addr = NULL; > + struct amdgpu_virt_shared_mem shared_mem = {0}; > int ret = 0; > > - ret = amdgpu_bo_create_kernel(adev, mem_len, PAGE_SIZE, > - AMDGPU_GEM_DOMAIN_VRAM, &rcmd_bo, > &mc_addr, (void **)&cpu_addr); > + mutex_lock(&virt_ras->remote_access_lock); > + > + ret = amdgpu_virt_ras_get_cmd_shared_mem(ras_core, cmd- > >cmd_id, > +mem_len, &shared_mem); > if (ret) > - return ret; > + goto out; > > - rcmd = (struct ras_cmd_ctx *)cpu_addr; > + rcmd = (struct ras_cmd_ctx *)shared_mem.cpu_addr; > memset(rcmd, 0, mem_len); > memcpy(rcmd, cmd, sizeof(*cmd)); > > ret = amdgpu_virt_send_remote_ras_cmd(ras_core->dev, > - mc_addr - adev->gmc.vram_start, mem_len); > + shared_mem.gpa, mem_len); > if (!ret) { > if (rcmd->cmd_res) { > ret = rcmd->cmd_res; > @@ -65,8 +120,7 @@ static int amdgpu_virt_ras_remote_ioctl_cmd(struct > ras_core_context *ras_core, > } > > out: > - amdgpu_bo_free_kernel(&rcmd_bo, &mc_addr, &cpu_addr); > - > + mutex_unlock(&virt_ras->remote_access_lock); > return ret; > } > > @@ -77,6 +131,9 @@ static int amdgpu_virt_ras_send_remote_cmd(struct > ras_core_context *ras_core, > struct ras_cmd_ctx rcmd = {0}; > int ret; > > + if (input_size > RAS_CMD_MAX_IN_SIZE) > + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; > + > rcmd.cmd_id = cmd_id; > rcmd.input_size = input_size; > memcpy(rcmd.input_buff_raw, input_data, input_size); @@ -146,7 > +203,7 @@ static int amdgpu_virt_ras_get_batch_records(struct > ras_core_context *ras_core, > struct ras_cmd_batch_trace_record_rsp *rsp = rsp_cache; > struct batch_ras_trace_info *batch; > int ret = 0; > - uint8_t i; > + uint32_t i; > > if (!rsp->real_batch_num || (batch_id < rsp->start_batch_id) || > (batch_id >= (rsp->start_batch_id + rsp->real_batch_num))) > { @@ -249,14 +306,14 @@ static int __fill_get_blocks_ecc_cmd(struct > amdgpu_device *adev, { > struct ras_cmd_ctx *rcmd; > > - if (!blks_ecc || !blks_ecc->bo || !blks_ecc->cpu_addr) > + if (!blks_ecc || !blks_ecc->shared_mem.cpu_addr) > return -EINVAL; > > - rcmd = (struct ras_cmd_ctx *)blks_ecc->cpu_addr; > + rcmd = (struct ras_cmd_ctx *)blks_ecc->shared_mem.cpu_addr; > > rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS; > rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req); > - rcmd->output_buf_size = blks_ecc->size - sizeof(*rcmd); > + rcmd->output_buf_size = blks_ecc->shared_mem.size - sizeof(*rcmd); > > return 0; > } > @@ -305,15 +362,15 @@ static int amdgpu_virt_ras_get_block_ecc(struct > ras_core_context *ras_core, > > if (!virt_ras->blocks_ecc.auto_update_actived) { > ret = __set_cmd_auto_update(adev, > RAS_CMD__GET_ALL_BLOCK_ECC_STATUS, > - blks_ecc->mc_addr - adev->gmc.vram_start, > - blks_ecc->size, true); > + blks_ecc->shared_mem.gpa, > + blks_ecc->shared_mem.size, true); > if (ret) > return ret; > > blks_ecc->auto_update_actived = true; > } > > - blks_ecc_cmd_ctx = blks_ecc->cpu_addr; > + blks_ecc_cmd_ctx = blks_ecc->shared_mem.cpu_addr; > blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx- > >output_buff_raw; > > output_data->ce_count = blks_ecc_rsp->blocks[input_data- > >block_id].ce_count; > @@ -364,18 +421,24 @@ int amdgpu_virt_ras_handle_cmd(struct > ras_core_context *ras_core, int amdgpu_virt_ras_sw_init(struct > amdgpu_device *adev) { > struct amdgpu_ras_mgr *ras_mgr = > amdgpu_ras_mgr_get_context(adev); > + struct amdgpu_virt_ras_cmd *virt_ras_cmd; > > ras_mgr->virt_ras_cmd = kzalloc(sizeof(struct amdgpu_virt_ras_cmd), > GFP_KERNEL); > if (!ras_mgr->virt_ras_cmd) > return -ENOMEM; > > + virt_ras_cmd = ras_mgr->virt_ras_cmd; > + mutex_init(&virt_ras_cmd->remote_access_lock); > + > return 0; > } > > int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev) { > struct amdgpu_ras_mgr *ras_mgr = > amdgpu_ras_mgr_get_context(adev); > + struct amdgpu_virt_ras_cmd *virt_ras_cmd = ras_mgr->virt_ras_cmd; > > + mutex_destroy(&virt_ras_cmd->remote_access_lock); > kfree(ras_mgr->virt_ras_cmd); > ras_mgr->virt_ras_cmd = NULL; > > @@ -392,11 +455,9 @@ int amdgpu_virt_ras_hw_init(struct amdgpu_device > *adev) > amdgpu_virt_get_ras_capability(adev); > > memset(blks_ecc, 0, sizeof(*blks_ecc)); > - blks_ecc->size = PAGE_SIZE; > - if (amdgpu_bo_create_kernel(adev, blks_ecc->size, > - PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, > - &blks_ecc->bo, &blks_ecc->mc_addr, > - (void **)&blks_ecc->cpu_addr)) > + if (amdgpu_virt_ras_get_cmd_shared_mem(ras_mgr->ras_core, > + RAS_CMD__GET_ALL_BLOCK_ECC_STATUS, > + PAGE_SIZE, &blks_ecc->shared_mem)) > return -ENOMEM; > > return 0; > @@ -409,18 +470,15 @@ int amdgpu_virt_ras_hw_fini(struct amdgpu_device > *adev) > (struct amdgpu_virt_ras_cmd *)ras_mgr- > >virt_ras_cmd; > struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc; > > - if (blks_ecc->bo) { > + if (blks_ecc->shared_mem.cpu_addr) { > __set_cmd_auto_update(adev, > RAS_CMD__GET_ALL_BLOCK_ECC_STATUS, > - blks_ecc->mc_addr - adev->gmc.vram_start, > - blks_ecc->size, false); > + blks_ecc->shared_mem.gpa, > + blks_ecc->shared_mem.size, false); > > - memset(blks_ecc->cpu_addr, 0, blks_ecc->size); > - amdgpu_bo_free_kernel(&blks_ecc->bo, > - &blks_ecc->mc_addr, &blks_ecc->cpu_addr); > - > - memset(blks_ecc, 0, sizeof(*blks_ecc)); > + memset(blks_ecc->shared_mem.cpu_addr, 0, blks_ecc- > >shared_mem.size); > } > + memset(blks_ecc, 0, sizeof(*blks_ecc)); > > return 0; > } > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h > index 53b0f3f60103..482a5fda72ab 100644 > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h > @@ -30,11 +30,14 @@ struct remote_batch_trace_mgr { > struct ras_cmd_batch_trace_record_rsp batch_trace; }; > > -struct vram_blocks_ecc { > - struct amdgpu_bo *bo; > - uint64_t mc_addr; > +struct amdgpu_virt_shared_mem { > + uint64_t gpa; > void *cpu_addr; > uint32_t size; > +}; > + > +struct vram_blocks_ecc { > + struct amdgpu_virt_shared_mem shared_mem; > bool auto_update_actived; > }; > > @@ -42,6 +45,7 @@ struct amdgpu_virt_ras_cmd { > bool remote_uniras_supported; > struct remote_batch_trace_mgr batch_mgr; > struct vram_blocks_ecc blocks_ecc; > + struct mutex remote_access_lock; > }; > > int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev); > -- > 2.43.0
