Use dedicated memory as vf ras command buffer. V2: Add lock to ensure serialization of sending vf ras commands.
Signed-off-by: YiPeng Chai <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 24 +++- .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c | 120 +++++++++++++----- .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h | 10 +- 3 files changed, 114 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h index c20d10263492..aa8871515212 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h @@ -498,14 +498,22 @@ struct amd_sriov_ras_chk_criti { uint32_t hit; }; +union amd_sriov_ras_host_push { + struct amd_sriov_ras_telemetry_error_count error_count; + struct amd_sriov_ras_cper_dump cper_dump; + struct amd_sriov_ras_chk_criti chk_criti; +}; + +#define AMD_SRIOV_UNIRAS_CMD_MAX_SIZE (PAGE_SIZE * 13) +struct amd_sriov_uniras_shared_mem { + uint8_t blocks_ecc_buf[PAGE_SIZE]; + uint8_t cmd_buf[AMD_SRIOV_UNIRAS_CMD_MAX_SIZE]; +}; + struct amdsriov_ras_telemetry { struct amd_sriov_ras_telemetry_header header; - - union { - struct amd_sriov_ras_telemetry_error_count error_count; - struct amd_sriov_ras_cper_dump cper_dump; - struct amd_sriov_ras_chk_criti chk_criti; - } body; + union amd_sriov_ras_host_push body; + struct amd_sriov_uniras_shared_mem uniras_shared_mem; }; /* version data stored in MAILBOX_MSGBUF_RCV_DW1 for future expansion */ @@ -538,6 +546,10 @@ _Static_assert(AMD_SRIOV_MSG_RESERVE_UCODE % 4 == 0, _Static_assert(AMD_SRIOV_MSG_RESERVE_UCODE > AMD_SRIOV_UCODE_ID__MAX, "AMD_SRIOV_MSG_RESERVE_UCODE must be bigger than AMD_SRIOV_UCODE_ID__MAX"); +_Static_assert( + sizeof(struct amdsriov_ras_telemetry) <= AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10, +"amdsriov_ras_telemetry must be " stringification(AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1) " KB"); + #undef _stringification #undef stringification #endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c index a75479593864..c101743b6299 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c @@ -30,28 +30,83 @@ #include "amdgpu_virt_ras_cmd.h" #include "amdgpu_ras_mgr.h" +static int amdgpu_virt_ras_get_cmd_shared_mem(struct ras_core_context *ras_core, + uint32_t cmd, uint32_t mem_size, struct amdgpu_virt_shared_mem *shared_mem) +{ + struct amdgpu_device *adev = ras_core->dev; + struct amdsriov_ras_telemetry *ras_telemetry_cpu; + struct amdsriov_ras_telemetry *ras_telemetry_gpu; + uint64_t fw_vram_usage_start_offset = 0; + uint64_t ras_telemetry_offset = 0; + + if (!adev->virt.fw_reserve.ras_telemetry) + return -EINVAL; + + if (adev->mman.fw_vram_usage_va && + adev->mman.fw_vram_usage_va <= adev->virt.fw_reserve.ras_telemetry) { + fw_vram_usage_start_offset = adev->mman.fw_vram_usage_start_offset; + ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry - + (uintptr_t)adev->mman.fw_vram_usage_va; + } else if (adev->mman.drv_vram_usage_va && + adev->mman.drv_vram_usage_va <= adev->virt.fw_reserve.ras_telemetry) { + fw_vram_usage_start_offset = adev->mman.drv_vram_usage_start_offset; + ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry - + (uintptr_t)adev->mman.drv_vram_usage_va; + } else { + return -EINVAL; + } + + ras_telemetry_cpu = + (struct amdsriov_ras_telemetry *)adev->virt.fw_reserve.ras_telemetry; + ras_telemetry_gpu = + (struct amdsriov_ras_telemetry *)(fw_vram_usage_start_offset + + ras_telemetry_offset); + + if (cmd == RAS_CMD__GET_ALL_BLOCK_ECC_STATUS) { + if (mem_size > PAGE_SIZE) + return -ENOMEM; + + shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.blocks_ecc_buf; + shared_mem->gpa = + (uintptr_t)ras_telemetry_gpu->uniras_shared_mem.blocks_ecc_buf - + adev->gmc.vram_start; + shared_mem->size = mem_size; + } else { + if (mem_size > AMD_SRIOV_UNIRAS_CMD_MAX_SIZE) + return -ENOMEM; + + shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.cmd_buf; + shared_mem->gpa = + (uintptr_t)ras_telemetry_gpu->uniras_shared_mem.cmd_buf - + adev->gmc.vram_start; + shared_mem->size = mem_size; + } + + return 0; +} + static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *output_data, uint32_t output_size) { - struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev); + struct amdgpu_virt_ras_cmd *virt_ras = ras_mgr->virt_ras_cmd; uint32_t mem_len = ALIGN(sizeof(*cmd) + output_size, AMDGPU_GPU_PAGE_SIZE); struct ras_cmd_ctx *rcmd; - struct amdgpu_bo *rcmd_bo = NULL; - uint64_t mc_addr = 0; - void *cpu_addr = NULL; + struct amdgpu_virt_shared_mem shared_mem = {0}; int ret = 0; - ret = amdgpu_bo_create_kernel(adev, mem_len, PAGE_SIZE, - AMDGPU_GEM_DOMAIN_VRAM, &rcmd_bo, &mc_addr, (void **)&cpu_addr); + mutex_lock(&virt_ras->remote_access_lock); + + ret = amdgpu_virt_ras_get_cmd_shared_mem(ras_core, cmd->cmd_id, mem_len, &shared_mem); if (ret) - return ret; + goto out; - rcmd = (struct ras_cmd_ctx *)cpu_addr; + rcmd = (struct ras_cmd_ctx *)shared_mem.cpu_addr; memset(rcmd, 0, mem_len); memcpy(rcmd, cmd, sizeof(*cmd)); ret = amdgpu_virt_send_remote_ras_cmd(ras_core->dev, - mc_addr - adev->gmc.vram_start, mem_len); + shared_mem.gpa, mem_len); if (!ret) { if (rcmd->cmd_res) { ret = rcmd->cmd_res; @@ -65,8 +120,7 @@ static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context *ras_core, } out: - amdgpu_bo_free_kernel(&rcmd_bo, &mc_addr, &cpu_addr); - + mutex_unlock(&virt_ras->remote_access_lock); return ret; } @@ -77,6 +131,9 @@ static int amdgpu_virt_ras_send_remote_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx rcmd = {0}; int ret; + if (input_size > RAS_CMD_MAX_IN_SIZE) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + rcmd.cmd_id = cmd_id; rcmd.input_size = input_size; memcpy(rcmd.input_buff_raw, input_data, input_size); @@ -146,7 +203,7 @@ static int amdgpu_virt_ras_get_batch_records(struct ras_core_context *ras_core, struct ras_cmd_batch_trace_record_rsp *rsp = rsp_cache; struct batch_ras_trace_info *batch; int ret = 0; - uint8_t i; + uint32_t i; if (!rsp->real_batch_num || (batch_id < rsp->start_batch_id) || (batch_id >= (rsp->start_batch_id + rsp->real_batch_num))) { @@ -249,14 +306,14 @@ static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev, { struct ras_cmd_ctx *rcmd; - if (!blks_ecc || !blks_ecc->bo || !blks_ecc->cpu_addr) + if (!blks_ecc || !blks_ecc->shared_mem.cpu_addr) return -EINVAL; - rcmd = (struct ras_cmd_ctx *)blks_ecc->cpu_addr; + rcmd = (struct ras_cmd_ctx *)blks_ecc->shared_mem.cpu_addr; rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS; rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req); - rcmd->output_buf_size = blks_ecc->size - sizeof(*rcmd); + rcmd->output_buf_size = blks_ecc->shared_mem.size - sizeof(*rcmd); return 0; } @@ -305,15 +362,15 @@ static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core, if (!virt_ras->blocks_ecc.auto_update_actived) { ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS, - blks_ecc->mc_addr - adev->gmc.vram_start, - blks_ecc->size, true); + blks_ecc->shared_mem.gpa, + blks_ecc->shared_mem.size, true); if (ret) return ret; blks_ecc->auto_update_actived = true; } - blks_ecc_cmd_ctx = blks_ecc->cpu_addr; + blks_ecc_cmd_ctx = blks_ecc->shared_mem.cpu_addr; blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw; output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count; @@ -364,18 +421,24 @@ int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core, int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev) { struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct amdgpu_virt_ras_cmd *virt_ras_cmd; ras_mgr->virt_ras_cmd = kzalloc(sizeof(struct amdgpu_virt_ras_cmd), GFP_KERNEL); if (!ras_mgr->virt_ras_cmd) return -ENOMEM; + virt_ras_cmd = ras_mgr->virt_ras_cmd; + mutex_init(&virt_ras_cmd->remote_access_lock); + return 0; } int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev) { struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct amdgpu_virt_ras_cmd *virt_ras_cmd = ras_mgr->virt_ras_cmd; + mutex_destroy(&virt_ras_cmd->remote_access_lock); kfree(ras_mgr->virt_ras_cmd); ras_mgr->virt_ras_cmd = NULL; @@ -392,11 +455,9 @@ int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev) amdgpu_virt_get_ras_capability(adev); memset(blks_ecc, 0, sizeof(*blks_ecc)); - blks_ecc->size = PAGE_SIZE; - if (amdgpu_bo_create_kernel(adev, blks_ecc->size, - PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, - &blks_ecc->bo, &blks_ecc->mc_addr, - (void **)&blks_ecc->cpu_addr)) + if (amdgpu_virt_ras_get_cmd_shared_mem(ras_mgr->ras_core, + RAS_CMD__GET_ALL_BLOCK_ECC_STATUS, + PAGE_SIZE, &blks_ecc->shared_mem)) return -ENOMEM; return 0; @@ -409,18 +470,15 @@ int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev) (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd; struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc; - if (blks_ecc->bo) { + if (blks_ecc->shared_mem.cpu_addr) { __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS, - blks_ecc->mc_addr - adev->gmc.vram_start, - blks_ecc->size, false); + blks_ecc->shared_mem.gpa, + blks_ecc->shared_mem.size, false); - memset(blks_ecc->cpu_addr, 0, blks_ecc->size); - amdgpu_bo_free_kernel(&blks_ecc->bo, - &blks_ecc->mc_addr, &blks_ecc->cpu_addr); - - memset(blks_ecc, 0, sizeof(*blks_ecc)); + memset(blks_ecc->shared_mem.cpu_addr, 0, blks_ecc->shared_mem.size); } + memset(blks_ecc, 0, sizeof(*blks_ecc)); return 0; } diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h index 53b0f3f60103..482a5fda72ab 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h @@ -30,11 +30,14 @@ struct remote_batch_trace_mgr { struct ras_cmd_batch_trace_record_rsp batch_trace; }; -struct vram_blocks_ecc { - struct amdgpu_bo *bo; - uint64_t mc_addr; +struct amdgpu_virt_shared_mem { + uint64_t gpa; void *cpu_addr; uint32_t size; +}; + +struct vram_blocks_ecc { + struct amdgpu_virt_shared_mem shared_mem; bool auto_update_actived; }; @@ -42,6 +45,7 @@ struct amdgpu_virt_ras_cmd { bool remote_uniras_supported; struct remote_batch_trace_mgr batch_mgr; struct vram_blocks_ecc blocks_ecc; + struct mutex remote_access_lock; }; int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev); -- 2.43.0
