PF will do page retirement and reset VF in SRIOV. Signed-off-by: Tao Zhou <tao.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 18 +++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 1 + 4 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index aad3c8b4c810..b603ab3bd138 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -123,12 +123,20 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, }; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); - ret = - amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + if (!amdgpu_sriov_vf(adev)) { + ret = amdgpu_umc_do_page_retirement(adev, + ras_error_status, NULL, reset); - if (ret == AMDGPU_RAS_SUCCESS && obj) { - obj->err_data.ue_count += err_data->ue_count; - obj->err_data.ce_count += err_data->ce_count; + if (ret == AMDGPU_RAS_SUCCESS && obj) { + obj->err_data.ue_count += err_data->ue_count; + obj->err_data.ce_count += err_data->ce_count; + } + } else { + if (adev->virt.ops && adev->virt.ops->ras_poison_handler) + adev->virt.ops->ras_poison_handler(adev); + else + dev_warn(adev->dev, + "No ras_poison_handler interface in SRIOV!\n"); } return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 4534e6f70a4b..41d03ef417d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -86,6 +86,7 @@ struct amdgpu_virt_ops { void (*trans_msg)(struct amdgpu_device *adev, u32 req, u32 data1, u32 data2, u32 data3); void (*ras_trigger_error)(struct amdgpu_device *adev, struct ta_ras_trigger_error_input *info); + void (*ras_poison_handler)(struct amdgpu_device *adev); }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 3b4c5162a237..7545d7c0f524 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -428,6 +428,11 @@ void amdgpu_virt_ras_trigger_error(struct amdgpu_device *adev, addr_lo, addr_hi); } +void amdgpu_virt_ras_poison_handler(struct amdgpu_device *adev) +{ + xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); +} + const struct amdgpu_virt_ops xgpu_ai_virt_ops = { .req_full_gpu = xgpu_ai_request_full_gpu_access, .rel_full_gpu = xgpu_ai_release_full_gpu_access, @@ -436,4 +441,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = { .trans_msg = xgpu_ai_mailbox_trans_msg, .req_init_data = xgpu_ai_request_init_data, .ras_trigger_error = amdgpu_virt_ras_trigger_error, + .ras_poison_handler = amdgpu_virt_ras_poison_handler, }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h index 0841d6632328..869dff6daddd 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h @@ -48,6 +48,7 @@ enum idh_request { IDH_LOG_VF_ERROR = 200, IDH_READY_TO_RESET = 201, IDH_RAS_ERROR_INJECT = 202, + IDH_RAS_POISON = 203, }; enum idh_event { -- 2.35.1