Register RAS fatal error interrupt and add handler. v2: only register NBIO RAS for dGPU platform. change nbio_v7_9_set_ras_controller_irq_state and nbio_v7_9_set_ras_err_event_athub_irq_state to dummy functions.
Signed-off-by: Tao Zhou <tao.zh...@amd.com> Reviewed-by: Hawking Zhang <hawking.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 187 ++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h | 1 + 3 files changed, 193 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bb29cb57add5..00658c2816dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -35,6 +35,7 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include "nbio_v4_3.h" +#include "nbio_v7_9.h" #include "atom.h" #include "amdgpu_reset.h" @@ -2644,6 +2645,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev) * check DF RAS */ adev->nbio.ras = &nbio_v4_3_ras; break; + case IP_VERSION(7, 9, 0): + if (!adev->gmc.is_app_apu) + adev->nbio.ras = &nbio_v7_9_ras; + break; default: /* nbio ras is not available */ break; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index 1d1ab188ef15..781f98655567 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -471,3 +471,190 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { .init_registers = nbio_v7_9_init_registers, .get_pcie_replay_count = nbio_v7_9_get_pcie_replay_count, }; + +static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + return; +} + +static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_intr_cntl; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_CNTLR_INTERRUPT_CLEAR, 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + if (!ras->disable_ras_err_cnt_harvest) { + /* + * clear error status after ras_controller_intr + * according to hw team and count ue number + * for query + */ + nbio_v7_9_query_ras_error_count(adev, &err_data); + + /* logging on error cnt and printing for awareness */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + + if (err_data.ce_count) + dev_info(adev->dev, "%ld correctable hardware " + "errors detected in %s block, " + "no user action is needed.\n", + obj->err_data.ce_count, + get_ras_block_str(adev->nbio.ras_if)); + + if (err_data.ue_count) + dev_info(adev->dev, "%ld uncorrectable hardware " + "errors detected in %s block\n", + obj->err_data.ue_count, + get_ras_block_str(adev->nbio.ras_if)); + } + + dev_info(adev->dev, "RAS controller interrupt triggered " + "by NBIF error\n"); + + /* ras_controller_int is dedicated for nbif ras error, + * not the global interrupt for sync flood + */ + amdgpu_ras_reset_gpu(adev); + } +} + +static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_intr_cntl; + + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); + + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + amdgpu_ras_global_ras_isr(adev); + } +} + +static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned type, + enum amdgpu_interrupt_state state) +{ + /* Dummy function, there is no initialization operation in driver */ + + return 0; +} + +static int nbio_v7_9_process_ras_controller_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for ras_controller_irq should be written + * to BIFring instead of general iv ring. However, due to known bif ring + * hw bug, it has to be disabled. There is no chance the process function + * will be involked. Just left it as a dummy one. + */ + return 0; +} + +static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned type, + enum amdgpu_interrupt_state state) +{ + /* Dummy function, there is no initialization operation in driver */ + + return 0; +} + +static int nbio_v7_9_process_err_event_athub_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for err_event_athub_irq should be written + * to BIFring instead of general iv ring. However, due to known bif ring + * hw bug, it has to be disabled. There is no chance the process function + * will be involked. Just left it as a dummy one. + */ + return 0; +} + +static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = { + .set = nbio_v7_9_set_ras_controller_irq_state, + .process = nbio_v7_9_process_ras_controller_irq, +}; + +static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = { + .set = nbio_v7_9_set_ras_err_event_athub_irq_state, + .process = nbio_v7_9_process_err_event_athub_irq, +}; + +static int nbio_v7_9_init_ras_controller_interrupt (struct amdgpu_device *adev) +{ + int r; + + /* init the irq funcs */ + adev->nbio.ras_controller_irq.funcs = + &nbio_v7_9_ras_controller_irq_funcs; + adev->nbio.ras_controller_irq.num_types = 1; + + /* register ras controller interrupt */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, + NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT, + &adev->nbio.ras_controller_irq); + + return r; +} + +static int nbio_v7_9_init_ras_err_event_athub_interrupt (struct amdgpu_device *adev) +{ + + int r; + + /* init the irq funcs */ + adev->nbio.ras_err_event_athub_irq.funcs = + &nbio_v7_9_ras_err_event_athub_irq_funcs; + adev->nbio.ras_err_event_athub_irq.num_types = 1; + + /* register ras err event athub interrupt */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, + NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT, + &adev->nbio.ras_err_event_athub_irq); + + return r; +} + +const struct amdgpu_ras_block_hw_ops nbio_v7_9_ras_hw_ops = { + .query_ras_error_count = nbio_v7_9_query_ras_error_count, +}; + +struct amdgpu_nbio_ras nbio_v7_9_ras = { + .ras_block = { + .ras_comm = { + .name = "pcie_bif", + .block = AMDGPU_RAS_BLOCK__PCIE_BIF, + .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, + }, + .hw_ops = &nbio_v7_9_ras_hw_ops, + .ras_late_init = amdgpu_nbio_ras_late_init, + }, + .handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring, + .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring, + .init_ras_controller_interrupt = nbio_v7_9_init_ras_controller_interrupt, + .init_ras_err_event_athub_interrupt = nbio_v7_9_init_ras_err_event_athub_interrupt, +}; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h index 8e04eb484328..73709771950d 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h @@ -28,5 +28,6 @@ extern const struct nbio_hdp_flush_reg nbio_v7_9_hdp_flush_reg; extern const struct amdgpu_nbio_funcs nbio_v7_9_funcs; +extern struct amdgpu_nbio_ras nbio_v7_9_ras; #endif -- 2.35.1