[AMD Official Use Only - General] Reviewed-by: Tao Zhou <tao.zh...@amd.com>
> -----Original Message----- > From: Zhang, Hawking <hawking.zh...@amd.com> > Sent: Thursday, March 23, 2023 10:24 AM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <tao.zh...@amd.com>; Yang, > Stanley <stanley.y...@amd.com>; Li, Candice <candice...@amd.com>; Chai, > Thomas <yipeng.c...@amd.com> > Cc: Zhang, Hawking <hawking.zh...@amd.com> > Subject: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3 > > GPU will stop working once fatal error is detected. > it will inform driver to do reset to recover from the fatal error. > > Signed-off-by: Hawking Zhang <hawking.zh...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 ++++ > drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c | 79 +++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h | 1 + > drivers/gpu/drm/amd/amdgpu/soc21.c | 15 ++++- > 4 files changed, 105 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index c6dc3cd2a9de..5b1779021881 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -34,6 +34,7 @@ > #include "amdgpu_atomfirmware.h" > #include "amdgpu_xgmi.h" > #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" > +#include "nbio_v4_3.h" > #include "atom.h" > #include "amdgpu_reset.h" > > @@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > if (!adev->gmc.xgmi.connected_to_cpu) > adev->nbio.ras = &nbio_v7_4_ras; > break; > + case IP_VERSION(4, 3, 0): > + if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF) > + /* unlike other generation of nbio ras, > + * nbio v4_3 only support fatal error interrupt > + * to inform software that DF is freezed due to > + * system fatal error event. driver should not > + * enable nbio ras in such case. Instead, > + * check DF RAS */ > + adev->nbio.ras = &nbio_v4_3_ras; > + break; > default: > /* nbio ras is not available */ > break; > diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c > b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c > index 09fdcd20cb91..d5ed9e0e1a5f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c > @@ -26,6 +26,7 @@ > > #include "nbio/nbio_4_3_0_offset.h" > #include "nbio/nbio_4_3_0_sh_mask.h" > +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" > #include <uapi/linux/kfd_ioctl.h> > > static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev) @@ - > 538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = { > .remap_hdp_registers = nbio_v4_3_remap_hdp_registers, > .get_rom_offset = nbio_v4_3_get_rom_offset, }; > + > +static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device > *adev, > + struct amdgpu_irq_src > *src, > + unsigned type, > + enum > amdgpu_interrupt_state state) { > + /* The ras_controller_irq enablement should be done in psp bl when it > + * tries to enable ras feature. Driver only need to set the correct > interrupt > + * vector for bare-metal and sriov use case respectively > + */ > + uint32_t bif_doorbell_int_cntl; > + > + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, > regBIF_BX0_BIF_DOORBELL_INT_CNTL); > + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, > + BIF_BX0_BIF_DOORBELL_INT_CNTL, > + > RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE, > + (state == > AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1); > + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, > +bif_doorbell_int_cntl); > + > + return 0; > +} > + > +static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device > *adev, > + struct amdgpu_irq_src > *source, > + struct amdgpu_iv_entry *entry) > +{ > + /* By design, the ih cookie for err_event_athub_irq should be written > + * to bif ring. since bif ring is not enabled, just leave process > callback > + * as a dummy one. > + */ > + return 0; > +} > + > +static const struct amdgpu_irq_src_funcs > nbio_v4_3_ras_err_event_athub_irq_funcs = { > + .set = nbio_v4_3_set_ras_err_event_athub_irq_state, > + .process = nbio_v4_3_process_err_event_athub_irq, > +}; > + > +static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct > +amdgpu_device *adev) { > + uint32_t bif_doorbell_int_cntl; > + > + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, > regBIF_BX0_BIF_DOORBELL_INT_CNTL); > + if (REG_GET_FIELD(bif_doorbell_int_cntl, > + BIF_DOORBELL_INT_CNTL, > + RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) { > + /* driver has to clear the interrupt status when bif ring is > disabled */ > + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, > + BIF_DOORBELL_INT_CNTL, > + > RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); > + WREG32_SOC15(NBIO, 0, > regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl); > + amdgpu_ras_global_ras_isr(adev); > + } > +} > + > +static int nbio_v4_3_init_ras_err_event_athub_interrupt(struct > +amdgpu_device *adev) { > + > + int r; > + > + /* init the irq funcs */ > + adev->nbio.ras_err_event_athub_irq.funcs = > + &nbio_v4_3_ras_err_event_athub_irq_funcs; > + adev->nbio.ras_err_event_athub_irq.num_types = 1; > + > + /* register ras err event athub interrupt > + * nbio v4_3 uses the same irq source as nbio v7_4 */ > + r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_BIF, > + NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT, > + &adev->nbio.ras_err_event_athub_irq); > + > + return r; > +} > + > +struct amdgpu_nbio_ras nbio_v4_3_ras = { > + .handle_ras_err_event_athub_intr_no_bifring = > nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring, > + .init_ras_err_event_athub_interrupt = > +nbio_v4_3_init_ras_err_event_athub_interrupt, > +}; > diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h > b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h > index 711999ceedf4..399037cdf4fb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h > +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h > @@ -29,5 +29,6 @@ > extern const struct nbio_hdp_flush_reg nbio_v4_3_hdp_flush_reg; extern const > struct amdgpu_nbio_funcs nbio_v4_3_funcs; extern const struct > amdgpu_nbio_funcs nbio_v4_3_sriov_funcs; > +extern struct amdgpu_nbio_ras nbio_v4_3_ras; > > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c > b/drivers/gpu/drm/amd/amdgpu/soc21.c > index 67580761b44d..514bfc705d5a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/soc21.c > +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c > @@ -754,6 +754,14 @@ static int soc21_common_late_init(void *handle) > > sriov_vcn_4_0_0_video_codecs_decode_array_vcn0, > > ARRAY_SIZE(sriov_vcn_4_0_0_video_codecs_decode_array_vcn0)); > } > + } else { > + if (adev->nbio.ras && > + adev->nbio.ras_err_event_athub_irq.funcs) > + /* don't need to fail gpu late init > + * if enabling athub_err_event interrupt failed > + * nbio v4_3 only support fatal error hanlding > + * just enable the interrupt directly */ > + amdgpu_irq_get(adev, &adev- > >nbio.ras_err_event_athub_irq, 0); > } > > return 0; > @@ -801,8 +809,13 @@ static int soc21_common_hw_fini(void *handle) > /* disable the doorbell aperture */ > soc21_enable_doorbell_aperture(adev, false); > > - if (amdgpu_sriov_vf(adev)) > + if (amdgpu_sriov_vf(adev)) { > xgpu_nv_mailbox_put_irq(adev); > + } else { > + if (adev->nbio.ras && > + adev->nbio.ras_err_event_athub_irq.funcs) > + amdgpu_irq_put(adev, &adev- > >nbio.ras_err_event_athub_irq, 0); > + } > > return 0; > } > -- > 2.17.1