On Thu, Jun 10, 2021 at 02:44:13PM -0700, Rob Clark wrote:
> From: Rob Clark
>
> Wire up support to stall the SMMU on iova fault, and collect a devcore-
> dump snapshot for easier debugging of faults.
>
> Currently this is a6xx-only, but mostly only because so far it is the
> only one using adreno-smmu-priv.
Acked-by: Jordan Crouse
> Signed-off-by: Rob Clark
> ---
> drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 19 +++-
> drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 38 +++-
> drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 42 ++
> drivers/gpu/drm/msm/adreno/adreno_gpu.c | 15 +++
> drivers/gpu/drm/msm/msm_gem.h | 1 +
> drivers/gpu/drm/msm/msm_gem_submit.c| 1 +
> drivers/gpu/drm/msm/msm_gpu.c | 48 +
> drivers/gpu/drm/msm/msm_gpu.h | 17
> drivers/gpu/drm/msm/msm_gpummu.c| 5 +++
> drivers/gpu/drm/msm/msm_iommu.c | 11 +
> drivers/gpu/drm/msm/msm_mmu.h | 1 +
> 11 files changed, 186 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index eb030b00bff4..7a271de9a212 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -1200,6 +1200,15 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
> struct drm_device *dev = gpu->dev;
> struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
>
> + /*
> + * If stalled on SMMU fault, we could trip the GPU's hang detection,
> + * but the fault handler will trigger the devcore dump, and we want
> + * to otherwise resume normally rather than killing the submit, so
> + * just bail.
> + */
> + if (gpu_read(gpu, REG_A5XX_RBBM_STATUS3) & BIT(24))
> + return;
> +
> DRM_DEV_ERROR(dev->dev, "gpu fault ring %d fence %x status %8.8X rb
> %4.4x/%4.4x ib1 %16.16llX/%4.4x ib2 %16.16llX/%4.4x\n",
> ring ? ring->id : -1, ring ? ring->seqno : 0,
> gpu_read(gpu, REG_A5XX_RBBM_STATUS),
> @@ -1523,6 +1532,7 @@ static struct msm_gpu_state *a5xx_gpu_state_get(struct
> msm_gpu *gpu)
> {
> struct a5xx_gpu_state *a5xx_state = kzalloc(sizeof(*a5xx_state),
> GFP_KERNEL);
> + bool stalled = !!(gpu_read(gpu, REG_A5XX_RBBM_STATUS3) & BIT(24));
>
> if (!a5xx_state)
> return ERR_PTR(-ENOMEM);
> @@ -1535,8 +1545,13 @@ static struct msm_gpu_state *a5xx_gpu_state_get(struct
> msm_gpu *gpu)
>
> a5xx_state->base.rbbm_status = gpu_read(gpu, REG_A5XX_RBBM_STATUS);
>
> - /* Get the HLSQ regs with the help of the crashdumper */
> - a5xx_gpu_state_get_hlsq_regs(gpu, a5xx_state);
> + /*
> + * Get the HLSQ regs with the help of the crashdumper, but only if
> + * we are not stalled in an iommu fault (in which case the crashdumper
> + * would not have access to memory)
> + */
> + if (!stalled)
> + a5xx_gpu_state_get_hlsq_regs(gpu, a5xx_state);
>
> a5xx_set_hwcg(gpu, true);
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index fc19db10bff1..c3699408bd1f 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1081,6 +1081,16 @@ static int a6xx_fault_handler(void *arg, unsigned long
> iova, int flags, void *da
> struct msm_gpu *gpu = arg;
> struct adreno_smmu_fault_info *info = data;
> const char *type = "UNKNOWN";
> + const char *block;
> + bool do_devcoredump = info && !READ_ONCE(gpu->crashstate);
> +
> + /*
> + * If we aren't going to be resuming later from fault_worker, then do
> + * it now.
> + */
> + if (!do_devcoredump) {
> + gpu->aspace->mmu->funcs->resume_translation(gpu->aspace->mmu);
> + }
>
> /*
>* Print a default message if we couldn't get the data from the
> @@ -1104,15 +1114,30 @@ static int a6xx_fault_handler(void *arg, unsigned
> long iova, int flags, void *da
> else if (info->fsr & ARM_SMMU_FSR_EF)
> type = "EXTERNAL";
>
> + block = a6xx_fault_block(gpu, info->fsynr1 & 0xff);
> +
> pr_warn_ratelimited("*** gpu fault: ttbr0=%.16llx iova=%.16lx dir=%s
> type=%s source=%s (%u,%u,%u,%u)\n",
> info->ttbr0, iova,
> - flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ", type,
> - a6xx_fault_block(gpu, info->fsynr1 & 0xff),
> + flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ",
> + type, block,
> gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(4)),
> gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(5)),
> gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(6)),
> gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(7)));
>
> + if (do_devcor