Re: [PATCH v5 5/5] drm/msm: devcoredump iommu fault support

2021-06-11 Thread Jordan Crouse
On Thu, Jun 10, 2021 at 02:44:13PM -0700, Rob Clark wrote:
> From: Rob Clark 
> 
> Wire up support to stall the SMMU on iova fault, and collect a devcore-
> dump snapshot for easier debugging of faults.
> 
> Currently this is a6xx-only, but mostly only because so far it is the
> only one using adreno-smmu-priv.

Acked-by: Jordan Crouse 
 
> Signed-off-by: Rob Clark 
> ---
>  drivers/gpu/drm/msm/adreno/a5xx_gpu.c   | 19 +++-
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 38 +++-
>  drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 42 ++
>  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 15 +++
>  drivers/gpu/drm/msm/msm_gem.h   |  1 +
>  drivers/gpu/drm/msm/msm_gem_submit.c|  1 +
>  drivers/gpu/drm/msm/msm_gpu.c   | 48 +
>  drivers/gpu/drm/msm/msm_gpu.h   | 17 
>  drivers/gpu/drm/msm/msm_gpummu.c|  5 +++
>  drivers/gpu/drm/msm/msm_iommu.c | 11 +
>  drivers/gpu/drm/msm/msm_mmu.h   |  1 +
>  11 files changed, 186 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index eb030b00bff4..7a271de9a212 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -1200,6 +1200,15 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
>   struct drm_device *dev = gpu->dev;
>   struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
>  
> + /*
> +  * If stalled on SMMU fault, we could trip the GPU's hang detection,
> +  * but the fault handler will trigger the devcore dump, and we want
> +  * to otherwise resume normally rather than killing the submit, so
> +  * just bail.
> +  */
> + if (gpu_read(gpu, REG_A5XX_RBBM_STATUS3) & BIT(24))
> + return;
> +
>   DRM_DEV_ERROR(dev->dev, "gpu fault ring %d fence %x status %8.8X rb 
> %4.4x/%4.4x ib1 %16.16llX/%4.4x ib2 %16.16llX/%4.4x\n",
>   ring ? ring->id : -1, ring ? ring->seqno : 0,
>   gpu_read(gpu, REG_A5XX_RBBM_STATUS),
> @@ -1523,6 +1532,7 @@ static struct msm_gpu_state *a5xx_gpu_state_get(struct 
> msm_gpu *gpu)
>  {
>   struct a5xx_gpu_state *a5xx_state = kzalloc(sizeof(*a5xx_state),
>   GFP_KERNEL);
> + bool stalled = !!(gpu_read(gpu, REG_A5XX_RBBM_STATUS3) & BIT(24));
>  
>   if (!a5xx_state)
>   return ERR_PTR(-ENOMEM);
> @@ -1535,8 +1545,13 @@ static struct msm_gpu_state *a5xx_gpu_state_get(struct 
> msm_gpu *gpu)
>  
>   a5xx_state->base.rbbm_status = gpu_read(gpu, REG_A5XX_RBBM_STATUS);
>  
> - /* Get the HLSQ regs with the help of the crashdumper */
> - a5xx_gpu_state_get_hlsq_regs(gpu, a5xx_state);
> + /*
> +  * Get the HLSQ regs with the help of the crashdumper, but only if
> +  * we are not stalled in an iommu fault (in which case the crashdumper
> +  * would not have access to memory)
> +  */
> + if (!stalled)
> + a5xx_gpu_state_get_hlsq_regs(gpu, a5xx_state);
>  
>   a5xx_set_hwcg(gpu, true);
>  
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index fc19db10bff1..c3699408bd1f 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1081,6 +1081,16 @@ static int a6xx_fault_handler(void *arg, unsigned long 
> iova, int flags, void *da
>   struct msm_gpu *gpu = arg;
>   struct adreno_smmu_fault_info *info = data;
>   const char *type = "UNKNOWN";
> + const char *block;
> + bool do_devcoredump = info && !READ_ONCE(gpu->crashstate);
> +
> + /*
> +  * If we aren't going to be resuming later from fault_worker, then do
> +  * it now.
> +  */
> + if (!do_devcoredump) {
> + gpu->aspace->mmu->funcs->resume_translation(gpu->aspace->mmu);
> + }
>  
>   /*
>* Print a default message if we couldn't get the data from the
> @@ -1104,15 +1114,30 @@ static int a6xx_fault_handler(void *arg, unsigned 
> long iova, int flags, void *da
>   else if (info->fsr & ARM_SMMU_FSR_EF)
>   type = "EXTERNAL";
>  
> + block = a6xx_fault_block(gpu, info->fsynr1 & 0xff);
> +
>   pr_warn_ratelimited("*** gpu fault: ttbr0=%.16llx iova=%.16lx dir=%s 
> type=%s source=%s (%u,%u,%u,%u)\n",
>   info->ttbr0, iova,
> - flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ", type,
> - a6xx_fault_block(gpu, info->fsynr1 & 0xff),
> + flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ",
> + type, block,
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(4)),
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(5)),
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(6)),
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(7)));
>  
> + if (do_devcor

[PATCH v5 5/5] drm/msm: devcoredump iommu fault support

2021-06-10 Thread Rob Clark
From: Rob Clark 

Wire up support to stall the SMMU on iova fault, and collect a devcore-
dump snapshot for easier debugging of faults.

Currently this is a6xx-only, but mostly only because so far it is the
only one using adreno-smmu-priv.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   | 19 +++-
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 38 +++-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 42 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 15 +++
 drivers/gpu/drm/msm/msm_gem.h   |  1 +
 drivers/gpu/drm/msm/msm_gem_submit.c|  1 +
 drivers/gpu/drm/msm/msm_gpu.c   | 48 +
 drivers/gpu/drm/msm/msm_gpu.h   | 17 
 drivers/gpu/drm/msm/msm_gpummu.c|  5 +++
 drivers/gpu/drm/msm/msm_iommu.c | 11 +
 drivers/gpu/drm/msm/msm_mmu.h   |  1 +
 11 files changed, 186 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index eb030b00bff4..7a271de9a212 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1200,6 +1200,15 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
struct drm_device *dev = gpu->dev;
struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
 
+   /*
+* If stalled on SMMU fault, we could trip the GPU's hang detection,
+* but the fault handler will trigger the devcore dump, and we want
+* to otherwise resume normally rather than killing the submit, so
+* just bail.
+*/
+   if (gpu_read(gpu, REG_A5XX_RBBM_STATUS3) & BIT(24))
+   return;
+
DRM_DEV_ERROR(dev->dev, "gpu fault ring %d fence %x status %8.8X rb 
%4.4x/%4.4x ib1 %16.16llX/%4.4x ib2 %16.16llX/%4.4x\n",
ring ? ring->id : -1, ring ? ring->seqno : 0,
gpu_read(gpu, REG_A5XX_RBBM_STATUS),
@@ -1523,6 +1532,7 @@ static struct msm_gpu_state *a5xx_gpu_state_get(struct 
msm_gpu *gpu)
 {
struct a5xx_gpu_state *a5xx_state = kzalloc(sizeof(*a5xx_state),
GFP_KERNEL);
+   bool stalled = !!(gpu_read(gpu, REG_A5XX_RBBM_STATUS3) & BIT(24));
 
if (!a5xx_state)
return ERR_PTR(-ENOMEM);
@@ -1535,8 +1545,13 @@ static struct msm_gpu_state *a5xx_gpu_state_get(struct 
msm_gpu *gpu)
 
a5xx_state->base.rbbm_status = gpu_read(gpu, REG_A5XX_RBBM_STATUS);
 
-   /* Get the HLSQ regs with the help of the crashdumper */
-   a5xx_gpu_state_get_hlsq_regs(gpu, a5xx_state);
+   /*
+* Get the HLSQ regs with the help of the crashdumper, but only if
+* we are not stalled in an iommu fault (in which case the crashdumper
+* would not have access to memory)
+*/
+   if (!stalled)
+   a5xx_gpu_state_get_hlsq_regs(gpu, a5xx_state);
 
a5xx_set_hwcg(gpu, true);
 
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index fc19db10bff1..c3699408bd1f 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1081,6 +1081,16 @@ static int a6xx_fault_handler(void *arg, unsigned long 
iova, int flags, void *da
struct msm_gpu *gpu = arg;
struct adreno_smmu_fault_info *info = data;
const char *type = "UNKNOWN";
+   const char *block;
+   bool do_devcoredump = info && !READ_ONCE(gpu->crashstate);
+
+   /*
+* If we aren't going to be resuming later from fault_worker, then do
+* it now.
+*/
+   if (!do_devcoredump) {
+   gpu->aspace->mmu->funcs->resume_translation(gpu->aspace->mmu);
+   }
 
/*
 * Print a default message if we couldn't get the data from the
@@ -1104,15 +1114,30 @@ static int a6xx_fault_handler(void *arg, unsigned long 
iova, int flags, void *da
else if (info->fsr & ARM_SMMU_FSR_EF)
type = "EXTERNAL";
 
+   block = a6xx_fault_block(gpu, info->fsynr1 & 0xff);
+
pr_warn_ratelimited("*** gpu fault: ttbr0=%.16llx iova=%.16lx dir=%s 
type=%s source=%s (%u,%u,%u,%u)\n",
info->ttbr0, iova,
-   flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ", type,
-   a6xx_fault_block(gpu, info->fsynr1 & 0xff),
+   flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ",
+   type, block,
gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(4)),
gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(5)),
gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(6)),
gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(7)));
 
+   if (do_devcoredump) {
+   /* Turn off the hangcheck timer to keep it from bothering us */
+   del_timer(&gpu->hangcheck_timer);
+
+   gpu->fault_info.ttbr0 = info->ttbr0;
+