Re: [PATCH v5 3/5] drm/msm: Improve the a6xx page fault handler

2021-07-06 Thread Yassine Oudjana
In-Reply-To: <20210610214431.539029-4-robdcl...@gmail.com>

On Thu 10 Jun 16:44 CDT 2021, Rob Clark wrote:

> From: Jordan Crouse 
>
> Use the new adreno-smmu-priv fault info function to get more SMMU
> debug registers and print the current TTBR0 to debug per-instance
> pagetables and figure out which GPU block generated the request.
>
> Signed-off-by: Jordan Crouse 
> Signed-off-by: Rob Clark 
> ---
>  drivers/gpu/drm/msm/adreno/a5xx_gpu.c |  4 +-
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 76 +--
>  drivers/gpu/drm/msm/msm_iommu.c   | 11 +++-
>  drivers/gpu/drm/msm/msm_mmu.h |  4 +-
>  4 files changed, 87 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index f46562c12022..eb030b00bff4 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -1075,7 +1075,7 @@ bool a5xx_idle(struct msm_gpu *gpu, struct 
> msm_ringbuffer *ring)
>   return true;
>  }
>
> -static int a5xx_fault_handler(void *arg, unsigned long iova, int flags)
> +static int a5xx_fault_handler(void *arg, unsigned long iova, int flags, void 
> *data)
>  {
>   struct msm_gpu *gpu = arg;
>   pr_warn_ratelimited("*** gpu fault: iova=%08lx, flags=%d 
> (%u,%u,%u,%u)\n",
> @@ -1085,7 +1085,7 @@ static int a5xx_fault_handler(void *arg, unsigned long 
> iova, int flags)
>   gpu_read(gpu, REG_A5XX_CP_SCRATCH_REG(6)),
>   gpu_read(gpu, REG_A5XX_CP_SCRATCH_REG(7)));
>
> - return -EFAULT;
> + return 0;
>  }
>
>  static void a5xx_cp_err_irq(struct msm_gpu *gpu)
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index c7f0ddb12d8f..fc19db10bff1 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1032,18 +1032,88 @@ static void a6xx_recover(struct msm_gpu *gpu)
>   msm_gpu_hw_init(gpu);
>  }
>
> -static int a6xx_fault_handler(void *arg, unsigned long iova, int flags)
> +static const char *a6xx_uche_fault_block(struct msm_gpu *gpu, u32 mid)
> +{
> + static const char *uche_clients[7] = {
> + "VFD", "SP", "VSC", "VPC", "HLSQ", "PC", "LRZ",
> + };
> + u32 val;
> +
> + if (mid < 1 || mid > 3)
> + return "UNKNOWN";
> +
> + /*
> +  * The source of the data depends on the mid ID read from FSYNR1.
> +  * and the client ID read from the UCHE block
> +  */
> + val = gpu_read(gpu, REG_A6XX_UCHE_CLIENT_PF);
> +
> + /* mid = 3 is most precise and refers to only one block per client */
> + if (mid == 3)
> + return uche_clients[val & 7];
> +
> + /* For mid=2 the source is TP or VFD except when the client id is 0 */
> + if (mid == 2)
> + return ((val & 7) == 0) ? "TP" : "TP|VFD";
> +
> + /* For mid=1 just return "UCHE" as a catchall for everything else */
> + return "UCHE";
> +}
> +
> +static const char *a6xx_fault_block(struct msm_gpu *gpu, u32 id)
> +{
> + if (id == 0)
> + return "CP";
> + else if (id == 4)
> + return "CCU";
> + else if (id == 6)
> + return "CDP Prefetch";
> +
> + return a6xx_uche_fault_block(gpu, id);
> +}
> +
> +#define ARM_SMMU_FSR_TF BIT(1)
> +#define ARM_SMMU_FSR_PF  BIT(3)
> +#define ARM_SMMU_FSR_EF  BIT(4)
> +
> +static int a6xx_fault_handler(void *arg, unsigned long iova, int flags, void 
> *data)
>  {
>   struct msm_gpu *gpu = arg;
> + struct adreno_smmu_fault_info *info = data;
> + const char *type = "UNKNOWN";
>
> - pr_warn_ratelimited("*** gpu fault: iova=%08lx, flags=%d 
> (%u,%u,%u,%u)\n",
> + /*
> +  * Print a default message if we couldn't get the data from the
> +  * adreno-smmu-priv
> +  */
> + if (!info) {
> + pr_warn_ratelimited("*** gpu fault: iova=%.16lx flags=%d 
> (%u,%u,%u,%u)\n",
>   iova, flags,
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(4)),
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(5)),
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(6)),
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(7)));
>
> - return -EFAULT;
> + return 0;
> + }
> +
> + if (info->fsr & ARM_SMMU_FSR_TF)
> + type = "TRANSLATION";
> + else if (info->fsr & ARM_SMMU_FSR_PF)
> + type = "PERMISSION";
> + else if (info->fsr & ARM_SMMU_FSR_EF)
> + type = "EXTERNAL";
> +
> + pr_warn_ratelimited("*** gpu fault: ttbr0=%.16llx iova=%.16lx dir=%s 
> type=%s source=%s (%u,%u,%u,%u)\n",
> + info->ttbr0, iova,
> + flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ", type,
> + a6xx_fault_block(gpu, info->fsynr1 & 0xff),
> + gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(4))

Re: [PATCH v5 3/5] drm/msm: Improve the a6xx page fault handler

2021-06-25 Thread Rob Clark
On Thu, Jun 24, 2021 at 8:39 PM Bjorn Andersson
 wrote:
>
> On Thu 10 Jun 16:44 CDT 2021, Rob Clark wrote:
> [..]
> > diff --git a/drivers/gpu/drm/msm/msm_iommu.c 
> > b/drivers/gpu/drm/msm/msm_iommu.c
> > index 50d881794758..6975b95c3c29 100644
> > --- a/drivers/gpu/drm/msm/msm_iommu.c
> > +++ b/drivers/gpu/drm/msm/msm_iommu.c
> > @@ -211,8 +211,17 @@ static int msm_fault_handler(struct iommu_domain 
> > *domain, struct device *dev,
> >   unsigned long iova, int flags, void *arg)
> >  {
> >   struct msm_iommu *iommu = arg;
> > + struct adreno_smmu_priv *adreno_smmu = 
> > dev_get_drvdata(iommu->base.dev);
> > + struct adreno_smmu_fault_info info, *ptr = NULL;
> > +
> > + if (adreno_smmu->get_fault_info) {
>
> This seemed reasonable when I read it last time, but I didn't realize
> that the msm_fault_handler() is installed for all msm_iommu instances.
>
> So while we're trying to recover from the boot splash and setup the new
> framebuffer we end up here with iommu->base.dev being the mdss device.
> Naturally drvdata of mdss is not a struct adreno_smmu_priv.
>
> > + adreno_smmu->get_fault_info(adreno_smmu->cookie, &info);
>
> So here we just jump straight out into hyperspace, never to return.
>
> Not sure how to wire this up to avoid the problem, but right now I don't
> think we can boot any device with a boot splash.
>

I think we could do:


diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index eed2a762e9dd..30ee8866154e 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -29,6 +29,9 @@ static struct msm_iommu_pagetable
*to_pagetable(struct msm_mmu *mmu)
  return container_of(mmu, struct msm_iommu_pagetable, base);
 }

+static int msm_fault_handler(struct iommu_domain *domain, struct device *dev,
+ unsigned long iova, int flags, void *arg);
+
 static int msm_iommu_pagetable_unmap(struct msm_mmu *mmu, u64 iova,
  size_t size)
 {
@@ -151,6 +154,8 @@ struct msm_mmu *msm_iommu_pagetable_create(struct
msm_mmu *parent)
  struct io_pgtable_cfg ttbr0_cfg;
  int ret;

+ iommu_set_fault_handler(iommu->domain, msm_fault_handler, iommu);
+
  /* Get the pagetable configuration from the domain */
  if (adreno_smmu->cookie)
  ttbr1_cfg = adreno_smmu->get_ttbr1_cfg(adreno_smmu->cookie);
@@ -300,7 +305,6 @@ struct msm_mmu *msm_iommu_new(struct device *dev,
struct iommu_domain *domain)

  iommu->domain = domain;
  msm_mmu_init(&iommu->base, dev, &funcs, MSM_MMU_IOMMU);
- iommu_set_fault_handler(domain, msm_fault_handler, iommu);

  atomic_set(&iommu->pagetables, 0);



That would have the result of setting the same fault handler multiple
times, but that looks harmless.  Mostly the fault handling stuff is to
make it easier to debug userspace issues, the fallback dmesg spam from
arm-smmu should be sufficient for any kernel side issues.

BR,
-R


Re: [PATCH v5 3/5] drm/msm: Improve the a6xx page fault handler

2021-06-24 Thread Bjorn Andersson
On Thu 10 Jun 16:44 CDT 2021, Rob Clark wrote:
[..]
> diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
> index 50d881794758..6975b95c3c29 100644
> --- a/drivers/gpu/drm/msm/msm_iommu.c
> +++ b/drivers/gpu/drm/msm/msm_iommu.c
> @@ -211,8 +211,17 @@ static int msm_fault_handler(struct iommu_domain 
> *domain, struct device *dev,
>   unsigned long iova, int flags, void *arg)
>  {
>   struct msm_iommu *iommu = arg;
> + struct adreno_smmu_priv *adreno_smmu = dev_get_drvdata(iommu->base.dev);
> + struct adreno_smmu_fault_info info, *ptr = NULL;
> +
> + if (adreno_smmu->get_fault_info) {

This seemed reasonable when I read it last time, but I didn't realize
that the msm_fault_handler() is installed for all msm_iommu instances.

So while we're trying to recover from the boot splash and setup the new
framebuffer we end up here with iommu->base.dev being the mdss device.
Naturally drvdata of mdss is not a struct adreno_smmu_priv.

> + adreno_smmu->get_fault_info(adreno_smmu->cookie, &info);

So here we just jump straight out into hyperspace, never to return.

Not sure how to wire this up to avoid the problem, but right now I don't
think we can boot any device with a boot splash.

Regards,
Bjorn

> + ptr = &info;
> + }
> +
>   if (iommu->base.handler)
> - return iommu->base.handler(iommu->base.arg, iova, flags);
> + return iommu->base.handler(iommu->base.arg, iova, flags, ptr);
> +
>   pr_warn_ratelimited("*** fault: iova=%16lx, flags=%d\n", iova, flags);
>   return 0;
>  }


Re: [PATCH v5 3/5] drm/msm: Improve the a6xx page fault handler

2021-06-15 Thread Akhil P Oommen

On 6/11/2021 3:14 AM, Rob Clark wrote:

From: Jordan Crouse 

Use the new adreno-smmu-priv fault info function to get more SMMU
debug registers and print the current TTBR0 to debug per-instance
pagetables and figure out which GPU block generated the request.

Signed-off-by: Jordan Crouse 
Signed-off-by: Rob Clark 
---
  drivers/gpu/drm/msm/adreno/a5xx_gpu.c |  4 +-
  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 76 +--
  drivers/gpu/drm/msm/msm_iommu.c   | 11 +++-
  drivers/gpu/drm/msm/msm_mmu.h |  4 +-
  4 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index f46562c12022..eb030b00bff4 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1075,7 +1075,7 @@ bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer 
*ring)
return true;
  }
  
-static int a5xx_fault_handler(void *arg, unsigned long iova, int flags)

+static int a5xx_fault_handler(void *arg, unsigned long iova, int flags, void 
*data)
  {
struct msm_gpu *gpu = arg;
pr_warn_ratelimited("*** gpu fault: iova=%08lx, flags=%d 
(%u,%u,%u,%u)\n",
@@ -1085,7 +1085,7 @@ static int a5xx_fault_handler(void *arg, unsigned long 
iova, int flags)
gpu_read(gpu, REG_A5XX_CP_SCRATCH_REG(6)),
gpu_read(gpu, REG_A5XX_CP_SCRATCH_REG(7)));
  
-	return -EFAULT;

+   return 0;
Shall we return -ENOSYS here to let the smmu handler print the fault 
debugging info? Due to the change here: iommu/arm-smmu: Add support for 
driver IOMMU fault handlers - arm_smmu_context_fault()



  }
  
  static void a5xx_cp_err_irq(struct msm_gpu *gpu)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index c7f0ddb12d8f..fc19db10bff1 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1032,18 +1032,88 @@ static void a6xx_recover(struct msm_gpu *gpu)
msm_gpu_hw_init(gpu);
  }
  
-static int a6xx_fault_handler(void *arg, unsigned long iova, int flags)

+static const char *a6xx_uche_fault_block(struct msm_gpu *gpu, u32 mid)
+{
+   static const char *uche_clients[7] = {
+   "VFD", "SP", "VSC", "VPC", "HLSQ", "PC", "LRZ",
+   };
+   u32 val;
+
+   if (mid < 1 || mid > 3)
+   return "UNKNOWN";
+
+   /*
+* The source of the data depends on the mid ID read from FSYNR1.
+* and the client ID read from the UCHE block
+*/
+   val = gpu_read(gpu, REG_A6XX_UCHE_CLIENT_PF);

Are you sure gx rail is up at this point?


+
+   /* mid = 3 is most precise and refers to only one block per client */
+   if (mid == 3)
+   return uche_clients[val & 7];
+
+   /* For mid=2 the source is TP or VFD except when the client id is 0 */
+   if (mid == 2)
+   return ((val & 7) == 0) ? "TP" : "TP|VFD";
+
+   /* For mid=1 just return "UCHE" as a catchall for everything else */
+   return "UCHE";
+}
UCHE decoding scheme is different for a660. I am not sure who should 
handle that. Jonathan?



+
+static const char *a6xx_fault_block(struct msm_gpu *gpu, u32 id)
+{
+   if (id == 0)
+   return "CP";
+   else if (id == 4)
+   return "CCU";
+   else if (id == 6)
+   return "CDP Prefetch";

I guess id == 7 is not possible here.


+
+   return a6xx_uche_fault_block(gpu, id);
+}
+
+#define ARM_SMMU_FSR_TF BIT(1)
+#define ARM_SMMU_FSR_PFBIT(3)
+#define ARM_SMMU_FSR_EFBIT(4)
+
+static int a6xx_fault_handler(void *arg, unsigned long iova, int flags, void 
*data)
  {
struct msm_gpu *gpu = arg;
+   struct adreno_smmu_fault_info *info = data;
+   const char *type = "UNKNOWN";
  
-	pr_warn_ratelimited("*** gpu fault: iova=%08lx, flags=%d (%u,%u,%u,%u)\n",

+   /*
+* Print a default message if we couldn't get the data from the
+* adreno-smmu-priv
+*/
+   if (!info) {
+   pr_warn_ratelimited("*** gpu fault: iova=%.16lx flags=%d 
(%u,%u,%u,%u)\n",
iova, flags,
gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(4)),
gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(5)),
gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(6)),
gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(7)));
  
-	return -EFAULT;

+   return 0;

return -ENOSYS here so that the smmu driver prints some extra debug data?

+   }
+
+   if (info->fsr & ARM_SMMU_FSR_TF)
+   type = "TRANSLATION";
+   else if (info->fsr & ARM_SMMU_FSR_PF)
+   type = "PERMISSION";
+   else if (info->fsr & ARM_SMMU_FSR_EF)
+   type = "EXTERNAL";
+
+   pr_warn_ratelimited("*** gpu fault: ttbr0=%.16llx iova=%.16lx dir=%s type=%s 
source=%s

Re: [PATCH v5 3/5] drm/msm: Improve the a6xx page fault handler

2021-06-14 Thread Bjorn Andersson
On Thu 10 Jun 16:44 CDT 2021, Rob Clark wrote:

> From: Jordan Crouse 
> 
> Use the new adreno-smmu-priv fault info function to get more SMMU
> debug registers and print the current TTBR0 to debug per-instance
> pagetables and figure out which GPU block generated the request.
> 

Acked-by: Bjorn Andersson 

Regards,
Bjorn

> Signed-off-by: Jordan Crouse 
> Signed-off-by: Rob Clark 
> ---
>  drivers/gpu/drm/msm/adreno/a5xx_gpu.c |  4 +-
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 76 +--
>  drivers/gpu/drm/msm/msm_iommu.c   | 11 +++-
>  drivers/gpu/drm/msm/msm_mmu.h |  4 +-
>  4 files changed, 87 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index f46562c12022..eb030b00bff4 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -1075,7 +1075,7 @@ bool a5xx_idle(struct msm_gpu *gpu, struct 
> msm_ringbuffer *ring)
>   return true;
>  }
>  
> -static int a5xx_fault_handler(void *arg, unsigned long iova, int flags)
> +static int a5xx_fault_handler(void *arg, unsigned long iova, int flags, void 
> *data)
>  {
>   struct msm_gpu *gpu = arg;
>   pr_warn_ratelimited("*** gpu fault: iova=%08lx, flags=%d 
> (%u,%u,%u,%u)\n",
> @@ -1085,7 +1085,7 @@ static int a5xx_fault_handler(void *arg, unsigned long 
> iova, int flags)
>   gpu_read(gpu, REG_A5XX_CP_SCRATCH_REG(6)),
>   gpu_read(gpu, REG_A5XX_CP_SCRATCH_REG(7)));
>  
> - return -EFAULT;
> + return 0;
>  }
>  
>  static void a5xx_cp_err_irq(struct msm_gpu *gpu)
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index c7f0ddb12d8f..fc19db10bff1 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1032,18 +1032,88 @@ static void a6xx_recover(struct msm_gpu *gpu)
>   msm_gpu_hw_init(gpu);
>  }
>  
> -static int a6xx_fault_handler(void *arg, unsigned long iova, int flags)
> +static const char *a6xx_uche_fault_block(struct msm_gpu *gpu, u32 mid)
> +{
> + static const char *uche_clients[7] = {
> + "VFD", "SP", "VSC", "VPC", "HLSQ", "PC", "LRZ",
> + };
> + u32 val;
> +
> + if (mid < 1 || mid > 3)
> + return "UNKNOWN";
> +
> + /*
> +  * The source of the data depends on the mid ID read from FSYNR1.
> +  * and the client ID read from the UCHE block
> +  */
> + val = gpu_read(gpu, REG_A6XX_UCHE_CLIENT_PF);
> +
> + /* mid = 3 is most precise and refers to only one block per client */
> + if (mid == 3)
> + return uche_clients[val & 7];
> +
> + /* For mid=2 the source is TP or VFD except when the client id is 0 */
> + if (mid == 2)
> + return ((val & 7) == 0) ? "TP" : "TP|VFD";
> +
> + /* For mid=1 just return "UCHE" as a catchall for everything else */
> + return "UCHE";
> +}
> +
> +static const char *a6xx_fault_block(struct msm_gpu *gpu, u32 id)
> +{
> + if (id == 0)
> + return "CP";
> + else if (id == 4)
> + return "CCU";
> + else if (id == 6)
> + return "CDP Prefetch";
> +
> + return a6xx_uche_fault_block(gpu, id);
> +}
> +
> +#define ARM_SMMU_FSR_TF BIT(1)
> +#define ARM_SMMU_FSR_PF  BIT(3)
> +#define ARM_SMMU_FSR_EF  BIT(4)
> +
> +static int a6xx_fault_handler(void *arg, unsigned long iova, int flags, void 
> *data)
>  {
>   struct msm_gpu *gpu = arg;
> + struct adreno_smmu_fault_info *info = data;
> + const char *type = "UNKNOWN";
>  
> - pr_warn_ratelimited("*** gpu fault: iova=%08lx, flags=%d 
> (%u,%u,%u,%u)\n",
> + /*
> +  * Print a default message if we couldn't get the data from the
> +  * adreno-smmu-priv
> +  */
> + if (!info) {
> + pr_warn_ratelimited("*** gpu fault: iova=%.16lx flags=%d 
> (%u,%u,%u,%u)\n",
>   iova, flags,
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(4)),
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(5)),
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(6)),
>   gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(7)));
>  
> - return -EFAULT;
> + return 0;
> + }
> +
> + if (info->fsr & ARM_SMMU_FSR_TF)
> + type = "TRANSLATION";
> + else if (info->fsr & ARM_SMMU_FSR_PF)
> + type = "PERMISSION";
> + else if (info->fsr & ARM_SMMU_FSR_EF)
> + type = "EXTERNAL";
> +
> + pr_warn_ratelimited("*** gpu fault: ttbr0=%.16llx iova=%.16lx dir=%s 
> type=%s source=%s (%u,%u,%u,%u)\n",
> + info->ttbr0, iova,
> + flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ", type,
> + a6xx_fault_block(gpu, info->fsynr1 & 0xff),
> + gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(4))