a6xx: Add support for using system cache(LLC)

Jordan Crouse Fri, 05 Oct 2018 08:08:07 -0700

On Fri, Oct 05, 2018 at 06:38:35PM +0530, Sharat Masetty wrote:
> The last level system cache can be partitioned to 32 different slices
> of which GPU has two slices preallocated. One slice is used for caching GPU
> buffers and the other slice is used for caching the GPU SMMU pagetables.
> This patch talks to the core system cache driver to acquire the slice handles,
> configure the SCID's to those slices and activates and deactivates the slices
> upon GPU power collapse and restore.
> 
> Some support from the IOMMU driver is also needed to make use of the
> system cache. IOMMU_SYS_CACHE is a buffer protection flag which enables
> caching GPU data buffers in the system cache with memory attributes such
> as outer cacheable, read-allocate, write-allocate for buffers. The GPU
> then has the ability to override a few cacheability parameters which it
> does to override write-allocate to write-no-allocate as the GPU hardware
> does not benefit much from it.
> 
> Similarly DOMAIN_ATTR_USE_SYS_CACHE is another domain level attribute
> used by the IOMMU driver to set the right attributes to cache the hardware
> pagetables into the system cache.
> 
> Signed-off-by: Sharat Masetty <smase...@codeaurora.org>
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 159 
> +++++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.h |   9 ++
>  drivers/gpu/drm/msm/msm_iommu.c       |  13 +++
>  drivers/gpu/drm/msm/msm_mmu.h         |   3 +
>  4 files changed, 183 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 177dbfc..1790dde 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -8,6 +8,7 @@
>  #include "a6xx_gmu.xml.h"
>  
>  #include <linux/devfreq.h>
> +#include <linux/soc/qcom/llcc-qcom.h>
>  
>  static inline bool _a6xx_check_idle(struct msm_gpu *gpu)
>  {
> @@ -674,6 +675,151 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
>       ~0
>  };
>  
> +#define A6XX_LLC_NUM_GPU_SCIDS               5
> +#define A6XX_GPU_LLC_SCID_NUM_BITS   5
> +
> +#define A6XX_GPU_LLC_SCID_MASK \
> +     ((1 << (A6XX_LLC_NUM_GPU_SCIDS * A6XX_GPU_LLC_SCID_NUM_BITS)) - 1)
> +
> +#define A6XX_GPUHTW_LLC_SCID_SHIFT   25
> +#define A6XX_GPUHTW_LLC_SCID_MASK \
> +     (((1 << A6XX_GPU_LLC_SCID_NUM_BITS) - 1) << A6XX_GPUHTW_LLC_SCID_SHIFT)
> +
> +static inline void a6xx_gpu_cx_rmw(struct a6xx_llc *llc,
> +     u32 reg, u32 mask, u32 or)
> +{
> +     msm_rmw(llc->mmio + (reg << 2), mask, or);
> +}
> +
> +static void a6xx_llc_deactivate(struct msm_gpu *gpu)
> +{
> +     struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> +     struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
> +     struct a6xx_llc *llc = &a6xx_gpu->llc;
> +
> +     llcc_slice_deactivate(llc->gpu_llc_slice);
> +     llcc_slice_deactivate(llc->gpuhtw_llc_slice);
> +}
> +
> +static void a6xx_llc_activate(struct msm_gpu *gpu)
> +{
> +     struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> +     struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
> +     struct a6xx_llc *llc = &a6xx_gpu->llc;
> +
> +     if (!llc->mmio)
> +             return;
> +
> +     /*
> +      * If the LLCC_GPU slice activated, program the sub-cache ID for all
> +      * GPU blocks
> +      */
> +     if (!llcc_slice_activate(llc->gpu_llc_slice))
> +             a6xx_gpu_cx_rmw(llc,
> +                             REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
> +                             A6XX_GPU_LLC_SCID_MASK,
> +                             (llc->cntl1_regval &
> +                              A6XX_GPU_LLC_SCID_MASK));
> +
> +     /*
> +      * If the LLCC_GPUHTW slice activated, program the sub-cache ID for the
> +      * GPU pagetables
> +      */
> +     if (!llcc_slice_activate(llc->gpuhtw_llc_slice))
> +             a6xx_gpu_cx_rmw(llc,
> +                             REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
> +                             A6XX_GPUHTW_LLC_SCID_MASK,
> +                             (llc->cntl1_regval &
> +                              A6XX_GPUHTW_LLC_SCID_MASK));
> +
> +     /* Program cacheability overrides */
> +     a6xx_gpu_cx_rmw(llc, REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF,
> +             llc->cntl0_regval);
> +}
> +
> +void a6xx_llc_slices_destroy(struct a6xx_llc *llc)
> +{
> +     if (llc->mmio) {
> +             iounmap(llc->mmio);
> +             llc->mmio = NULL;
> +     }
> +
> +     llcc_slice_putd(llc->gpu_llc_slice);
> +     llc->gpu_llc_slice = NULL;


I don't think these need to be put back to NULL - we shouldn't touch them again
after this point.

> +
> +     llcc_slice_putd(llc->gpuhtw_llc_slice);
> +     llc->gpuhtw_llc_slice = NULL;
> +}
> +
> +static int a6xx_llc_slices_init(struct platform_device *pdev,
> +             struct a6xx_llc *llc)
> +{
> +     int i;
> +
> +     /* Map registers */
> +     llc->mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
> +     if (IS_ERR(llc->mmio)) {
> +             llc->mmio = NULL;
> +             return -1;

Return a valid error code here even if we don't care what it is.  -ENODEV maybe.
And in fact, if we don't care what it is (LLCC is very optional) then just don't
return anything at all.

> +     }
> +
> +     /* Get the system cache slice descriptor for GPU and GPUHTWs */
> +     llc->gpu_llc_slice = llcc_slice_getd(LLCC_GPU);
> +     llc->gpuhtw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
> +     if (IS_ERR(llc->gpu_llc_slice) && IS_ERR(llc->gpuhtw_llc_slice))
> +             return -1;
> +
> +     /*
> +      * Setup GPU system cache CNTL0 and CNTL1 register values.
> +      * These values will be programmed everytime GPU comes out
> +      * of power collapse as these are non-retention registers.
> +      */
> +
> +     /*
> +      * CNTL0 provides options to override the settings for the
> +      * read and write allocation policies for the LLC. These
> +      * overrides are global for all memory transactions from
> +      * the GPU.
> +      *
> +      * 0x3: read-no-alloc-overridden = 0
> +      *      read-no-alloc = 0 - Allocate lines on read miss
> +      *      write-no-alloc-overridden = 1
> +      *      write-no-alloc = 1 - Do not allocates lines on write miss
> +      */
> +     llc->cntl0_regval = 0x03;
> +
> +     /*
> +      * CNTL1 is used to specify SCID for (CP, TP, VFD, CCU and UBWC
> +      * FLAG cache) GPU blocks. This value will be passed along with
> +      * the address for any memory transaction from GPU to identify
> +      * the sub-cache for that transaction.
> +      *
> +      * Currently there is only one SCID allocated for all GPU blocks
> +      * Hence set same SCID for all the blocks.

This last sentence is not needed

> +      */
> +
> +     if (!IS_ERR(llc->gpu_llc_slice)) {
> +             u32 gpu_scid = llcc_get_slice_id(llc->gpu_llc_slice);
> +
> +             for (i = 0; i < A6XX_LLC_NUM_GPU_SCIDS; i++)
> +                     llc->cntl1_regval |=
> +                             gpu_scid << (A6XX_GPU_LLC_SCID_NUM_BITS * i);
> +     }
> +
> +     /*
> +      * Set SCID for GPU IOMMU. This will be used to access
> +      * page tables that are cached in LLC.
> +      */
> +     if (!IS_ERR(llc->gpuhtw_llc_slice)) {
> +             u32 gpuhtw_scid = llcc_get_slice_id(llc->gpuhtw_llc_slice);
> +
> +             llc->cntl1_regval |=
> +                     gpuhtw_scid << A6XX_GPUHTW_LLC_SCID_SHIFT;
> +     }
> +
> +     return 0;
> +}
> +
>  static int a6xx_pm_resume(struct msm_gpu *gpu)
>  {
>       struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> @@ -686,6 +832,9 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
>  
>       msm_gpu_resume_devfreq(gpu);
>  
> +     /* Activate LLC slices */
> +     a6xx_llc_activate(gpu);
> +
>       return ret;
>  }
>  
> @@ -694,6 +843,9 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
>       struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>       struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>  
> +     /* Deactivate LLC slices */
> +     a6xx_llc_deactivate(gpu);
> +
>       devfreq_suspend_device(gpu->devfreq.devfreq);
>  
>       /*
> @@ -753,6 +905,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
>               drm_gem_object_unreference_unlocked(a6xx_gpu->sqe_bo);
>       }
>  
> +     a6xx_llc_slices_destroy(&a6xx_gpu->llc);
> +
>       a6xx_gmu_remove(a6xx_gpu);
>  
>       adreno_gpu_cleanup(adreno_gpu);
> @@ -819,7 +973,10 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
>       adreno_gpu->registers = a6xx_registers;
>       adreno_gpu->reg_offsets = a6xx_register_offsets;
>  
> -     ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1, 0);
> +     ret = a6xx_llc_slices_init(pdev, &a6xx_gpu->llc);

Yep - there is no reason to take a ret and not deal with it.

> +
> +     ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1,
> +                     ret ? 0 : MMU_FEATURE_USE_SYSTEM_CACHE);
>       if (ret) {
>               a6xx_destroy(&(a6xx_gpu->base.base));
>               return ERR_PTR(ret);
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> index 4127dce..86353e8 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> @@ -12,6 +12,14 @@
>  
>  extern bool hang_debug;
>  
> +struct a6xx_llc {
> +     void __iomem *mmio;
> +     void *gpu_llc_slice;
> +     void *gpuhtw_llc_slice;
> +     u32 cntl0_regval;
> +     u32 cntl1_regval;
> +};
> +
>  struct a6xx_gpu {
>       struct adreno_gpu base;
>  
> @@ -21,6 +29,7 @@ struct a6xx_gpu {
>       struct msm_ringbuffer *cur_ring;
>  
>       struct a6xx_gmu gmu;
> +     struct a6xx_llc llc;
>  };
>  
>  #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
> diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
> index e80c79b..66612c4 100644
> --- a/drivers/gpu/drm/msm/msm_iommu.c
> +++ b/drivers/gpu/drm/msm/msm_iommu.c
> @@ -38,6 +38,16 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const 
> char * const *names,
>                           int cnt)
>  {
>       struct msm_iommu *iommu = to_msm_iommu(mmu);
> +     int gpu_htw_llc = 1;
> +
> +     /*
> +      * This allows GPU to set the bus attributes required
> +      * to use system cache on behalf of the iommu page table
> +      * walker.
> +      */
> +     if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
> +             iommu_domain_set_attr(iommu->domain,
> +                             DOMAIN_ATTR_USE_SYS_CACHE, &gpu_htw_llc);
>  
>       return iommu_attach_device(iommu->domain, mmu->dev);
>  }
> @@ -56,6 +66,9 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova,
>       struct msm_iommu *iommu = to_msm_iommu(mmu);
>       size_t ret;
>  
> +     if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
> +             prot |= IOMMU_SYS_CACHE;
> +
>       ret = iommu_map_sg(iommu->domain, iova, sgt->sgl, sgt->nents, prot);
>       WARN_ON(ret < 0);
>  
> diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
> index 9b9f43f..524790b 100644
> --- a/drivers/gpu/drm/msm/msm_mmu.h
> +++ b/drivers/gpu/drm/msm/msm_mmu.h
> @@ -49,6 +49,9 @@ struct msm_mmu_funcs {
>       bool (*is_domain_secure)(struct msm_mmu *mmu);
>  };
>  
> +/* MMU features */
> +#define MMU_FEATURE_USE_SYSTEM_CACHE (1 << 0)
> +
>  struct msm_mmu {
>       const struct msm_mmu_funcs *funcs;
>       struct device *dev;
> -- 
> 1.9.1
> 

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [v2 7/7] drm/msm/a6xx: Add support for using system cache(LLC)

Reply via email to