[Intel-gfx] [PATCH 3/3] drm/i915/gtt: Setup phys pages for 3lvl pdps

2019-07-04 Thread Mika Kuoppala
If we setup backing phys page for 3lvl pdps, even they
are not used, we lose 5 pages per ppgtt.

Trading this memory on bsw, we gain more common code paths for all
gen8+ directory manipulation. And those paths are now void of checks
for page directory type, making the hot paths faster.

v2: don't shortcut vm (Chris)

Signed-off-by: Mika Kuoppala 
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 77 +++--
 1 file changed, 50 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 84e119d7a5fc..b9422d592e8c 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -758,22 +758,14 @@ static struct i915_page_directory *alloc_pd(struct 
i915_address_space *vm)
return pd;
 }
 
-static inline bool pd_has_phys_page(const struct i915_page_directory * const 
pd)
-{
-   return pd->base.page;
-}
-
 static void free_pd(struct i915_address_space *vm,
struct i915_page_directory *pd)
 {
-   if (likely(pd_has_phys_page(pd)))
-   cleanup_page_dma(vm, &pd->base);
-
+   cleanup_page_dma(vm, &pd->base);
kfree(pd);
 }
 
 #define init_pd(vm, pd, to) {  \
-   GEM_DEBUG_BUG_ON(!pd_has_phys_page(pd));\
fill_px((vm), (pd), gen8_pde_encode(px_dma(to), I915_CACHE_LLC)); \
memset_p((pd)->entry, (to), 512);   \
 }
@@ -1595,6 +1587,50 @@ static void ppgtt_init(struct i915_ppgtt *ppgtt, struct 
intel_gt *gt)
ppgtt->vm.vma_ops.clear_pages = clear_pages;
 }
 
+static void init_pd_n(struct i915_address_space *vm,
+ struct i915_page_directory *pd,
+ struct i915_page_directory *to,
+ const unsigned int entries)
+{
+   const u64 daddr = gen8_pde_encode(px_dma(to), I915_CACHE_LLC);
+   u64 * const vaddr = kmap_atomic(pd->base.page);
+
+   memset64(vaddr, daddr, entries);
+   kunmap_atomic(vaddr);
+
+   memset_p(pd->entry, to, entries);
+}
+
+static struct i915_page_directory *
+gen8_alloc_top_pd(struct i915_address_space *vm)
+{
+   struct i915_page_directory *pd;
+
+   if (i915_vm_is_4lvl(vm)) {
+   pd = alloc_pd(vm);
+   if (!IS_ERR(pd))
+   init_pd(vm, pd, vm->scratch_pdp);
+
+   return pd;
+   }
+
+   /* 3lvl */
+   pd = __alloc_pd();
+   if (!pd)
+   return ERR_PTR(-ENOMEM);
+
+   pd->entry[GEN8_3LVL_PDPES] = NULL;
+
+   if (unlikely(setup_page_dma(vm, &pd->base))) {
+   kfree(pd);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   init_pd_n(vm, pd, vm->scratch_pd, GEN8_3LVL_PDPES);
+
+   return pd;
+}
+
 /*
  * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP registers
  * with a net effect resembling a 2-level page table in normal x86 terms. Each
@@ -1631,34 +1667,21 @@ static struct i915_ppgtt *gen8_ppgtt_create(struct 
drm_i915_private *i915)
if (err)
goto err_free;
 
-   ppgtt->pd = __alloc_pd();
-   if (!ppgtt->pd) {
-   err = -ENOMEM;
+   ppgtt->pd = gen8_alloc_top_pd(&ppgtt->vm);
+   if (IS_ERR(ppgtt->pd)) {
+   err = PTR_ERR(ppgtt->pd);
goto err_free_scratch;
}
 
if (i915_vm_is_4lvl(&ppgtt->vm)) {
-   err = setup_page_dma(&ppgtt->vm, &ppgtt->pd->base);
-   if (err)
-   goto err_free_pdp;
-
-   init_pd(&ppgtt->vm, ppgtt->pd, ppgtt->vm.scratch_pdp);
-
ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc_4lvl;
ppgtt->vm.insert_entries = gen8_ppgtt_insert_4lvl;
ppgtt->vm.clear_range = gen8_ppgtt_clear_4lvl;
} else {
-   /*
-* We don't need to setup dma for top level pdp, only
-* for entries. So point entries to scratch.
-*/
-   memset_p(ppgtt->pd->entry, ppgtt->vm.scratch_pd,
-GEN8_3LVL_PDPES);
-
if (intel_vgpu_active(i915)) {
err = gen8_preallocate_top_level_pdp(ppgtt);
if (err)
-   goto err_free_pdp;
+   goto err_free_pd;
}
 
ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc_3lvl;
@@ -1673,7 +1696,7 @@ static struct i915_ppgtt *gen8_ppgtt_create(struct 
drm_i915_private *i915)
 
return ppgtt;
 
-err_free_pdp:
+err_free_pd:
free_pd(&ppgtt->vm, ppgtt->pd);
 err_free_scratch:
gen8_free_scratch(&ppgtt->vm);
-- 
2.17.1

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH 3/3] drm/i915/gtt: Setup phys pages for 3lvl pdps

2019-06-18 Thread Chris Wilson
Quoting Mika Kuoppala (2019-06-18 17:17:31)
> If we setup backing phys page for 3lvl pdps, even they
   even though they
> are not used, we lose 5 pages per ppgtt.
> 
> Trading this memory on bsw, we gain more common code paths for all
> gen8+ directory manipulation. And those paths are now void of checks
> for page directory type, making the hot paths faster.
> 
> Signed-off-by: Mika Kuoppala 
> ---
>  drivers/gpu/drm/i915/i915_gem_gtt.c | 106 +---
>  1 file changed, 66 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index b521b1ddd19b..ea78302c6348 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -715,22 +715,14 @@ static struct i915_page_directory *alloc_pd(struct 
> i915_address_space *vm)
> return pd;
>  }
>  
> -static inline bool pd_has_phys_page(const struct i915_page_directory * const 
> pd)
> -{
> -   return pd->base.page;
> -}
> -
>  static void free_pd(struct i915_address_space *vm,
> struct i915_page_directory *pd)
>  {
> -   if (likely(pd_has_phys_page(pd)))
> -   cleanup_page_dma(vm, &pd->base);
> -
> +   cleanup_page_dma(vm, &pd->base);
> kfree(pd);
>  }
>  
>  #define init_pd(vm, pd, to) {  \
> -   GEM_DEBUG_BUG_ON(!pd_has_phys_page(pd));\
> fill_px((vm), (pd), gen8_pde_encode(px_dma(to), I915_CACHE_LLC)); \
> memset_p((pd)->entry, (to), 512);   \
>  }
> @@ -1539,6 +1531,50 @@ static void ppgtt_init(struct drm_i915_private *i915,
> ppgtt->vm.vma_ops.clear_pages = clear_pages;
>  }
>  
> +static void init_pd_n(struct i915_address_space *vm,
> + struct i915_page_directory *pd,
> + struct i915_page_directory *to,
> + const unsigned int entries)
> +{
> +   const u64 daddr = gen8_pde_encode(px_dma(to), I915_CACHE_LLC);
> +   u64 * const vaddr = kmap_atomic(pd->base.page);
> +
> +   memset64(vaddr, daddr, entries);
> +   kunmap_atomic(vaddr);
> +
> +   memset_p(pd->entry, to, entries);
> +}
> +
> +static struct i915_page_directory *
> +gen8_alloc_top_pd(struct i915_address_space *vm)
> +{
> +   struct i915_page_directory *pd;
> +
> +   if (i915_vm_is_4lvl(vm)) {
> +   pd = alloc_pd(vm);
> +   if (!IS_ERR(pd))
> +   init_pd(vm, pd, vm->scratch_pdp);
> +
> +   return pd;
> +   }
> +
> +   /* 3lvl */
> +   pd = __alloc_pd();
> +   if (!pd)
> +   return ERR_PTR(-ENOMEM);
> +
> +   pd->entry[GEN8_3LVL_PDPES] = NULL;
> +
> +   if (unlikely(setup_page_dma(vm, &pd->base))) {
> +   kfree(pd);
> +   return ERR_PTR(-ENOMEM);
> +   }
> +
> +   init_pd_n(vm, pd, vm->scratch_pd, GEN8_3LVL_PDPES);
> +
> +   return pd;
> +}
> +
>  /*
>   * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP 
> registers
>   * with a net effect resembling a 2-level page table in normal x86 terms. 
> Each
> @@ -1548,6 +1584,7 @@ static void ppgtt_init(struct drm_i915_private *i915,
>   */
>  static struct i915_ppgtt *gen8_ppgtt_create(struct drm_i915_private *i915)
>  {
> +   struct i915_address_space *vm;
> struct i915_ppgtt *ppgtt;
> int err;
>  
> @@ -1557,70 +1594,59 @@ static struct i915_ppgtt *gen8_ppgtt_create(struct 
> drm_i915_private *i915)
>  
> ppgtt_init(i915, ppgtt);
>  
> +   vm = &ppgtt->vm;

Been having this debate with Tursulin, whether or not it is more
confusing to have a local alias here. I think on reading it, it is much
clearer that we are setting up one object if we use ppgtt->vm.foo than
it is to alternate between ppgtt->foo and vm->bar.

I'd suggest leaving it as ppgtt->vm.foo in this patch.
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

[Intel-gfx] [PATCH 3/3] drm/i915/gtt: Setup phys pages for 3lvl pdps

2019-06-18 Thread Mika Kuoppala
If we setup backing phys page for 3lvl pdps, even they
are not used, we lose 5 pages per ppgtt.

Trading this memory on bsw, we gain more common code paths for all
gen8+ directory manipulation. And those paths are now void of checks
for page directory type, making the hot paths faster.

Signed-off-by: Mika Kuoppala 
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 106 +---
 1 file changed, 66 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
b/drivers/gpu/drm/i915/i915_gem_gtt.c
index b521b1ddd19b..ea78302c6348 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -715,22 +715,14 @@ static struct i915_page_directory *alloc_pd(struct 
i915_address_space *vm)
return pd;
 }
 
-static inline bool pd_has_phys_page(const struct i915_page_directory * const 
pd)
-{
-   return pd->base.page;
-}
-
 static void free_pd(struct i915_address_space *vm,
struct i915_page_directory *pd)
 {
-   if (likely(pd_has_phys_page(pd)))
-   cleanup_page_dma(vm, &pd->base);
-
+   cleanup_page_dma(vm, &pd->base);
kfree(pd);
 }
 
 #define init_pd(vm, pd, to) {  \
-   GEM_DEBUG_BUG_ON(!pd_has_phys_page(pd));\
fill_px((vm), (pd), gen8_pde_encode(px_dma(to), I915_CACHE_LLC)); \
memset_p((pd)->entry, (to), 512);   \
 }
@@ -1539,6 +1531,50 @@ static void ppgtt_init(struct drm_i915_private *i915,
ppgtt->vm.vma_ops.clear_pages = clear_pages;
 }
 
+static void init_pd_n(struct i915_address_space *vm,
+ struct i915_page_directory *pd,
+ struct i915_page_directory *to,
+ const unsigned int entries)
+{
+   const u64 daddr = gen8_pde_encode(px_dma(to), I915_CACHE_LLC);
+   u64 * const vaddr = kmap_atomic(pd->base.page);
+
+   memset64(vaddr, daddr, entries);
+   kunmap_atomic(vaddr);
+
+   memset_p(pd->entry, to, entries);
+}
+
+static struct i915_page_directory *
+gen8_alloc_top_pd(struct i915_address_space *vm)
+{
+   struct i915_page_directory *pd;
+
+   if (i915_vm_is_4lvl(vm)) {
+   pd = alloc_pd(vm);
+   if (!IS_ERR(pd))
+   init_pd(vm, pd, vm->scratch_pdp);
+
+   return pd;
+   }
+
+   /* 3lvl */
+   pd = __alloc_pd();
+   if (!pd)
+   return ERR_PTR(-ENOMEM);
+
+   pd->entry[GEN8_3LVL_PDPES] = NULL;
+
+   if (unlikely(setup_page_dma(vm, &pd->base))) {
+   kfree(pd);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   init_pd_n(vm, pd, vm->scratch_pd, GEN8_3LVL_PDPES);
+
+   return pd;
+}
+
 /*
  * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP registers
  * with a net effect resembling a 2-level page table in normal x86 terms. Each
@@ -1548,6 +1584,7 @@ static void ppgtt_init(struct drm_i915_private *i915,
  */
 static struct i915_ppgtt *gen8_ppgtt_create(struct drm_i915_private *i915)
 {
+   struct i915_address_space *vm;
struct i915_ppgtt *ppgtt;
int err;
 
@@ -1557,70 +1594,59 @@ static struct i915_ppgtt *gen8_ppgtt_create(struct 
drm_i915_private *i915)
 
ppgtt_init(i915, ppgtt);
 
+   vm = &ppgtt->vm;
+
/*
 * From bdw, there is hw support for read-only pages in the PPGTT.
 *
 * Gen11 has HSDES#:1807136187 unresolved. Disable ro support
 * for now.
 */
-   ppgtt->vm.has_read_only = INTEL_GEN(i915) != 11;
+   vm->has_read_only = INTEL_GEN(i915) != 11;
 
/* There are only few exceptions for gen >=6. chv and bxt.
 * And we are not sure about the latter so play safe for now.
 */
if (IS_CHERRYVIEW(i915) || IS_BROXTON(i915))
-   ppgtt->vm.pt_kmap_wc = true;
+   vm->pt_kmap_wc = true;
 
-   err = gen8_init_scratch(&ppgtt->vm);
+   err = gen8_init_scratch(vm);
if (err)
goto err_free;
 
-   ppgtt->pd = __alloc_pd();
-   if (!ppgtt->pd) {
-   err = -ENOMEM;
+   ppgtt->pd = gen8_alloc_top_pd(vm);
+   if (IS_ERR(ppgtt->pd)) {
+   err = PTR_ERR(ppgtt->pd);
goto err_free_scratch;
}
 
-   if (i915_vm_is_4lvl(&ppgtt->vm)) {
-   err = setup_page_dma(&ppgtt->vm, &ppgtt->pd->base);
-   if (err)
-   goto err_free_pdp;
-
-   init_pd(&ppgtt->vm, ppgtt->pd, ppgtt->vm.scratch_pdp);
-
-   ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc_4lvl;
-   ppgtt->vm.insert_entries = gen8_ppgtt_insert_4lvl;
-   ppgtt->vm.clear_range = gen8_ppgtt_clear_4lvl;
+   if (i915_vm_is_4lvl(vm)) {
+   vm->allocate_va_range = gen8_ppgtt_alloc_4lvl;
+   vm->insert_entries = gen8_ppgtt_insert_4lvl;
+   vm->clear_range = gen8_ppgtt_clear_4lvl;