[Intel-gfx] [PATCH 3/3] drm/i915/gtt: Setup phys pages for 3lvl pdps
If we setup backing phys page for 3lvl pdps, even they are not used, we lose 5 pages per ppgtt. Trading this memory on bsw, we gain more common code paths for all gen8+ directory manipulation. And those paths are now void of checks for page directory type, making the hot paths faster. v2: don't shortcut vm (Chris) Signed-off-by: Mika Kuoppala --- drivers/gpu/drm/i915/i915_gem_gtt.c | 77 +++-- 1 file changed, 50 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 84e119d7a5fc..b9422d592e8c 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -758,22 +758,14 @@ static struct i915_page_directory *alloc_pd(struct i915_address_space *vm) return pd; } -static inline bool pd_has_phys_page(const struct i915_page_directory * const pd) -{ - return pd->base.page; -} - static void free_pd(struct i915_address_space *vm, struct i915_page_directory *pd) { - if (likely(pd_has_phys_page(pd))) - cleanup_page_dma(vm, &pd->base); - + cleanup_page_dma(vm, &pd->base); kfree(pd); } #define init_pd(vm, pd, to) { \ - GEM_DEBUG_BUG_ON(!pd_has_phys_page(pd));\ fill_px((vm), (pd), gen8_pde_encode(px_dma(to), I915_CACHE_LLC)); \ memset_p((pd)->entry, (to), 512); \ } @@ -1595,6 +1587,50 @@ static void ppgtt_init(struct i915_ppgtt *ppgtt, struct intel_gt *gt) ppgtt->vm.vma_ops.clear_pages = clear_pages; } +static void init_pd_n(struct i915_address_space *vm, + struct i915_page_directory *pd, + struct i915_page_directory *to, + const unsigned int entries) +{ + const u64 daddr = gen8_pde_encode(px_dma(to), I915_CACHE_LLC); + u64 * const vaddr = kmap_atomic(pd->base.page); + + memset64(vaddr, daddr, entries); + kunmap_atomic(vaddr); + + memset_p(pd->entry, to, entries); +} + +static struct i915_page_directory * +gen8_alloc_top_pd(struct i915_address_space *vm) +{ + struct i915_page_directory *pd; + + if (i915_vm_is_4lvl(vm)) { + pd = alloc_pd(vm); + if (!IS_ERR(pd)) + init_pd(vm, pd, vm->scratch_pdp); + + return pd; + } + + /* 3lvl */ + pd = __alloc_pd(); + if (!pd) + return ERR_PTR(-ENOMEM); + + pd->entry[GEN8_3LVL_PDPES] = NULL; + + if (unlikely(setup_page_dma(vm, &pd->base))) { + kfree(pd); + return ERR_PTR(-ENOMEM); + } + + init_pd_n(vm, pd, vm->scratch_pd, GEN8_3LVL_PDPES); + + return pd; +} + /* * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP registers * with a net effect resembling a 2-level page table in normal x86 terms. Each @@ -1631,34 +1667,21 @@ static struct i915_ppgtt *gen8_ppgtt_create(struct drm_i915_private *i915) if (err) goto err_free; - ppgtt->pd = __alloc_pd(); - if (!ppgtt->pd) { - err = -ENOMEM; + ppgtt->pd = gen8_alloc_top_pd(&ppgtt->vm); + if (IS_ERR(ppgtt->pd)) { + err = PTR_ERR(ppgtt->pd); goto err_free_scratch; } if (i915_vm_is_4lvl(&ppgtt->vm)) { - err = setup_page_dma(&ppgtt->vm, &ppgtt->pd->base); - if (err) - goto err_free_pdp; - - init_pd(&ppgtt->vm, ppgtt->pd, ppgtt->vm.scratch_pdp); - ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc_4lvl; ppgtt->vm.insert_entries = gen8_ppgtt_insert_4lvl; ppgtt->vm.clear_range = gen8_ppgtt_clear_4lvl; } else { - /* -* We don't need to setup dma for top level pdp, only -* for entries. So point entries to scratch. -*/ - memset_p(ppgtt->pd->entry, ppgtt->vm.scratch_pd, -GEN8_3LVL_PDPES); - if (intel_vgpu_active(i915)) { err = gen8_preallocate_top_level_pdp(ppgtt); if (err) - goto err_free_pdp; + goto err_free_pd; } ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc_3lvl; @@ -1673,7 +1696,7 @@ static struct i915_ppgtt *gen8_ppgtt_create(struct drm_i915_private *i915) return ppgtt; -err_free_pdp: +err_free_pd: free_pd(&ppgtt->vm, ppgtt->pd); err_free_scratch: gen8_free_scratch(&ppgtt->vm); -- 2.17.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH 3/3] drm/i915/gtt: Setup phys pages for 3lvl pdps
Quoting Mika Kuoppala (2019-06-18 17:17:31) > If we setup backing phys page for 3lvl pdps, even they even though they > are not used, we lose 5 pages per ppgtt. > > Trading this memory on bsw, we gain more common code paths for all > gen8+ directory manipulation. And those paths are now void of checks > for page directory type, making the hot paths faster. > > Signed-off-by: Mika Kuoppala > --- > drivers/gpu/drm/i915/i915_gem_gtt.c | 106 +--- > 1 file changed, 66 insertions(+), 40 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c > b/drivers/gpu/drm/i915/i915_gem_gtt.c > index b521b1ddd19b..ea78302c6348 100644 > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c > @@ -715,22 +715,14 @@ static struct i915_page_directory *alloc_pd(struct > i915_address_space *vm) > return pd; > } > > -static inline bool pd_has_phys_page(const struct i915_page_directory * const > pd) > -{ > - return pd->base.page; > -} > - > static void free_pd(struct i915_address_space *vm, > struct i915_page_directory *pd) > { > - if (likely(pd_has_phys_page(pd))) > - cleanup_page_dma(vm, &pd->base); > - > + cleanup_page_dma(vm, &pd->base); > kfree(pd); > } > > #define init_pd(vm, pd, to) { \ > - GEM_DEBUG_BUG_ON(!pd_has_phys_page(pd));\ > fill_px((vm), (pd), gen8_pde_encode(px_dma(to), I915_CACHE_LLC)); \ > memset_p((pd)->entry, (to), 512); \ > } > @@ -1539,6 +1531,50 @@ static void ppgtt_init(struct drm_i915_private *i915, > ppgtt->vm.vma_ops.clear_pages = clear_pages; > } > > +static void init_pd_n(struct i915_address_space *vm, > + struct i915_page_directory *pd, > + struct i915_page_directory *to, > + const unsigned int entries) > +{ > + const u64 daddr = gen8_pde_encode(px_dma(to), I915_CACHE_LLC); > + u64 * const vaddr = kmap_atomic(pd->base.page); > + > + memset64(vaddr, daddr, entries); > + kunmap_atomic(vaddr); > + > + memset_p(pd->entry, to, entries); > +} > + > +static struct i915_page_directory * > +gen8_alloc_top_pd(struct i915_address_space *vm) > +{ > + struct i915_page_directory *pd; > + > + if (i915_vm_is_4lvl(vm)) { > + pd = alloc_pd(vm); > + if (!IS_ERR(pd)) > + init_pd(vm, pd, vm->scratch_pdp); > + > + return pd; > + } > + > + /* 3lvl */ > + pd = __alloc_pd(); > + if (!pd) > + return ERR_PTR(-ENOMEM); > + > + pd->entry[GEN8_3LVL_PDPES] = NULL; > + > + if (unlikely(setup_page_dma(vm, &pd->base))) { > + kfree(pd); > + return ERR_PTR(-ENOMEM); > + } > + > + init_pd_n(vm, pd, vm->scratch_pd, GEN8_3LVL_PDPES); > + > + return pd; > +} > + > /* > * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP > registers > * with a net effect resembling a 2-level page table in normal x86 terms. > Each > @@ -1548,6 +1584,7 @@ static void ppgtt_init(struct drm_i915_private *i915, > */ > static struct i915_ppgtt *gen8_ppgtt_create(struct drm_i915_private *i915) > { > + struct i915_address_space *vm; > struct i915_ppgtt *ppgtt; > int err; > > @@ -1557,70 +1594,59 @@ static struct i915_ppgtt *gen8_ppgtt_create(struct > drm_i915_private *i915) > > ppgtt_init(i915, ppgtt); > > + vm = &ppgtt->vm; Been having this debate with Tursulin, whether or not it is more confusing to have a local alias here. I think on reading it, it is much clearer that we are setting up one object if we use ppgtt->vm.foo than it is to alternate between ppgtt->foo and vm->bar. I'd suggest leaving it as ppgtt->vm.foo in this patch. -Chris ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 3/3] drm/i915/gtt: Setup phys pages for 3lvl pdps
If we setup backing phys page for 3lvl pdps, even they are not used, we lose 5 pages per ppgtt. Trading this memory on bsw, we gain more common code paths for all gen8+ directory manipulation. And those paths are now void of checks for page directory type, making the hot paths faster. Signed-off-by: Mika Kuoppala --- drivers/gpu/drm/i915/i915_gem_gtt.c | 106 +--- 1 file changed, 66 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index b521b1ddd19b..ea78302c6348 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -715,22 +715,14 @@ static struct i915_page_directory *alloc_pd(struct i915_address_space *vm) return pd; } -static inline bool pd_has_phys_page(const struct i915_page_directory * const pd) -{ - return pd->base.page; -} - static void free_pd(struct i915_address_space *vm, struct i915_page_directory *pd) { - if (likely(pd_has_phys_page(pd))) - cleanup_page_dma(vm, &pd->base); - + cleanup_page_dma(vm, &pd->base); kfree(pd); } #define init_pd(vm, pd, to) { \ - GEM_DEBUG_BUG_ON(!pd_has_phys_page(pd));\ fill_px((vm), (pd), gen8_pde_encode(px_dma(to), I915_CACHE_LLC)); \ memset_p((pd)->entry, (to), 512); \ } @@ -1539,6 +1531,50 @@ static void ppgtt_init(struct drm_i915_private *i915, ppgtt->vm.vma_ops.clear_pages = clear_pages; } +static void init_pd_n(struct i915_address_space *vm, + struct i915_page_directory *pd, + struct i915_page_directory *to, + const unsigned int entries) +{ + const u64 daddr = gen8_pde_encode(px_dma(to), I915_CACHE_LLC); + u64 * const vaddr = kmap_atomic(pd->base.page); + + memset64(vaddr, daddr, entries); + kunmap_atomic(vaddr); + + memset_p(pd->entry, to, entries); +} + +static struct i915_page_directory * +gen8_alloc_top_pd(struct i915_address_space *vm) +{ + struct i915_page_directory *pd; + + if (i915_vm_is_4lvl(vm)) { + pd = alloc_pd(vm); + if (!IS_ERR(pd)) + init_pd(vm, pd, vm->scratch_pdp); + + return pd; + } + + /* 3lvl */ + pd = __alloc_pd(); + if (!pd) + return ERR_PTR(-ENOMEM); + + pd->entry[GEN8_3LVL_PDPES] = NULL; + + if (unlikely(setup_page_dma(vm, &pd->base))) { + kfree(pd); + return ERR_PTR(-ENOMEM); + } + + init_pd_n(vm, pd, vm->scratch_pd, GEN8_3LVL_PDPES); + + return pd; +} + /* * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP registers * with a net effect resembling a 2-level page table in normal x86 terms. Each @@ -1548,6 +1584,7 @@ static void ppgtt_init(struct drm_i915_private *i915, */ static struct i915_ppgtt *gen8_ppgtt_create(struct drm_i915_private *i915) { + struct i915_address_space *vm; struct i915_ppgtt *ppgtt; int err; @@ -1557,70 +1594,59 @@ static struct i915_ppgtt *gen8_ppgtt_create(struct drm_i915_private *i915) ppgtt_init(i915, ppgtt); + vm = &ppgtt->vm; + /* * From bdw, there is hw support for read-only pages in the PPGTT. * * Gen11 has HSDES#:1807136187 unresolved. Disable ro support * for now. */ - ppgtt->vm.has_read_only = INTEL_GEN(i915) != 11; + vm->has_read_only = INTEL_GEN(i915) != 11; /* There are only few exceptions for gen >=6. chv and bxt. * And we are not sure about the latter so play safe for now. */ if (IS_CHERRYVIEW(i915) || IS_BROXTON(i915)) - ppgtt->vm.pt_kmap_wc = true; + vm->pt_kmap_wc = true; - err = gen8_init_scratch(&ppgtt->vm); + err = gen8_init_scratch(vm); if (err) goto err_free; - ppgtt->pd = __alloc_pd(); - if (!ppgtt->pd) { - err = -ENOMEM; + ppgtt->pd = gen8_alloc_top_pd(vm); + if (IS_ERR(ppgtt->pd)) { + err = PTR_ERR(ppgtt->pd); goto err_free_scratch; } - if (i915_vm_is_4lvl(&ppgtt->vm)) { - err = setup_page_dma(&ppgtt->vm, &ppgtt->pd->base); - if (err) - goto err_free_pdp; - - init_pd(&ppgtt->vm, ppgtt->pd, ppgtt->vm.scratch_pdp); - - ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc_4lvl; - ppgtt->vm.insert_entries = gen8_ppgtt_insert_4lvl; - ppgtt->vm.clear_range = gen8_ppgtt_clear_4lvl; + if (i915_vm_is_4lvl(vm)) { + vm->allocate_va_range = gen8_ppgtt_alloc_4lvl; + vm->insert_entries = gen8_ppgtt_insert_4lvl; + vm->clear_range = gen8_ppgtt_clear_4lvl;