[Intel-gfx] [PATCH v2 10/24] drm/i915: Track GEN6 page table usage

2014-12-23 Thread Michel Thierry
From: Ben Widawsky 

Instead of implementing the full tracking + dynamic allocation, this
patch does a bit less than half of the work, by tracking and warning on
unexpected conditions. The tracking itself follows which PTEs within a
page table are currently being used for objects. The next patch will
modify this to actually allocate the page tables only when necessary.

With the current patch there isn't much in the way of making a gen
agnostic range allocation function. However, in the next patch we'll add
more specificity which makes having separate functions a bit easier to
manage.

One important change introduced here is that DMA mappings are
created/destroyed at the same page directories/tables are
allocated/deallocated.

Notice that aliasing PPGTT is not managed here. The patch which actually
begins dynamic allocation/teardown explains the reasoning for this.

v2: s/pdp.pagedir/pdp.pagedirs
Make a scratch page allocation helper

v3: Rebase and expand commit message.

v4: Allocate required pagetables only when it is needed, _bind_to_vm
instead of bind_vma (Daniel).

Cc: Daniel Vetter 
Signed-off-by: Ben Widawsky 
Signed-off-by: Michel Thierry  (v3+)
---
 drivers/gpu/drm/i915/i915_gem.c |   9 ++
 drivers/gpu/drm/i915/i915_gem_gtt.c | 277 ++--
 drivers/gpu/drm/i915/i915_gem_gtt.h | 149 ++-
 3 files changed, 322 insertions(+), 113 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2b6ecfd..5d52990 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3597,6 +3597,15 @@ search_free:
if (ret)
goto err_remove_node;
 
+   /*  allocate before insert / bind */
+   if (vma->vm->allocate_va_range) {
+   ret = vma->vm->allocate_va_range(vma->vm,
+   vma->node.start,
+   vma->node.size);
+   if (ret)
+   goto err_remove_node;
+   }
+
trace_i915_vma_bind(vma, flags);
ret = i915_vma_bind(vma, obj->cache_level,
flags & PIN_GLOBAL ? GLOBAL_BIND : 0);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 52bdde7..313432e 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -138,10 +138,9 @@ static int sanitize_enable_ppgtt(struct drm_device *dev, 
int enable_ppgtt)
return has_aliasing_ppgtt ? 1 : 0;
 }
 
-
 static void ppgtt_bind_vma(struct i915_vma *vma,
-  enum i915_cache_level cache_level,
-  u32 flags);
+ enum i915_cache_level cache_level,
+ u32 flags);
 static void ppgtt_unbind_vma(struct i915_vma *vma);
 
 static inline gen8_gtt_pte_t gen8_pte_encode(dma_addr_t addr,
@@ -275,27 +274,99 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
return pte;
 }
 
-static void free_pt_single(struct i915_pagetab *pt)
-{
+#define i915_dma_unmap_single(px, dev) do { \
+   pci_unmap_page((dev)->pdev, (px)->daddr, 4096, PCI_DMA_BIDIRECTIONAL); \
+} while (0);
+
+/**
+ * i915_dma_map_px_single() - Create a dma mapping for a page table/dir/etc.
+ * @px:Page table/dir/etc to get a DMA map for
+ * @dev:   drm device
+ *
+ * Page table allocations are unified across all gens. They always require a
+ * single 4k allocation, as well as a DMA mapping. If we keep the structs
+ * symmetric here, the simple macro covers us for every page table type.
+ *
+ * Return: 0 if success.
+ */
+#define i915_dma_map_px_single(px, dev) \
+   pci_dma_mapping_error((dev)->pdev, \
+ (px)->daddr = pci_map_page((dev)->pdev, \
+(px)->page, 0, 4096, \
+PCI_DMA_BIDIRECTIONAL))
+
+static void __free_pt_single(struct i915_pagetab *pt, struct drm_device *dev,
+int scratch)
+{
+   if (WARN(scratch ^ pt->scratch,
+"Tried to free scratch = %d. Is scratch = %d\n",
+scratch, pt->scratch))
+   return;
+
if (WARN_ON(!pt->page))
return;
+
+   if (!scratch) {
+   const size_t count = INTEL_INFO(dev)->gen >= 8 ?
+   GEN8_PTES_PER_PAGE : I915_PPGTT_PT_ENTRIES;
+   WARN(!bitmap_empty(pt->used_ptes, count),
+"Free page table with %d used pages\n",
+bitmap_weight(pt->used_ptes, count));
+   }
+
+   i915_dma_unmap_single(pt, dev);
__free_page(pt->page);
+   kfree(pt->used_ptes);
kfree(pt);
 }
 
-static struct i915_pagetab *alloc_pt_single(void)
+#define free_pt_single(pt, dev) \
+   __free_pt_single(pt, dev, false)
+#define free_pt_scratch(pt, dev) \
+   

Re: [Intel-gfx] [PATCH v2 10/24] drm/i915: Track GEN6 page table usage

2015-01-05 Thread Daniel Vetter
On Tue, Dec 23, 2014 at 05:16:13PM +, Michel Thierry wrote:
> From: Ben Widawsky 
> 
> Instead of implementing the full tracking + dynamic allocation, this
> patch does a bit less than half of the work, by tracking and warning on
> unexpected conditions. The tracking itself follows which PTEs within a
> page table are currently being used for objects. The next patch will
> modify this to actually allocate the page tables only when necessary.
> 
> With the current patch there isn't much in the way of making a gen
> agnostic range allocation function. However, in the next patch we'll add
> more specificity which makes having separate functions a bit easier to
> manage.
> 
> One important change introduced here is that DMA mappings are
> created/destroyed at the same page directories/tables are
> allocated/deallocated.
> 
> Notice that aliasing PPGTT is not managed here. The patch which actually
> begins dynamic allocation/teardown explains the reasoning for this.
> 
> v2: s/pdp.pagedir/pdp.pagedirs
> Make a scratch page allocation helper
> 
> v3: Rebase and expand commit message.
> 
> v4: Allocate required pagetables only when it is needed, _bind_to_vm
> instead of bind_vma (Daniel).
> 
> Cc: Daniel Vetter 
> Signed-off-by: Ben Widawsky 
> Signed-off-by: Michel Thierry  (v3+)

Imo still a bit too much rebase fluff in this patch. I think it would help
the patch clarity a lot of we'd split the changes to move around the
dma_map/unmap calls from the other parts of the patch to dynamically
allocate pagetables.

Bunch more comments below.
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_gem.c |   9 ++
>  drivers/gpu/drm/i915/i915_gem_gtt.c | 277 
> ++--
>  drivers/gpu/drm/i915/i915_gem_gtt.h | 149 ++-
>  3 files changed, 322 insertions(+), 113 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2b6ecfd..5d52990 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3597,6 +3597,15 @@ search_free:
>   if (ret)
>   goto err_remove_node;
>  
> + /*  allocate before insert / bind */
> + if (vma->vm->allocate_va_range) {
> + ret = vma->vm->allocate_va_range(vma->vm,
> + vma->node.start,
> + vma->node.size);
> + if (ret)
> + goto err_remove_node;
> + }

Is this really the right patch for this hunk? The commit message sounds
like dynamic pagetable alloc is only partially implemented here ...

> +
>   trace_i915_vma_bind(vma, flags);
>   ret = i915_vma_bind(vma, obj->cache_level,
>   flags & PIN_GLOBAL ? GLOBAL_BIND : 0);
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 52bdde7..313432e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -138,10 +138,9 @@ static int sanitize_enable_ppgtt(struct drm_device *dev, 
> int enable_ppgtt)
>   return has_aliasing_ppgtt ? 1 : 0;
>  }
>  
> -
>  static void ppgtt_bind_vma(struct i915_vma *vma,
> -enum i915_cache_level cache_level,
> -u32 flags);
> +   enum i915_cache_level cache_level,
> +   u32 flags);
>  static void ppgtt_unbind_vma(struct i915_vma *vma);
>  
>  static inline gen8_gtt_pte_t gen8_pte_encode(dma_addr_t addr,
> @@ -275,27 +274,99 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
>   return pte;
>  }
>  
> -static void free_pt_single(struct i915_pagetab *pt)
> -{
> +#define i915_dma_unmap_single(px, dev) do { \
> + pci_unmap_page((dev)->pdev, (px)->daddr, 4096, PCI_DMA_BIDIRECTIONAL); \
> +} while (0);
> +
> +/**
> + * i915_dma_map_px_single() - Create a dma mapping for a page table/dir/etc.
> + * @px:  Page table/dir/etc to get a DMA map for
> + * @dev: drm device
> + *
> + * Page table allocations are unified across all gens. They always require a
> + * single 4k allocation, as well as a DMA mapping. If we keep the structs
> + * symmetric here, the simple macro covers us for every page table type.
> + *
> + * Return: 0 if success.
> + */
> +#define i915_dma_map_px_single(px, dev) \
> + pci_dma_mapping_error((dev)->pdev, \
> +   (px)->daddr = pci_map_page((dev)->pdev, \
> +  (px)->page, 0, 4096, \
> +  PCI_DMA_BIDIRECTIONAL))

Linux coding style discourages macro abuse like this, please make this a
static inline instead. Otoh I don't really see the value in hiding the
pci_map_page call, imo open-codeing this is totally ok.

But while you touch the code please switch away from the pci_map wrappers
and use the dma_map functions directly.

> +
> +static void __free_pt_single(struct i9