Re: [Intel-gfx] [PATCH 12/13] drm/i915: Async GPU relocation processing

2017-04-03 Thread Joonas Lahtinen
On ke, 2017-03-29 at 16:56 +0100, Chris Wilson wrote:
> If the user requires patching of their batch or auxiliary buffers, we
> currently make the alterations on the cpu. If they are active on the GPU
> at the time, we wait under the struct_mutex for them to finish executing
> before we rewrite the contents. This happens if shared relocation trees
> are used between different contexts with separate address space (and the
> buffers then have different addresses in each), the 3D state will need
> to be adjusted between execution on each context. However, we don't need
> to use the CPU to do the relocation patching, as we could queue commands
> to the GPU to perform it and use fences to serialise the operation with
> the current activity and future - so the operation on the GPU appears
> just as atomic as performing it immediately. Performing the relocation
> rewrites on the GPU is not free, in terms of pure throughput, the number
> of relocations/s is about halved - but more importantly so is the time
> under the struct_mutex.
> 
> v2: Break out the request/batch allocation for clearer error flow.
> 
> Signed-off-by: Chris Wilson 



>  static void reloc_cache_reset(struct reloc_cache *cache)
>  {
>   void *vaddr;
>  
> + if (cache->rq)
> + reloc_gpu_flush(cache);

An odd place to do the flush, I was expecting GEM_BUG_ON(cache->rq);

The instruction generation I've gone through in one spot in the code,
no intention going over it more times.

Reviewed-by: Joonas Lahtinen 

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH 12/13] drm/i915: Async GPU relocation processing

2017-03-29 Thread Chris Wilson
If the user requires patching of their batch or auxiliary buffers, we
currently make the alterations on the cpu. If they are active on the GPU
at the time, we wait under the struct_mutex for them to finish executing
before we rewrite the contents. This happens if shared relocation trees
are used between different contexts with separate address space (and the
buffers then have different addresses in each), the 3D state will need
to be adjusted between execution on each context. However, we don't need
to use the CPU to do the relocation patching, as we could queue commands
to the GPU to perform it and use fences to serialise the operation with
the current activity and future - so the operation on the GPU appears
just as atomic as performing it immediately. Performing the relocation
rewrites on the GPU is not free, in terms of pure throughput, the number
of relocations/s is about halved - but more importantly so is the time
under the struct_mutex.

v2: Break out the request/batch allocation for clearer error flow.

Signed-off-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_gem.c|   1 -
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 223 -
 2 files changed, 217 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f800114279d3..10f2d26cb2a9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4263,7 +4263,6 @@ static void __i915_gem_free_objects(struct 
drm_i915_private *i915,
GEM_BUG_ON(i915_gem_object_is_active(obj));
list_for_each_entry_safe(vma, vn,
 >vma_list, obj_link) {
-   GEM_BUG_ON(!i915_vma_is_ggtt(vma));
GEM_BUG_ON(i915_vma_is_active(vma));
vma->flags &= ~I915_VMA_PIN_MASK;
i915_vma_close(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 4d703e331c90..017e27b7c300 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -40,7 +40,12 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
 
-#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
+enum {
+   FORCE_CPU_RELOC = 1,
+   FORCE_GTT_RELOC,
+   FORCE_GPU_RELOC,
+#define DBG_FORCE_RELOC 0 /* choose one of the above! */
+};
 
 #define  __EXEC_OBJECT_HAS_PIN BIT(31)
 #define  __EXEC_OBJECT_HAS_FENCE   BIT(30)
@@ -187,10 +192,15 @@ struct i915_execbuffer {
struct drm_mm_node node;
unsigned long vaddr;
unsigned int page;
+   unsigned int gen;
bool use_64bit_reloc : 1;
bool has_llc : 1;
bool has_fence : 1;
bool needs_unfenced : 1;
+
+   struct drm_i915_gem_request *rq;
+   u32 *rq_cmd;
+   unsigned int rq_size;
} reloc_cache;
u64 invalid_flags;
u32 context_flags;
@@ -441,8 +451,11 @@ static inline int use_cpu_reloc(const struct reloc_cache 
*cache,
if (!i915_gem_object_has_struct_page(obj))
return false;
 
-   if (DBG_USE_CPU_RELOC)
-   return DBG_USE_CPU_RELOC > 0;
+   if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
+   return true;
+
+   if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
+   return false;
 
return (cache->has_llc ||
obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
@@ -833,11 +846,14 @@ static void reloc_cache_init(struct reloc_cache *cache,
cache->page = -1;
cache->vaddr = 0;
/* Must be a variable in the struct to allow GCC to unroll. */
+   cache->gen = INTEL_GEN(i915);
cache->has_llc = HAS_LLC(i915);
-   cache->has_fence = INTEL_GEN(i915) < 4;
-   cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
+   cache->has_fence = cache->gen < 4;
+   cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
cache->node.allocated = false;
+   cache->rq = NULL;
+   cache->rq_size = 0;
 }
 
 static inline void *unmask_page(unsigned long p)
@@ -859,10 +875,24 @@ static inline struct i915_ggtt *cache_to_ggtt(struct 
reloc_cache *cache)
return >ggtt;
 }
 
+static void reloc_gpu_flush(struct reloc_cache *cache)
+{
+   GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / 
sizeof(u32));
+   cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
+   i915_gem_object_unpin_map(cache->rq->batch->obj);
+   i915_gem_chipset_flush(cache->rq->i915);
+
+   __i915_add_request(cache->rq, true);
+   cache->rq = NULL;
+}
+
 static void reloc_cache_reset(struct reloc_cache *cache)
 {
void *vaddr;
 
+   if (cache->rq)
+