Re: [Intel-gfx] [PATCH v2 2/4] drm/i915: Add provision to extend Golden context batch

2015-07-20 Thread Siluvery, Arun

On 17/07/2015 21:03, Chris Wilson wrote:

On Fri, Jul 17, 2015 at 07:13:32PM +0100, Arun Siluvery wrote:

The Golden batch carries 3D state at the beginning so that HW starts with
a known state. It is carried as a binary blob which is auto-generated from
source. The idea was it would be easier to maintain and keep the complexity
out of the kernel which makes sense as we don't really touch it. However if
you really need to update it then you need to update generator source and
keep the binary blob in sync with it.

There is a need to patch this in bxt to send one additional command to enable
a feature. A solution was to patch the binary data with some additional
data structures (included as part of auto-generator source) but it was
unnecessarily complicated.

Chris suggested the idea of having a secondary batch and execute two batch
buffers. It has clear advantages as we needn't touch the base golden batch,
can customize secondary/auxiliary batch depending on Gen and can be carried
in the driver with no dependencies.

This patch adds support for this auxiliary batch which is inserted at the
end of golden batch and is completely independent from it. Thanks to Mika
for the preliminary review.

v2: Strictly conform to the batch size requirements to cover Gen2 and
add comments to clarify overflow check in macro (Chris, Mika).

Cc: Mika Kuoppala mika.kuopp...@intel.com
Cc: Chris Wilson ch...@chris-wilson.co.uk
Cc: Armin Reese armin.c.re...@intel.com
Signed-off-by: Arun Siluvery arun.siluv...@linux.intel.com
---
  drivers/gpu/drm/i915/i915_gem_render_state.c | 45 
  drivers/gpu/drm/i915/i915_gem_render_state.h |  2 ++
  drivers/gpu/drm/i915/intel_lrc.c |  6 
  3 files changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c 
b/drivers/gpu/drm/i915/i915_gem_render_state.c
index b6492fe..5026a62 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.c
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.c
@@ -73,6 +73,24 @@ free_gem:
return ret;
  }

+/*
+ * Macro to add commands to auxiliary batch.
+ * This macro only checks for page overflow before inserting the commands,
+ * this is sufficient as the null state generator makes the final batch
+ * with two passes to build command and state separately. At this point
+ * the size of both are known and it compacts them by relocating the state
+ * right after the commands taking care of aligment so we should sufficient
+ * space below them for adding new commands.
+ */
+#define OUT_BATCH(batch, i, val)   \
+   do {\
+   if (WARN_ON((i) = PAGE_SIZE / sizeof(u32))) {   \
+   ret = -ENOSPC;  \
+   goto err_out;   \
+   }   \
+   (batch)[(i)++] = (val); \
+   } while(0)
+
  static int render_state_setup(struct render_state *so)
  {
const struct intel_renderstate_rodata *rodata = so-rodata;
@@ -110,6 +128,21 @@ static int render_state_setup(struct render_state *so)

d[i++] = s;
}
+
+   while (i % CACHELINE_DWORDS)
+   OUT_BATCH(d, i, MI_NOOP);
+
+   so-aux_batch_offset = i * sizeof(u32);
+
+   OUT_BATCH(d, i, MI_BATCH_BUFFER_END);
+   so-aux_batch_size = (i * sizeof(u32)) - so-aux_batch_offset;
+
+   /*
+* Since we are sending length, we need to strictly conform to
+* all requirements. For Gen2 this must be a multiple of 8.
+*/
+   so-aux_batch_size = ALIGN(so-aux_batch_size, 8);
+
kunmap(page);

ret = i915_gem_object_set_to_gtt_domain(so-obj, false);
@@ -128,6 +161,8 @@ err_out:
return ret;
  }

+#undef OUT_BATCH
+
  void i915_gem_render_state_fini(struct render_state *so)
  {
i915_gem_object_ggtt_unpin(so-obj);
@@ -176,6 +211,16 @@ int i915_gem_render_state_init(struct drm_i915_gem_request 
*req)
if (ret)
goto out;

+   if (so.aux_batch_size  8) {
+   ret = req-ring-dispatch_execbuffer(req,
+(so.ggtt_offset +
+ so.aux_batch_offset),
+so.aux_batch_size,
+I915_DISPATCH_SECURE);
+   if (ret)
+   goto out;
+   }
+
i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req);

  out:
diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.h 
b/drivers/gpu/drm/i915/i915_gem_render_state.h
index 7aa7372..79de101 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.h
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.h
@@ -37,6 +37,8 @@ struct render_state {
struct drm_i915_gem_object *obj;
u64 

[Intel-gfx] [PATCH v2 2/4] drm/i915: Add provision to extend Golden context batch

2015-07-17 Thread Arun Siluvery
The Golden batch carries 3D state at the beginning so that HW starts with
a known state. It is carried as a binary blob which is auto-generated from
source. The idea was it would be easier to maintain and keep the complexity
out of the kernel which makes sense as we don't really touch it. However if
you really need to update it then you need to update generator source and
keep the binary blob in sync with it.

There is a need to patch this in bxt to send one additional command to enable
a feature. A solution was to patch the binary data with some additional
data structures (included as part of auto-generator source) but it was
unnecessarily complicated.

Chris suggested the idea of having a secondary batch and execute two batch
buffers. It has clear advantages as we needn't touch the base golden batch,
can customize secondary/auxiliary batch depending on Gen and can be carried
in the driver with no dependencies.

This patch adds support for this auxiliary batch which is inserted at the
end of golden batch and is completely independent from it. Thanks to Mika
for the preliminary review.

v2: Strictly conform to the batch size requirements to cover Gen2 and
add comments to clarify overflow check in macro (Chris, Mika).

Cc: Mika Kuoppala mika.kuopp...@intel.com
Cc: Chris Wilson ch...@chris-wilson.co.uk
Cc: Armin Reese armin.c.re...@intel.com
Signed-off-by: Arun Siluvery arun.siluv...@linux.intel.com
---
 drivers/gpu/drm/i915/i915_gem_render_state.c | 45 
 drivers/gpu/drm/i915/i915_gem_render_state.h |  2 ++
 drivers/gpu/drm/i915/intel_lrc.c |  6 
 3 files changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c 
b/drivers/gpu/drm/i915/i915_gem_render_state.c
index b6492fe..5026a62 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.c
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.c
@@ -73,6 +73,24 @@ free_gem:
return ret;
 }
 
+/*
+ * Macro to add commands to auxiliary batch.
+ * This macro only checks for page overflow before inserting the commands,
+ * this is sufficient as the null state generator makes the final batch
+ * with two passes to build command and state separately. At this point
+ * the size of both are known and it compacts them by relocating the state
+ * right after the commands taking care of aligment so we should sufficient
+ * space below them for adding new commands.
+ */
+#define OUT_BATCH(batch, i, val)   \
+   do {\
+   if (WARN_ON((i) = PAGE_SIZE / sizeof(u32))) {  \
+   ret = -ENOSPC;  \
+   goto err_out;   \
+   }   \
+   (batch)[(i)++] = (val); \
+   } while(0)
+
 static int render_state_setup(struct render_state *so)
 {
const struct intel_renderstate_rodata *rodata = so-rodata;
@@ -110,6 +128,21 @@ static int render_state_setup(struct render_state *so)
 
d[i++] = s;
}
+
+   while (i % CACHELINE_DWORDS)
+   OUT_BATCH(d, i, MI_NOOP);
+
+   so-aux_batch_offset = i * sizeof(u32);
+
+   OUT_BATCH(d, i, MI_BATCH_BUFFER_END);
+   so-aux_batch_size = (i * sizeof(u32)) - so-aux_batch_offset;
+
+   /*
+* Since we are sending length, we need to strictly conform to
+* all requirements. For Gen2 this must be a multiple of 8.
+*/
+   so-aux_batch_size = ALIGN(so-aux_batch_size, 8);
+
kunmap(page);
 
ret = i915_gem_object_set_to_gtt_domain(so-obj, false);
@@ -128,6 +161,8 @@ err_out:
return ret;
 }
 
+#undef OUT_BATCH
+
 void i915_gem_render_state_fini(struct render_state *so)
 {
i915_gem_object_ggtt_unpin(so-obj);
@@ -176,6 +211,16 @@ int i915_gem_render_state_init(struct drm_i915_gem_request 
*req)
if (ret)
goto out;
 
+   if (so.aux_batch_size  8) {
+   ret = req-ring-dispatch_execbuffer(req,
+(so.ggtt_offset +
+ so.aux_batch_offset),
+so.aux_batch_size,
+I915_DISPATCH_SECURE);
+   if (ret)
+   goto out;
+   }
+
i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req);
 
 out:
diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.h 
b/drivers/gpu/drm/i915/i915_gem_render_state.h
index 7aa7372..79de101 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.h
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.h
@@ -37,6 +37,8 @@ struct render_state {
struct drm_i915_gem_object *obj;
u64 ggtt_offset;
int gen;
+   u32 aux_batch_size;
+   u64 aux_batch_offset;
 };
 
 int 

Re: [Intel-gfx] [PATCH v2 2/4] drm/i915: Add provision to extend Golden context batch

2015-07-17 Thread Chris Wilson
On Fri, Jul 17, 2015 at 07:13:32PM +0100, Arun Siluvery wrote:
 The Golden batch carries 3D state at the beginning so that HW starts with
 a known state. It is carried as a binary blob which is auto-generated from
 source. The idea was it would be easier to maintain and keep the complexity
 out of the kernel which makes sense as we don't really touch it. However if
 you really need to update it then you need to update generator source and
 keep the binary blob in sync with it.
 
 There is a need to patch this in bxt to send one additional command to enable
 a feature. A solution was to patch the binary data with some additional
 data structures (included as part of auto-generator source) but it was
 unnecessarily complicated.
 
 Chris suggested the idea of having a secondary batch and execute two batch
 buffers. It has clear advantages as we needn't touch the base golden batch,
 can customize secondary/auxiliary batch depending on Gen and can be carried
 in the driver with no dependencies.
 
 This patch adds support for this auxiliary batch which is inserted at the
 end of golden batch and is completely independent from it. Thanks to Mika
 for the preliminary review.
 
 v2: Strictly conform to the batch size requirements to cover Gen2 and
 add comments to clarify overflow check in macro (Chris, Mika).
 
 Cc: Mika Kuoppala mika.kuopp...@intel.com
 Cc: Chris Wilson ch...@chris-wilson.co.uk
 Cc: Armin Reese armin.c.re...@intel.com
 Signed-off-by: Arun Siluvery arun.siluv...@linux.intel.com
 ---
  drivers/gpu/drm/i915/i915_gem_render_state.c | 45 
 
  drivers/gpu/drm/i915/i915_gem_render_state.h |  2 ++
  drivers/gpu/drm/i915/intel_lrc.c |  6 
  3 files changed, 53 insertions(+)
 
 diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c 
 b/drivers/gpu/drm/i915/i915_gem_render_state.c
 index b6492fe..5026a62 100644
 --- a/drivers/gpu/drm/i915/i915_gem_render_state.c
 +++ b/drivers/gpu/drm/i915/i915_gem_render_state.c
 @@ -73,6 +73,24 @@ free_gem:
   return ret;
  }
  
 +/*
 + * Macro to add commands to auxiliary batch.
 + * This macro only checks for page overflow before inserting the commands,
 + * this is sufficient as the null state generator makes the final batch
 + * with two passes to build command and state separately. At this point
 + * the size of both are known and it compacts them by relocating the state
 + * right after the commands taking care of aligment so we should sufficient
 + * space below them for adding new commands.
 + */
 +#define OUT_BATCH(batch, i, val) \
 + do {\
 + if (WARN_ON((i) = PAGE_SIZE / sizeof(u32))) {  \
 + ret = -ENOSPC;  \
 + goto err_out;   \
 + }   \
 + (batch)[(i)++] = (val); \
 + } while(0)
 +
  static int render_state_setup(struct render_state *so)
  {
   const struct intel_renderstate_rodata *rodata = so-rodata;
 @@ -110,6 +128,21 @@ static int render_state_setup(struct render_state *so)
  
   d[i++] = s;
   }
 +
 + while (i % CACHELINE_DWORDS)
 + OUT_BATCH(d, i, MI_NOOP);
 +
 + so-aux_batch_offset = i * sizeof(u32);
 +
 + OUT_BATCH(d, i, MI_BATCH_BUFFER_END);
 + so-aux_batch_size = (i * sizeof(u32)) - so-aux_batch_offset;
 +
 + /*
 +  * Since we are sending length, we need to strictly conform to
 +  * all requirements. For Gen2 this must be a multiple of 8.
 +  */
 + so-aux_batch_size = ALIGN(so-aux_batch_size, 8);
 +
   kunmap(page);
  
   ret = i915_gem_object_set_to_gtt_domain(so-obj, false);
 @@ -128,6 +161,8 @@ err_out:
   return ret;
  }
  
 +#undef OUT_BATCH
 +
  void i915_gem_render_state_fini(struct render_state *so)
  {
   i915_gem_object_ggtt_unpin(so-obj);
 @@ -176,6 +211,16 @@ int i915_gem_render_state_init(struct 
 drm_i915_gem_request *req)
   if (ret)
   goto out;
  
 + if (so.aux_batch_size  8) {
 + ret = req-ring-dispatch_execbuffer(req,
 +  (so.ggtt_offset +
 +   so.aux_batch_offset),
 +  so.aux_batch_size,
 +  I915_DISPATCH_SECURE);
 + if (ret)
 + goto out;
 + }
 +
   i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req);
  
  out:
 diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.h 
 b/drivers/gpu/drm/i915/i915_gem_render_state.h
 index 7aa7372..79de101 100644
 --- a/drivers/gpu/drm/i915/i915_gem_render_state.h
 +++ b/drivers/gpu/drm/i915/i915_gem_render_state.h
 @@ -37,6 +37,8 @@ struct render_state {
   struct drm_i915_gem_object *obj;
   u64 ggtt_offset;