Re: [Mesa-dev] [PATCH 1/7] i965: Introduce the BROADCAST pseudo-opcode.

2015-04-29 Thread Matt Turner
On Fri, Feb 20, 2015 at 11:48 AM, Francisco Jerez curroje...@riseup.net wrote:
 The BROADCAST instruction picks the channel from its first source
 given by an index passed in as second source.  This will be used in
 situations where all channels from the same SIMD thread have to agree
 on the value of something, e.g. a surface binding table index.
 ---
  src/mesa/drivers/dri/i965/brw_defines.h  |  6 ++
  src/mesa/drivers/dri/i965/brw_eu.h   |  6 ++
  src/mesa/drivers/dri/i965/brw_eu_emit.c  | 77 
 
  src/mesa/drivers/dri/i965/brw_fs_generator.cpp   |  4 ++
  src/mesa/drivers/dri/i965/brw_shader.cpp |  3 +
  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp |  4 ++
  6 files changed, 100 insertions(+)

 diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
 b/src/mesa/drivers/dri/i965/brw_defines.h
 index 17c27dd..d4930e3 100644
 --- a/src/mesa/drivers/dri/i965/brw_defines.h
 +++ b/src/mesa/drivers/dri/i965/brw_defines.h
 @@ -911,6 +911,12 @@ enum opcode {

 SHADER_OPCODE_URB_WRITE_SIMD8,

 +   /**
 +* Pick the channel from its first source register given by the index
 +* specified as second source.  Useful for variable indexing of surfaces.
 +*/
 +   SHADER_OPCODE_BROADCAST,
 +
 VEC4_OPCODE_MOV_BYTES,
 VEC4_OPCODE_PACK_BYTES,
 VEC4_OPCODE_UNPACK_UNIFORM,
 diff --git a/src/mesa/drivers/dri/i965/brw_eu.h 
 b/src/mesa/drivers/dri/i965/brw_eu.h
 index a94ea42..2505480 100644
 --- a/src/mesa/drivers/dri/i965/brw_eu.h
 +++ b/src/mesa/drivers/dri/i965/brw_eu.h
 @@ -413,6 +413,12 @@ brw_pixel_interpolator_query(struct brw_compile *p,
   unsigned msg_length,
   unsigned response_length);

 +void
 +brw_broadcast(struct brw_compile *p,
 +  struct brw_reg dst,
 +  struct brw_reg src,
 +  struct brw_reg idx);
 +
  /***
   * brw_eu_util.c:
   */
 diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c 
 b/src/mesa/drivers/dri/i965/brw_eu_emit.c
 index 1d6fd67..d7e3995 100644
 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
 +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
 @@ -2854,6 +2854,83 @@ brw_pixel_interpolator_query(struct brw_compile *p,
 brw_inst_set_pi_message_data(brw, insn, data);
  }

 +void
 +brw_broadcast(struct brw_compile *p,
 +  struct brw_reg dst,
 +  struct brw_reg src,
 +  struct brw_reg idx)
 +{
 +   const struct brw_context *brw = p-brw;
 +   const bool align1 = (brw_inst_access_mode(brw, p-current) == 
 BRW_ALIGN_1);

Unnecessary parentheses.

 +   brw_inst *inst;
 +
 +   assert(src.file == BRW_GENERAL_REGISTER_FILE 
 +  src.address_mode == BRW_ADDRESS_DIRECT);
 +
 +   if ((src.vstride == 0  (src.hstride == 0 || !align1)) ||
 +   idx.file == BRW_IMMEDIATE_VALUE) {
 +  /* Trivial, the source is already uniform or the index is a constant.
 +   * We will typically not get here if the optimizer is doing its job, 
 but
 +   * asserting would be mean.
 +   */
 +  const unsigned i = (idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0);

Unnecessary parentheses.

 +  brw_MOV(p, dst,
 +  (align1 ? stride(suboffset(src, i), 0, 1, 0) :
 +   stride(suboffset(src, 4 * i), 0, 4, 1)));
 +

Extra new line.

 +   } else {
 +  if (align1) {
 + const struct brw_reg addr =
 +retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
 + const unsigned offset = src.nr * REG_SIZE + src.subnr;
 + /* Limit in bytes of the signed indirect addressing immediate. */
 + const unsigned limit = 512;
 +
 + brw_push_insn_state(p);
 + brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 +
 + /* Take into account the component size and horizontal stride. */
 + assert(src.vstride == src.hstride + src.width);
 + brw_SHL(p, addr, vec1(idx),
 + brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
 +src.hstride - 1));
 +
 + /* We can only address up to limit bytes using the indirect
 +  * addressing immediate, account for the difference if the source
 +  * register is above this limit.
 +  */
 + if (offset = limit)
 +brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
 +
 + brw_pop_insn_state(p);
 +
 + /* Use indirect addressing to fetch the specified component. */
 + brw_MOV(p, dst,
 + retype(brw_vec1_indirect(addr.subnr, offset % limit),
 +src.type));
 +

Extra new line.


Putting some of Ian's explanation for why this is needed into the
commit message might be good. I had to go read the piglit tests before
I really understood.
___
mesa-dev mailing list

Re: [Mesa-dev] [PATCH 1/7] i965: Introduce the BROADCAST pseudo-opcode.

2015-04-29 Thread Matt Turner
I replied with a bunch of style nits, and perhaps a suggestion to
combine the pass added in 4/7 with opt_algebraic, pending Ken's
thoughts.

In the case we want to combine 4/7 with opt_algebraic, I don't think I
need to see the final result -- I trust you can manage. :)

The series is

Reviewed-by: Matt Turner matts...@gmail.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/7] i965: Introduce the BROADCAST pseudo-opcode.

2015-02-20 Thread Ian Romanick
I want to be sure I understand what this series is doing and why it's
necessary.

Sampler arrays and UBO arrays can, with some restrictions, be indexed
dynamically.  The primary restriction is that any active SIMD channels
that access the array must access the same index.  When you get to a
particular access, not all SIMD channels may be active.  This is the
essence of what the GLSL spec calls dynamically uniform.

The implication is that the compiler is allowed to pick the index from
any active SIMD channel.  If they're not all the same, the shader author
should have followed the rules.  Currently i965 picks the index from a
SIMD channel that may or may not be active (due to other, non-uniform
flow control).  Correct?

This patch series attempts to fix that by first introducing some opcodes
to get a value from one of the active SIMD channels, then using those
opcodes to pick an array index from one of the active SIMD channels.
Correct?

On 02/20/2015 11:48 AM, Francisco Jerez wrote:
 The BROADCAST instruction picks the channel from its first source
 given by an index passed in as second source.  This will be used in
 situations where all channels from the same SIMD thread have to agree
 on the value of something, e.g. a surface binding table index.
 ---
  src/mesa/drivers/dri/i965/brw_defines.h  |  6 ++
  src/mesa/drivers/dri/i965/brw_eu.h   |  6 ++
  src/mesa/drivers/dri/i965/brw_eu_emit.c  | 77 
 
  src/mesa/drivers/dri/i965/brw_fs_generator.cpp   |  4 ++
  src/mesa/drivers/dri/i965/brw_shader.cpp |  3 +
  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp |  4 ++
  6 files changed, 100 insertions(+)
 
 diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
 b/src/mesa/drivers/dri/i965/brw_defines.h
 index 17c27dd..d4930e3 100644
 --- a/src/mesa/drivers/dri/i965/brw_defines.h
 +++ b/src/mesa/drivers/dri/i965/brw_defines.h
 @@ -911,6 +911,12 @@ enum opcode {
  
 SHADER_OPCODE_URB_WRITE_SIMD8,
  
 +   /**
 +* Pick the channel from its first source register given by the index
 +* specified as second source.  Useful for variable indexing of surfaces.
 +*/
 +   SHADER_OPCODE_BROADCAST,
 +
 VEC4_OPCODE_MOV_BYTES,
 VEC4_OPCODE_PACK_BYTES,
 VEC4_OPCODE_UNPACK_UNIFORM,
 diff --git a/src/mesa/drivers/dri/i965/brw_eu.h 
 b/src/mesa/drivers/dri/i965/brw_eu.h
 index a94ea42..2505480 100644
 --- a/src/mesa/drivers/dri/i965/brw_eu.h
 +++ b/src/mesa/drivers/dri/i965/brw_eu.h
 @@ -413,6 +413,12 @@ brw_pixel_interpolator_query(struct brw_compile *p,
   unsigned msg_length,
   unsigned response_length);
  
 +void
 +brw_broadcast(struct brw_compile *p,
 +  struct brw_reg dst,
 +  struct brw_reg src,
 +  struct brw_reg idx);
 +
  /***
   * brw_eu_util.c:
   */
 diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c 
 b/src/mesa/drivers/dri/i965/brw_eu_emit.c
 index 1d6fd67..d7e3995 100644
 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
 +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
 @@ -2854,6 +2854,83 @@ brw_pixel_interpolator_query(struct brw_compile *p,
 brw_inst_set_pi_message_data(brw, insn, data);
  }
  
 +void
 +brw_broadcast(struct brw_compile *p,
 +  struct brw_reg dst,
 +  struct brw_reg src,
 +  struct brw_reg idx)
 +{
 +   const struct brw_context *brw = p-brw;
 +   const bool align1 = (brw_inst_access_mode(brw, p-current) == 
 BRW_ALIGN_1);
 +   brw_inst *inst;
 +
 +   assert(src.file == BRW_GENERAL_REGISTER_FILE 
 +  src.address_mode == BRW_ADDRESS_DIRECT);
 +
 +   if ((src.vstride == 0  (src.hstride == 0 || !align1)) ||
 +   idx.file == BRW_IMMEDIATE_VALUE) {
 +  /* Trivial, the source is already uniform or the index is a constant.
 +   * We will typically not get here if the optimizer is doing its job, 
 but
 +   * asserting would be mean.
 +   */
 +  const unsigned i = (idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0);
 +  brw_MOV(p, dst,
 +  (align1 ? stride(suboffset(src, i), 0, 1, 0) :
 +   stride(suboffset(src, 4 * i), 0, 4, 1)));
 +
 +   } else {
 +  if (align1) {
 + const struct brw_reg addr =
 +retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
 + const unsigned offset = src.nr * REG_SIZE + src.subnr;
 + /* Limit in bytes of the signed indirect addressing immediate. */
 + const unsigned limit = 512;
 +
 + brw_push_insn_state(p);
 + brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 +
 + /* Take into account the component size and horizontal stride. */
 + assert(src.vstride == src.hstride + src.width);
 + brw_SHL(p, addr, vec1(idx),
 + brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
 +   

Re: [Mesa-dev] [PATCH 1/7] i965: Introduce the BROADCAST pseudo-opcode.

2015-02-20 Thread Francisco Jerez
Ian Romanick i...@freedesktop.org writes:

 I want to be sure I understand what this series is doing and why it's
 necessary.

 Sampler arrays and UBO arrays can, with some restrictions, be indexed
 dynamically.  The primary restriction is that any active SIMD channels
 that access the array must access the same index.  When you get to a
 particular access, not all SIMD channels may be active.  This is the
 essence of what the GLSL spec calls dynamically uniform.

 The implication is that the compiler is allowed to pick the index from
 any active SIMD channel.  If they're not all the same, the shader author
 should have followed the rules.  Currently i965 picks the index from a
 SIMD channel that may or may not be active (due to other, non-uniform
 flow control).  Correct?

Yeah, the problem is that right now we incorrectly assume that the first
channel of the indexing expression is always live, if it's not we end up
using garbage for the binding table index.

 This patch series attempts to fix that by first introducing some opcodes
 to get a value from one of the active SIMD channels, then using those
 opcodes to pick an array index from one of the active SIMD channels.
 Correct?

Yeah, that's right.  FIND_LIVE_CHANNEL returns the index of an arbitrary
live channel and BROADCAST uses indirect register addressing to fetch
the value from that channel and write it to the destination.  The same
opcodes will be used for dynamically uniform indexing of image arrays
too.

 On 02/20/2015 11:48 AM, Francisco Jerez wrote:
 The BROADCAST instruction picks the channel from its first source
 given by an index passed in as second source.  This will be used in
 situations where all channels from the same SIMD thread have to agree
 on the value of something, e.g. a surface binding table index.
 ---
  src/mesa/drivers/dri/i965/brw_defines.h  |  6 ++
  src/mesa/drivers/dri/i965/brw_eu.h   |  6 ++
  src/mesa/drivers/dri/i965/brw_eu_emit.c  | 77 
 
  src/mesa/drivers/dri/i965/brw_fs_generator.cpp   |  4 ++
  src/mesa/drivers/dri/i965/brw_shader.cpp |  3 +
  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp |  4 ++
  6 files changed, 100 insertions(+)
 
 diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
 b/src/mesa/drivers/dri/i965/brw_defines.h
 index 17c27dd..d4930e3 100644
 --- a/src/mesa/drivers/dri/i965/brw_defines.h
 +++ b/src/mesa/drivers/dri/i965/brw_defines.h
 @@ -911,6 +911,12 @@ enum opcode {
  
 SHADER_OPCODE_URB_WRITE_SIMD8,
  
 +   /**
 +* Pick the channel from its first source register given by the index
 +* specified as second source.  Useful for variable indexing of surfaces.
 +*/
 +   SHADER_OPCODE_BROADCAST,
 +
 VEC4_OPCODE_MOV_BYTES,
 VEC4_OPCODE_PACK_BYTES,
 VEC4_OPCODE_UNPACK_UNIFORM,
 diff --git a/src/mesa/drivers/dri/i965/brw_eu.h 
 b/src/mesa/drivers/dri/i965/brw_eu.h
 index a94ea42..2505480 100644
 --- a/src/mesa/drivers/dri/i965/brw_eu.h
 +++ b/src/mesa/drivers/dri/i965/brw_eu.h
 @@ -413,6 +413,12 @@ brw_pixel_interpolator_query(struct brw_compile *p,
   unsigned msg_length,
   unsigned response_length);
  
 +void
 +brw_broadcast(struct brw_compile *p,
 +  struct brw_reg dst,
 +  struct brw_reg src,
 +  struct brw_reg idx);
 +
  /***
   * brw_eu_util.c:
   */
 diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c 
 b/src/mesa/drivers/dri/i965/brw_eu_emit.c
 index 1d6fd67..d7e3995 100644
 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
 +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
 @@ -2854,6 +2854,83 @@ brw_pixel_interpolator_query(struct brw_compile *p,
 brw_inst_set_pi_message_data(brw, insn, data);
  }
  
 +void
 +brw_broadcast(struct brw_compile *p,
 +  struct brw_reg dst,
 +  struct brw_reg src,
 +  struct brw_reg idx)
 +{
 +   const struct brw_context *brw = p-brw;
 +   const bool align1 = (brw_inst_access_mode(brw, p-current) == 
 BRW_ALIGN_1);
 +   brw_inst *inst;
 +
 +   assert(src.file == BRW_GENERAL_REGISTER_FILE 
 +  src.address_mode == BRW_ADDRESS_DIRECT);
 +
 +   if ((src.vstride == 0  (src.hstride == 0 || !align1)) ||
 +   idx.file == BRW_IMMEDIATE_VALUE) {
 +  /* Trivial, the source is already uniform or the index is a constant.
 +   * We will typically not get here if the optimizer is doing its job, 
 but
 +   * asserting would be mean.
 +   */
 +  const unsigned i = (idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0);
 +  brw_MOV(p, dst,
 +  (align1 ? stride(suboffset(src, i), 0, 1, 0) :
 +   stride(suboffset(src, 4 * i), 0, 4, 1)));
 +
 +   } else {
 +  if (align1) {
 + const struct brw_reg addr =
 +retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
 + const unsigned offset = src.nr * REG_SIZE + src.subnr;
 

[Mesa-dev] [PATCH 1/7] i965: Introduce the BROADCAST pseudo-opcode.

2015-02-20 Thread Francisco Jerez
The BROADCAST instruction picks the channel from its first source
given by an index passed in as second source.  This will be used in
situations where all channels from the same SIMD thread have to agree
on the value of something, e.g. a surface binding table index.
---
 src/mesa/drivers/dri/i965/brw_defines.h  |  6 ++
 src/mesa/drivers/dri/i965/brw_eu.h   |  6 ++
 src/mesa/drivers/dri/i965/brw_eu_emit.c  | 77 
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp   |  4 ++
 src/mesa/drivers/dri/i965/brw_shader.cpp |  3 +
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp |  4 ++
 6 files changed, 100 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 17c27dd..d4930e3 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -911,6 +911,12 @@ enum opcode {
 
SHADER_OPCODE_URB_WRITE_SIMD8,
 
+   /**
+* Pick the channel from its first source register given by the index
+* specified as second source.  Useful for variable indexing of surfaces.
+*/
+   SHADER_OPCODE_BROADCAST,
+
VEC4_OPCODE_MOV_BYTES,
VEC4_OPCODE_PACK_BYTES,
VEC4_OPCODE_UNPACK_UNIFORM,
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h 
b/src/mesa/drivers/dri/i965/brw_eu.h
index a94ea42..2505480 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -413,6 +413,12 @@ brw_pixel_interpolator_query(struct brw_compile *p,
  unsigned msg_length,
  unsigned response_length);
 
+void
+brw_broadcast(struct brw_compile *p,
+  struct brw_reg dst,
+  struct brw_reg src,
+  struct brw_reg idx);
+
 /***
  * brw_eu_util.c:
  */
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c 
b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 1d6fd67..d7e3995 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -2854,6 +2854,83 @@ brw_pixel_interpolator_query(struct brw_compile *p,
brw_inst_set_pi_message_data(brw, insn, data);
 }
 
+void
+brw_broadcast(struct brw_compile *p,
+  struct brw_reg dst,
+  struct brw_reg src,
+  struct brw_reg idx)
+{
+   const struct brw_context *brw = p-brw;
+   const bool align1 = (brw_inst_access_mode(brw, p-current) == BRW_ALIGN_1);
+   brw_inst *inst;
+
+   assert(src.file == BRW_GENERAL_REGISTER_FILE 
+  src.address_mode == BRW_ADDRESS_DIRECT);
+
+   if ((src.vstride == 0  (src.hstride == 0 || !align1)) ||
+   idx.file == BRW_IMMEDIATE_VALUE) {
+  /* Trivial, the source is already uniform or the index is a constant.
+   * We will typically not get here if the optimizer is doing its job, but
+   * asserting would be mean.
+   */
+  const unsigned i = (idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0);
+  brw_MOV(p, dst,
+  (align1 ? stride(suboffset(src, i), 0, 1, 0) :
+   stride(suboffset(src, 4 * i), 0, 4, 1)));
+
+   } else {
+  if (align1) {
+ const struct brw_reg addr =
+retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+ const unsigned offset = src.nr * REG_SIZE + src.subnr;
+ /* Limit in bytes of the signed indirect addressing immediate. */
+ const unsigned limit = 512;
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+ /* Take into account the component size and horizontal stride. */
+ assert(src.vstride == src.hstride + src.width);
+ brw_SHL(p, addr, vec1(idx),
+ brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
+src.hstride - 1));
+
+ /* We can only address up to limit bytes using the indirect
+  * addressing immediate, account for the difference if the source
+  * register is above this limit.
+  */
+ if (offset = limit)
+brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
+
+ brw_pop_insn_state(p);
+
+ /* Use indirect addressing to fetch the specified component. */
+ brw_MOV(p, dst,
+ retype(brw_vec1_indirect(addr.subnr, offset % limit),
+src.type));
+
+  } else {
+ /* In SIMD4x2 mode the index can be either zero or one, replicate it
+  * to all bits of a flag register,
+  */
+ inst = brw_MOV(p,
+brw_null_reg(),
+stride(brw_swizzle1(idx, 0), 0, 4, 1));
+ brw_inst_set_pred_control(brw, inst, BRW_PREDICATE_NONE);
+ brw_inst_set_cond_modifier(brw, inst, BRW_CONDITIONAL_NZ);
+ brw_inst_set_flag_reg_nr(brw, inst, 1);
+
+ /* and use predicated