Re: [Mesa-dev] [RFC 4/5] intel/compiler: Implement unordered comparisons

2018-11-27 Thread Jason Ekstrand
On Tue, Nov 27, 2018 at 12:55 PM Ian Romanick  wrote:

> On 11/22/2018 10:46 AM, Jason Ekstrand wrote:
> > The vec4 path has only been compile-tested as there's no easy way to
> > generate a vec4 shader with an unordered equality.
> > ---
> >  src/intel/compiler/brw_compiler.c   |  3 +++
> >  src/intel/compiler/brw_fs_nir.cpp   | 20 +---
> >  src/intel/compiler/brw_vec4_nir.cpp | 21 +
> >  3 files changed, 37 insertions(+), 7 deletions(-)
> >
> > diff --git a/src/intel/compiler/brw_compiler.c
> b/src/intel/compiler/brw_compiler.c
> > index fe632c5badc..f9e8fa09a34 100644
> > --- a/src/intel/compiler/brw_compiler.c
> > +++ b/src/intel/compiler/brw_compiler.c
> > @@ -42,6 +42,9 @@
> > .lower_fdiv = true,
>   \
> > .lower_flrp64 = true,
>   \
> > .lower_ldexp = true,
>\
> > +   .lower_fltu = true,
>   \
> > +   .lower_fgeu = true,
>   \
>
> Can't we use cmpn.l and cmpn.ge for these?  On some platforms it has to
> be split into two SIMD8 instructions, but that seems better than the
> lowering... or maybe just use those on BDW+?
>

I didn't know about CMPN!  I guess that would do the trick though I suppose
we have a pile of back-end work to be able to optimize it.  I think the
back-end usually gets rid of the inot by flipping the cmod but maybe not.


> > +   .lower_fne_to_fequ = true,
>\
> > .lower_cs_local_id_from_index = true,
>   \
> > .lower_device_index_to_zero = true,
>   \
> > .native_integers = true,
>\
> > diff --git a/src/intel/compiler/brw_fs_nir.cpp
> b/src/intel/compiler/brw_fs_nir.cpp
> > index a62d521bb5d..eba3611e447 100644
> > --- a/src/intel/compiler/brw_fs_nir.cpp
> > +++ b/src/intel/compiler/brw_fs_nir.cpp
> > @@ -1049,6 +1049,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld,
> nir_alu_instr *instr)
> > case nir_op_flt:
> > case nir_op_fge:
> > case nir_op_feq:
> > +   case nir_op_fne:
> > +   case nir_op_fequ:
> > case nir_op_fneu: {
> >fs_reg dest = result;
> >
> > @@ -1056,26 +1058,30 @@ fs_visitor::nir_emit_alu(const fs_builder &bld,
> nir_alu_instr *instr)
> >if (bit_size != 32)
> >   dest = bld.vgrf(op[0].type, 1);
> >
> > -  brw_conditional_mod cond;
> >switch (instr->op) {
> >case nir_op_flt:
> > - cond = BRW_CONDITIONAL_L;
> > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_L);
> >   break;
> >case nir_op_fge:
> > - cond = BRW_CONDITIONAL_GE;
> > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_GE);
> >   break;
> >case nir_op_feq:
> > - cond = BRW_CONDITIONAL_Z;
> > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z);
> > + break;
> > +  case nir_op_fequ:
> > + bld.CMP(dest, op[0], op[0], BRW_CONDITIONAL_NZ);
> > + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
> > +   bld.CMP(dest, op[1], op[1],
> BRW_CONDITIONAL_NZ));
> > + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
> > +   bld.CMP(dest, op[0], op[1],
> BRW_CONDITIONAL_Z));
>
> OpFUnordEqual is (isnan(a) || isnan(b) || a == b), and that's literally
> what this code does.  It seems like this could be implemented (perhaps
> as a lowering?) as (OpFUnordGreaterThanEqual(a,b) &&
> OpFUnordGreaterThanEqual(b, a)) via cmpn.ge.  Using this same technique,
> that would be 2 instructions (on BDW+) instead of 3.  HSW has to do cmpn
> as two SIMD8 instructions, so this may be better there.  Dunno.
>

That's a good idea.  Unfortunately, CMPN dosn't modify Z or NZ.  We could
also lower fequ(x, y) to inot(fne(x, y)) and implement fne(x, y) as (x < y)
|| (x > y) which wouldn't require CMPN.

--Jason


> >   break;
> >case nir_op_fneu:
> > - cond = BRW_CONDITIONAL_NZ;
> > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_NZ);
> >   break;
> >default:
> >   unreachable("bad opcode");
> >}
> >
> > -  bld.CMP(dest, op[0], op[1], cond);
> > -
> >if (bit_size > 32) {
> >   bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
> >} else if(bit_size < 32) {
> > diff --git a/src/intel/compiler/brw_vec4_nir.cpp
> b/src/intel/compiler/brw_vec4_nir.cpp
> > index f7f46f5034c..32559e1aade 100644
> > --- a/src/intel/compiler/brw_vec4_nir.cpp
> > +++ b/src/intel/compiler/brw_vec4_nir.cpp
> > @@ -1366,6 +1366,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
> >break;
> > }
> >
> > +   case nir_op_fequ: {
> > +  dst_reg cmp_res = dst;
> > +  if (nir_src_bit_size(instr->src[0].src) == 64)
> > + cmp_res = dst_reg(this, glsl_type::dvec4_type);
> > +
> > +  vec4_instruction *inst;
> > +  inst = emit(CMP(cmp_res, op[0], op[0], BRW_CONDITIONAL_NZ));
> > +  inst = emit(CMP(cmp_res, op[1], op[1], BRW_CONDITIONAL_NZ));
> > +  inst->predicate = BRW_PREDICATE_NORMAL;
> > +  inst

Re: [Mesa-dev] [RFC 4/5] intel/compiler: Implement unordered comparisons

2018-11-27 Thread Ian Romanick
On 11/22/2018 10:46 AM, Jason Ekstrand wrote:
> The vec4 path has only been compile-tested as there's no easy way to
> generate a vec4 shader with an unordered equality.
> ---
>  src/intel/compiler/brw_compiler.c   |  3 +++
>  src/intel/compiler/brw_fs_nir.cpp   | 20 +---
>  src/intel/compiler/brw_vec4_nir.cpp | 21 +
>  3 files changed, 37 insertions(+), 7 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_compiler.c 
> b/src/intel/compiler/brw_compiler.c
> index fe632c5badc..f9e8fa09a34 100644
> --- a/src/intel/compiler/brw_compiler.c
> +++ b/src/intel/compiler/brw_compiler.c
> @@ -42,6 +42,9 @@
> .lower_fdiv = true,   
>  \
> .lower_flrp64 = true, 
>  \
> .lower_ldexp = true,  
>  \
> +   .lower_fltu = true,   
>  \
> +   .lower_fgeu = true,   
>  \

Can't we use cmpn.l and cmpn.ge for these?  On some platforms it has to
be split into two SIMD8 instructions, but that seems better than the
lowering... or maybe just use those on BDW+?

> +   .lower_fne_to_fequ = true,
>  \
> .lower_cs_local_id_from_index = true, 
>  \
> .lower_device_index_to_zero = true,   
>  \
> .native_integers = true,  
>  \
> diff --git a/src/intel/compiler/brw_fs_nir.cpp 
> b/src/intel/compiler/brw_fs_nir.cpp
> index a62d521bb5d..eba3611e447 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -1049,6 +1049,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
> nir_alu_instr *instr)
> case nir_op_flt:
> case nir_op_fge:
> case nir_op_feq:
> +   case nir_op_fne:
> +   case nir_op_fequ:
> case nir_op_fneu: {
>fs_reg dest = result;
>  
> @@ -1056,26 +1058,30 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
> nir_alu_instr *instr)
>if (bit_size != 32)
>   dest = bld.vgrf(op[0].type, 1);
>  
> -  brw_conditional_mod cond;
>switch (instr->op) {
>case nir_op_flt:
> - cond = BRW_CONDITIONAL_L;
> + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_L);
>   break;
>case nir_op_fge:
> - cond = BRW_CONDITIONAL_GE;
> + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_GE);
>   break;
>case nir_op_feq:
> - cond = BRW_CONDITIONAL_Z;
> + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z);
> + break;
> +  case nir_op_fequ:
> + bld.CMP(dest, op[0], op[0], BRW_CONDITIONAL_NZ);
> + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
> +   bld.CMP(dest, op[1], op[1], BRW_CONDITIONAL_NZ));
> + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
> +   bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z));

OpFUnordEqual is (isnan(a) || isnan(b) || a == b), and that's literally
what this code does.  It seems like this could be implemented (perhaps
as a lowering?) as (OpFUnordGreaterThanEqual(a,b) &&
OpFUnordGreaterThanEqual(b, a)) via cmpn.ge.  Using this same technique,
that would be 2 instructions (on BDW+) instead of 3.  HSW has to do cmpn
as two SIMD8 instructions, so this may be better there.  Dunno.

>   break;
>case nir_op_fneu:
> - cond = BRW_CONDITIONAL_NZ;
> + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_NZ);
>   break;
>default:
>   unreachable("bad opcode");
>}
>  
> -  bld.CMP(dest, op[0], op[1], cond);
> -
>if (bit_size > 32) {
>   bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
>} else if(bit_size < 32) {
> diff --git a/src/intel/compiler/brw_vec4_nir.cpp 
> b/src/intel/compiler/brw_vec4_nir.cpp
> index f7f46f5034c..32559e1aade 100644
> --- a/src/intel/compiler/brw_vec4_nir.cpp
> +++ b/src/intel/compiler/brw_vec4_nir.cpp
> @@ -1366,6 +1366,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
>break;
> }
>  
> +   case nir_op_fequ: {
> +  dst_reg cmp_res = dst;
> +  if (nir_src_bit_size(instr->src[0].src) == 64)
> + cmp_res = dst_reg(this, glsl_type::dvec4_type);
> +
> +  vec4_instruction *inst;
> +  inst = emit(CMP(cmp_res, op[0], op[0], BRW_CONDITIONAL_NZ));
> +  inst = emit(CMP(cmp_res, op[1], op[1], BRW_CONDITIONAL_NZ));
> +  inst->predicate = BRW_PREDICATE_NORMAL;
> +  inst->predicate_inverse = true;
> +  inst = emit(CMP(cmp_res, op[0], op[1], BRW_CONDITIONAL_Z));
> +  inst->predicate = BRW_PREDICATE_NORMAL;
> +  inst->predicate_inverse = true;
> +
> +  if (nir_src_bit_size(instr->src[0].src) == 64) {
> + dst_reg cmp_res32 =

[Mesa-dev] [RFC 4/5] intel/compiler: Implement unordered comparisons

2018-11-22 Thread Jason Ekstrand
The vec4 path has only been compile-tested as there's no easy way to
generate a vec4 shader with an unordered equality.
---
 src/intel/compiler/brw_compiler.c   |  3 +++
 src/intel/compiler/brw_fs_nir.cpp   | 20 +---
 src/intel/compiler/brw_vec4_nir.cpp | 21 +
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.c 
b/src/intel/compiler/brw_compiler.c
index fe632c5badc..f9e8fa09a34 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -42,6 +42,9 @@
.lower_fdiv = true,\
.lower_flrp64 = true,  \
.lower_ldexp = true,   \
+   .lower_fltu = true,\
+   .lower_fgeu = true,\
+   .lower_fne_to_fequ = true, \
.lower_cs_local_id_from_index = true,  \
.lower_device_index_to_zero = true,\
.native_integers = true,   \
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index a62d521bb5d..eba3611e447 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1049,6 +1049,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
case nir_op_flt:
case nir_op_fge:
case nir_op_feq:
+   case nir_op_fne:
+   case nir_op_fequ:
case nir_op_fneu: {
   fs_reg dest = result;
 
@@ -1056,26 +1058,30 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   if (bit_size != 32)
  dest = bld.vgrf(op[0].type, 1);
 
-  brw_conditional_mod cond;
   switch (instr->op) {
   case nir_op_flt:
- cond = BRW_CONDITIONAL_L;
+ bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_L);
  break;
   case nir_op_fge:
- cond = BRW_CONDITIONAL_GE;
+ bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_GE);
  break;
   case nir_op_feq:
- cond = BRW_CONDITIONAL_Z;
+ bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z);
+ break;
+  case nir_op_fequ:
+ bld.CMP(dest, op[0], op[0], BRW_CONDITIONAL_NZ);
+ set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
+   bld.CMP(dest, op[1], op[1], BRW_CONDITIONAL_NZ));
+ set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
+   bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z));
  break;
   case nir_op_fneu:
- cond = BRW_CONDITIONAL_NZ;
+ bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_NZ);
  break;
   default:
  unreachable("bad opcode");
   }
 
-  bld.CMP(dest, op[0], op[1], cond);
-
   if (bit_size > 32) {
  bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
   } else if(bit_size < 32) {
diff --git a/src/intel/compiler/brw_vec4_nir.cpp 
b/src/intel/compiler/brw_vec4_nir.cpp
index f7f46f5034c..32559e1aade 100644
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -1366,6 +1366,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
   break;
}
 
+   case nir_op_fequ: {
+  dst_reg cmp_res = dst;
+  if (nir_src_bit_size(instr->src[0].src) == 64)
+ cmp_res = dst_reg(this, glsl_type::dvec4_type);
+
+  vec4_instruction *inst;
+  inst = emit(CMP(cmp_res, op[0], op[0], BRW_CONDITIONAL_NZ));
+  inst = emit(CMP(cmp_res, op[1], op[1], BRW_CONDITIONAL_NZ));
+  inst->predicate = BRW_PREDICATE_NORMAL;
+  inst->predicate_inverse = true;
+  inst = emit(CMP(cmp_res, op[0], op[1], BRW_CONDITIONAL_Z));
+  inst->predicate = BRW_PREDICATE_NORMAL;
+  inst->predicate_inverse = true;
+
+  if (nir_src_bit_size(instr->src[0].src) == 64) {
+ dst_reg cmp_res32 = dst_reg(this, glsl_type::bvec4_type);
+ emit(VEC4_OPCODE_PICK_LOW_32BIT, cmp_res32, src_reg(cmp_res));
+ emit(MOV(dst, src_reg(cmp_res32)));
+  }
+   }
+
case nir_op_ball_iequal2:
case nir_op_ball_iequal3:
case nir_op_ball_iequal4:
-- 
2.19.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev