Re: [Mesa-dev] [RFC 4/5] intel/compiler: Implement unordered comparisons
On Tue, Nov 27, 2018 at 12:55 PM Ian Romanick wrote: > On 11/22/2018 10:46 AM, Jason Ekstrand wrote: > > The vec4 path has only been compile-tested as there's no easy way to > > generate a vec4 shader with an unordered equality. > > --- > > src/intel/compiler/brw_compiler.c | 3 +++ > > src/intel/compiler/brw_fs_nir.cpp | 20 +--- > > src/intel/compiler/brw_vec4_nir.cpp | 21 + > > 3 files changed, 37 insertions(+), 7 deletions(-) > > > > diff --git a/src/intel/compiler/brw_compiler.c > b/src/intel/compiler/brw_compiler.c > > index fe632c5badc..f9e8fa09a34 100644 > > --- a/src/intel/compiler/brw_compiler.c > > +++ b/src/intel/compiler/brw_compiler.c > > @@ -42,6 +42,9 @@ > > .lower_fdiv = true, > \ > > .lower_flrp64 = true, > \ > > .lower_ldexp = true, >\ > > + .lower_fltu = true, > \ > > + .lower_fgeu = true, > \ > > Can't we use cmpn.l and cmpn.ge for these? On some platforms it has to > be split into two SIMD8 instructions, but that seems better than the > lowering... or maybe just use those on BDW+? > I didn't know about CMPN! I guess that would do the trick though I suppose we have a pile of back-end work to be able to optimize it. I think the back-end usually gets rid of the inot by flipping the cmod but maybe not. > > + .lower_fne_to_fequ = true, >\ > > .lower_cs_local_id_from_index = true, > \ > > .lower_device_index_to_zero = true, > \ > > .native_integers = true, >\ > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > > index a62d521bb5d..eba3611e447 100644 > > --- a/src/intel/compiler/brw_fs_nir.cpp > > +++ b/src/intel/compiler/brw_fs_nir.cpp > > @@ -1049,6 +1049,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, > nir_alu_instr *instr) > > case nir_op_flt: > > case nir_op_fge: > > case nir_op_feq: > > + case nir_op_fne: > > + case nir_op_fequ: > > case nir_op_fneu: { > >fs_reg dest = result; > > > > @@ -1056,26 +1058,30 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, > nir_alu_instr *instr) > >if (bit_size != 32) > > dest = bld.vgrf(op[0].type, 1); > > > > - brw_conditional_mod cond; > >switch (instr->op) { > >case nir_op_flt: > > - cond = BRW_CONDITIONAL_L; > > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_L); > > break; > >case nir_op_fge: > > - cond = BRW_CONDITIONAL_GE; > > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_GE); > > break; > >case nir_op_feq: > > - cond = BRW_CONDITIONAL_Z; > > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z); > > + break; > > + case nir_op_fequ: > > + bld.CMP(dest, op[0], op[0], BRW_CONDITIONAL_NZ); > > + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */ > > + bld.CMP(dest, op[1], op[1], > BRW_CONDITIONAL_NZ)); > > + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */ > > + bld.CMP(dest, op[0], op[1], > BRW_CONDITIONAL_Z)); > > OpFUnordEqual is (isnan(a) || isnan(b) || a == b), and that's literally > what this code does. It seems like this could be implemented (perhaps > as a lowering?) as (OpFUnordGreaterThanEqual(a,b) && > OpFUnordGreaterThanEqual(b, a)) via cmpn.ge. Using this same technique, > that would be 2 instructions (on BDW+) instead of 3. HSW has to do cmpn > as two SIMD8 instructions, so this may be better there. Dunno. > That's a good idea. Unfortunately, CMPN dosn't modify Z or NZ. We could also lower fequ(x, y) to inot(fne(x, y)) and implement fne(x, y) as (x < y) || (x > y) which wouldn't require CMPN. --Jason > > break; > >case nir_op_fneu: > > - cond = BRW_CONDITIONAL_NZ; > > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_NZ); > > break; > >default: > > unreachable("bad opcode"); > >} > > > > - bld.CMP(dest, op[0], op[1], cond); > > - > >if (bit_size > 32) { > > bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); > >} else if(bit_size < 32) { > > diff --git a/src/intel/compiler/brw_vec4_nir.cpp > b/src/intel/compiler/brw_vec4_nir.cpp > > index f7f46f5034c..32559e1aade 100644 > > --- a/src/intel/compiler/brw_vec4_nir.cpp > > +++ b/src/intel/compiler/brw_vec4_nir.cpp > > @@ -1366,6 +1366,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) > >break; > > } > > > > + case nir_op_fequ: { > > + dst_reg cmp_res = dst; > > + if (nir_src_bit_size(instr->src[0].src) == 64) > > + cmp_res = dst_reg(this, glsl_type::dvec4_type); > > + > > + vec4_instruction *inst; > > + inst = emit(CMP(cmp_res, op[0], op[0], BRW_CONDITIONAL_NZ)); > > + inst = emit(CMP(cmp_res, op[1], op[1], BRW_CONDITIONAL_NZ)); > > + inst->predicate = BRW_PREDICATE_NORMAL; > > + inst
Re: [Mesa-dev] [RFC 4/5] intel/compiler: Implement unordered comparisons
On 11/22/2018 10:46 AM, Jason Ekstrand wrote: > The vec4 path has only been compile-tested as there's no easy way to > generate a vec4 shader with an unordered equality. > --- > src/intel/compiler/brw_compiler.c | 3 +++ > src/intel/compiler/brw_fs_nir.cpp | 20 +--- > src/intel/compiler/brw_vec4_nir.cpp | 21 + > 3 files changed, 37 insertions(+), 7 deletions(-) > > diff --git a/src/intel/compiler/brw_compiler.c > b/src/intel/compiler/brw_compiler.c > index fe632c5badc..f9e8fa09a34 100644 > --- a/src/intel/compiler/brw_compiler.c > +++ b/src/intel/compiler/brw_compiler.c > @@ -42,6 +42,9 @@ > .lower_fdiv = true, > \ > .lower_flrp64 = true, > \ > .lower_ldexp = true, > \ > + .lower_fltu = true, > \ > + .lower_fgeu = true, > \ Can't we use cmpn.l and cmpn.ge for these? On some platforms it has to be split into two SIMD8 instructions, but that seems better than the lowering... or maybe just use those on BDW+? > + .lower_fne_to_fequ = true, > \ > .lower_cs_local_id_from_index = true, > \ > .lower_device_index_to_zero = true, > \ > .native_integers = true, > \ > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index a62d521bb5d..eba3611e447 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -1049,6 +1049,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, > nir_alu_instr *instr) > case nir_op_flt: > case nir_op_fge: > case nir_op_feq: > + case nir_op_fne: > + case nir_op_fequ: > case nir_op_fneu: { >fs_reg dest = result; > > @@ -1056,26 +1058,30 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, > nir_alu_instr *instr) >if (bit_size != 32) > dest = bld.vgrf(op[0].type, 1); > > - brw_conditional_mod cond; >switch (instr->op) { >case nir_op_flt: > - cond = BRW_CONDITIONAL_L; > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_L); > break; >case nir_op_fge: > - cond = BRW_CONDITIONAL_GE; > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_GE); > break; >case nir_op_feq: > - cond = BRW_CONDITIONAL_Z; > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z); > + break; > + case nir_op_fequ: > + bld.CMP(dest, op[0], op[0], BRW_CONDITIONAL_NZ); > + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */ > + bld.CMP(dest, op[1], op[1], BRW_CONDITIONAL_NZ)); > + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */ > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z)); OpFUnordEqual is (isnan(a) || isnan(b) || a == b), and that's literally what this code does. It seems like this could be implemented (perhaps as a lowering?) as (OpFUnordGreaterThanEqual(a,b) && OpFUnordGreaterThanEqual(b, a)) via cmpn.ge. Using this same technique, that would be 2 instructions (on BDW+) instead of 3. HSW has to do cmpn as two SIMD8 instructions, so this may be better there. Dunno. > break; >case nir_op_fneu: > - cond = BRW_CONDITIONAL_NZ; > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_NZ); > break; >default: > unreachable("bad opcode"); >} > > - bld.CMP(dest, op[0], op[1], cond); > - >if (bit_size > 32) { > bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); >} else if(bit_size < 32) { > diff --git a/src/intel/compiler/brw_vec4_nir.cpp > b/src/intel/compiler/brw_vec4_nir.cpp > index f7f46f5034c..32559e1aade 100644 > --- a/src/intel/compiler/brw_vec4_nir.cpp > +++ b/src/intel/compiler/brw_vec4_nir.cpp > @@ -1366,6 +1366,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) >break; > } > > + case nir_op_fequ: { > + dst_reg cmp_res = dst; > + if (nir_src_bit_size(instr->src[0].src) == 64) > + cmp_res = dst_reg(this, glsl_type::dvec4_type); > + > + vec4_instruction *inst; > + inst = emit(CMP(cmp_res, op[0], op[0], BRW_CONDITIONAL_NZ)); > + inst = emit(CMP(cmp_res, op[1], op[1], BRW_CONDITIONAL_NZ)); > + inst->predicate = BRW_PREDICATE_NORMAL; > + inst->predicate_inverse = true; > + inst = emit(CMP(cmp_res, op[0], op[1], BRW_CONDITIONAL_Z)); > + inst->predicate = BRW_PREDICATE_NORMAL; > + inst->predicate_inverse = true; > + > + if (nir_src_bit_size(instr->src[0].src) == 64) { > + dst_reg cmp_res32 =
[Mesa-dev] [RFC 4/5] intel/compiler: Implement unordered comparisons
The vec4 path has only been compile-tested as there's no easy way to generate a vec4 shader with an unordered equality. --- src/intel/compiler/brw_compiler.c | 3 +++ src/intel/compiler/brw_fs_nir.cpp | 20 +--- src/intel/compiler/brw_vec4_nir.cpp | 21 + 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index fe632c5badc..f9e8fa09a34 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -42,6 +42,9 @@ .lower_fdiv = true,\ .lower_flrp64 = true, \ .lower_ldexp = true, \ + .lower_fltu = true,\ + .lower_fgeu = true,\ + .lower_fne_to_fequ = true, \ .lower_cs_local_id_from_index = true, \ .lower_device_index_to_zero = true,\ .native_integers = true, \ diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index a62d521bb5d..eba3611e447 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -1049,6 +1049,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_flt: case nir_op_fge: case nir_op_feq: + case nir_op_fne: + case nir_op_fequ: case nir_op_fneu: { fs_reg dest = result; @@ -1056,26 +1058,30 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) if (bit_size != 32) dest = bld.vgrf(op[0].type, 1); - brw_conditional_mod cond; switch (instr->op) { case nir_op_flt: - cond = BRW_CONDITIONAL_L; + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_L); break; case nir_op_fge: - cond = BRW_CONDITIONAL_GE; + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_GE); break; case nir_op_feq: - cond = BRW_CONDITIONAL_Z; + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z); + break; + case nir_op_fequ: + bld.CMP(dest, op[0], op[0], BRW_CONDITIONAL_NZ); + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */ + bld.CMP(dest, op[1], op[1], BRW_CONDITIONAL_NZ)); + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */ + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z)); break; case nir_op_fneu: - cond = BRW_CONDITIONAL_NZ; + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_NZ); break; default: unreachable("bad opcode"); } - bld.CMP(dest, op[0], op[1], cond); - if (bit_size > 32) { bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); } else if(bit_size < 32) { diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp index f7f46f5034c..32559e1aade 100644 --- a/src/intel/compiler/brw_vec4_nir.cpp +++ b/src/intel/compiler/brw_vec4_nir.cpp @@ -1366,6 +1366,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) break; } + case nir_op_fequ: { + dst_reg cmp_res = dst; + if (nir_src_bit_size(instr->src[0].src) == 64) + cmp_res = dst_reg(this, glsl_type::dvec4_type); + + vec4_instruction *inst; + inst = emit(CMP(cmp_res, op[0], op[0], BRW_CONDITIONAL_NZ)); + inst = emit(CMP(cmp_res, op[1], op[1], BRW_CONDITIONAL_NZ)); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = true; + inst = emit(CMP(cmp_res, op[0], op[1], BRW_CONDITIONAL_Z)); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = true; + + if (nir_src_bit_size(instr->src[0].src) == 64) { + dst_reg cmp_res32 = dst_reg(this, glsl_type::bvec4_type); + emit(VEC4_OPCODE_PICK_LOW_32BIT, cmp_res32, src_reg(cmp_res)); + emit(MOV(dst, src_reg(cmp_res32))); + } + } + case nir_op_ball_iequal2: case nir_op_ball_iequal3: case nir_op_ball_iequal4: -- 2.19.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev