Re: [Mesa-dev] [PATCH 4/4] i965: Port INTEL_PRECISE_TRIG=1 to NIR.

2016-04-08 Thread Jason Ekstrand
On Fri, Apr 8, 2016 at 9:36 AM, Eduardo Lima Mitev  wrote:

> On 04/08/2016 01:35 AM, Kenneth Graunke wrote:
>
>> This makes the extra multiply visible to NIR's algebraic optimizations
>> (for constant reassociation) as well as constant folding.  This means
>> that when the result of sin/cos are multiplied by an constant, we can
>> eliminate the extra multiply altogether, reducing the cost of the
>> workaround.
>>
>> It also means we only have to implement it one place, rather than in
>> both backends.
>>
>> This makes INTEL_PRECISE_TRIG=1 cost nothing on GPUTest/Volplosion,
>> which has a ton of sin() calls, but always multiplies them by an
>> immediate constant.  The extra multiply gets folded away.
>>
>>
> Cleaner, indeed. Patch (and series) is:
>
> Reviewed-by: Eduardo Lima Mitev 
>
> Thanks!
>
> Eduardo
>
>
> Signed-off-by: Kenneth Graunke 
>> ---
>>   src/mesa/drivers/dri/i965/Makefile.am  |  5 +++
>>   src/mesa/drivers/dri/i965/Makefile.sources |  1 +
>>   src/mesa/drivers/dri/i965/brw_fs_nir.cpp   | 16 +---
>>   src/mesa/drivers/dri/i965/brw_nir.c|  3 ++
>>   src/mesa/drivers/dri/i965/brw_nir.h|  2 +
>>   .../drivers/dri/i965/brw_nir_trig_workarounds.py   | 43
>> ++
>>   src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 16 +---
>>   7 files changed, 58 insertions(+), 28 deletions(-)
>>   create mode 100755 src/mesa/drivers/dri/i965/brw_nir_trig_workarounds.py
>>
>> diff --git a/src/mesa/drivers/dri/i965/Makefile.am
>> b/src/mesa/drivers/dri/i965/Makefile.am
>> index 0db5a51..a41c830 100644
>> --- a/src/mesa/drivers/dri/i965/Makefile.am
>> +++ b/src/mesa/drivers/dri/i965/Makefile.am
>> @@ -33,6 +33,7 @@ AM_CFLAGS = \
>> -I$(top_srcdir)/src/mesa/drivers/dri/common \
>> -I$(top_srcdir)/src/mesa/drivers/dri/intel/server \
>> -I$(top_srcdir)/src/gtest/include \
>> +   -I$(top_srcdir)/src/compiler/nir \
>> -I$(top_builddir)/src/compiler/nir \
>> -I$(top_builddir)/src/mesa/drivers/dri/common \
>> $(DEFINES) \
>> @@ -41,6 +42,10 @@ AM_CFLAGS = \
>>
>>   AM_CXXFLAGS = $(AM_CFLAGS)
>>
>> +brw_nir_trig_workarounds.c: brw_nir_trig_workarounds.py
>> $(top_srcdir)/src/compiler/nir/nir_algebraic.py
>> +   $(MKDIR_GEN)
>> +   $(AM_V_GEN) PYTHONPATH=$(top_srcdir)/src/compiler/nir $(PYTHON2)
>> $(PYTHON_FLAGS) $(srcdir)/brw_nir_trig_workarounds.py > $@ || ($(RM) $@;
>> false)
>> +
>>   noinst_LTLIBRARIES = libi965_dri.la libi965_compiler.la
>>   libi965_dri_la_SOURCES = $(i965_FILES)
>>   libi965_dri_la_LIBADD = libi965_compiler.la $(INTEL_LIBS)
>> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources
>> b/src/mesa/drivers/dri/i965/Makefile.sources
>> index 4689588..2619e43 100644
>> --- a/src/mesa/drivers/dri/i965/Makefile.sources
>> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
>> @@ -44,6 +44,7 @@ i965_compiler_FILES = \
>> brw_nir.c \
>> brw_nir_analyze_boolean_resolves.c \
>> brw_nir_attribute_workarounds.c \
>> +   brw_nir_trig_workarounds.c \
>> brw_nir_opt_peephole_ffma.c \
>> brw_nir_uniforms.cpp \
>> brw_packed_float.c \
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> index 90b8789..bd6314a 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> @@ -775,24 +775,12 @@ fs_visitor::nir_emit_alu(const fs_builder ,
>> nir_alu_instr *instr)
>> break;
>>
>>  case nir_op_fsin:
>> -  if (!compiler->precise_trig) {
>> - inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
>> -  } else {
>> - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
>> - inst = bld.emit(SHADER_OPCODE_SIN, tmp, op[0]);
>> - inst = bld.MUL(result, tmp, brw_imm_f(0.7));
>> -  }
>> +  inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
>> inst->saturate = instr->dest.saturate;
>> break;
>>
>>  case nir_op_fcos:
>> -  if (!compiler->precise_trig) {
>> - inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
>> -  } else {
>> - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
>> - inst = bld.emit(SHADER_OPCODE_COS, tmp, op[0]);
>> - inst = bld.MUL(result, tmp, brw_imm_f(0.7));
>> -  }
>> +  inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
>> inst->saturate = instr->dest.saturate;
>> break;
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_nir.c
>> b/src/mesa/drivers/dri/i965/brw_nir.c
>> index 1821c0d..932979a 100644
>> --- a/src/mesa/drivers/dri/i965/brw_nir.c
>> +++ b/src/mesa/drivers/dri/i965/brw_nir.c
>> @@ -447,6 +447,9 @@ brw_preprocess_nir(const struct brw_compiler
>> *compiler, nir_shader *nir)
>>  if (nir->stage == MESA_SHADER_GEOMETRY)
>> OPT(nir_lower_gs_intrinsics);
>>
>> +   if (compiler->precise_trig)
>> +  

Re: [Mesa-dev] [PATCH 4/4] i965: Port INTEL_PRECISE_TRIG=1 to NIR.

2016-04-08 Thread Eduardo Lima Mitev

On 04/08/2016 01:35 AM, Kenneth Graunke wrote:

This makes the extra multiply visible to NIR's algebraic optimizations
(for constant reassociation) as well as constant folding.  This means
that when the result of sin/cos are multiplied by an constant, we can
eliminate the extra multiply altogether, reducing the cost of the
workaround.

It also means we only have to implement it one place, rather than in
both backends.

This makes INTEL_PRECISE_TRIG=1 cost nothing on GPUTest/Volplosion,
which has a ton of sin() calls, but always multiplies them by an
immediate constant.  The extra multiply gets folded away.



Cleaner, indeed. Patch (and series) is:

Reviewed-by: Eduardo Lima Mitev 

Thanks!

Eduardo


Signed-off-by: Kenneth Graunke 
---
  src/mesa/drivers/dri/i965/Makefile.am  |  5 +++
  src/mesa/drivers/dri/i965/Makefile.sources |  1 +
  src/mesa/drivers/dri/i965/brw_fs_nir.cpp   | 16 +---
  src/mesa/drivers/dri/i965/brw_nir.c|  3 ++
  src/mesa/drivers/dri/i965/brw_nir.h|  2 +
  .../drivers/dri/i965/brw_nir_trig_workarounds.py   | 43 ++
  src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 16 +---
  7 files changed, 58 insertions(+), 28 deletions(-)
  create mode 100755 src/mesa/drivers/dri/i965/brw_nir_trig_workarounds.py

diff --git a/src/mesa/drivers/dri/i965/Makefile.am 
b/src/mesa/drivers/dri/i965/Makefile.am
index 0db5a51..a41c830 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -33,6 +33,7 @@ AM_CFLAGS = \
-I$(top_srcdir)/src/mesa/drivers/dri/common \
-I$(top_srcdir)/src/mesa/drivers/dri/intel/server \
-I$(top_srcdir)/src/gtest/include \
+   -I$(top_srcdir)/src/compiler/nir \
-I$(top_builddir)/src/compiler/nir \
-I$(top_builddir)/src/mesa/drivers/dri/common \
$(DEFINES) \
@@ -41,6 +42,10 @@ AM_CFLAGS = \

  AM_CXXFLAGS = $(AM_CFLAGS)

+brw_nir_trig_workarounds.c: brw_nir_trig_workarounds.py 
$(top_srcdir)/src/compiler/nir/nir_algebraic.py
+   $(MKDIR_GEN)
+   $(AM_V_GEN) PYTHONPATH=$(top_srcdir)/src/compiler/nir $(PYTHON2) 
$(PYTHON_FLAGS) $(srcdir)/brw_nir_trig_workarounds.py > $@ || ($(RM) $@; false)
+
  noinst_LTLIBRARIES = libi965_dri.la libi965_compiler.la
  libi965_dri_la_SOURCES = $(i965_FILES)
  libi965_dri_la_LIBADD = libi965_compiler.la $(INTEL_LIBS)
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources 
b/src/mesa/drivers/dri/i965/Makefile.sources
index 4689588..2619e43 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -44,6 +44,7 @@ i965_compiler_FILES = \
brw_nir.c \
brw_nir_analyze_boolean_resolves.c \
brw_nir_attribute_workarounds.c \
+   brw_nir_trig_workarounds.c \
brw_nir_opt_peephole_ffma.c \
brw_nir_uniforms.cpp \
brw_packed_float.c \
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 90b8789..bd6314a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -775,24 +775,12 @@ fs_visitor::nir_emit_alu(const fs_builder , 
nir_alu_instr *instr)
break;

 case nir_op_fsin:
-  if (!compiler->precise_trig) {
- inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
-  } else {
- fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
- inst = bld.emit(SHADER_OPCODE_SIN, tmp, op[0]);
- inst = bld.MUL(result, tmp, brw_imm_f(0.7));
-  }
+  inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
inst->saturate = instr->dest.saturate;
break;

 case nir_op_fcos:
-  if (!compiler->precise_trig) {
- inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
-  } else {
- fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
- inst = bld.emit(SHADER_OPCODE_COS, tmp, op[0]);
- inst = bld.MUL(result, tmp, brw_imm_f(0.7));
-  }
+  inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
inst->saturate = instr->dest.saturate;
break;

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c 
b/src/mesa/drivers/dri/i965/brw_nir.c
index 1821c0d..932979a 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -447,6 +447,9 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
 if (nir->stage == MESA_SHADER_GEOMETRY)
OPT(nir_lower_gs_intrinsics);

+   if (compiler->precise_trig)
+  OPT(brw_nir_apply_trig_workarounds);
+
 static const nir_lower_tex_options tex_options = {
.lower_txp = ~0,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h 
b/src/mesa/drivers/dri/i965/brw_nir.h
index b10c083..2711606 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -106,6 +106,8 @@ bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
  

[Mesa-dev] [PATCH 4/4] i965: Port INTEL_PRECISE_TRIG=1 to NIR.

2016-04-07 Thread Kenneth Graunke
This makes the extra multiply visible to NIR's algebraic optimizations
(for constant reassociation) as well as constant folding.  This means
that when the result of sin/cos are multiplied by an constant, we can
eliminate the extra multiply altogether, reducing the cost of the
workaround.

It also means we only have to implement it one place, rather than in
both backends.

This makes INTEL_PRECISE_TRIG=1 cost nothing on GPUTest/Volplosion,
which has a ton of sin() calls, but always multiplies them by an
immediate constant.  The extra multiply gets folded away.

Signed-off-by: Kenneth Graunke 
---
 src/mesa/drivers/dri/i965/Makefile.am  |  5 +++
 src/mesa/drivers/dri/i965/Makefile.sources |  1 +
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp   | 16 +---
 src/mesa/drivers/dri/i965/brw_nir.c|  3 ++
 src/mesa/drivers/dri/i965/brw_nir.h|  2 +
 .../drivers/dri/i965/brw_nir_trig_workarounds.py   | 43 ++
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 16 +---
 7 files changed, 58 insertions(+), 28 deletions(-)
 create mode 100755 src/mesa/drivers/dri/i965/brw_nir_trig_workarounds.py

diff --git a/src/mesa/drivers/dri/i965/Makefile.am 
b/src/mesa/drivers/dri/i965/Makefile.am
index 0db5a51..a41c830 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -33,6 +33,7 @@ AM_CFLAGS = \
-I$(top_srcdir)/src/mesa/drivers/dri/common \
-I$(top_srcdir)/src/mesa/drivers/dri/intel/server \
-I$(top_srcdir)/src/gtest/include \
+   -I$(top_srcdir)/src/compiler/nir \
-I$(top_builddir)/src/compiler/nir \
-I$(top_builddir)/src/mesa/drivers/dri/common \
$(DEFINES) \
@@ -41,6 +42,10 @@ AM_CFLAGS = \
 
 AM_CXXFLAGS = $(AM_CFLAGS)
 
+brw_nir_trig_workarounds.c: brw_nir_trig_workarounds.py 
$(top_srcdir)/src/compiler/nir/nir_algebraic.py
+   $(MKDIR_GEN)
+   $(AM_V_GEN) PYTHONPATH=$(top_srcdir)/src/compiler/nir $(PYTHON2) 
$(PYTHON_FLAGS) $(srcdir)/brw_nir_trig_workarounds.py > $@ || ($(RM) $@; false)
+
 noinst_LTLIBRARIES = libi965_dri.la libi965_compiler.la
 libi965_dri_la_SOURCES = $(i965_FILES)
 libi965_dri_la_LIBADD = libi965_compiler.la $(INTEL_LIBS)
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources 
b/src/mesa/drivers/dri/i965/Makefile.sources
index 4689588..2619e43 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -44,6 +44,7 @@ i965_compiler_FILES = \
brw_nir.c \
brw_nir_analyze_boolean_resolves.c \
brw_nir_attribute_workarounds.c \
+   brw_nir_trig_workarounds.c \
brw_nir_opt_peephole_ffma.c \
brw_nir_uniforms.cpp \
brw_packed_float.c \
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 90b8789..bd6314a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -775,24 +775,12 @@ fs_visitor::nir_emit_alu(const fs_builder , 
nir_alu_instr *instr)
   break;
 
case nir_op_fsin:
-  if (!compiler->precise_trig) {
- inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
-  } else {
- fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
- inst = bld.emit(SHADER_OPCODE_SIN, tmp, op[0]);
- inst = bld.MUL(result, tmp, brw_imm_f(0.7));
-  }
+  inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
   inst->saturate = instr->dest.saturate;
   break;
 
case nir_op_fcos:
-  if (!compiler->precise_trig) {
- inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
-  } else {
- fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
- inst = bld.emit(SHADER_OPCODE_COS, tmp, op[0]);
- inst = bld.MUL(result, tmp, brw_imm_f(0.7));
-  }
+  inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
   inst->saturate = instr->dest.saturate;
   break;
 
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c 
b/src/mesa/drivers/dri/i965/brw_nir.c
index 1821c0d..932979a 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -447,6 +447,9 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
if (nir->stage == MESA_SHADER_GEOMETRY)
   OPT(nir_lower_gs_intrinsics);
 
+   if (compiler->precise_trig)
+  OPT(brw_nir_apply_trig_workarounds);
+
static const nir_lower_tex_options tex_options = {
   .lower_txp = ~0,
};
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h 
b/src/mesa/drivers/dri/i965/brw_nir.h
index b10c083..2711606 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -106,6 +106,8 @@ bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
  bool use_legacy_snorm_formula,
  const uint8_t *attrib_wa_flags);
 
+bool