[Mesa-dev] [PATCH v4] anv: fix alphaToCoverage when there is no color attachment

2019-05-06 Thread Iago Toral Quiroga
From: Samuel Iglesias Gonsálvez 

There are tests in CTS for alpha to coverage without a color attachment
that are failing. This happens because we remove the shader color
outputs when we don't have a valid color attachment for them, but when
alpha to coverage is enabled we still want to preserve the the output
at location 0 since we need the alpha component. In that case we will
also need to create a null render target for RT 0.

v2:
  - We already create a null rt when we don't have any, so reuse that
for this case (Jason)
  - Simplify the code a bit (Iago)

v3:
  - Take alpha to coverage from the key and don't tie this to depth-only
rendering only, we want the same behavior if we have multiple render
targets but the one at location 0 is not used. (Jason).
  - Rewrite commit message (Iago)

v4:
  - Make sure we take into account the array length of the shader outputs,
which we were no handling correctly either and make sure we also
create null render targets for any invalid array entries too. (Jason)

Fixes the following CTS tests:
dEQP-VK.pipeline.multisample.alpha_to_coverage_no_color_attachment.*

Signed-off-by: Samuel Iglesias Gonsálvez 
Signed-off-by: Iago Toral Quiroga 
---
 src/intel/vulkan/anv_pipeline.c | 56 -
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 20eab548fb2..f15f0896266 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -823,14 +823,24 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
  continue;
 
   const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
-  /* Unused or out-of-bounds */
-  if (rt >= MAX_RTS || !(stage->key.wm.color_outputs_valid & (1 << rt)))
+  /* Out-of-bounds */
+  if (rt >= MAX_RTS)
  continue;
 
   const unsigned array_len =
  glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1;
   assert(rt + array_len <= max_rt);
 
+  /* Unused */
+  if (!(stage->key.wm.color_outputs_valid & BITFIELD_RANGE(rt, 
array_len))) {
+ /* If this is the RT at location 0 and we have alpha to coverage
+  * enabled we will have to create a null RT for it, so mark it as
+  * used.
+  */
+ if (rt > 0 || !stage->key.wm.alpha_to_coverage)
+continue;
+  }
+
   for (unsigned i = 0; i < array_len; i++)
  rt_used[rt + i] = true;
}
@@ -841,11 +851,22 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
  continue;
 
   rt_to_bindings[i] = num_rts;
-  rt_bindings[rt_to_bindings[i]] = (struct anv_pipeline_binding) {
- .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
- .binding = 0,
- .index = i,
-  };
+
+  if (stage->key.wm.color_outputs_valid & (1 << i)) {
+ rt_bindings[rt_to_bindings[i]] = (struct anv_pipeline_binding) {
+.set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
+.binding = 0,
+.index = i,
+ };
+  } else {
+ /* Setup a null render target */
+ rt_bindings[rt_to_bindings[i]] = (struct anv_pipeline_binding) {
+.set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
+.binding = 0,
+.index = UINT32_MAX,
+ };
+  }
+
   num_rts++;
}
 
@@ -855,14 +876,21 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
  continue;
 
   const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
+  const unsigned array_len =
+ glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1;
+
   if (rt >= MAX_RTS ||
-  !(stage->key.wm.color_outputs_valid & (1 << rt))) {
- /* Unused or out-of-bounds, throw it away */
- deleted_output = true;
- var->data.mode = nir_var_function_temp;
- exec_node_remove(&var->node);
- exec_list_push_tail(&impl->locals, &var->node);
- continue;
+  !(stage->key.wm.color_outputs_valid & BITFIELD_RANGE(rt, 
array_len))) {
+ /* Unused or out-of-bounds, throw it away, unless it is the first
+  * RT and we have alpha to coverage enabled.
+  */
+ if (rt != 0 || !stage->key.wm.alpha_to_coverage) {
+deleted_output = true;
+var->data.mode = nir_var_function_temp;
+exec_node_remove(&var->node);
+exec_list_push_tail(&impl->locals, &var->node);
+continue;
+ }
   }
 
   /* Give it the new location */
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v3] anv: fix alphaToCoverage when there is no color attachment

2019-05-03 Thread Iago Toral Quiroga
From: Samuel Iglesias Gonsálvez 

There are tests in CTS for alpha to coverage without a color attachment
that are failing. This happens because when we remove the shader color
outputs when we don't have a valid color attachment for them, but when
alpha to coverage is enabled we still want to preserve the the output
at location 0 since we need its alpha component for alpha to coverage.
In that case we will also need to create a null render target for RT 0.

v2:
  - We already create a null rt when we don't have any, so reuse that
for this case (Jason)
  - Simplify the code a bit (Iago)

v3:
  - Take alpha to coverage from the key and don't tie this to depth-only
rendering only, we want the same behavior if we have multiple render
targets but the one at location 0 is not used. (Jason).
  - Rewrite commit message (Iago)

Fixes the following CTS tests:
dEQP-VK.pipeline.multisample.alpha_to_coverage_no_color_attachment.*

Signed-off-by: Samuel Iglesias Gonsálvez 
Signed-off-by: Iago Toral Quiroga 
---
 src/intel/vulkan/anv_pipeline.c | 48 +
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 20eab548fb2..f379dd2752e 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -818,15 +818,28 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
memset(rt_used, 0, sizeof(rt_used));
 
/* Flag used render targets */
+   bool needs_null_rt_for_alpha_to_coverage = false;
nir_foreach_variable_safe(var, &stage->nir->outputs) {
   if (var->data.location < FRAG_RESULT_DATA0)
  continue;
 
   const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
-  /* Unused or out-of-bounds */
-  if (rt >= MAX_RTS || !(stage->key.wm.color_outputs_valid & (1 << rt)))
+  /* Out-of-bounds */
+  if (rt >= MAX_RTS)
  continue;
 
+  /* Unused */
+  if (!(stage->key.wm.color_outputs_valid & (1 << rt))) {
+ /* If this is the RT at location 0 and we have alpha to coverage
+  * enabled, we'll have to create a null render target and it must
+  * be at index 0.
+  */
+ if (rt == 0 && stage->key.wm.alpha_to_coverage)
+needs_null_rt_for_alpha_to_coverage = true;
+
+ continue;
+  }
+
   const unsigned array_len =
  glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1;
   assert(rt + array_len <= max_rt);
@@ -835,7 +848,12 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
  rt_used[rt + i] = true;
}
 
-   /* Set new, compacted, location */
+   /* Make sure we leave the first RT slot available for alpha to coverage
+* if we don't have a valid RT 0.
+*/
+   if (needs_null_rt_for_alpha_to_coverage)
+  num_rts = 1;
+
for (unsigned i = 0; i < max_rt; i++) {
   if (!rt_used[i])
  continue;
@@ -857,11 +875,15 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
   const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
   if (rt >= MAX_RTS ||
   !(stage->key.wm.color_outputs_valid & (1 << rt))) {
- /* Unused or out-of-bounds, throw it away */
- deleted_output = true;
- var->data.mode = nir_var_function_temp;
- exec_node_remove(&var->node);
- exec_list_push_tail(&impl->locals, &var->node);
+ /* Unused or out-of-bounds, throw it away, unless it is the first
+  * RT and we have alpha to coverage.
+  */
+ if (rt != 0 || !stage->key.wm.alpha_to_coverage) {
+deleted_output = true;
+var->data.mode = nir_var_function_temp;
+exec_node_remove(&var->node);
+exec_list_push_tail(&impl->locals, &var->node);
+ }
  continue;
   }
 
@@ -873,14 +895,18 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
if (deleted_output)
   nir_fixup_deref_modes(stage->nir);
 
-   if (num_rts == 0) {
-  /* If we have no render targets, we need a null render target */
+   /* If we have no render targets or we need to create one for alpha to
+* coverage, we need a null render target.
+*/
+   if (num_rts == 0 || needs_null_rt_for_alpha_to_coverage) {
   rt_bindings[0] = (struct anv_pipeline_binding) {
  .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
  .binding = 0,
  .index = UINT32_MAX,
   };
-  num_rts = 1;
+
+  if (num_rts == 0)
+ num_rts = 1;
}
 
/* Now that we've determined the actual number of render targets, adjust
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2] anv: fix alphaToCoverage when there is no color attachment

2019-05-02 Thread Iago Toral Quiroga
From: Samuel Iglesias Gonsálvez 

There tests in CTS for for alpha to coverage without a color attachment.
First the test draws a primitive with alpha 0 and a subpass with only
a depth buffer. No writes to a depth buffer are expected. Then a
second draw with a color buffer and the same depth buffer is done to
verify the depth buffer still has the original clear values.

This behavior is not explicitly forbidden by the Vulkan spec, so
it seems it is allowed.

When there is no color attachment for a given output, we discard it
so at the end we have an FS assembly like:

Native code for unnamed fragment shader (null)
SIMD16 shader: 1 instructions. 0 loops. 4 cycles. 0:0 spills:fills.
Promoted 0 constants. Compacted 16 to 16 bytes (0%)
  START B0 (4 cycles)
sendc(16)   null<1>UW   g120<0,1,0>F0x90031000

render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 8 rlen 0 {
align1 1H EOT };

As g120 is not initialized, we see random writes to the depth buffer
due to the alphaToCoverage enablement. This patch fixes that by
keeping the output in that case.

v2:
 - No need to create a null render target, the driver is already
   doing that (Jason)
 - Simplified code a bit (Iago)

Fixes the following CTS tests:
dEQP-VK.pipeline.multisample.alpha_to_coverage_no_color_attachment.*

Signed-off-by: Samuel Iglesias Gonsálvez 
Signed-off-by: Iago Toral Quiroga 
---
 src/intel/vulkan/anv_pipeline.c | 25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index b9c9bfd7598..07f1a939e43 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -808,7 +808,9 @@ anv_pipeline_compile_gs(const struct brw_compiler *compiler,
 
 static void
 anv_pipeline_link_fs(const struct brw_compiler *compiler,
- struct anv_pipeline_stage *stage)
+ struct anv_pipeline_stage *stage,
+ bool has_depth_stencil_att,
+ bool has_alpha_to_coverage)
 {
unsigned num_rts = 0;
const int max_rt = FRAG_RESULT_DATA7 - FRAG_RESULT_DATA0 + 1;
@@ -859,11 +861,17 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
   const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
   if (rt >= MAX_RTS ||
   !(stage->key.wm.color_outputs_valid & (1 << rt))) {
- /* Unused or out-of-bounds, throw it away */
- deleted_output = true;
- var->data.mode = nir_var_function_temp;
- exec_node_remove(&var->node);
- exec_list_push_tail(&impl->locals, &var->node);
+ /* Unused or out-of-bounds, throw it away. The exception is depth-only
+  * rendering with alphaToCoverage, as in this case we need to keep the
+  * fragment output in location 0, which we will bind later to a null
+  * render target.
+  */
+ if (rt != 0 || !has_alpha_to_coverage || !has_depth_stencil_att) {
+deleted_output = true;
+var->data.mode = nir_var_function_temp;
+exec_node_remove(&var->node);
+exec_list_push_tail(&impl->locals, &var->node);
+ }
  continue;
   }
 
@@ -1120,7 +1128,10 @@ anv_pipeline_compile_graphics(struct anv_pipeline 
*pipeline,
  anv_pipeline_link_gs(compiler, &stages[s], next_stage);
  break;
   case MESA_SHADER_FRAGMENT:
- anv_pipeline_link_fs(compiler, &stages[s]);
+ anv_pipeline_link_fs(compiler, &stages[s],
+  pipeline->subpass->depth_stencil_attachment,
+  info->pMultisampleState &&
+  info->pMultisampleState->alphaToCoverageEnable);
  break;
   default:
  unreachable("Invalid graphics shader stage");
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] anv/query: ensure visibility of reset before copying query results

2019-04-30 Thread Iago Toral Quiroga
Specifically, vkCmdCopyQueryPoolResults is required to see the effect
of a previous vkCmdResetQueryPool. This may not work currently when
query execution is still on going, as some of the queries may become
available asynchronously after the reset.

Fixes new CTS tests:
dEQP-VK.query_pool.statistics_query.reset_before_copy.*
---

Jason, do you have any better ideas?

 src/intel/vulkan/genX_query.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 146435c3f8f..08b013f6351 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -383,6 +383,19 @@ void genX(CmdResetQueryPool)(
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 
+   /* From the Vulkan spec:
+*
+*"vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+* previous uses of vkCmdResetQueryPool in the same queue, without
+* any additional synchronization. Thus, the results will always
+* reflect the most recent use of the query."
+*
+* So we need to make sure that any on-going queries are finished by
+* the time we emit the reset.
+*/
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
for (uint32_t i = 0; i < queryCount; i++) {
   anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
  sdm.Address = anv_query_address(pool, firstQuery + i);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 3/3] intel/compiler: implement more algebraic optimizations

2019-03-04 Thread Iago Toral Quiroga
Now that we propagate constants to the first source of 2src instructions we
see more opportunities of constant folding in the backend.

v2:
 - The hardware only uses 5 bits (or 6 bits for Q/UQ) from the shift
   count parameter in SHL/SHR instructions, so do the same in constant
   propagation (Ian)
---
 src/intel/compiler/brw_fs.cpp | 203 --
 1 file changed, 195 insertions(+), 8 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index bd588b55bde..f09320b4127 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic()
  break;
 
   case BRW_OPCODE_MUL:
- if (inst->src[1].file != IMM)
+ if (inst->src[0].file != IMM && inst->src[1].file != IMM)
 continue;
 
+ /* Constant folding */
+ if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+assert(inst->src[0].type == inst->src[1].type);
+bool local_progress = true;
+switch (inst->src[0].type) {
+case BRW_REGISTER_TYPE_HF: {
+   float v1 = _mesa_half_to_float(inst->src[0].ud & 0xu);
+   float v2 = _mesa_half_to_float(inst->src[1].ud & 0xu);
+   inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 * v2));
+   break;
+}
+case BRW_REGISTER_TYPE_W: {
+   int16_t v1 = inst->src[0].ud & 0xu;
+   int16_t v2 = inst->src[1].ud & 0xu;
+   inst->src[0] = brw_imm_w(v1 * v2);
+   break;
+}
+case BRW_REGISTER_TYPE_UW: {
+   uint16_t v1 = inst->src[0].ud & 0xu;
+   uint16_t v2 = inst->src[1].ud & 0xu;
+   inst->src[0] = brw_imm_uw(v1 * v2);
+   break;
+}
+case BRW_REGISTER_TYPE_F:
+   inst->src[0].f *= inst->src[1].f;
+   break;
+case BRW_REGISTER_TYPE_D:
+   inst->src[0].d *= inst->src[1].d;
+   break;
+case BRW_REGISTER_TYPE_UD:
+   inst->src[0].ud *= inst->src[1].ud;
+   break;
+default:
+   local_progress = false;
+   break;
+};
+
+if (local_progress) {
+   inst->opcode = BRW_OPCODE_MOV;
+   inst->src[1] = reg_undef;
+   progress = true;
+   break;
+}
+ }
+
+
  /* a * 1.0 = a */
  if (inst->src[1].is_one()) {
 inst->opcode = BRW_OPCODE_MOV;
@@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic()
 break;
  }
 
+ if (inst->src[0].is_one()) {
+inst->opcode = BRW_OPCODE_MOV;
+inst->src[0] = inst->src[1];
+inst->src[1] = reg_undef;
+progress = true;
+break;
+ }
+
  /* a * -1.0 = -a */
  if (inst->src[1].is_negative_one()) {
 inst->opcode = BRW_OPCODE_MOV;
@@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic()
 break;
  }
 
- if (inst->src[0].file == IMM &&
- inst->src[0].type == BRW_REGISTER_TYPE_F) {
+ if (inst->src[0].is_negative_one()) {
+inst->opcode = BRW_OPCODE_MOV;
+inst->src[0] = inst->src[1];
+inst->src[0].negate = !inst->src[1].negate;
+inst->src[1] = reg_undef;
+progress = true;
+break;
+ }
+
+ /* a * 0 = 0 (this is not exact for floating point) */
+ if (inst->src[1].is_zero() &&
+ brw_reg_type_is_integer(inst->src[1].type)) {
+inst->opcode = BRW_OPCODE_MOV;
+inst->src[0] = inst->src[1];
+inst->src[1] = reg_undef;
+progress = true;
+break;
+ }
+
+ if (inst->src[0].is_zero() &&
+ brw_reg_type_is_integer(inst->src[0].type)) {
 inst->opcode = BRW_OPCODE_MOV;
-inst->src[0].f *= inst->src[1].f;
 inst->src[1] = reg_undef;
 progress = true;
 break;
  }
  break;
   case BRW_OPCODE_ADD:
- if (inst->src[1].file != IMM)
+ if (inst->src[0].file != IMM && inst->src[1].file != IMM)
 continue;
 
- if (inst->src[0].file == IMM &&
- inst->src[0].type == BRW_REGISTER_TYPE_F) {
+ /* Constant folding */
+ if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+assert(inst->src[0].type == inst->src[1].type);
+bool local_progress = true;
+switch (inst->src[0].type) {
+case BRW_REGISTER_TYPE_HF: {
+   float v1 = _mesa_half_to_float(inst->src[0].ud & 0xu);
+   float v2 = _mesa_half_to_float(inst->src[1].ud & 0xu);
+   inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 + v2));
+ 

[Mesa-dev] [PATCH v2 1/3] intel/compiler: allow constant propagation for int quotient and reminder

2019-03-04 Thread Iago Toral Quiroga
And let combine constants promote the constants if needed.
---
 src/intel/compiler/brw_fs_combine_constants.cpp | 2 ++
 src/intel/compiler/brw_fs_copy_propagation.cpp  | 4 
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp 
b/src/intel/compiler/brw_fs_combine_constants.cpp
index db7b14a8312..a26eb67a00a 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -69,6 +69,8 @@ static bool
 must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
 {
switch (inst->opcode) {
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
case SHADER_OPCODE_POW:
   return devinfo->gen < 8;
case BRW_OPCODE_MAD:
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp 
b/src/intel/compiler/brw_fs_copy_propagation.cpp
index 8b904ab356b..c11b05b128a 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -611,10 +611,6 @@ fs_visitor::try_constant_propagate(fs_inst *inst, 
acp_entry *entry)
 
   case SHADER_OPCODE_INT_QUOTIENT:
   case SHADER_OPCODE_INT_REMAINDER:
- /* FINISHME: Promote non-float constants and remove this. */
- if (devinfo->gen < 8)
-break;
- /* fallthrough */
   case SHADER_OPCODE_POW:
  /* Allow constant propagation into src1 (except on Gen 6 which
   * doesn't support scalar source math), and let constant combining
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 2/3] intel/compiler: allow constant propagation to first source of 2src instructions

2019-03-04 Thread Iago Toral Quiroga
Even if it is not supported by the hardware, we will fix it up
in the combine constants pass.

v2:
 - This will enable new constant folding opportunities in the algebraic pass
   for MUL or ADD with types other than F, so do not assert on that type.
   For now we just skip anything that is not float and a later patch will
   expand the algebraic pass to support more constant folding scenarios.
---
 src/intel/compiler/brw_fs.cpp |  8 +--
 .../compiler/brw_fs_combine_constants.cpp | 37 ++---
 .../compiler/brw_fs_copy_propagation.cpp  | 55 +--
 3 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 2358acbeb59..bd588b55bde 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2603,8 +2603,8 @@ fs_visitor::opt_algebraic()
 break;
  }
 
- if (inst->src[0].file == IMM) {
-assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+ if (inst->src[0].file == IMM &&
+ inst->src[0].type == BRW_REGISTER_TYPE_F) {
 inst->opcode = BRW_OPCODE_MOV;
 inst->src[0].f *= inst->src[1].f;
 inst->src[1] = reg_undef;
@@ -2616,8 +2616,8 @@ fs_visitor::opt_algebraic()
  if (inst->src[1].file != IMM)
 continue;
 
- if (inst->src[0].file == IMM) {
-assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+ if (inst->src[0].file == IMM &&
+ inst->src[0].type == BRW_REGISTER_TYPE_F) {
 inst->opcode = BRW_OPCODE_MOV;
 inst->src[0].f += inst->src[1].f;
 inst->src[1] = reg_undef;
diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp 
b/src/intel/compiler/brw_fs_combine_constants.cpp
index a26eb67a00a..bb155caeccf 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -66,13 +66,31 @@ could_coissue(const struct gen_device_info *devinfo, const 
fs_inst *inst)
  * Returns true for instructions that don't support immediate sources.
  */
 static bool
-must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
+must_promote_imm(const struct gen_device_info *devinfo,
+ const fs_inst *inst, const int src_idx)
 {
switch (inst->opcode) {
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
case SHADER_OPCODE_POW:
-  return devinfo->gen < 8;
+  return src_idx != 1 || devinfo->gen < 8;
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SUBB:
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_ADDC:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_IF:
+   case BRW_OPCODE_SEL:
+  return src_idx != 1;
case BRW_OPCODE_MAD:
case BRW_OPCODE_LRP:
   return true;
@@ -335,13 +353,18 @@ fs_visitor::opt_combine_constants()
foreach_block_and_inst(block, fs_inst, inst, cfg) {
   ip++;
 
-  if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))
+  const bool is_coissue_candidate = could_coissue(devinfo, inst);
+  if (!is_coissue_candidate && !must_promote_imm(devinfo, inst, -1))
  continue;
 
   for (int i = 0; i < inst->sources; i++) {
  if (inst->src[i].file != IMM)
 continue;
 
+ const bool must_promote = must_promote_imm(devinfo, inst, i);
+ if (!is_coissue_candidate && !must_promote)
+continue;
+
  char data[8];
  brw_reg_type type;
  if (!get_constant_value(devinfo, inst, i, data, &type))
@@ -357,8 +380,8 @@ fs_visitor::opt_combine_constants()
imm->inst = NULL;
 imm->block = intersection;
 imm->uses->push_tail(link(const_ctx, &inst->src[i]));
-imm->uses_by_coissue += could_coissue(devinfo, inst);
-imm->must_promote = imm->must_promote || must_promote_imm(devinfo, 
inst);
+imm->uses_by_coissue += is_coissue_candidate;
+imm->must_promote = imm->must_promote || must_promote;
 imm->last_use_ip = ip;
 if (type == BRW_REGISTER_TYPE_HF)
imm->is_half_float = true;
@@ -371,8 +394,8 @@ fs_visitor::opt_combine_constants()
 memcpy(imm->bytes, data, size);
 imm->size = size;
 imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
-imm->uses_by_coissue = could_coissue(devinfo, inst);
-imm->must_promote = must_promote_imm(devinfo, inst);
+imm->uses_by_coissue = is_coissue_candidate;
+imm->must_promote = must_promote;
 imm->first_use_ip = ip;
 imm->last_use_ip = ip;
  }
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp 
b

[Mesa-dev] [PATCH v2 0/3] intel: propagate constants to first source of 2-src instructions

2019-03-04 Thread Iago Toral Quiroga
This v2 addresses comments to v1, specifically constant folding of SHR/SHL
instructions now only take up to 5-bits from the shift count parameter, just
like the hardware does.

Also, patch 2 has been fixed to avoid hitting assertions that would then be
fixed by patch 3. I sent the series up to patch 2 to Jenkins and verified that
it came back green.

Iago Toral Quiroga (3):
  intel/compiler: allow constant propagation for int quotient and
reminder
  intel/compiler: allow constant propagation to first source of 2src
instructions
  intel/compiler: implement more algebraic optimizations

 src/intel/compiler/brw_fs.cpp | 203 +-
 .../compiler/brw_fs_combine_constants.cpp |  39 +++-
 .../compiler/brw_fs_copy_propagation.cpp  |  59 +++--
 3 files changed, 253 insertions(+), 48 deletions(-)

-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 2/3] intel/compiler: allow constant propagation to first source of 2-src instructions

2019-02-27 Thread Iago Toral Quiroga
Even if it is not supported by the hardware, we will fix it up
in the combine constants pass.
---
 .../compiler/brw_fs_combine_constants.cpp | 37 ++---
 .../compiler/brw_fs_copy_propagation.cpp  | 55 +--
 2 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp 
b/src/intel/compiler/brw_fs_combine_constants.cpp
index a26eb67a00a..bb155caeccf 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -66,13 +66,31 @@ could_coissue(const struct gen_device_info *devinfo, const 
fs_inst *inst)
  * Returns true for instructions that don't support immediate sources.
  */
 static bool
-must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
+must_promote_imm(const struct gen_device_info *devinfo,
+ const fs_inst *inst, const int src_idx)
 {
switch (inst->opcode) {
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
case SHADER_OPCODE_POW:
-  return devinfo->gen < 8;
+  return src_idx != 1 || devinfo->gen < 8;
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SUBB:
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_ADDC:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_IF:
+   case BRW_OPCODE_SEL:
+  return src_idx != 1;
case BRW_OPCODE_MAD:
case BRW_OPCODE_LRP:
   return true;
@@ -335,13 +353,18 @@ fs_visitor::opt_combine_constants()
foreach_block_and_inst(block, fs_inst, inst, cfg) {
   ip++;
 
-  if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))
+  const bool is_coissue_candidate = could_coissue(devinfo, inst);
+  if (!is_coissue_candidate && !must_promote_imm(devinfo, inst, -1))
  continue;
 
   for (int i = 0; i < inst->sources; i++) {
  if (inst->src[i].file != IMM)
 continue;
 
+ const bool must_promote = must_promote_imm(devinfo, inst, i);
+ if (!is_coissue_candidate && !must_promote)
+continue;
+
  char data[8];
  brw_reg_type type;
  if (!get_constant_value(devinfo, inst, i, data, &type))
@@ -357,8 +380,8 @@ fs_visitor::opt_combine_constants()
imm->inst = NULL;
 imm->block = intersection;
 imm->uses->push_tail(link(const_ctx, &inst->src[i]));
-imm->uses_by_coissue += could_coissue(devinfo, inst);
-imm->must_promote = imm->must_promote || must_promote_imm(devinfo, 
inst);
+imm->uses_by_coissue += is_coissue_candidate;
+imm->must_promote = imm->must_promote || must_promote;
 imm->last_use_ip = ip;
 if (type == BRW_REGISTER_TYPE_HF)
imm->is_half_float = true;
@@ -371,8 +394,8 @@ fs_visitor::opt_combine_constants()
 memcpy(imm->bytes, data, size);
 imm->size = size;
 imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
-imm->uses_by_coissue = could_coissue(devinfo, inst);
-imm->must_promote = must_promote_imm(devinfo, inst);
+imm->uses_by_coissue = is_coissue_candidate;
+imm->must_promote = must_promote;
 imm->first_use_ip = ip;
 imm->last_use_ip = ip;
  }
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp 
b/src/intel/compiler/brw_fs_copy_propagation.cpp
index c11b05b128a..33454e50861 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -624,10 +624,8 @@ fs_visitor::try_constant_propagate(fs_inst *inst, 
acp_entry *entry)
   case BRW_OPCODE_SHL:
   case BRW_OPCODE_SHR:
   case BRW_OPCODE_SUBB:
- if (i == 1) {
-inst->src[i] = val;
-progress = true;
- }
+ inst->src[i] = val;
+ progress = true;
  break;
 
   case BRW_OPCODE_MACH:
@@ -638,10 +636,7 @@ fs_visitor::try_constant_propagate(fs_inst *inst, 
acp_entry *entry)
   case BRW_OPCODE_AND:
   case BRW_OPCODE_XOR:
   case BRW_OPCODE_ADDC:
- if (i == 1) {
-inst->src[i] = val;
-progress = true;
- } else if (i == 0 && inst->src[1].file != IMM) {
+ if (i == 0 && inst->src[1].file != IMM) {
 /* Fit this constant in by commuting the operands.
  * Exception: we can't do this for 32-bit integer MUL/MACH
  * because it's asymmetric.
@@ -653,24 +648,25 @@ fs_visitor::try_constant_propagate(fs_inst *inst, 
acp_entry *entry)
  * Integer MUL with a non-accumulator destination will be lowered
  * by lower_integer_multiplication(), so don't restrict it.
  */
-if (((inst->opcode 

[Mesa-dev] [PATCH 3/3] intel/compiler: implement more algebraic optimizations

2019-02-27 Thread Iago Toral Quiroga
Now that we propagate constants to the first source of 2src instructions we
see more opportunities of constant folding in the backend.

Shader-db results on KBL:

total instructions in shared programs: 14965607 -> 14855983 (-0.73%)
instructions in affected programs: 3988102 -> 3878478 (-2.75%)
helped: 14292
HURT: 59

total cycles in shared programs: 344324295 -> 340656008 (-1.07%)
cycles in affected programs: 247527740 -> 243859453 (-1.48%)
helped: 14056
HURT: 3314

total loops in shared programs: 4283 -> 4283 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0

total spills in shared programs: 27812 -> 24350 (-12.45%)
spills in affected programs: 24921 -> 21459 (-13.89%)
helped: 345
HURT: 19

total fills in shared programs: 24173 -> 22032 (-8.86%)
fills in affected programs: 21124 -> 18983 (-10.14%)
helped: 355
HURT: 25

LOST:   0
GAINED: 5
---
 src/intel/compiler/brw_fs.cpp | 203 --
 1 file changed, 195 insertions(+), 8 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 2358acbeb59..b2b60237c82 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic()
  break;
 
   case BRW_OPCODE_MUL:
- if (inst->src[1].file != IMM)
+ if (inst->src[0].file != IMM && inst->src[1].file != IMM)
 continue;
 
+ /* Constant folding */
+ if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+assert(inst->src[0].type == inst->src[1].type);
+bool local_progress = true;
+switch (inst->src[0].type) {
+case BRW_REGISTER_TYPE_HF: {
+   float v1 = _mesa_half_to_float(inst->src[0].ud & 0xu);
+   float v2 = _mesa_half_to_float(inst->src[1].ud & 0xu);
+   inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 * v2));
+   break;
+}
+case BRW_REGISTER_TYPE_W: {
+   int16_t v1 = inst->src[0].ud & 0xu;
+   int16_t v2 = inst->src[1].ud & 0xu;
+   inst->src[0] = brw_imm_w(v1 * v2);
+   break;
+}
+case BRW_REGISTER_TYPE_UW: {
+   uint16_t v1 = inst->src[0].ud & 0xu;
+   uint16_t v2 = inst->src[1].ud & 0xu;
+   inst->src[0] = brw_imm_uw(v1 * v2);
+   break;
+}
+case BRW_REGISTER_TYPE_F:
+   inst->src[0].f *= inst->src[1].f;
+   break;
+case BRW_REGISTER_TYPE_D:
+   inst->src[0].d *= inst->src[1].d;
+   break;
+case BRW_REGISTER_TYPE_UD:
+   inst->src[0].ud *= inst->src[1].ud;
+   break;
+default:
+   local_progress = false;
+   break;
+};
+
+if (local_progress) {
+   inst->opcode = BRW_OPCODE_MOV;
+   inst->src[1] = reg_undef;
+   progress = true;
+   break;
+}
+ }
+
+
  /* a * 1.0 = a */
  if (inst->src[1].is_one()) {
 inst->opcode = BRW_OPCODE_MOV;
@@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic()
 break;
  }
 
+ if (inst->src[0].is_one()) {
+inst->opcode = BRW_OPCODE_MOV;
+inst->src[0] = inst->src[1];
+inst->src[1] = reg_undef;
+progress = true;
+break;
+ }
+
  /* a * -1.0 = -a */
  if (inst->src[1].is_negative_one()) {
 inst->opcode = BRW_OPCODE_MOV;
@@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic()
 break;
  }
 
- if (inst->src[0].file == IMM) {
-assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+ if (inst->src[0].is_negative_one()) {
+inst->opcode = BRW_OPCODE_MOV;
+inst->src[0] = inst->src[1];
+inst->src[0].negate = !inst->src[1].negate;
+inst->src[1] = reg_undef;
+progress = true;
+break;
+ }
+
+ /* a * 0 = 0 (this is not exact for floating point) */
+ if (inst->src[1].is_zero() &&
+ brw_reg_type_is_integer(inst->src[1].type)) {
+inst->opcode = BRW_OPCODE_MOV;
+inst->src[0] = inst->src[1];
+inst->src[1] = reg_undef;
+progress = true;
+break;
+ }
+
+ if (inst->src[0].is_zero() &&
+ brw_reg_type_is_integer(inst->src[0].type)) {
 inst->opcode = BRW_OPCODE_MOV;
-inst->src[0].f *= inst->src[1].f;
 inst->src[1] = reg_undef;
 progress = true;
 break;
  }
  break;
   case BRW_OPCODE_ADD:
- if (inst->src[1].file != IMM)
+ if (inst->src[0].file != IMM && inst->src[1].file != IMM)
 continue;
 
- if (inst->src[0].file == IMM) {
-   

[Mesa-dev] [PATCH 1/3] intel/compiler: allow constant propagation for int quotient and reminder

2019-02-27 Thread Iago Toral Quiroga
And let combine constants promote the constants if needed.
---
 src/intel/compiler/brw_fs_combine_constants.cpp | 2 ++
 src/intel/compiler/brw_fs_copy_propagation.cpp  | 4 
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp 
b/src/intel/compiler/brw_fs_combine_constants.cpp
index db7b14a8312..a26eb67a00a 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -69,6 +69,8 @@ static bool
 must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
 {
switch (inst->opcode) {
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
case SHADER_OPCODE_POW:
   return devinfo->gen < 8;
case BRW_OPCODE_MAD:
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp 
b/src/intel/compiler/brw_fs_copy_propagation.cpp
index 8b904ab356b..c11b05b128a 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -611,10 +611,6 @@ fs_visitor::try_constant_propagate(fs_inst *inst, 
acp_entry *entry)
 
   case SHADER_OPCODE_INT_QUOTIENT:
   case SHADER_OPCODE_INT_REMAINDER:
- /* FINISHME: Promote non-float constants and remove this. */
- if (devinfo->gen < 8)
-break;
- /* fallthrough */
   case SHADER_OPCODE_POW:
  /* Allow constant propagation into src1 (except on Gen 6 which
   * doesn't support scalar source math), and let constant combining
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 0/3] intel: propagate constants to first source of 2-src instructions

2019-02-27 Thread Iago Toral Quiroga
This little series lives on top of my VK_KHR_shader_float16_int8 branch, since
it depends on having a more flexible combine constants pass, which is included
with that work.

A branch with that on this series is available here:
https://github.com/Igalia/mesa/tree/itoral/VK_KHR_shader_float16_int8_combine_constants

Shader-db results on KBL:

total instructions in shared programs: 14965607 -> 14855983 (-0.73%)
instructions in affected programs: 3988102 -> 3878478 (-2.75%)
helped: 14292
HURT: 59

total cycles in shared programs: 344324295 -> 340656008 (-1.07%)
cycles in affected programs: 247527740 -> 243859453 (-1.48%)
helped: 14056
HURT: 3314

total loops in shared programs: 4283 -> 4283 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0

total spills in shared programs: 27812 -> 24350 (-12.45%)
spills in affected programs: 24921 -> 21459 (-13.89%)
helped: 345
HURT: 19

total fills in shared programs: 24173 -> 22032 (-8.86%)
fills in affected programs: 21124 -> 18983 (-10.14%)
helped: 355
HURT: 25

LOST:   0
GAINED: 5

Initially the series included propagation to BFE, BFI2 and BFREV, but that
actually led to significantly worse shader-db results, so that part has been
removed.

Iago

Iago Toral Quiroga (3):
  intel/compiler: allow constant propagation for int quotient and
reminder
  intel/compiler: allow constant propagation to first source of 2-src
instructions
  intel/compiler: implement more algebraic optimizations

 src/intel/compiler/brw_fs.cpp | 203 +-
 .../compiler/brw_fs_combine_constants.cpp |  39 +++-
 .../compiler/brw_fs_copy_propagation.cpp  |  59 +++--
 3 files changed, 253 insertions(+), 48 deletions(-)

-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v5 33/40] intel/compiler: also set F execution type for mixed float mode in BDW

2019-02-27 Thread Iago Toral Quiroga
The section 'Execution Data Types' of 3D Media GPGPU volume, which
describes execution types, is exactly the same in BDW and SKL+.

Also, this section states that there is a single execution type, so it
makes sense that this is the wider of the two floating point types
involved in mixed float mode, which is what we do for SKL+ and CHV.

v2:
 - Make sure we also account for the destination type in mixed mode (Curro).
---
 src/intel/compiler/brw_eu_validate.c | 39 +---
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/intel/compiler/brw_eu_validate.c 
b/src/intel/compiler/brw_eu_validate.c
index 358a0347a93..e0010f0fb07 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -348,6 +348,17 @@ is_unsupported_inst(const struct gen_device_info *devinfo,
return brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)) == NULL;
 }
 
+/**
+ * Returns whether a combination of two types would qualify as mixed float
+ * operation mode
+ */
+static inline bool
+types_are_mixed_float(enum brw_reg_type t0, enum brw_reg_type t1)
+{
+   return (t0 == BRW_REGISTER_TYPE_F && t1 == BRW_REGISTER_TYPE_HF) ||
+  (t1 == BRW_REGISTER_TYPE_F && t0 == BRW_REGISTER_TYPE_HF);
+}
+
 static enum brw_reg_type
 execution_type_for_type(enum brw_reg_type type)
 {
@@ -390,20 +401,24 @@ execution_type(const struct gen_device_info *devinfo, 
const brw_inst *inst)
enum brw_reg_type src0_exec_type, src1_exec_type;
 
/* Execution data type is independent of destination data type, except in
-* mixed F/HF instructions on CHV and SKL+.
+* mixed F/HF instructions.
 */
enum brw_reg_type dst_exec_type = brw_inst_dst_type(devinfo, inst);
 
src0_exec_type = execution_type_for_type(brw_inst_src0_type(devinfo, inst));
if (num_sources == 1) {
-  if ((devinfo->gen >= 9 || devinfo->is_cherryview) &&
-  src0_exec_type == BRW_REGISTER_TYPE_HF) {
+  if (src0_exec_type == BRW_REGISTER_TYPE_HF)
  return dst_exec_type;
-  }
   return src0_exec_type;
}
 
src1_exec_type = execution_type_for_type(brw_inst_src1_type(devinfo, inst));
+   if (types_are_mixed_float(src0_exec_type, src1_exec_type) ||
+   types_are_mixed_float(src0_exec_type, dst_exec_type) ||
+   types_are_mixed_float(src1_exec_type, dst_exec_type)) {
+  return BRW_REGISTER_TYPE_F;
+   }
+
if (src0_exec_type == src1_exec_type)
   return src0_exec_type;
 
@@ -431,18 +446,12 @@ execution_type(const struct gen_device_info *devinfo, 
const brw_inst *inst)
src1_exec_type == BRW_REGISTER_TYPE_DF)
   return BRW_REGISTER_TYPE_DF;
 
-   if (devinfo->gen >= 9 || devinfo->is_cherryview) {
-  if (dst_exec_type == BRW_REGISTER_TYPE_F ||
-  src0_exec_type == BRW_REGISTER_TYPE_F ||
-  src1_exec_type == BRW_REGISTER_TYPE_F) {
- return BRW_REGISTER_TYPE_F;
-  } else {
- return BRW_REGISTER_TYPE_HF;
-  }
-   }
+   if (src0_exec_type == BRW_REGISTER_TYPE_F ||
+   src1_exec_type == BRW_REGISTER_TYPE_F)
+  return BRW_REGISTER_TYPE_F;
 
-   assert(src0_exec_type == BRW_REGISTER_TYPE_F);
-   return BRW_REGISTER_TYPE_F;
+   assert(src0_exec_type == BRW_REGISTER_TYPE_HF);
+   return BRW_REGISTER_TYPE_HF;
 }
 
 /**
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v5 02/40] intel/compiler: add a NIR pass to lower conversions

2019-02-15 Thread Iago Toral Quiroga
Some conversions are not directly supported in hardware and need to be
split in two conversion instructions going through an intermediary type.
Doing this at the NIR level simplifies a bit the complexity in the backend.

v2:
 - Consider fp16 rounding conversion opcodes
 - Properly handle swizzles on conversion sources.

v3
 - Run the pass earlier, right after nir_opt_algebraic_late (Jason)
 - NIR alu output types already have the bit-size (Jason)
 - Use 'is_conversion' to identify conversion operations (Jason)

v4:
 - Be careful about the intermediate types we use so we don't lose
   range and avoid incorrect rounding semantics (Jason)

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/Makefile.sources|   1 +
 src/intel/compiler/brw_nir.c  |   2 +
 src/intel/compiler/brw_nir.h  |   2 +
 .../compiler/brw_nir_lower_conversions.c  | 169 ++
 src/intel/compiler/meson.build|   1 +
 5 files changed, 175 insertions(+)
 create mode 100644 src/intel/compiler/brw_nir_lower_conversions.c

diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 94a28d370e8..9975daa3ad1 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -83,6 +83,7 @@ COMPILER_FILES = \
compiler/brw_nir_analyze_boolean_resolves.c \
compiler/brw_nir_analyze_ubo_ranges.c \
compiler/brw_nir_attribute_workarounds.c \
+   compiler/brw_nir_lower_conversions.c \
compiler/brw_nir_lower_cs_intrinsics.c \
compiler/brw_nir_lower_image_load_store.c \
compiler/brw_nir_lower_mem_access_bit_sizes.c \
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 9dbf06004a4..7e3dbc9e447 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -876,6 +876,8 @@ brw_postprocess_nir(nir_shader *nir, const struct 
brw_compiler *compiler,
 
OPT(nir_opt_algebraic_late);
 
+   OPT(brw_nir_lower_conversions);
+
OPT(nir_lower_to_source_mods, nir_lower_all_source_mods);
OPT(nir_copy_prop);
OPT(nir_opt_dce);
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index bc81950d47e..662b2627e95 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -114,6 +114,8 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const 
struct brw_vue_map *vue,
GLenum tes_primitive_mode);
 void brw_nir_lower_fs_outputs(nir_shader *nir);
 
+bool brw_nir_lower_conversions(nir_shader *nir);
+
 bool brw_nir_lower_image_load_store(nir_shader *nir,
 const struct gen_device_info *devinfo);
 void brw_nir_rewrite_image_intrinsic(nir_intrinsic_instr *intrin,
diff --git a/src/intel/compiler/brw_nir_lower_conversions.c 
b/src/intel/compiler/brw_nir_lower_conversions.c
new file mode 100644
index 000..9aff30b568b
--- /dev/null
+++ b/src/intel/compiler/brw_nir_lower_conversions.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+static nir_op
+get_conversion_op(nir_alu_type src_type,
+  unsigned src_bit_size,
+  nir_alu_type dst_type,
+  unsigned dst_bit_size,
+  nir_rounding_mode rounding_mode)
+{
+   nir_alu_type src_full_type = (nir_alu_type) (src_type | src_bit_size);
+   nir_alu_type dst_full_type = (nir_alu_type) (dst_type | dst_bit_size);
+
+   return nir_type_conversion_op(src_full_type, dst_full_type, rounding_mode);
+}
+
+static nir_rounding_mode
+get_opcode_rounding_mode(nir_op op)
+{
+   switch (op) {
+   case nir_op_f2f16_rtz:
+  return nir_rounding_mode_rtz;
+   case nir_op_f2f16_rtne:
+  return nir_rounding_mode_rtne;
+   default:
+  return nir_rounding_mode_undef;
+   }
+}
+
+static v

[Mesa-dev] [PATCH v4 34/40] intel/compiler: validate region restrictions for half-float conversions

2019-02-12 Thread Iago Toral Quiroga
---
 src/intel/compiler/brw_eu_validate.c|  64 -
 src/intel/compiler/test_eu_validate.cpp | 122 
 2 files changed, 185 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_validate.c 
b/src/intel/compiler/brw_eu_validate.c
index 000a05cb6ac..203641fecb9 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -531,7 +531,69 @@ general_restrictions_based_on_operand_types(const struct 
gen_device_info *devinf
exec_type_size == 8 && dst_type_size == 4)
   dst_type_size = 8;
 
-   if (exec_type_size > dst_type_size) {
+   /* From the BDW+ PRM:
+*
+*"There is no direct conversion from HF to DF or DF to HF.
+* There is no direct conversion from HF to Q/UQ or Q/UQ to HF."
+*/
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+   ERROR_IF(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV &&
+((dst_type == BRW_REGISTER_TYPE_HF && type_sz(src0_type) == 8) ||
+ (dst_type_size == 8 && src0_type == BRW_REGISTER_TYPE_HF)),
+"There are no direct conversion between 64-bit types and HF");
+
+   /* From the BDW+ PRM:
+*
+*   "Conversion between Integer and HF (Half Float) must be
+*DWord-aligned and strided by a DWord on the destination."
+*
+* But this seems to be expanded on CHV and SKL+ by:
+*
+*   "There is a relaxed alignment rule for word destinations. When
+*the destination type is word (UW, W, HF), destination data types
+*can be aligned to either the lowest word or the second lowest
+*word of the execution channel. This means the destination data
+*words can be either all in the even word locations or all in the
+*odd word locations."
+*
+* We do not implement the second rule as is though, since empirical testing
+* shows inconsistencies:
+*   - It suggests that packed 16-bit is not allowed, which is not true.
+*   - It suggests that conversions from Q/DF to W (which need to be 64-bit
+* aligned on the destination) are not possible, which is not true.
+*   - It suggests that conversions from 16-bit executions types to W need
+* to be 32-bit aligned, which doesn't seem to be necessary.
+*
+* So from this rule we only validate the implication that conversion from
+* F to HF needs to be DWord aligned too (in BDW this is limited to
+* conversions from integer types).
+*/
+   bool is_half_float_conversion =
+   brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV &&
+   dst_type != src0_type &&
+   (dst_type == BRW_REGISTER_TYPE_HF || src0_type == BRW_REGISTER_TYPE_HF);
+
+   if (is_half_float_conversion) {
+  assert(devinfo->gen >= 8);
+
+  if ((dst_type == BRW_REGISTER_TYPE_HF && 
brw_reg_type_is_integer(src0_type)) ||
+  (brw_reg_type_is_integer(dst_type) && src0_type == 
BRW_REGISTER_TYPE_HF)) {
+ ERROR_IF(dst_stride * dst_type_size != 4,
+  "Conversions between integer and half-float must be strided "
+  "by a DWord on the destination");
+
+ unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+ ERROR_IF(subreg % 4 != 0,
+  "Conversions between integer and half-float must be aligned "
+  "to a DWord on the destination");
+  } else if ((devinfo->is_cherryview || devinfo->gen >= 9) &&
+ dst_type == BRW_REGISTER_TYPE_HF) {
+ ERROR_IF(dst_stride != 2,
+  "Conversions to HF must have either all words in even word "
+  "locations or all words in odd word locations");
+  }
+
+   } else if (exec_type_size > dst_type_size) {
   if (!(dst_type_is_byte && inst_is_raw_move(devinfo, inst))) {
  ERROR_IF(dst_stride * dst_type_size != exec_type_size,
   "Destination stride must be equal to the ratio of the sizes "
diff --git a/src/intel/compiler/test_eu_validate.cpp 
b/src/intel/compiler/test_eu_validate.cpp
index 73300b23122..1557b6d2452 100644
--- a/src/intel/compiler/test_eu_validate.cpp
+++ b/src/intel/compiler/test_eu_validate.cpp
@@ -848,6 +848,128 @@ TEST_P(validation_test, 
byte_destination_relaxed_alignment)
}
 }
 
+TEST_P(validation_test, half_float_conversion)
+{
+   static const struct {
+  enum brw_reg_type dst_type;
+  enum brw_reg_type src_type;
+  unsigned dst_stride;
+  unsigned dst_subnr;
+  bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src_type, dst_stride, dst_subnr, expected_result)  \
+  {   \
+ BRW_REGISTER_TYPE_##dst_type,\
+ BRW_REGISTER_TYPE_##src_type,\
+ BRW_HORIZONTAL_STRIDE_##dst_stride,  \
+ dst_subnr,   

[Mesa-dev] [PATCH v4 03/40] intel/compiler: split float to 64-bit opcodes from int to 64-bit

2019-02-12 Thread Iago Toral Quiroga
Going forward having these split is a bit more convenient since these two
groups have different restrictions.

v2:
 - Rebased on top of new regioning lowering pass.

Reviewed-by: Topi Pohjolainen  (v1)
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 1041296b903..bb7591422d4 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -801,10 +801,17 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
case nir_op_f2f64:
case nir_op_f2i64:
case nir_op_f2u64:
+  assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
+  inst = bld.MOV(result, op[0]);
+  inst->saturate = instr->dest.saturate;
+  break;
+
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2f64:
case nir_op_u2u64:
+  assert(type_sz(op[0].type) > 1); /* brw_nir_lower_conversions */
+  /* fallthrough */
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 24/40] intel/compiler: implement isign for int8

2019-02-12 Thread Iago Toral Quiroga
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 25 +
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 3a6e4a2eb60..40c0481ac53 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -906,11 +906,28 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
*  Predicated OR sets 1 if val is positive.
*/
   uint32_t bit_size = nir_dest_bit_size(instr->dest.dest);
-  assert(bit_size == 32 || bit_size == 16);
 
-  fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0);
-  fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1);
-  fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15);
+  fs_reg zero, one, shift;
+  switch (bit_size) {
+  case 32:
+ zero = brw_imm_d(0);
+ one = brw_imm_d(1);
+ shift = brw_imm_d(31);
+ break;
+  case 16:
+ zero = brw_imm_w(0);
+ one = brw_imm_w(1);
+ shift = brw_imm_w(15);
+ break;
+  case 8: {
+ zero = setup_imm_b(bld, 0);
+ one = setup_imm_b(bld, 1);
+ shift = setup_imm_b(bld, 7);
+ break;
+  }
+  default:
+ unreachable("unsupported bit-size");
+  };
 
   bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G);
   bld.ASR(result, op[0], shift);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 15/40] intel/compiler: don't compact 3-src instructions with Src1Type or Src2Type bits

2019-02-12 Thread Iago Toral Quiroga
We are now using these bits, so don't assert that they are not set. In gen8,
if these bits are set compaction is not possible. On gen9 and CHV platforms
set_3src_control_index() checks these bits (and others) against a table to
validate if the particular bit combination is eligible for compaction or not.

v2
 - Add more detail in the commit message explaining the situation for SKL+
   and CHV (Jason)

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Jason Ekstrand 
Reviewed-by: Matt Turner 
---
 src/intel/compiler/brw_eu_compact.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_compact.c 
b/src/intel/compiler/brw_eu_compact.c
index ae14ef10ec0..20fed254331 100644
--- a/src/intel/compiler/brw_eu_compact.c
+++ b/src/intel/compiler/brw_eu_compact.c
@@ -928,8 +928,11 @@ has_3src_unmapped_bits(const struct gen_device_info 
*devinfo,
   assert(!brw_inst_bits(src, 127, 126) &&
  !brw_inst_bits(src, 105, 105) &&
  !brw_inst_bits(src, 84, 84) &&
- !brw_inst_bits(src, 36, 35) &&
  !brw_inst_bits(src, 7,  7));
+
+  /* Src1Type and Src2Type, used for mixed-precision floating point */
+  if (brw_inst_bits(src, 36, 35))
+ return true;
}
 
return false;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 17/40] intel/compiler: set correct precision fields for 3-source float instructions

2019-02-12 Thread Iago Toral Quiroga
Source0 and Destination extract the floating-point precision automatically
from the SrcType and DstType instruction fields respectively when they are
set to types :F or :HF. For Source1 and Source2 operands, we use the new
1-bit fields Src1Type and Src2Type, where 0 means normal precision and 1
means half-precision. Since we always use the type of the destination for
all operands when we emit 3-source instructions, we only need set Src1Type
and Src2Type to 1 when we are emitting a half-precision instruction.

v2:
 - Set the bit separately for each source based on its type so we can
   do mixed floating-point mode in the future (Topi).

v3:
 - Use regular citation style for the comment referencing the PRM (Matt).
 - Decided not to add asserts in the emission code to check that only
   mixed HF/F types are used since such checks would break negative tests
   for brw_eu_validate.c (Matt)

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Jason Ekstrand 
Reviewed-by: Matt Turner 
---
 src/intel/compiler/brw_eu_emit.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 30037e71b00..195c26ab760 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -843,6 +843,22 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct 
brw_reg dest,
   */
  brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
  brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
+
+ /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
+  *
+  *"Three source instructions can use operands with mixed-mode
+  * precision. When SrcType field is set to :f or :hf it defines
+  * precision for source 0 only, and fields Src1Type and Src2Type
+  * define precision for other source operands:
+  *
+  * 0b = :f. Single precision Float (32-bit).
+  * 1b = :hf. Half precision Float (16-bit)."
+  */
+ if (src1.type == BRW_REGISTER_TYPE_HF)
+brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
+
+ if (src2.type == BRW_REGISTER_TYPE_HF)
+brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
   }
}
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 08/40] intel/compiler: implement 16-bit fsign

2019-02-12 Thread Iago Toral Quiroga
v2:
 - make 16-bit be its own separate case (Jason)

v3:
 - Drop the result_int temporary (Jason)

Reviewed-by: Topi Pohjolainen  (v1)
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 4c7a839390c..64e24f86b5a 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -844,7 +844,21 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
 : bld.MOV(result, brw_imm_f(1.0f));
 
  set_predicate(BRW_PREDICATE_NORMAL, inst);
-  } else if (type_sz(op[0].type) < 8) {
+  } else if (type_sz(op[0].type) == 2) {
+ /* AND(val, 0x8000) gives the sign bit.
+  *
+  * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not 
zero.
+  */
+ fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
+ bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
+
+ op[0].type = BRW_REGISTER_TYPE_UW;
+ result.type = BRW_REGISTER_TYPE_UW;
+ bld.AND(result, op[0], brw_imm_uw(0x8000u));
+
+ inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+  } else if (type_sz(op[0].type) == 4) {
  /* AND(val, 0x8000) gives the sign bit.
   *
   * Predicated OR ORs 1.0 (0x3f80) with the sign bit if val is not
@@ -866,6 +880,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   * - The sign is encoded in the high 32-bit of each DF
   * - We need to produce a DF result.
   */
+ assert(type_sz(op[0].type) == 8);
 
  fs_reg zero = vgrf(glsl_type::double_type);
  bld.MOV(zero, setup_imm_df(bld, 0.0));
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 13/40] intel/compiler: add instruction setters for Src1Type and Src2Type.

2019-02-12 Thread Iago Toral Quiroga
The original SrcType is a 3-bit field that takes a subset of the types
supported for the hardware for 3-source instructions. Since gen8,
when the half-float type was added, 3-source floating point operations
can use use mixed precision mode, where not all the operands have the
same floating-point precision. While the precision for the first operand
is taken from the type in SrcType, the bits in Src1Type (bit 36) and
Src2Type (bit 35) define the precision for the other operands
(0: normal precision, 1: half precision).

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Matt Turner 
---
 src/intel/compiler/brw_inst.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
index 71316f12215..1f55d45125d 100644
--- a/src/intel/compiler/brw_inst.h
+++ b/src/intel/compiler/brw_inst.h
@@ -222,6 +222,8 @@ F8(3src_src1_negate,39, 39, 40, 40)
 F8(3src_src1_abs,   38, 38, 39, 39)
 F8(3src_src0_negate,37, 37, 38, 38)
 F8(3src_src0_abs,   36, 36, 37, 37)
+F8(3src_a16_src1_type,  -1, -1, 36, 36)
+F8(3src_a16_src2_type,  -1, -1, 35, 35)
 F8(3src_a16_flag_reg_nr,34, 34, 33, 33)
 F8(3src_a16_flag_subreg_nr, 33, 33, 32, 32)
 FF(3src_a16_dst_reg_file,
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 01/40] compiler/nir: add an is_conversion field to nir_op_info

2019-02-12 Thread Iago Toral Quiroga
This is set to True only for numeric conversion opcodes.
---
 src/compiler/nir/nir.h|  3 ++
 src/compiler/nir/nir_opcodes.py   | 73 +--
 src/compiler/nir/nir_opcodes_c.py |  1 +
 3 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index ff2c41faf27..2793662b1d9 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -926,6 +926,9 @@ typedef struct {
nir_alu_type input_types[NIR_MAX_VEC_COMPONENTS];
 
nir_op_algebraic_property algebraic_properties;
+
+   /* Whether this represents a numeric conversion opcode */
+   bool is_conversion;
 } nir_op_info;
 
 extern const nir_op_info nir_op_infos[nir_num_opcodes];
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index d32005846a6..dc4cd9ac63d 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -33,12 +33,13 @@ class Opcode(object):
NOTE: this must be kept in sync with nir_op_info
"""
def __init__(self, name, output_size, output_type, input_sizes,
-input_types, algebraic_properties, const_expr):
+input_types, is_conversion, algebraic_properties, const_expr):
   """Parameters:
 
   - name is the name of the opcode (prepend nir_op_ for the enum name)
   - all types are strings that get nir_type_ prepended to them
   - input_types is a list of types
+  - is_conversion is true if this opcode represents a type conversion
   - algebraic_properties is a space-seperated string, where nir_op_is_ is
 prepended before each entry
   - const_expr is an expression or series of statements that computes the
@@ -70,6 +71,7 @@ class Opcode(object):
   assert isinstance(input_sizes[0], int)
   assert isinstance(input_types, list)
   assert isinstance(input_types[0], str)
+  assert isinstance(is_conversion, bool)
   assert isinstance(algebraic_properties, str)
   assert isinstance(const_expr, str)
   assert len(input_sizes) == len(input_types)
@@ -84,6 +86,7 @@ class Opcode(object):
   self.output_type = output_type
   self.input_sizes = input_sizes
   self.input_types = input_types
+  self.is_conversion = is_conversion
   self.algebraic_properties = algebraic_properties
   self.const_expr = const_expr
 
@@ -138,21 +141,22 @@ associative = "associative "
 opcodes = {}
 
 def opcode(name, output_size, output_type, input_sizes, input_types,
-   algebraic_properties, const_expr):
+   is_conversion, algebraic_properties, const_expr):
assert name not in opcodes
opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
-  input_types, algebraic_properties, const_expr)
+  input_types, is_conversion, algebraic_properties,
+  const_expr)
 
 def unop_convert(name, out_type, in_type, const_expr):
-   opcode(name, 0, out_type, [0], [in_type], "", const_expr)
+   opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 
 def unop(name, ty, const_expr):
-   opcode(name, 0, ty, [0], [ty], "", const_expr)
+   opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 
 def unop_horiz(name, output_size, output_type, input_size, input_type,
const_expr):
-   opcode(name, output_size, output_type, [input_size], [input_type], "",
-  const_expr)
+   opcode(name, output_size, output_type, [input_size], [input_type],
+  False, "", const_expr)
 
 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 reduce_expr, final_expr):
@@ -173,6 +177,8 @@ def unop_reduce(name, output_size, output_type, input_type, 
prereduce_expr,
unop_horiz(name + "4", output_size, output_type, 4, input_type,
   final(reduce_(reduce_(src0, src1), reduce_(src2, src3
 
+def unop_numeric_convert(name, out_type, in_type, const_expr):
+   opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 
 # These two move instructions differ in what modifiers they support and what
 # the negate modifier means. Otherwise, they are identical.
@@ -215,13 +221,13 @@ for src_t in [tint, tuint, tfloat, tbool]:
   if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
   rnd_modes = ['_rtne', '_rtz', '']
   for rnd_mode in rnd_modes:
-  unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
-   bit_size, rnd_mode),
-   dst_t + str(bit_size), src_t, "src0")
+  unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], 
dst_t[0],
+  bit_size, 
rnd_mode),
+   dst_t + str(bit_size), src_t, "src0")
   else:
   conv_expr = "src0 != 0" if dst_t == tbool else "src0"
-  unop_conve

[Mesa-dev] [PATCH v4 37/40] intel/compiler: validate region restrictions for mixed float mode

2019-02-12 Thread Iago Toral Quiroga
---
 src/intel/compiler/brw_eu_validate.c| 256 ++
 src/intel/compiler/test_eu_validate.cpp | 618 
 2 files changed, 874 insertions(+)

diff --git a/src/intel/compiler/brw_eu_validate.c 
b/src/intel/compiler/brw_eu_validate.c
index ed9c8fe59dd..a61d4c46e81 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -170,6 +170,13 @@ src1_is_null(const struct gen_device_info *devinfo, const 
brw_inst *inst)
   brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
 }
 
+static bool
+src0_is_acc(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_reg_file(devinfo, inst) == 
BRW_ARCHITECTURE_REGISTER_FILE &&
+  brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_ACCUMULATOR;
+}
+
 static bool
 src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst)
 {
@@ -847,6 +854,254 @@ general_restrictions_on_region_parameters(const struct 
gen_device_info *devinfo,
return error_msg;
 }
 
+static struct string
+special_restrictions_for_mixed_float_mode(const struct gen_device_info 
*devinfo,
+  const brw_inst *inst)
+{
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   unsigned opcode = brw_inst_opcode(devinfo, inst);
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   if (num_sources >= 3)
+  return error_msg;
+
+   if (!is_mixed_float(devinfo, inst))
+  return error_msg;
+
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   bool is_align16 = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16;
+
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+   enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst);
+   enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
+
+   unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
+   bool dst_is_packed = is_packed(exec_size * dst_stride, exec_size, 
dst_stride);
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+* Float Operations:
+*
+*"Indirect addressing on source is not supported when source and
+* destination data types are mixed float."
+*
+* Indirect addressing is only supported on the first source, so we only
+* check that.
+*/
+   ERROR_IF(types_are_mixed_float(dst_type, src0_type) &&
+brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT,
+"Indirect addressing on source is not supported when source and "
+"destination data types are mixed float");
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+* Float Operations:
+*
+*"No SIMD16 in mixed mode when destination is f32. Instruction
+* execution size must be no more than 8."
+*/
+   ERROR_IF(exec_size > 8 && dst_type == BRW_REGISTER_TYPE_F,
+"Mixed float mode with 32-bit float destination is limited "
+"to SIMD8");
+
+   if (is_align16) {
+  /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+   * Float Operations:
+   *
+   *   "In Align16 mode, when half float and float data types are mixed
+   *between source operands OR between source and destination operands,
+   *the register content are assumed to be packed."
+   *
+   * Since Align16 doesn't have a concept of horizontal stride (or width),
+   * it means that vertical stride must always be 4, since 0 and 2 would
+   * lead to replicated data, and any other value is disallowed in Align16.
+   * However, the PRM also says:
+   *
+   *   "In Align16, vertical stride can never be zero for f16"
+   *
+   * Which is oddly redundant and specific considering the more general
+   * assumption that all operands are assumed to be packed, so we
+   * understand that this might be hinting that there may be an exception
+   * for f32 operands with a vstride of 0, so we don't validate this for
+   * them while we don't have empirical evidence that it is forbidden.
+   */
+  ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
+   (src0_type != BRW_REGISTER_TYPE_F ||
+brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
+   "Align16 mixed float mode assumes packed data (vstride must "
+   "be 4 -or 0 for f32 operands-)");
+
+  ERROR_IF(num_sources >= 2 &&
+   brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
+   (src1_type != BRW_REGISTER_TYPE_F ||
+brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
+   "Align16 mixed float mode assumes packed data (vstride must "
+   "be 4 -or 0 for f32 operands-)");
+
+  /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+   * Float Operations:
+   *
+   *   "For Align16 

[Mesa-dev] [PATCH v4 21/40] intel/compiler: split is_partial_write() into two variants

2019-02-12 Thread Iago Toral Quiroga
This function is used in two different scenarios that for 32-bit
instructions are the same, but for 16-bit instructions are not.

One scenario is that in which we are working at a SIMD8 register
level and we need to know if a register is fully defined or written.
This is useful, for example, in the context of liveness analysis or
register allocation, where we work with units of registers.

The other scenario is that in which we want to know if an instruction
is writing a full scalar component or just some subset of it. This is
useful, for example, in the context of some optimization passes
like copy propagation.

For 32-bit instructions (or larger), a SIMD8 dispatch will always write
at least a full SIMD8 register (32B) if the write is not partial. The
function is_partial_write() checks this to determine if we have a partial
write. However, when we deal with 16-bit instructions, that logic disables
some optimizations that should be safe. For example, a SIMD8 16-bit MOV will
only update half of a SIMD register, but it is still a complete write of the
variable for a SIMD8 dispatch, so we should not prevent copy propagation in
this scenario because we don't write all 32 bytes in the SIMD register
or because the write starts at offset 16B (wehere we pack components Y or
W of 16-bit vectors).

This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit
instructions, which lose a number of optimizations because of this, most
important of which is copy-propagation.

This patch splits is_partial_write() into is_partial_reg_write(), which
represents the current is_partial_write(), useful for things like
liveness analysis, and is_partial_var_write(), which considers
the dispatch size to check if we are writing a full variable (rather
than a full register) to decide if the write is partial or not, which
is what we really want in many optimization passes.

Then the patch goes on and rewrites all uses of is_partial_write() to use
one or the other version. Specifically, we use is_partial_var_write()
in the following places: copy propagation, cmod propagation, common
subexpression elimination, saturate propagation and sel peephole.

Notice that the semantics of is_partial_var_write() exactly match the
current implementation of is_partial_write() for anything that is
32-bit or larger, so no changes are expected for 32-bit instructions.

Tested against ~5000 tests involving 16-bit instructions in CTS produced
the following changes in instruction counts:

Patched  | Master|%|

SIMD8  |621,900  |706,721| -12.00% |

SIMD16 | 93,252  | 93,252|   0.00% |


As expected, the change only affects SIMD8 dispatches.

Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_fs.cpp | 31 +++
 .../compiler/brw_fs_cmod_propagation.cpp  | 20 ++--
 .../compiler/brw_fs_copy_propagation.cpp  |  8 ++---
 src/intel/compiler/brw_fs_cse.cpp |  3 +-
 .../compiler/brw_fs_dead_code_eliminate.cpp   |  2 +-
 src/intel/compiler/brw_fs_live_variables.cpp  |  2 +-
 src/intel/compiler/brw_fs_reg_allocate.cpp|  2 +-
 .../compiler/brw_fs_register_coalesce.cpp |  2 +-
 .../compiler/brw_fs_saturate_propagation.cpp  |  7 +++--
 src/intel/compiler/brw_fs_sel_peephole.cpp|  4 +--
 src/intel/compiler/brw_ir_fs.h|  3 +-
 11 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index d172c2de4d7..0f04e577de3 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -696,14 +696,33 @@ fs_visitor::limit_dispatch_width(unsigned n, const char 
*msg)
  * it.
  */
 bool
-fs_inst::is_partial_write() const
+fs_inst::is_partial_reg_write() const
 {
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
-   (this->exec_size * type_sz(this->dst.type)) < 32 ||
!this->dst.is_contiguous() ||
+   (this->exec_size * type_sz(this->dst.type)) < REG_SIZE ||
this->dst.offset % REG_SIZE != 0);
 }
 
+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire variable for the given dispatch width.
+ *
+ * This is only different from is_partial_reg_write() for SIMD8
+ * dispatches of 16-bit (or smaller) instructions.
+ */
+bool
+fs_inst::is_partial_var_write(uint32_t dispatch_width) const
+{
+   const uint32_t type_size = type_sz(this->dst.type);
+   uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size);
+
+   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
+   !this->dst.is_contiguous() ||
+   (this->exec_size * type_sz(this->dst.type)) < var_size ||
+   this->dst.offset % var_size != 0);
+}
+
 unsigned
 fs_inst::components_read(unsigned i) const
 {
@@ -2923,7 +2942,7 @@ fs_v

[Mesa-dev] [PATCH v4 04/40] intel/compiler: handle b2i/b2f with other integer conversion opcodes

2019-02-12 Thread Iago Toral Quiroga
Since we handle booleans as integers this makes more sense.

v2:
 - rebased to incorporate new boolean conversion opcodes

v3:
 - rebased on top regioning lowering pass

Reviewed-by: Jason Ekstrand  (v1)
Reviewed-by: Topi Pohjolainen  (v2)
---
 src/intel/compiler/brw_fs_nir.cpp | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index bb7591422d4..b705b3f5bc2 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -788,6 +788,14 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   inst->saturate = instr->dest.saturate;
   break;
 
+   case nir_op_f2f64:
+   case nir_op_f2i64:
+   case nir_op_f2u64:
+  assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
+  inst = bld.MOV(result, op[0]);
+  inst->saturate = instr->dest.saturate;
+  break;
+
case nir_op_b2i8:
case nir_op_b2i16:
case nir_op_b2i32:
@@ -798,14 +806,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   op[0].type = BRW_REGISTER_TYPE_D;
   op[0].negate = !op[0].negate;
   /* fallthrough */
-   case nir_op_f2f64:
-   case nir_op_f2i64:
-   case nir_op_f2u64:
-  assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
-  inst = bld.MOV(result, op[0]);
-  inst->saturate = instr->dest.saturate;
-  break;
-
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2f64:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 33/40] intel/compiler: also set F execution type for mixed float mode in BDW

2019-02-12 Thread Iago Toral Quiroga
The section 'Execution Data Types' of 3D Media GPGPU volume, which
describes execution types, is exactly the same in BDW and SKL+.

Also, this section states that there is a single execution type, so it
makes sense that this is the wider of the two floating point types
involved in mixed float mode, which is what we do for SKL+ and CHV.
---
 src/intel/compiler/brw_eu_validate.c | 18 +++---
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/intel/compiler/brw_eu_validate.c 
b/src/intel/compiler/brw_eu_validate.c
index 358a0347a93..000a05cb6ac 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -431,18 +431,14 @@ execution_type(const struct gen_device_info *devinfo, 
const brw_inst *inst)
src1_exec_type == BRW_REGISTER_TYPE_DF)
   return BRW_REGISTER_TYPE_DF;
 
-   if (devinfo->gen >= 9 || devinfo->is_cherryview) {
-  if (dst_exec_type == BRW_REGISTER_TYPE_F ||
-  src0_exec_type == BRW_REGISTER_TYPE_F ||
-  src1_exec_type == BRW_REGISTER_TYPE_F) {
- return BRW_REGISTER_TYPE_F;
-  } else {
- return BRW_REGISTER_TYPE_HF;
-  }
+   if (dst_exec_type == BRW_REGISTER_TYPE_F ||
+   src0_exec_type == BRW_REGISTER_TYPE_F ||
+   src1_exec_type == BRW_REGISTER_TYPE_F) {
+  return BRW_REGISTER_TYPE_F;
+   } else {
+  assert(devinfo->gen >= 8 && src0_exec_type == BRW_REGISTER_TYPE_HF);
+  return BRW_REGISTER_TYPE_HF;
}
-
-   assert(src0_exec_type == BRW_REGISTER_TYPE_F);
-   return BRW_REGISTER_TYPE_F;
 }
 
 /**
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 36/40] intel/compiler: skip validating restrictions on operand types for mixed float

2019-02-12 Thread Iago Toral Quiroga
Mixed float instructions are those that use both F and HF operands as their
sources or destination, except for regular conversions.

There are specific rules for mixed float operation mode with its own set
of restrictions, which involve rules that are incompatible with general
restrictions. For example:

   "In Align1, destination stride can be smaller than execution type"

Instead, we will implement validation for mixed float mode instructions
separately in a follow-up patch.
---
 src/intel/compiler/brw_eu_validate.c | 50 
 1 file changed, 50 insertions(+)

diff --git a/src/intel/compiler/brw_eu_validate.c 
b/src/intel/compiler/brw_eu_validate.c
index b1fdd1ce941..ed9c8fe59dd 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -461,6 +461,53 @@ is_packed(unsigned vstride, unsigned width, unsigned 
hstride)
return false;
 }
 
+/**
+ * Returns whether a combination of two types would qualify as mixed float
+ * operation mode
+ */
+static inline bool
+types_are_mixed_float(enum brw_reg_type t0, enum brw_reg_type t1)
+{
+   return (t0 == BRW_REGISTER_TYPE_F && t1 == BRW_REGISTER_TYPE_HF) ||
+  (t1 == BRW_REGISTER_TYPE_F && t0 == BRW_REGISTER_TYPE_HF);
+}
+
+/**
+ * Returns whether an instruction is using mixed float operation mode
+ */
+static bool
+is_mixed_float(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   if (devinfo->gen < 8)
+  return false;
+
+   if (inst_is_send(devinfo, inst))
+  return false;
+
+   unsigned opcode = brw_inst_opcode(devinfo, inst);
+   const struct opcode_desc *desc = brw_opcode_desc(devinfo, opcode);
+   if (desc->ndst == 0)
+  return false;
+
+   /* FIXME: support 3-src instructions */
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   assert(num_sources < 3);
+
+   enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+
+   if (num_sources == 1) {
+  return opcode == BRW_OPCODE_MATH &&
+ types_are_mixed_float(src0_type, dst_type);
+   }
+
+   enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst);
+
+   return types_are_mixed_float(src0_type, src1_type) ||
+  types_are_mixed_float(src0_type, dst_type) ||
+  types_are_mixed_float(src1_type, dst_type);
+}
+
 /**
  * Checks restrictions listed in "General Restrictions Based on Operand Types"
  * in the "Register Region Restrictions" section.
@@ -487,6 +534,9 @@ general_restrictions_based_on_operand_types(const struct 
gen_device_info *devinf
if (desc->ndst == 0)
   return (struct string){};
 
+   if (is_mixed_float(devinfo, inst))
+  return (struct string){};
+
/* The PRMs say:
 *
 *Where n is the largest element size in bytes for any source or
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 12/40] compiler/nir: add lowering for 16-bit ldexp

2019-02-12 Thread Iago Toral Quiroga
v2 (Topi):
 - Make bit-size handling order be 16-bit, 32-bit, 64-bit
 - Clamp lower exponent range at -28 instead of -30.

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Jason Ekstrand 
---
 src/compiler/nir/nir_opt_algebraic.py | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py 
b/src/compiler/nir/nir_opt_algebraic.py
index 40eb3de02c3..71c626e1b3f 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -790,7 +790,9 @@ for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 
'i']):
 
 def fexp2i(exp, bits):
# We assume that exp is already in the right range.
-   if bits == 32:
+   if bits == 16:
+  return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
+   elif bits == 32:
   return ('ishl', ('iadd', exp, 127), 23)
elif bits == 64:
   return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
@@ -808,7 +810,9 @@ def ldexp(f, exp, bits):
# handles a range on exp of [-252, 254] which allows you to create any
# value (including denorms if the hardware supports it) and to adjust the
# exponent of any normal value to anything you want.
-   if bits == 32:
+   if bits == 16:
+  exp = ('imin', ('imax', exp, -28), 30)
+   elif bits == 32:
   exp = ('imin', ('imax', exp, -252), 254)
elif bits == 64:
   exp = ('imin', ('imax', exp, -2044), 2046)
@@ -828,6 +832,7 @@ def ldexp(f, exp, bits):
return ('fmul', ('fmul', f, pow2_1), pow2_2)
 
 optimizations += [
+   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
(('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
(('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
 ]
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 00/40] intel: VK_KHR_shader_float16_int8 implementation

2019-02-12 Thread Iago Toral Quiroga
The changes in this version address review feedback to v3. The most significant
changes include:

1. A more generic constant combining pass that can handle more
constant types (not just F and HF) requested by Jason.

2. The addition of assembly validation for half-float restrictions, and also
for mixed float mode, requested by Curro. It should be noted that this
implementation of VK_KHR_shader_float16_int8 does not emit any mixed mode float
instructions at this moment so I have not empirically validated the restictions
implemented here.

As always, a branch with these patches is available for testing in the
itoral/VK_KHR_shader_float16_int8 branch of the Igalia Mesa repository at
https://github.com/Igalia/mesa.

Iago Toral Quiroga (40):
  compiler/nir: add an is_conversion field to nir_op_info
  intel/compiler: add a NIR pass to lower conversions
  intel/compiler: split float to 64-bit opcodes from int to 64-bit
  intel/compiler: handle b2i/b2f with other integer conversion opcodes
  intel/compiler: assert restrictions on conversions to half-float
  intel/compiler: lower some 16-bit float operations to 32-bit
  intel/compiler: handle extended math restrictions for half-float
  intel/compiler: implement 16-bit fsign
  intel/compiler: drop unnecessary temporary from 32-bit fsign
implementation
  compiler/nir: add lowering option for 16-bit fmod
  compiler/nir: add lowering for 16-bit flrp
  compiler/nir: add lowering for 16-bit ldexp
  intel/compiler: add instruction setters for Src1Type and Src2Type.
  intel/compiler: add new half-float register type for 3-src
instructions
  intel/compiler: don't compact 3-src instructions with Src1Type or
Src2Type bits
  intel/compiler: allow half-float on 3-source instructions since gen8
  intel/compiler: set correct precision fields for 3-source float
instructions
  intel/compiler: fix ddx and ddy for 16-bit float
  intel/compiler: fix ddy for half-float in Broadwell
  intel/compiler: workaround for SIMD8 half-float MAD in gen8
  intel/compiler: split is_partial_write() into two variants
  intel/compiler: activate 16-bit bit-size lowerings also for 8-bit
  intel/compiler: rework conversion opcodes
  intel/compiler: implement isign for int8
  intel/compiler: ask for an integer type if requesting an 8-bit type
  intel/eu: force stride of 2 on NULL register for Byte instructions
  intel/compiler: generalize the combine constants pass
  intel/compiler: implement is_zero, is_one, is_negative_one for
8-bit/16-bit
  intel/compiler: add a brw_reg_type_is_integer helper
  intel/compiler: fix cmod propagation for non 32-bit types
  intel/compiler: remove inexact algebraic optimizations from the
backend
  intel/compiler: skip MAD algebraic optimization for half-float or
mixed mode
  intel/compiler: also set F execution type for mixed float mode in BDW
  intel/compiler: validate region restrictions for half-float
conversions
  intel/compiler: validate conversions between 64-bit and 8-bit types
  intel/compiler: skip validating restrictions on operand types for
mixed float
  intel/compiler: validate region restrictions for mixed float mode
  compiler/spirv: move the check for Int8 capability
  anv/pipeline: support Float16 and Int8 SPIR-V capabilities in gen8+
  anv/device: expose VK_KHR_shader_float16_int8 in gen8+

 src/compiler/nir/nir.h|   5 +
 src/compiler/nir/nir_opcodes.py   |  73 +-
 src/compiler/nir/nir_opcodes_c.py |   1 +
 src/compiler/nir/nir_opt_algebraic.py |  11 +-
 src/compiler/shader_info.h|   1 +
 src/compiler/spirv/spirv_to_nir.c |  11 +-
 src/intel/Makefile.sources|   1 +
 src/intel/compiler/brw_compiler.c |   2 +
 src/intel/compiler/brw_eu_compact.c   |   5 +-
 src/intel/compiler/brw_eu_emit.c  |  36 +-
 src/intel/compiler/brw_eu_validate.c  | 396 -
 src/intel/compiler/brw_fs.cpp | 101 ++-
 .../compiler/brw_fs_cmod_propagation.cpp  |  34 +-
 .../compiler/brw_fs_combine_constants.cpp | 202 -
 .../compiler/brw_fs_copy_propagation.cpp  |   8 +-
 src/intel/compiler/brw_fs_cse.cpp |   3 +-
 .../compiler/brw_fs_dead_code_eliminate.cpp   |   2 +-
 src/intel/compiler/brw_fs_generator.cpp   |  54 +-
 src/intel/compiler/brw_fs_live_variables.cpp  |   2 +-
 src/intel/compiler/brw_fs_lower_regioning.cpp |  39 +-
 src/intel/compiler/brw_fs_nir.cpp |  87 +-
 src/intel/compiler/brw_fs_reg_allocate.cpp|   2 +-
 .../compiler/brw_fs_register_coalesce.cpp |   2 +-
 .../compiler/brw_fs_saturate_propagation.cpp  |   7 +-
 src/intel/compiler/brw_fs_sel_peephole.cpp|   4 +-
 src/intel/compiler/brw_inst.h |   2 +
 src/intel/compiler/brw_ir_fs.h|   3 +-
 src/intel/compiler/brw_nir.c  |  22 +-
 src/intel/compiler/brw_nir.h  |   2 +
 .../com

[Mesa-dev] [PATCH v4 14/40] intel/compiler: add new half-float register type for 3-src instructions

2019-02-12 Thread Iago Toral Quiroga
This is available since gen8.

v2: restore previously existing assertion.

v3: don't use separate tables for gen7 and gen8, just assert that we
don't use half-float before gen8 (Matt)

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/compiler/brw_reg_type.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/intel/compiler/brw_reg_type.c 
b/src/intel/compiler/brw_reg_type.c
index 60240ba1513..feabee2f53b 100644
--- a/src/intel/compiler/brw_reg_type.c
+++ b/src/intel/compiler/brw_reg_type.c
@@ -138,6 +138,7 @@ enum hw_3src_reg_type {
GEN7_3SRC_TYPE_D  = 1,
GEN7_3SRC_TYPE_UD = 2,
GEN7_3SRC_TYPE_DF = 3,
+   GEN8_3SRC_TYPE_HF = 4,
 
/** When ExecutionDatatype is 1: @{ */
GEN10_ALIGN1_3SRC_REG_TYPE_HF = 0b000,
@@ -166,6 +167,7 @@ static const struct hw_3src_type {
[BRW_REGISTER_TYPE_D]  = { GEN7_3SRC_TYPE_D  },
[BRW_REGISTER_TYPE_UD] = { GEN7_3SRC_TYPE_UD },
[BRW_REGISTER_TYPE_DF] = { GEN7_3SRC_TYPE_DF },
+   [BRW_REGISTER_TYPE_HF] = { GEN8_3SRC_TYPE_HF },
 }, gen10_hw_3src_align1_type[] = {
 #define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
@@ -258,6 +260,7 @@ brw_reg_type_to_a16_hw_3src_type(const struct 
gen_device_info *devinfo,
  enum brw_reg_type type)
 {
assert(type < ARRAY_SIZE(gen7_hw_3src_type));
+   assert(devinfo->gen >= 8 || type != BRW_REGISTER_TYPE_HF);
assert(gen7_hw_3src_type[type].reg_type != (enum hw_3src_reg_type)INVALID);
return gen7_hw_3src_type[type].reg_type;
 }
@@ -283,6 +286,7 @@ enum brw_reg_type
 brw_a16_hw_3src_type_to_reg_type(const struct gen_device_info *devinfo,
  unsigned hw_type)
 {
+   assert(devinfo->gen >= 8 || hw_type != GEN8_3SRC_TYPE_HF);
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
   if (gen7_hw_3src_type[i].reg_type == hw_type) {
  return i;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 10/40] compiler/nir: add lowering option for 16-bit fmod

2019-02-12 Thread Iago Toral Quiroga
And enable it on Intel.

v2:
 - Squash the change to enable this lowering on Intel (Jason)

Reviewed-by: Jason Ekstrand 
---
 src/compiler/nir/nir.h| 1 +
 src/compiler/nir/nir_opt_algebraic.py | 1 +
 src/intel/compiler/brw_compiler.c | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 2793662b1d9..6547a468648 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2119,6 +2119,7 @@ typedef struct nir_shader_compiler_options {
bool lower_fpow;
bool lower_fsat;
bool lower_fsqrt;
+   bool lower_fmod16;
bool lower_fmod32;
bool lower_fmod64;
/** Lowers ibitfield_extract/ubitfield_extract to ibfe/ubfe. */
diff --git a/src/compiler/nir/nir_opt_algebraic.py 
b/src/compiler/nir/nir_opt_algebraic.py
index 75a3d2ad238..cd969de1f88 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -636,6 +636,7 @@ optimizations = [
(('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
 
# Misc. lowering
+   (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 
'options->lower_fmod16'),
(('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 
'options->lower_fmod32'),
(('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 
'options->lower_fmod64'),
(('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b, 
'options->lower_fmod32'),
diff --git a/src/intel/compiler/brw_compiler.c 
b/src/intel/compiler/brw_compiler.c
index fe632c5badc..f885e79c3e6 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -33,6 +33,7 @@
.lower_sub = true, \
.lower_fdiv = true,\
.lower_scmp = true,\
+   .lower_fmod16 = true,  \
.lower_fmod32 = true,  \
.lower_fmod64 = false, \
.lower_bitfield_extract = true,\
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 23/40] intel/compiler: rework conversion opcodes

2019-02-12 Thread Iago Toral Quiroga
Now that we have the regioning lowering pass we can just put all of these
opcodes together in a single block and we can just assert on the few cases
of conversion instructions that are not supported in hardware and that should
be lowered in brw_nir_lower_conversions.

The only cases what we still handle separately are the conversions from float
to half-float since the rounding variants would need to fallthrough and we
are already doing this for boolean opcodes (since they need to negate), plus
there is also a large comment about these opcodes that we probably want to
keep so it is just easier to keep these separate.

Suggested-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 41 +--
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index f59e9ad4e2b..3a6e4a2eb60 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -772,7 +772,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
   /* fallthrough */
-
+   case nir_op_f2f16:
   /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
* on the HW gen, it is a special hw opcode or just a MOV, and
* brw_F32TO16 (at brw_eu_emit) would do the work to chose.
@@ -782,23 +782,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
* only for gen8+, it will be better to use directly the MOV, and use
* BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
*/
-
-   case nir_op_f2f16:
-   case nir_op_i2f16:
-   case nir_op_u2f16:
   assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
   inst = bld.MOV(result, op[0]);
   inst->saturate = instr->dest.saturate;
   break;
 
-   case nir_op_f2f64:
-   case nir_op_f2i64:
-   case nir_op_f2u64:
-  assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
-  inst = bld.MOV(result, op[0]);
-  inst->saturate = instr->dest.saturate;
-  break;
-
case nir_op_b2i8:
case nir_op_b2i16:
case nir_op_b2i32:
@@ -813,19 +801,34 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
case nir_op_i2i64:
case nir_op_u2f64:
case nir_op_u2u64:
-  assert(type_sz(op[0].type) > 1); /* brw_nir_lower_conversions */
-  /* fallthrough */
+   case nir_op_f2f64:
+   case nir_op_f2i64:
+   case nir_op_f2u64:
+   case nir_op_i2i32:
+   case nir_op_u2u32:
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:
-   case nir_op_f2i16:
-   case nir_op_f2u16:
-   case nir_op_i2i32:
-   case nir_op_u2u32:
+   case nir_op_i2f16:
case nir_op_i2i16:
+   case nir_op_u2f16:
case nir_op_u2u16:
+   case nir_op_f2i16:
+   case nir_op_f2u16:
case nir_op_i2i8:
case nir_op_u2u8:
+   case nir_op_f2i8:
+   case nir_op_f2u8:
+  if (result.type == BRW_REGISTER_TYPE_B ||
+  result.type == BRW_REGISTER_TYPE_UB ||
+  result.type == BRW_REGISTER_TYPE_HF)
+ assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
+
+  if (op[0].type == BRW_REGISTER_TYPE_B ||
+  op[0].type == BRW_REGISTER_TYPE_UB ||
+  op[0].type == BRW_REGISTER_TYPE_HF)
+ assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
+
   inst = bld.MOV(result, op[0]);
   inst->saturate = instr->dest.saturate;
   break;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 39/40] anv/pipeline: support Float16 and Int8 SPIR-V capabilities in gen8+

2019-02-12 Thread Iago Toral Quiroga
v2:
  - Merge Float16 and Int8 capabilities into a single patch (Jason)
  - Merged patch that enabled SPIR-V front-end checks for these caps
(except for Int8, which was already merged)

Reviewed-by: Jason Ekstrand  (v1)
---
 src/compiler/shader_info.h| 1 +
 src/compiler/spirv/spirv_to_nir.c | 4 +++-
 src/intel/vulkan/anv_pipeline.c   | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index 3d871938751..4726c185243 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -38,6 +38,7 @@ struct spirv_supported_capabilities {
bool descriptor_array_dynamic_indexing;
bool device_group;
bool draw_parameters;
+   bool float16;
bool float64;
bool geometry_streams;
bool gcn_shader;
diff --git a/src/compiler/spirv/spirv_to_nir.c 
b/src/compiler/spirv/spirv_to_nir.c
index 7e07de2bfc0..309ed6c59b0 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -3556,7 +3556,6 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, 
SpvOp opcode,
   case SpvCapabilityLinkage:
   case SpvCapabilityVector16:
   case SpvCapabilityFloat16Buffer:
-  case SpvCapabilityFloat16:
   case SpvCapabilitySparseResidency:
  vtn_warn("Unsupported SPIR-V capability: %s",
   spirv_capability_to_string(cap));
@@ -3573,6 +3572,9 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, 
SpvOp opcode,
   case SpvCapabilityFloat64:
  spv_check_supported(float64, cap);
  break;
+  case SpvCapabilityFloat16:
+ spv_check_supported(float16, cap);
+ break;
   case SpvCapabilityInt64:
  spv_check_supported(int64, cap);
  break;
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index e2024212bd9..0e8c4245df6 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -139,8 +139,10 @@ anv_shader_compile_to_nir(struct anv_device *device,
  .device_group = true,
  .draw_parameters = true,
  .float64 = pdevice->info.gen >= 8,
+ .float16 = pdevice->info.gen >= 8,
  .geometry_streams = true,
  .image_write_without_format = true,
+ .int8 = pdevice->info.gen >= 8,
  .int16 = pdevice->info.gen >= 8,
  .int64 = pdevice->info.gen >= 8,
  .min_lod = true,
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 28/40] intel/compiler: implement is_zero, is_one, is_negative_one for 8-bit/16-bit

2019-02-12 Thread Iago Toral Quiroga
There are no 8-bit immediates, so assert in that case.
16-bit immediates are replicated in each word of a 32-bit immediate, so
we only need to check the lower 16-bits.

v2:
 - Fix is_zero with half-float to consider -0 as well (Jason).
 - Fix is_negative_one for word type.

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_shader.cpp | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/src/intel/compiler/brw_shader.cpp 
b/src/intel/compiler/brw_shader.cpp
index 569e68e02af..bf09c416e9e 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -719,11 +719,20 @@ backend_reg::is_zero() const
if (file != IMM)
   return false;
 
+   assert(type_sz(type) > 1);
+
switch (type) {
+   case BRW_REGISTER_TYPE_HF:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0 || (d & 0x) == 0x8000;
case BRW_REGISTER_TYPE_F:
   return f == 0;
case BRW_REGISTER_TYPE_DF:
   return df == 0;
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0;
case BRW_REGISTER_TYPE_D:
case BRW_REGISTER_TYPE_UD:
   return d == 0;
@@ -741,11 +750,20 @@ backend_reg::is_one() const
if (file != IMM)
   return false;
 
+   assert(type_sz(type) > 1);
+
switch (type) {
+   case BRW_REGISTER_TYPE_HF:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0x3c00;
case BRW_REGISTER_TYPE_F:
   return f == 1.0f;
case BRW_REGISTER_TYPE_DF:
   return df == 1.0;
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 1;
case BRW_REGISTER_TYPE_D:
case BRW_REGISTER_TYPE_UD:
   return d == 1;
@@ -763,11 +781,19 @@ backend_reg::is_negative_one() const
if (file != IMM)
   return false;
 
+   assert(type_sz(type) > 1);
+
switch (type) {
+   case BRW_REGISTER_TYPE_HF:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0xbc00;
case BRW_REGISTER_TYPE_F:
   return f == -1.0;
case BRW_REGISTER_TYPE_DF:
   return df == -1.0;
+   case BRW_REGISTER_TYPE_W:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0x;
case BRW_REGISTER_TYPE_D:
   return d == -1;
case BRW_REGISTER_TYPE_Q:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 29/40] intel/compiler: add a brw_reg_type_is_integer helper

2019-02-12 Thread Iago Toral Quiroga
v2:
 - Fixed typo: meant BRW_REGISTER_TYPE_UB instead BRW_REGISTER_TYPE_UV

Reviewed-by: Jason Ekstrand  (v1)
---
 src/intel/compiler/brw_reg_type.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/src/intel/compiler/brw_reg_type.h 
b/src/intel/compiler/brw_reg_type.h
index ffbec90d3fe..086770d2e03 100644
--- a/src/intel/compiler/brw_reg_type.h
+++ b/src/intel/compiler/brw_reg_type.h
@@ -82,6 +82,24 @@ brw_reg_type_is_floating_point(enum brw_reg_type type)
}
 }
 
+static inline bool
+brw_reg_type_is_integer(enum brw_reg_type type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_Q:
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_UB:
+  return true;
+   default:
+  return false;
+   }
+}
+
 unsigned
 brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
 enum brw_reg_file file, enum brw_reg_type type);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 19/40] intel/compiler: fix ddy for half-float in Broadwell

2019-02-12 Thread Iago Toral Quiroga
Broadwell has restrictions that apply to Align16 half-float that
make the Align16 implementation of this invalid for this platform.
Use the gen11 path for this instead, which uses Align1 mode.

The restriction is not present in cherryview, gen9 or gen10, where
the Align16 implementation seems to work just fine.

v2:
 - Rework the comment in the code, move the PRM citation from the
   commit message to the comment in the code (Matt)
 - Cherryview isn't affected, only Broadwell (Matt)

Reviewed-by: Jason Ekstrand  (v1)
Reviewed-by: Matt Turner 
---
 src/intel/compiler/brw_fs_generator.cpp | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index 996eafd4af1..37129ce4c67 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1272,8 +1272,21 @@ fs_generator::generate_ddy(const fs_inst *inst,
const uint32_t type_size = type_sz(src.type);
 
if (inst->opcode == FS_OPCODE_DDY_FINE) {
-  /* produce accurate derivatives */
-  if (devinfo->gen >= 11) {
+  /* produce accurate derivatives.
+   *
+   * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
+   * "Register Region Restrictions", Section "1. Special Restrictions":
+   *
+   *"In Align16 mode, the channel selects and channel enables apply to
+   * a pair of half-floats, because these parameters are defined for
+   * DWord elements ONLY. This is applicable when both source and
+   * destination are half-floats."
+   *
+   * So for half-float operations we use the Gen11+ Align1 path. CHV
+   * inherits its FP16 hardware from SKL, so it is not affected.
+   */
+  if (devinfo->gen >= 11 ||
+  (devinfo->is_broadwell && src.type == BRW_REGISTER_TYPE_HF)) {
  src = stride(src, 0, 2, 1);
  struct brw_reg src_0  = byte_offset(src,  0 * type_size);
  struct brw_reg src_2  = byte_offset(src,  2 * type_size);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 27/40] intel/compiler: generalize the combine constants pass

2019-02-12 Thread Iago Toral Quiroga
At the very least we need it to handle HF too, since we are doing
constant propagation for MAD and LRP, which relies on this pass
to promote the immediates to GRF in the end, but ideally
we want it to support even more types so we can take advantage
of it to improve register pressure in some scenarios.
---
 .../compiler/brw_fs_combine_constants.cpp | 202 --
 1 file changed, 180 insertions(+), 22 deletions(-)

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp 
b/src/intel/compiler/brw_fs_combine_constants.cpp
index 7343f77bb45..5d79f1a0826 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -36,6 +36,7 @@
 
 #include "brw_fs.h"
 #include "brw_cfg.h"
+#include "util/half_float.h"
 
 using namespace brw;
 
@@ -114,8 +115,17 @@ struct imm {
 */
exec_list *uses;
 
-   /** The immediate value.  We currently only handle floats. */
-   float val;
+   /** The immediate value */
+   union {
+  char bytes[8];
+  float f;
+  int32_t d;
+  int16_t w;
+   };
+   uint8_t size;
+
+   /** When promoting half-float we need to account for certain restrictions */
+   bool is_half_float;
 
/**
 * The GRF register and subregister number where we've decided to store the
@@ -145,10 +155,11 @@ struct table {
 };
 
 static struct imm *
-find_imm(struct table *table, float val)
+find_imm(struct table *table, void *data, uint8_t size)
 {
for (int i = 0; i < table->len; i++) {
-  if (table->imm[i].val == val) {
+  if (table->imm[i].size == size &&
+  !memcmp(table->imm[i].bytes, data, size)) {
  return &table->imm[i];
   }
}
@@ -190,6 +201,96 @@ compare(const void *_a, const void *_b)
return a->first_use_ip - b->first_use_ip;
 }
 
+static bool
+get_constant_value(const struct gen_device_info *devinfo,
+   const fs_inst *inst, uint32_t src_idx,
+   void *out, brw_reg_type *out_type)
+{
+   const bool can_do_source_mods = inst->can_do_source_mods(devinfo);
+   const fs_reg *src = &inst->src[src_idx];
+
+   *out_type = src->type;
+
+   switch (*out_type) {
+   case BRW_REGISTER_TYPE_F: {
+  float val = !can_do_source_mods ? src->f : fabsf(src->f);
+  memcpy(out, &val, 4);
+  break;
+   }
+   case BRW_REGISTER_TYPE_HF: {
+  uint16_t val = src->d & 0xu;
+  if (can_do_source_mods)
+ val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));
+  memcpy(out, &val, 2);
+  break;
+   }
+   case BRW_REGISTER_TYPE_D: {
+  int32_t val = !can_do_source_mods ? src->d : abs(src->d);
+  memcpy(out, &val, 4);
+  break;
+   }
+   case BRW_REGISTER_TYPE_UD:
+  memcpy(out, &src->ud, 4);
+  break;
+   case BRW_REGISTER_TYPE_W: {
+  int16_t val = src->d & 0xu;
+  if (can_do_source_mods)
+ val = abs(val);
+  memcpy(out, &val, 2);
+  break;
+   }
+   case BRW_REGISTER_TYPE_UW:
+  memcpy(out, &src->ud, 2);
+  break;
+   default:
+  return false;
+   };
+
+   return true;
+}
+
+static struct brw_reg
+build_imm_reg_for_copy(struct imm *imm)
+{
+   switch (imm->size) {
+   case 4:
+  return brw_imm_d(imm->d);
+   case 2:
+  return brw_imm_w(imm->w);
+   default:
+  unreachable("not implemented");
+   }
+}
+
+static inline uint32_t
+get_alignment_for_imm(const struct imm *imm)
+{
+   if (imm->is_half_float)
+  return 4; /* At least MAD seems to require this */
+   else
+  return imm->size;
+}
+
+static bool
+needs_negate(const struct fs_reg *reg, const struct imm *imm)
+{
+   switch (reg->type) {
+   case BRW_REGISTER_TYPE_F:
+  return signbit(reg->f) != signbit(imm->f);
+   case BRW_REGISTER_TYPE_D:
+  return (reg->d < 0) != (imm->d < 0);
+   case BRW_REGISTER_TYPE_HF:
+  return (reg->d & 0x8000u) != (imm->w & 0x8000u);
+   case BRW_REGISTER_TYPE_W:
+  return ((reg->d & 0xu) < 0) != (imm->w < 0);
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_UW:
+  return false;
+   default:
+  unreachable("not implemented");
+   };
+}
+
 bool
 fs_visitor::opt_combine_constants()
 {
@@ -214,13 +315,17 @@ fs_visitor::opt_combine_constants()
  continue;
 
   for (int i = 0; i < inst->sources; i++) {
- if (inst->src[i].file != IMM ||
- inst->src[i].type != BRW_REGISTER_TYPE_F)
+ if (inst->src[i].file != IMM)
 continue;
 
- float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
- fabs(inst->src[i].f);
- struct imm *imm = find_imm(&table, val);
+ char data[8];
+ brw_reg_type type;
+ if (!get_constant_value(devinfo, inst, i, data, &type))
+continue;
+
+ uint8_t size = type_sz(type);
+
+ struct imm *imm = find_imm(&table, data, size);
 
  if (imm) {
 bblock_t *intersection = cfg_t::intersect(block, imm->block);
@@ -237,7 +342,9 @@ fs_visitor::opt_combine_constants(

[Mesa-dev] [PATCH v4 26/40] intel/eu: force stride of 2 on NULL register for Byte instructions

2019-02-12 Thread Iago Toral Quiroga
The hardware only allows a stride of 1 on a Byte destination for raw
byte MOV instructions. This is required even when the destination
is the NULL register.

Rather than making sure that we emit a proper NULL:B destination
every time we need one, just fix it at emission time.

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_eu_emit.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 195c26ab760..0c79a77c9c7 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -94,6 +94,17 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct 
brw_reg dest)
else if (dest.file == BRW_GENERAL_REGISTER_FILE)
   assert(dest.nr < 128);
 
+   /* The hardware has a restriction where if the destination is Byte,
+* the instruction needs to have a stride of 2 (except for packed byte
+* MOV). This seems to be required even if the destination is the NULL
+* register.
+*/
+   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+   dest.nr == BRW_ARF_NULL &&
+   type_sz(dest.type) == 1) {
+  dest.hstride = BRW_HORIZONTAL_STRIDE_2;
+   }
+
gen7_convert_mrf_to_grf(p, &dest);
 
if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 25/40] intel/compiler: ask for an integer type if requesting an 8-bit type

2019-02-12 Thread Iago Toral Quiroga
v2:
  - Assign BRW_REGISTER_TYPE_B directly for 8-bit (Jason)

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 40c0481ac53..761d4d3c4ed 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -345,7 +345,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
   unsigned array_elems =
  reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
   unsigned size = array_elems * reg->num_components;
-  const brw_reg_type reg_type =
+  const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B :
  brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
   nir_locals[reg->index] = bld.vgrf(reg_type, size);
}
@@ -4376,7 +4376,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, 
nir_intrinsic_instr *instr
   fs_reg value = get_nir_src(instr->src[0]);
   if (instr->intrinsic == nir_intrinsic_vote_feq) {
  const unsigned bit_size = nir_src_bit_size(instr->src[0]);
- value.type = brw_reg_type_from_bit_size(bit_size, 
BRW_REGISTER_TYPE_F);
+ value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
+brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
   }
 
   fs_reg uniformized = bld.emit_uniformize(value);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 30/40] intel/compiler: fix cmod propagation for non 32-bit types

2019-02-12 Thread Iago Toral Quiroga
v2:
 - Do not propagate if the bit-size changes

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_cmod_propagation.cpp | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp 
b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 7bb5c9afbc9..57d4e645c05 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -244,8 +244,7 @@ opt_cmod_propagation_local(const gen_device_info *devinfo,
 /* CMP's result is the same regardless of dest type. */
 if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
 scan_inst->opcode == BRW_OPCODE_CMP &&
-(inst->dst.type == BRW_REGISTER_TYPE_D ||
- inst->dst.type == BRW_REGISTER_TYPE_UD)) {
+brw_reg_type_is_integer(inst->dst.type)) {
inst->remove(block);
progress = true;
break;
@@ -258,9 +257,14 @@ opt_cmod_propagation_local(const gen_device_info *devinfo,
break;
 
 /* Comparisons operate differently for ints and floats */
-if (scan_inst->dst.type != inst->dst.type &&
-(scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
- inst->dst.type == BRW_REGISTER_TYPE_F))
+if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
+brw_reg_type_is_floating_point(inst->dst.type))
+   break;
+
+/* Comparison result may be altered if the bit-size changes
+ * since that affects range, denorms, etc
+ */
+if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
break;
 
 /* If the instruction generating inst's source also wrote the
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 35/40] intel/compiler: validate conversions between 64-bit and 8-bit types

2019-02-12 Thread Iago Toral Quiroga
---
 src/intel/compiler/brw_eu_validate.c| 10 +-
 src/intel/compiler/test_eu_validate.cpp | 46 +
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_validate.c 
b/src/intel/compiler/brw_eu_validate.c
index 203641fecb9..b1fdd1ce941 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -533,10 +533,18 @@ general_restrictions_based_on_operand_types(const struct 
gen_device_info *devinf
 
/* From the BDW+ PRM:
 *
-*"There is no direct conversion from HF to DF or DF to HF.
+*"There is no direct conversion from B/UB to DF or DF to B/UB.
+* There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
+* There is no direct conversion from HF to DF or DF to HF.
 * There is no direct conversion from HF to Q/UQ or Q/UQ to HF."
 */
enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+
+   ERROR_IF(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV &&
+((dst_type_size == 1 && type_sz(src0_type) == 8) ||
+ (dst_type_size == 8 && type_sz(src0_type) == 1)),
+"There are no direct conversion between 64-bit types and B/UB");
+
ERROR_IF(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV &&
 ((dst_type == BRW_REGISTER_TYPE_HF && type_sz(src0_type) == 8) ||
  (dst_type_size == 8 && src0_type == BRW_REGISTER_TYPE_HF)),
diff --git a/src/intel/compiler/test_eu_validate.cpp 
b/src/intel/compiler/test_eu_validate.cpp
index 1557b6d2452..06beb53eb5d 100644
--- a/src/intel/compiler/test_eu_validate.cpp
+++ b/src/intel/compiler/test_eu_validate.cpp
@@ -848,6 +848,52 @@ TEST_P(validation_test, byte_destination_relaxed_alignment)
}
 }
 
+TEST_P(validation_test, byte_64bit_conversion)
+{
+   static const struct {
+  enum brw_reg_type dst_type;
+  enum brw_reg_type src_type;
+  unsigned dst_stride;
+  bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src_type, dst_stride, expected_result) \
+  {   \
+ BRW_REGISTER_TYPE_##dst_type,\
+ BRW_REGISTER_TYPE_##src_type,\
+ BRW_HORIZONTAL_STRIDE_##dst_stride,  \
+ expected_result, \
+  }
+
+  INST(B,  Q, 1, false),
+  INST(B, UQ, 1, false),
+  INST(B, DF, 1, false),
+
+  INST(B,  Q, 2, false),
+  INST(B, UQ, 2, false),
+  INST(B, DF, 2, false),
+
+  INST(B,  Q, 4, false),
+  INST(B, UQ, 4, false),
+  INST(B, DF, 4, false),
+
+#undef INST
+   };
+
+   if (devinfo.gen < 8)
+  return;
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+  if (!devinfo.has_64bit_types && type_sz(inst[i].src_type) == 8)
+ continue;
+
+  brw_MOV(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src_type));
+  brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+  EXPECT_EQ(inst[i].expected_result, validate(p));
+
+  clear_instructions(p);
+   }
+}
+
 TEST_P(validation_test, half_float_conversion)
 {
static const struct {
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 18/40] intel/compiler: fix ddx and ddy for 16-bit float

2019-02-12 Thread Iago Toral Quiroga
We were assuming 32-bit elements. Also, In SIMD8 we pack 2 vector components
in a single SIMD register, so for example, component Y of a 16-bit vec2
starts is at byte offset 16B. This means that when we compute the offset of
the elements to be differentiated we should not stomp whatever base offset we
have, but instead add to it.

v2
 - Use byte_offset() helper (Jason)
 - Merge the fix for SIMD8: using byte_offset() fixes that too.

Reviewed-by: Jason Ekstrand  (v1)
Reviewed-by: Matt Turner 
---
 src/intel/compiler/brw_fs_generator.cpp | 37 -
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index e3b68fa3165..996eafd4af1 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1248,10 +1248,9 @@ fs_generator::generate_ddx(const fs_inst *inst,
   width = BRW_WIDTH_4;
}
 
-   struct brw_reg src0 = src;
+   struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
struct brw_reg src1 = src;
 
-   src0.subnr   = sizeof(float);
src0.vstride = vstride;
src0.width   = width;
src0.hstride = BRW_HORIZONTAL_STRIDE_0;
@@ -1270,23 +1269,25 @@ void
 fs_generator::generate_ddy(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src)
 {
+   const uint32_t type_size = type_sz(src.type);
+
if (inst->opcode == FS_OPCODE_DDY_FINE) {
   /* produce accurate derivatives */
   if (devinfo->gen >= 11) {
  src = stride(src, 0, 2, 1);
- struct brw_reg src_0  = byte_offset(src,  0 * sizeof(float));
- struct brw_reg src_2  = byte_offset(src,  2 * sizeof(float));
- struct brw_reg src_4  = byte_offset(src,  4 * sizeof(float));
- struct brw_reg src_6  = byte_offset(src,  6 * sizeof(float));
- struct brw_reg src_8  = byte_offset(src,  8 * sizeof(float));
- struct brw_reg src_10 = byte_offset(src, 10 * sizeof(float));
- struct brw_reg src_12 = byte_offset(src, 12 * sizeof(float));
- struct brw_reg src_14 = byte_offset(src, 14 * sizeof(float));
-
- struct brw_reg dst_0  = byte_offset(dst,  0 * sizeof(float));
- struct brw_reg dst_4  = byte_offset(dst,  4 * sizeof(float));
- struct brw_reg dst_8  = byte_offset(dst,  8 * sizeof(float));
- struct brw_reg dst_12 = byte_offset(dst, 12 * sizeof(float));
+ struct brw_reg src_0  = byte_offset(src,  0 * type_size);
+ struct brw_reg src_2  = byte_offset(src,  2 * type_size);
+ struct brw_reg src_4  = byte_offset(src,  4 * type_size);
+ struct brw_reg src_6  = byte_offset(src,  6 * type_size);
+ struct brw_reg src_8  = byte_offset(src,  8 * type_size);
+ struct brw_reg src_10 = byte_offset(src, 10 * type_size);
+ struct brw_reg src_12 = byte_offset(src, 12 * type_size);
+ struct brw_reg src_14 = byte_offset(src, 14 * type_size);
+
+ struct brw_reg dst_0  = byte_offset(dst,  0 * type_size);
+ struct brw_reg dst_4  = byte_offset(dst,  4 * type_size);
+ struct brw_reg dst_8  = byte_offset(dst,  8 * type_size);
+ struct brw_reg dst_12 = byte_offset(dst, 12 * type_size);
 
  brw_push_insn_state(p);
  brw_set_default_exec_size(p, BRW_EXECUTE_4);
@@ -1313,10 +1314,8 @@ fs_generator::generate_ddy(const fs_inst *inst,
   }
} else {
   /* replicate the derivative at the top-left pixel to other pixels */
-  struct brw_reg src0 = stride(src, 4, 4, 0);
-  struct brw_reg src1 = stride(src, 4, 4, 0);
-  src0.subnr = 0 * sizeof(float);
-  src1.subnr = 2 * sizeof(float);
+  struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
+  struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
 
   brw_ADD(p, dst, negate(src0), src1);
}
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 38/40] compiler/spirv: move the check for Int8 capability

2019-02-12 Thread Iago Toral Quiroga
So it is right after the checks for the other various Int* capabilities.
---
 src/compiler/spirv/spirv_to_nir.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/compiler/spirv/spirv_to_nir.c 
b/src/compiler/spirv/spirv_to_nir.c
index 1cbc926c818..7e07de2bfc0 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -3579,6 +3579,9 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, 
SpvOp opcode,
   case SpvCapabilityInt16:
  spv_check_supported(int16, cap);
  break;
+  case SpvCapabilityInt8:
+ spv_check_supported(int8, cap);
+ break;
 
   case SpvCapabilityTransformFeedback:
  spv_check_supported(transform_feedback, cap);
@@ -3591,10 +3594,6 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, 
SpvOp opcode,
   case SpvCapabilityInt64Atomics:
  spv_check_supported(int64_atomics, cap);
 
-  case SpvCapabilityInt8:
- spv_check_supported(int8, cap);
- break;
-
   case SpvCapabilityStorageImageMultisample:
  spv_check_supported(storage_image_ms, cap);
  break;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 11/40] compiler/nir: add lowering for 16-bit flrp

2019-02-12 Thread Iago Toral Quiroga
And enable it on Intel.

v2:
 - Squash the change to enable it on Intel (Jason)

Reviewed-by: Jason Ekstrand 
---
 src/compiler/nir/nir.h| 1 +
 src/compiler/nir/nir_opt_algebraic.py | 1 +
 src/intel/compiler/brw_compiler.c | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 6547a468648..740c64d2a94 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2113,6 +2113,7 @@ typedef struct nir_shader_compiler_options {
bool lower_fdiv;
bool lower_ffma;
bool fuse_ffma;
+   bool lower_flrp16;
bool lower_flrp32;
/** Lowers flrp when it does not support doubles */
bool lower_flrp64;
diff --git a/src/compiler/nir/nir_opt_algebraic.py 
b/src/compiler/nir/nir_opt_algebraic.py
index cd969de1f88..40eb3de02c3 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -124,6 +124,7 @@ optimizations = [
(('~flrp', 0.0, a, b), ('fmul', a, b)),
(('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 
'options->lower_flrp32'),
(('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
+   (('flrp@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 
'options->lower_flrp16'),
(('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 
'options->lower_flrp32'),
(('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 
'options->lower_flrp64'),
(('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
diff --git a/src/intel/compiler/brw_compiler.c 
b/src/intel/compiler/brw_compiler.c
index f885e79c3e6..04a1a7cac4e 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -33,6 +33,7 @@
.lower_sub = true, \
.lower_fdiv = true,\
.lower_scmp = true,\
+   .lower_flrp16 = true,  \
.lower_fmod16 = true,  \
.lower_fmod32 = true,  \
.lower_fmod64 = false, \
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 02/40] intel/compiler: add a NIR pass to lower conversions

2019-02-12 Thread Iago Toral Quiroga
Some conversions are not directly supported in hardware and need to be
split in two conversion instructions going through an intermediary type.
Doing this at the NIR level simplifies a bit the complexity in the backend.

v2:
 - Consider fp16 rounding conversion opcodes
 - Properly handle swizzles on conversion sources.

v3
 - Run the pass earlier, right after nir_opt_algebraic_late (Jason)
 - NIR alu output types already have the bit-size (Jason)
 - Use the new is_conversion field in nir_op_info to select conversion
   instructions (Jason)
 - Use 'is_conversion' to identify conversion operations (Jason)

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/Makefile.sources|   1 +
 src/intel/compiler/brw_nir.c  |   2 +
 src/intel/compiler/brw_nir.h  |   2 +
 .../compiler/brw_nir_lower_conversions.c  | 158 ++
 src/intel/compiler/meson.build|   1 +
 5 files changed, 164 insertions(+)
 create mode 100644 src/intel/compiler/brw_nir_lower_conversions.c

diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 94a28d370e8..9975daa3ad1 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -83,6 +83,7 @@ COMPILER_FILES = \
compiler/brw_nir_analyze_boolean_resolves.c \
compiler/brw_nir_analyze_ubo_ranges.c \
compiler/brw_nir_attribute_workarounds.c \
+   compiler/brw_nir_lower_conversions.c \
compiler/brw_nir_lower_cs_intrinsics.c \
compiler/brw_nir_lower_image_load_store.c \
compiler/brw_nir_lower_mem_access_bit_sizes.c \
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 9dbf06004a4..7e3dbc9e447 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -876,6 +876,8 @@ brw_postprocess_nir(nir_shader *nir, const struct 
brw_compiler *compiler,
 
OPT(nir_opt_algebraic_late);
 
+   OPT(brw_nir_lower_conversions);
+
OPT(nir_lower_to_source_mods, nir_lower_all_source_mods);
OPT(nir_copy_prop);
OPT(nir_opt_dce);
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index bc81950d47e..662b2627e95 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -114,6 +114,8 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const 
struct brw_vue_map *vue,
GLenum tes_primitive_mode);
 void brw_nir_lower_fs_outputs(nir_shader *nir);
 
+bool brw_nir_lower_conversions(nir_shader *nir);
+
 bool brw_nir_lower_image_load_store(nir_shader *nir,
 const struct gen_device_info *devinfo);
 void brw_nir_rewrite_image_intrinsic(nir_intrinsic_instr *intrin,
diff --git a/src/intel/compiler/brw_nir_lower_conversions.c 
b/src/intel/compiler/brw_nir_lower_conversions.c
new file mode 100644
index 000..bb1312ad428
--- /dev/null
+++ b/src/intel/compiler/brw_nir_lower_conversions.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+static nir_op
+get_conversion_op(nir_alu_type src_type,
+  unsigned src_bit_size,
+  nir_alu_type dst_type,
+  unsigned dst_bit_size,
+  nir_rounding_mode rounding_mode)
+{
+   nir_alu_type src_full_type = (nir_alu_type) (src_type | src_bit_size);
+   nir_alu_type dst_full_type = (nir_alu_type) (dst_type | dst_bit_size);
+
+   return nir_type_conversion_op(src_full_type, dst_full_type, rounding_mode);
+}
+
+static nir_rounding_mode
+get_opcode_rounding_mode(nir_op op)
+{
+   switch (op) {
+   case nir_op_f2f16_rtz:
+  return nir_rounding_mode_rtz;
+   case nir_op_f2f16_rtne:
+  return nir_rounding_mode_rtne;
+   default:
+  return nir_rounding_mode_undef;
+   }
+}
+
+static void
+split_conversion(nir_builder

[Mesa-dev] [PATCH v4 40/40] anv/device: expose VK_KHR_shader_float16_int8 in gen8+

2019-02-12 Thread Iago Toral Quiroga
v2 (Jason):
 - Merge shaderFloat16 and shaderInt8 enablement into a single patch.
 - Merge extension enable.

Reviewed-by: Jason Ekstrand  (v1)
---
 src/intel/vulkan/anv_device.c  | 9 +
 src/intel/vulkan/anv_extensions.py | 1 +
 2 files changed, 10 insertions(+)

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 77120937c51..0fc6d59162d 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1001,6 +1001,15 @@ void anv_GetPhysicalDeviceFeatures2(
  break;
   }
 
+  case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: {
+ VkPhysicalDeviceFloat16Int8FeaturesKHR *features = (void *)ext;
+ ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+ features->shaderFloat16 = pdevice->info.gen >= 8;
+ features->shaderInt8 = pdevice->info.gen >= 8;
+ break;
+  }
+
   default:
  anv_debug_ignored_stype(ext->sType);
  break;
diff --git a/src/intel/vulkan/anv_extensions.py 
b/src/intel/vulkan/anv_extensions.py
index e502b5d5685..db28898d663 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -109,6 +109,7 @@ EXTENSIONS = [
 Extension('VK_KHR_sampler_mirror_clamp_to_edge',  1, True),
 Extension('VK_KHR_sampler_ycbcr_conversion',  1, True),
 Extension('VK_KHR_shader_draw_parameters',1, True),
+Extension('VK_KHR_shader_float16_int8',   1, 'device->info.gen 
>= 8'),
 Extension('VK_KHR_storage_buffer_storage_class',  1, True),
 Extension('VK_KHR_surface',  25, 
'ANV_HAS_SURFACE'),
 Extension('VK_KHR_swapchain',68, 
'ANV_HAS_SURFACE'),
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 09/40] intel/compiler: drop unnecessary temporary from 32-bit fsign implementation

2019-02-12 Thread Iago Toral Quiroga
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 64e24f86b5a..f59e9ad4e2b 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -866,12 +866,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   */
  bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 
- fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
  op[0].type = BRW_REGISTER_TYPE_UD;
  result.type = BRW_REGISTER_TYPE_UD;
- bld.AND(result_int, op[0], brw_imm_ud(0x8000u));
+ bld.AND(result, op[0], brw_imm_ud(0x8000u));
 
- inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f80u));
+ inst = bld.OR(result, result, brw_imm_ud(0x3f80u));
  inst->predicate = BRW_PREDICATE_NORMAL;
   } else {
  /* For doubles we do the same but we need to consider:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 07/40] intel/compiler: handle extended math restrictions for half-float

2019-02-12 Thread Iago Toral Quiroga
Extended math with half-float operands is only supported since gen9,
but it is limited to SIMD8. In gen8 we lower it to 32-bit.

v2: quashed together the following patches (Jason):
  - intel/compiler: allow extended math functions with HF operands
  - intel/compiler: lower 16-bit extended math to 32-bit prior to gen9
  - intel/compiler: extended Math is limited to SIMD8 on half-float

Reviewed-by: Jason Ekstrand 
Reviewed-by: Topi Pohjolainen 
  (allow extended math functions with HF operands,
   extended Math is limited to SIMD8 on half-float)
---
 src/intel/compiler/brw_eu_emit.c |  6 --
 src/intel/compiler/brw_fs.cpp| 27 ++-
 src/intel/compiler/brw_nir.c | 13 -
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 9be82d1b87c..2f31d9591fc 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1916,8 +1916,10 @@ void gen6_math(struct brw_codegen *p,
   assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
  (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
} else {
-  assert(src0.type == BRW_REGISTER_TYPE_F);
-  assert(src1.type == BRW_REGISTER_TYPE_F);
+  assert(src0.type == BRW_REGISTER_TYPE_F ||
+ (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
+  assert(src1.type == BRW_REGISTER_TYPE_F ||
+ (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
}
 
/* Source modifiers are ignored for extended math instructions on Gen6. */
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 5a18ba86a96..d172c2de4d7 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5898,18 +5898,27 @@ get_lowered_simd_width(const struct gen_device_info 
*devinfo,
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS:
+   case SHADER_OPCODE_COS: {
   /* Unary extended math instructions are limited to SIMD8 on Gen4 and
-   * Gen6.
+   * Gen6. Extended Math Function is limited to SIMD8 with half-float.
*/
-  return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
-  devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) 
:
-  MIN2(8, inst->exec_size));
+  if (devinfo->gen == 6 || (devinfo->gen == 4 && !devinfo->is_g4x))
+ return MIN2(8, inst->exec_size);
+  if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ return MIN2(8, inst->exec_size);
+  return MIN2(16, inst->exec_size);
+   }
 
-   case SHADER_OPCODE_POW:
-  /* SIMD16 is only allowed on Gen7+. */
-  return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
-  MIN2(8, inst->exec_size));
+   case SHADER_OPCODE_POW: {
+  /* SIMD16 is only allowed on Gen7+. Extended Math Function is limited
+   * to SIMD8 with half-float
+   */
+  if (devinfo->gen < 7)
+ return MIN2(8, inst->exec_size);
+  if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ return MIN2(8, inst->exec_size);
+  return MIN2(16, inst->exec_size);
+   }
 
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 75513e5113c..9b26a6c3d6f 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -631,6 +631,8 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED 
void *data)
if (alu->dest.dest.ssa.bit_size != 16)
   return 0;
 
+   const struct brw_compiler *compiler = (const struct brw_compiler *) data;
+
switch (alu->op) {
case nir_op_idiv:
case nir_op_imod:
@@ -643,6 +645,15 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED 
void *data)
case nir_op_fround_even:
case nir_op_ftrunc:
   return 32;
+   case nir_op_frcp:
+   case nir_op_frsq:
+   case nir_op_fsqrt:
+   case nir_op_fpow:
+   case nir_op_fexp2:
+   case nir_op_flog2:
+   case nir_op_fsin:
+   case nir_op_fcos:
+  return compiler->devinfo->gen < 9 ? 32 : 0;
default:
   return 0;
}
@@ -770,7 +781,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
   OPT(nir_opt_large_constants, NULL, 32);
}
 
-   OPT(nir_lower_bit_size, lower_bit_size_callback, NULL);
+   OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
 
if (is_scalar) {
   OPT(nir_lower_load_const_to_scalar);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 22/40] intel/compiler: activate 16-bit bit-size lowerings also for 8-bit

2019-02-12 Thread Iago Toral Quiroga
Particularly, we need the same lowewrings we use for 16-bit
integers.

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_nir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 9b26a6c3d6f..1d62f2adde8 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -628,7 +628,7 @@ static unsigned
 lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data)
 {
assert(alu->dest.dest.is_ssa);
-   if (alu->dest.dest.ssa.bit_size != 16)
+   if (alu->dest.dest.ssa.bit_size >= 32)
   return 0;
 
const struct brw_compiler *compiler = (const struct brw_compiler *) data;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 31/40] intel/compiler: remove inexact algebraic optimizations from the backend

2019-02-12 Thread Iago Toral Quiroga
NIR already has these and correctly considers exact/inexact qualification,
whereas the backend doesn't and can apply the optimizations where it
shouldn't. This happened to be the case in a handful of Tomb Raider shaders,
where NIR would skip the optimizations because of a precise qualification
but the backend would then (incorrectly) apply them anyway.

Besides this, considering that we are not emitting much math in the backend
these days it is unlikely that these optimizations are useful in general. A
shader-db run confirms that MAD and LRP optimizations, for example, were only
being triggered in cases where NIR would skip them due to precise
requirements, so in the near future we might want to remove more of these,
but for now we just remove the ones that are not completely correct.

Suggested-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs.cpp | 39 +--
 1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0f04e577de3..873a1dd8196 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2547,15 +2547,6 @@ fs_visitor::opt_algebraic()
 break;
  }
 
- /* a * 0.0 = 0.0 */
- if (inst->src[1].is_zero()) {
-inst->opcode = BRW_OPCODE_MOV;
-inst->src[0] = inst->src[1];
-inst->src[1] = reg_undef;
-progress = true;
-break;
- }
-
  if (inst->src[0].file == IMM) {
 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
 inst->opcode = BRW_OPCODE_MOV;
@@ -2569,14 +2560,6 @@ fs_visitor::opt_algebraic()
  if (inst->src[1].file != IMM)
 continue;
 
- /* a + 0.0 = a */
- if (inst->src[1].is_zero()) {
-inst->opcode = BRW_OPCODE_MOV;
-inst->src[1] = reg_undef;
-progress = true;
-break;
- }
-
  if (inst->src[0].file == IMM) {
 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
 inst->opcode = BRW_OPCODE_MOV;
@@ -2595,16 +2578,6 @@ fs_visitor::opt_algebraic()
 break;
  }
  break;
-  case BRW_OPCODE_LRP:
- if (inst->src[1].equals(inst->src[2])) {
-inst->opcode = BRW_OPCODE_MOV;
-inst->src[0] = inst->src[1];
-inst->src[1] = reg_undef;
-inst->src[2] = reg_undef;
-progress = true;
-break;
- }
- break;
   case BRW_OPCODE_CMP:
  if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
   inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
@@ -2682,17 +2655,7 @@ fs_visitor::opt_algebraic()
  }
  break;
   case BRW_OPCODE_MAD:
- if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
-inst->opcode = BRW_OPCODE_MOV;
-inst->src[1] = reg_undef;
-inst->src[2] = reg_undef;
-progress = true;
- } else if (inst->src[0].is_zero()) {
-inst->opcode = BRW_OPCODE_MUL;
-inst->src[0] = inst->src[2];
-inst->src[2] = reg_undef;
-progress = true;
- } else if (inst->src[1].is_one()) {
+ if (inst->src[1].is_one()) {
 inst->opcode = BRW_OPCODE_ADD;
 inst->src[1] = inst->src[2];
 inst->src[2] = reg_undef;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 16/40] intel/compiler: allow half-float on 3-source instructions since gen8

2019-02-12 Thread Iago Toral Quiroga
Reviewed-by: Topi Pohjolainen 
Reviewed-by: Jason Ekstrand 
Reviewed-by: Matt Turner 
---
 src/intel/compiler/brw_eu_emit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 2f31d9591fc..30037e71b00 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -797,7 +797,8 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct 
brw_reg dest,
   assert(dest.type == BRW_REGISTER_TYPE_F  ||
  dest.type == BRW_REGISTER_TYPE_DF ||
  dest.type == BRW_REGISTER_TYPE_D  ||
- dest.type == BRW_REGISTER_TYPE_UD);
+ dest.type == BRW_REGISTER_TYPE_UD ||
+ (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
   if (devinfo->gen == 6) {
  brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
 dest.file == 
BRW_MESSAGE_REGISTER_FILE);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 32/40] intel/compiler: skip MAD algebraic optimization for half-float or mixed mode

2019-02-12 Thread Iago Toral Quiroga
It is very likely that this optimzation is never useful and we'll probably
just end up removing it, so let's not bother adding more cases to it for
now.
---
 src/intel/compiler/brw_fs.cpp | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 873a1dd8196..aeabaefd6df 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2655,6 +2655,10 @@ fs_visitor::opt_algebraic()
  }
  break;
   case BRW_OPCODE_MAD:
+ if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
+ inst->src[1].type != BRW_REGISTER_TYPE_F ||
+ inst->src[2].type != BRW_REGISTER_TYPE_F)
+break;
  if (inst->src[1].is_one()) {
 inst->opcode = BRW_OPCODE_ADD;
 inst->src[1] = inst->src[2];
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 20/40] intel/compiler: workaround for SIMD8 half-float MAD in gen8

2019-02-12 Thread Iago Toral Quiroga
Empirical testing shows that gen8 has a bug where MAD instructions with
a half-float source starting at a non-zero offset fail to execute
properly.

This scenario usually happened in SIMD8 executions, where we used to
pack vector components Y and W in the second half of SIMD registers
(therefore, with a 16B offset). It looks like we are not currently doing
this any more but this would handle the situation properly if we ever
happen to produce code like this again.

v2 (Jason):
 - Move this workaround to the lower_regioning pass as an additional case
   to has_invalid_src_region()
 - Do not apply the workaround if the stride of the source operand is 0,
   testing suggests the problem doesn't exist in that case.

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/compiler/brw_fs_lower_regioning.cpp | 39 +--
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp 
b/src/intel/compiler/brw_fs_lower_regioning.cpp
index df50993dee6..7c70cfab535 100644
--- a/src/intel/compiler/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@@ -109,20 +109,37 @@ namespace {
has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
   unsigned i)
{
-  if (is_unordered(inst)) {
+  if (is_unordered(inst))
  return false;
-  } else {
- const unsigned dst_byte_stride = inst->dst.stride * 
type_sz(inst->dst.type);
- const unsigned src_byte_stride = inst->src[i].stride *
-type_sz(inst->src[i].type);
- const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
- const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
 
- return has_dst_aligned_region_restriction(devinfo, inst) &&
-!is_uniform(inst->src[i]) &&
-(src_byte_stride != dst_byte_stride ||
- src_byte_offset != dst_byte_offset);
+  /* Empirical testing shows that Broadwell has a bug affecting half-float
+   * MAD instructions when any of its sources has a non-zero offset, such
+   * as:
+   *
+   * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 
1Q };
+   *
+   * We used to generate code like this for SIMD8 executions where we
+   * used to pack components Y and W of a vector at offset 16B of a SIMD
+   * register. The problem doesn't occur if the stride of the source is 0.
+   */
+  if (devinfo->gen == 8 &&
+  inst->opcode == BRW_OPCODE_MAD &&
+  inst->src[i].type == BRW_REGISTER_TYPE_HF &&
+  inst->src[i].offset > 0 &&
+  inst->src[i].stride != 0) {
+ return true;
   }
+
+  const unsigned dst_byte_stride = inst->dst.stride * 
type_sz(inst->dst.type);
+  const unsigned src_byte_stride = inst->src[i].stride *
+ type_sz(inst->src[i].type);
+  const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
+  const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
+
+  return has_dst_aligned_region_restriction(devinfo, inst) &&
+ !is_uniform(inst->src[i]) &&
+ (src_byte_stride != dst_byte_stride ||
+  src_byte_offset != dst_byte_offset);
}
 
/*
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 06/40] intel/compiler: lower some 16-bit float operations to 32-bit

2019-02-12 Thread Iago Toral Quiroga
The hardware doesn't support half-float for these.

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_nir.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 7e3dbc9e447..75513e5113c 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -637,6 +637,11 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED 
void *data)
case nir_op_irem:
case nir_op_udiv:
case nir_op_umod:
+   case nir_op_fceil:
+   case nir_op_ffloor:
+   case nir_op_ffract:
+   case nir_op_fround_even:
+   case nir_op_ftrunc:
   return 32;
default:
   return 0;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 05/40] intel/compiler: assert restrictions on conversions to half-float

2019-02-12 Thread Iago Toral Quiroga
There are some hardware restrictions that brw_nir_lower_conversions should
have taken care of before we get here.

v2:
 - rebased on top of regioning lowering pass

Reviewed-by: Topi Pohjolainen  (v1)
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index b705b3f5bc2..4c7a839390c 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -784,6 +784,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
*/
 
case nir_op_f2f16:
+   case nir_op_i2f16:
+   case nir_op_u2f16:
+  assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
   inst = bld.MOV(result, op[0]);
   inst->saturate = instr->dest.saturate;
   break;
@@ -821,8 +824,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
case nir_op_u2u32:
case nir_op_i2i16:
case nir_op_u2u16:
-   case nir_op_i2f16:
-   case nir_op_u2f16:
case nir_op_i2i8:
case nir_op_u2u8:
   inst = bld.MOV(result, op[0]);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] intel/compiler: update validator to account for half-float exec type promotion

2019-01-23 Thread Iago Toral Quiroga
Commit c84ec70b3a72 implemented execution type promotion to 32-bit for
conversions involving half-float registers, which empirical testing suggested
was required, but it did not incorporate this change into the assembly validator
logic. This commits adds that, preventing validation errors like this:

mov(16)  g9<4>B   g3<16,8,2>HF { align1 1H };
ERROR: Destination stride must be equal to the ratio of the sizes of the
   execution data type to the destination type

Fixes: c84ec70b3a72 "intel/fs: Promote execution type to 32-bit when any 
half-float conversion is needed."
---
 src/intel/compiler/brw_eu_validate.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/intel/compiler/brw_eu_validate.c 
b/src/intel/compiler/brw_eu_validate.c
index a25010b225c..3bb37677672 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -325,17 +325,20 @@ execution_type(const struct gen_device_info *devinfo, 
const brw_inst *inst)
unsigned num_sources = num_sources_from_inst(devinfo, inst);
enum brw_reg_type src0_exec_type, src1_exec_type;
 
-   /* Execution data type is independent of destination data type, except in
-* mixed F/HF instructions on CHV and SKL+.
+   /* Empirical testing suggests that type conversions involving half-float
+* promote execution type to 32-bit. See get_exec_type() in brw_ir_fs.h.
 */
enum brw_reg_type dst_exec_type = brw_inst_dst_type(devinfo, inst);
 
src0_exec_type = execution_type_for_type(brw_inst_src0_type(devinfo, inst));
if (num_sources == 1) {
-  if ((devinfo->gen >= 9 || devinfo->is_cherryview) &&
-  src0_exec_type == BRW_REGISTER_TYPE_HF) {
- return dst_exec_type;
+  if (type_sz(src0_exec_type) == 2 && dst_exec_type != src0_exec_type) {
+ if (src0_exec_type == BRW_REGISTER_TYPE_HF)
+return BRW_REGISTER_TYPE_F;
+ else if (dst_exec_type == BRW_REGISTER_TYPE_HF)
+return BRW_REGISTER_TYPE_D;
   }
+
   return src0_exec_type;
}
 
@@ -367,14 +370,12 @@ execution_type(const struct gen_device_info *devinfo, 
const brw_inst *inst)
src1_exec_type == BRW_REGISTER_TYPE_DF)
   return BRW_REGISTER_TYPE_DF;
 
-   if (devinfo->gen >= 9 || devinfo->is_cherryview) {
-  if (dst_exec_type == BRW_REGISTER_TYPE_F ||
-  src0_exec_type == BRW_REGISTER_TYPE_F ||
-  src1_exec_type == BRW_REGISTER_TYPE_F) {
- return BRW_REGISTER_TYPE_F;
-  } else {
- return BRW_REGISTER_TYPE_HF;
-  }
+   if (dst_exec_type == BRW_REGISTER_TYPE_F ||
+   src0_exec_type == BRW_REGISTER_TYPE_F ||
+   src1_exec_type == BRW_REGISTER_TYPE_F) {
+  return BRW_REGISTER_TYPE_F;
+   } else {
+  return BRW_REGISTER_TYPE_HF;
}
 
assert(src0_exec_type == BRW_REGISTER_TYPE_F);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 03/42] intel/compiler: split float to 64-bit opcodes from int to 64-bit

2019-01-15 Thread Iago Toral Quiroga
Going forward having these split is a bit more convenient since these two
groups have different restrictions.

v2:
 - Rebased on top of new regioning lowering pass.

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/compiler/brw_fs_nir.cpp | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index bdc883e5364..a59debf2b78 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -801,10 +801,17 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
case nir_op_f2f64:
case nir_op_f2i64:
case nir_op_f2u64:
+  assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
+  inst = bld.MOV(result, op[0]);
+  inst->saturate = instr->dest.saturate;
+  break;
+
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2f64:
case nir_op_u2u64:
+  assert(type_sz(op[0].type) > 1); /* brw_nir_lower_conversions */
+  /* fallthrough */
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 27/42] intel/compiler: activate 16-bit bit-size lowerings also for 8-bit

2019-01-15 Thread Iago Toral Quiroga
Particularly, we need the same lowewrings we use for 16-bit
integers.

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_nir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 3b2909da33e..2dfbf8824dc 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -628,7 +628,7 @@ static unsigned
 lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data)
 {
assert(alu->dest.dest.is_ssa);
-   if (alu->dest.dest.ssa.bit_size != 16)
+   if (alu->dest.dest.ssa.bit_size >= 32)
   return 0;
 
const struct brw_compiler *compiler = (const struct brw_compiler *) data;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 32/42] intel/eu: force stride of 2 on NULL register for Byte instructions

2019-01-15 Thread Iago Toral Quiroga
The hardware only allows a stride of 1 on a Byte destination for raw
byte MOV instructions. This is required even when the destination
is the NULL register.

Rather than making sure that we emit a proper NULL:B destination
every time we need one, just fix it at emission time.

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_eu_emit.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 2fa89f8a2a3..4e1672408ea 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -94,6 +94,17 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct 
brw_reg dest)
else if (dest.file == BRW_GENERAL_REGISTER_FILE)
   assert(dest.nr < 128);
 
+   /* The hardware has a restriction where if the destination is Byte,
+* the instruction needs to have a stride of 2 (except for packed byte
+* MOV). This seems to be required even if the destination is the NULL
+* register.
+*/
+   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+   dest.nr == BRW_ARF_NULL &&
+   type_sz(dest.type) == 1) {
+  dest.hstride = BRW_HORIZONTAL_STRIDE_2;
+   }
+
gen7_convert_mrf_to_grf(p, &dest);
 
brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 39/42] intel/compiler: remove MAD/LRP algebraic optimizations from the backend

2019-01-15 Thread Iago Toral Quiroga
NIR already has these so they are redundant. A run of shader-db confirms
that the only cases where these backend optimizations are activated
are some Tomb Raider shaders where the affected variables are qualified
as "precise", which is why NIR won't apply them and why the backend
shouldn't either (so it is actually a bug).

Suggested-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs.cpp | 37 ---
 1 file changed, 37 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 77c955ac435..e7f5a8822a3 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2568,16 +2568,6 @@ fs_visitor::opt_algebraic()
 break;
  }
  break;
-  case BRW_OPCODE_LRP:
- if (inst->src[1].equals(inst->src[2])) {
-inst->opcode = BRW_OPCODE_MOV;
-inst->src[0] = inst->src[1];
-inst->src[1] = reg_undef;
-inst->src[2] = reg_undef;
-progress = true;
-break;
- }
- break;
   case BRW_OPCODE_CMP:
  if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
   inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
@@ -2654,33 +2644,6 @@ fs_visitor::opt_algebraic()
 }
  }
  break;
-  case BRW_OPCODE_MAD:
- if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
-inst->opcode = BRW_OPCODE_MOV;
-inst->src[1] = reg_undef;
-inst->src[2] = reg_undef;
-progress = true;
- } else if (inst->src[0].is_zero()) {
-inst->opcode = BRW_OPCODE_MUL;
-inst->src[0] = inst->src[2];
-inst->src[2] = reg_undef;
-progress = true;
- } else if (inst->src[1].is_one()) {
-inst->opcode = BRW_OPCODE_ADD;
-inst->src[1] = inst->src[2];
-inst->src[2] = reg_undef;
-progress = true;
- } else if (inst->src[2].is_one()) {
-inst->opcode = BRW_OPCODE_ADD;
-inst->src[2] = reg_undef;
-progress = true;
- } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
-inst->opcode = BRW_OPCODE_ADD;
-inst->src[1].f *= inst->src[2].f;
-inst->src[2] = reg_undef;
-progress = true;
- }
- break;
   case SHADER_OPCODE_BROADCAST:
  if (is_uniform(inst->src[0])) {
 inst->opcode = BRW_OPCODE_MOV;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 17/42] intel/compiler: add new half-float register type for 3-src instructions

2019-01-15 Thread Iago Toral Quiroga
This is available since gen8.

v2: restore previously existing assertion.

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/compiler/brw_reg_type.c | 36 +++
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_reg_type.c 
b/src/intel/compiler/brw_reg_type.c
index 60240ba1513..09b3ea61d4c 100644
--- a/src/intel/compiler/brw_reg_type.c
+++ b/src/intel/compiler/brw_reg_type.c
@@ -138,6 +138,7 @@ enum hw_3src_reg_type {
GEN7_3SRC_TYPE_D  = 1,
GEN7_3SRC_TYPE_UD = 2,
GEN7_3SRC_TYPE_DF = 3,
+   GEN8_3SRC_TYPE_HF = 4,
 
/** When ExecutionDatatype is 1: @{ */
GEN10_ALIGN1_3SRC_REG_TYPE_HF = 0b000,
@@ -166,6 +167,14 @@ static const struct hw_3src_type {
[BRW_REGISTER_TYPE_D]  = { GEN7_3SRC_TYPE_D  },
[BRW_REGISTER_TYPE_UD] = { GEN7_3SRC_TYPE_UD },
[BRW_REGISTER_TYPE_DF] = { GEN7_3SRC_TYPE_DF },
+}, gen8_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_F]  = { GEN7_3SRC_TYPE_F  },
+   [BRW_REGISTER_TYPE_D]  = { GEN7_3SRC_TYPE_D  },
+   [BRW_REGISTER_TYPE_UD] = { GEN7_3SRC_TYPE_UD },
+   [BRW_REGISTER_TYPE_DF] = { GEN7_3SRC_TYPE_DF },
+   [BRW_REGISTER_TYPE_HF] = { GEN8_3SRC_TYPE_HF },
 }, gen10_hw_3src_align1_type[] = {
 #define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
@@ -249,6 +258,20 @@ brw_hw_type_to_reg_type(const struct gen_device_info 
*devinfo,
unreachable("not reached");
 }
 
+static inline const struct hw_3src_type *
+get_hw_3src_type_map(const struct gen_device_info *devinfo, uint32_t *size)
+{
+   if (devinfo->gen < 8) {
+  if (size)
+ *size = ARRAY_SIZE(gen7_hw_3src_type);
+  return gen7_hw_3src_type;
+   } else {
+  if (size)
+ *size = ARRAY_SIZE(gen8_hw_3src_type);
+  return gen8_hw_3src_type;
+   }
+}
+
 /**
  * Convert a brw_reg_type enumeration value into the hardware representation
  * for a 3-src align16 instruction
@@ -257,9 +280,12 @@ unsigned
 brw_reg_type_to_a16_hw_3src_type(const struct gen_device_info *devinfo,
  enum brw_reg_type type)
 {
-   assert(type < ARRAY_SIZE(gen7_hw_3src_type));
-   assert(gen7_hw_3src_type[type].reg_type != (enum hw_3src_reg_type)INVALID);
-   return gen7_hw_3src_type[type].reg_type;
+   uint32_t map_size;
+   const struct hw_3src_type *hw_3src_type_map =
+  get_hw_3src_type_map(devinfo, &map_size);
+   assert(type < map_size);
+   assert(hw_3src_type_map[type].reg_type != (enum hw_3src_reg_type)INVALID);
+   return hw_3src_type_map[type].reg_type;
 }
 
 /**
@@ -283,8 +309,10 @@ enum brw_reg_type
 brw_a16_hw_3src_type_to_reg_type(const struct gen_device_info *devinfo,
  unsigned hw_type)
 {
+   const struct hw_3src_type *hw_3src_type_map =
+  get_hw_3src_type_map(devinfo, NULL);
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
-  if (gen7_hw_3src_type[i].reg_type == hw_type) {
+  if (hw_3src_type_map[i].reg_type == hw_type) {
  return i;
   }
}
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 28/42] intel/compiler: handle 64-bit float to 8-bit integer conversions

2019-01-15 Thread Iago Toral Quiroga
These are not directly supported in hardware and brw_nir_lower_conversions
should have taken care of that before we get here. Also, while we are
at it, make sure 64-bit integer to 8-bit are also properly split by
the same lowering pass, since they have the same hardware restrictions.
---
 src/intel/compiler/brw_fs_nir.cpp | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index cf546b8ff09..e454578d99b 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -786,6 +786,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
case nir_op_f2f16:
case nir_op_i2f16:
case nir_op_u2f16:
+   case nir_op_i2i8:
+   case nir_op_u2u8:
+   case nir_op_f2i8:
+   case nir_op_f2u8:
   assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
   inst = bld.MOV(result, op[0]);
   inst->saturate = instr->dest.saturate;
@@ -824,8 +828,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
case nir_op_u2u32:
case nir_op_i2i16:
case nir_op_u2u16:
-   case nir_op_i2i8:
-   case nir_op_u2u8:
   inst = bld.MOV(result, op[0]);
   inst->saturate = instr->dest.saturate;
   break;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 16/42] intel/compiler: add instruction setters for Src1Type and Src2Type.

2019-01-15 Thread Iago Toral Quiroga
The original SrcType is a 3-bit field that takes a subset of the types
supported for the hardware for 3-source instructions. Since gen8,
when the half-float type was added, 3-source floating point operations
can use use mixed precision mode, where not all the operands have the
same floating-point precision. While the precision for the first operand
is taken from the type in SrcType, the bits in Src1Type (bit 36) and
Src2Type (bit 35) define the precision for the other operands
(0: normal precision, 1: half precision).

Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_inst.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
index ce89bbba72f..c45697eaa3a 100644
--- a/src/intel/compiler/brw_inst.h
+++ b/src/intel/compiler/brw_inst.h
@@ -222,6 +222,8 @@ F8(3src_src1_negate,39, 39, 40, 40)
 F8(3src_src1_abs,   38, 38, 39, 39)
 F8(3src_src0_negate,37, 37, 38, 38)
 F8(3src_src0_abs,   36, 36, 37, 37)
+F8(3src_a16_src1_type,  -1, -1, 36, 36)
+F8(3src_a16_src2_type,  -1, -1, 35, 35)
 F8(3src_a16_flag_reg_nr,34, 34, 33, 33)
 F8(3src_a16_flag_subreg_nr, 33, 33, 32, 32)
 FF(3src_a16_dst_reg_file,
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 35/42] anv/device: expose shaderFloat16 and shaderInt8 in gen8+

2019-01-15 Thread Iago Toral Quiroga
v2 (Jason):
 - Merge Float16 and Int8 into a single patch.
 - Merge extension enable.

Reviewed-by: Jason Ekstrand  (v1)
---
 src/intel/vulkan/anv_device.c  | 9 +
 src/intel/vulkan/anv_extensions.py | 1 +
 2 files changed, 10 insertions(+)

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 523f1483e29..d9931d339e5 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -966,6 +966,15 @@ void anv_GetPhysicalDeviceFeatures2(
  break;
   }
 
+  case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: {
+ VkPhysicalDeviceFloat16Int8FeaturesKHR *features = (void *)ext;
+ ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+ features->shaderFloat16 = pdevice->info.gen >= 8;
+ features->shaderInt8 = pdevice->info.gen >= 8;
+ break;
+  }
+
   default:
  anv_debug_ignored_stype(ext->sType);
  break;
diff --git a/src/intel/vulkan/anv_extensions.py 
b/src/intel/vulkan/anv_extensions.py
index 388845003aa..0f579ced692 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -105,6 +105,7 @@ EXTENSIONS = [
 Extension('VK_KHR_sampler_mirror_clamp_to_edge',  1, True),
 Extension('VK_KHR_sampler_ycbcr_conversion',  1, True),
 Extension('VK_KHR_shader_draw_parameters',1, True),
+Extension('VK_KHR_shader_float16_int8',   1, 'device->info.gen 
>= 8'),
 Extension('VK_KHR_storage_buffer_storage_class',  1, True),
 Extension('VK_KHR_surface',  25, 
'ANV_HAS_SURFACE'),
 Extension('VK_KHR_swapchain',68, 
'ANV_HAS_SURFACE'),
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 21/42] intel/compiler: set correct precision fields for 3-source float instructions

2019-01-15 Thread Iago Toral Quiroga
Source0 and Destination extract the floating-point precision automatically
from the SrcType and DstType instruction fields respectively when they are
set to types :F or :HF. For Source1 and Source2 operands, we use the new
1-bit fields Src1Type and Src2Type, where 0 means normal precision and 1
means half-precision. Since we always use the type of the destination for
all operands when we emit 3-source instructions, we only need set Src1Type
and Src2Type to 1 when we are emitting a half-precision instruction.

v2:
 - Set the bit separately for each source based on its type so we can
   do mixed floating-point mode in the future (Topi).

Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_eu_emit.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index a785f96b650..2fa89f8a2a3 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -801,6 +801,22 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct 
brw_reg dest,
   */
  brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
  brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
+
+ /* From the Bspec: Instruction types
+  *
+  * Three source instructions can use operands with mixed-mode
+  * precision. When SrcType field is set to :f or :hf it defines
+  * precision for source 0 only, and fields Src1Type and Src2Type
+  * define precision for other source operands:
+  *
+  *   0b = :f. Single precision Float (32-bit).
+  *   1b = :hf. Half precision Float (16-bit).
+  */
+ if (src1.type == BRW_REGISTER_TYPE_HF)
+brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
+
+ if (src2.type == BRW_REGISTER_TYPE_HF)
+brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
   }
}
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 30/42] intel/compiler: implement isign for int8

2019-01-15 Thread Iago Toral Quiroga
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 25 +
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index a739562c3ab..a3d193b8a44 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -912,11 +912,28 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
*  Predicated OR sets 1 if val is positive.
*/
   uint32_t bit_size = nir_dest_bit_size(instr->dest.dest);
-  assert(bit_size == 32 || bit_size == 16);
 
-  fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0);
-  fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1);
-  fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15);
+  fs_reg zero, one, shift;
+  switch (bit_size) {
+  case 32:
+ zero = brw_imm_d(0);
+ one = brw_imm_d(1);
+ shift = brw_imm_d(31);
+ break;
+  case 16:
+ zero = brw_imm_w(0);
+ one = brw_imm_w(1);
+ shift = brw_imm_w(15);
+ break;
+  case 8: {
+ zero = setup_imm_b(bld, 0);
+ one = setup_imm_b(bld, 1);
+ shift = setup_imm_b(bld, 7);
+ break;
+  }
+  default:
+ unreachable("unsupported bit-size");
+  };
 
   bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G);
   bld.ASR(result, op[0], shift);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 36/42] intel/compiler: implement is_zero, is_one, is_negative_one for 8-bit/16-bit

2019-01-15 Thread Iago Toral Quiroga
There are no 8-bit immediates, so assert in that case.
16-bit immediates are replicated in each word of a 32-bit immediate, so
we only need to check the lower 16-bits.

v2:
 - Fix is_zero with half-float to consider -0 as well (Jason).
 - Fix is_negative_one for word type.
---
 src/intel/compiler/brw_shader.cpp | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/src/intel/compiler/brw_shader.cpp 
b/src/intel/compiler/brw_shader.cpp
index 97966c951a1..3c636c9d3a4 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -704,11 +704,20 @@ backend_reg::is_zero() const
if (file != IMM)
   return false;
 
+   assert(type_sz(type) > 1);
+
switch (type) {
+   case BRW_REGISTER_TYPE_HF:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0 || (d & 0x) == 0x8000;
case BRW_REGISTER_TYPE_F:
   return f == 0;
case BRW_REGISTER_TYPE_DF:
   return df == 0;
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0;
case BRW_REGISTER_TYPE_D:
case BRW_REGISTER_TYPE_UD:
   return d == 0;
@@ -726,11 +735,20 @@ backend_reg::is_one() const
if (file != IMM)
   return false;
 
+   assert(type_sz(type) > 1);
+
switch (type) {
+   case BRW_REGISTER_TYPE_HF:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0x3c00;
case BRW_REGISTER_TYPE_F:
   return f == 1.0f;
case BRW_REGISTER_TYPE_DF:
   return df == 1.0;
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 1;
case BRW_REGISTER_TYPE_D:
case BRW_REGISTER_TYPE_UD:
   return d == 1;
@@ -748,11 +766,19 @@ backend_reg::is_negative_one() const
if (file != IMM)
   return false;
 
+   assert(type_sz(type) > 1);
+
switch (type) {
+   case BRW_REGISTER_TYPE_HF:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0xbc00;
case BRW_REGISTER_TYPE_F:
   return f == -1.0;
case BRW_REGISTER_TYPE_DF:
   return df == -1.0;
+   case BRW_REGISTER_TYPE_W:
+  assert((d & 0x) == ((d >> 16) & 0x));
+  return (d & 0x) == 0x;
case BRW_REGISTER_TYPE_D:
   return d == -1;
case BRW_REGISTER_TYPE_Q:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 19/42] intel/compiler: don't compact 3-src instructions with Src1Type or Src2Type bits

2019-01-15 Thread Iago Toral Quiroga
We are now using these bits, so don't assert that they are not set, just
avoid compaction in that case.

Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_eu_compact.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_compact.c 
b/src/intel/compiler/brw_eu_compact.c
index ae14ef10ec0..20fed254331 100644
--- a/src/intel/compiler/brw_eu_compact.c
+++ b/src/intel/compiler/brw_eu_compact.c
@@ -928,8 +928,11 @@ has_3src_unmapped_bits(const struct gen_device_info 
*devinfo,
   assert(!brw_inst_bits(src, 127, 126) &&
  !brw_inst_bits(src, 105, 105) &&
  !brw_inst_bits(src, 84, 84) &&
- !brw_inst_bits(src, 36, 35) &&
  !brw_inst_bits(src, 7,  7));
+
+  /* Src1Type and Src2Type, used for mixed-precision floating point */
+  if (brw_inst_bits(src, 36, 35))
+ return true;
}
 
return false;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 41/42] intel/compiler: fix combine constants for Align16 with half-float prior to gen9

2019-01-15 Thread Iago Toral Quiroga
There is a hardware restriction where <0,1,0>:HF in Align16 doesn't replicate
a single 16-bit channel, but instead it replicates a full 32-bit channel.
---
 .../compiler/brw_fs_combine_constants.cpp | 24 +--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp 
b/src/intel/compiler/brw_fs_combine_constants.cpp
index 54017e5668b..56e414d3f4e 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -301,7 +301,26 @@ fs_visitor::opt_combine_constants()
*/
   exec_node *n = (imm->inst ? imm->inst :
   imm->block->last_non_control_flow_inst()->next);
-  const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
+
+  /* Prior to gen9 we also have to deal with this restriction:
+   *
+   * "In Align16 mode, the channel selects and channel enables apply to a
+   *  pair of half-floats, because these parameters are defined for DWord
+   *  elements ONLY. This is applicable when both source and destination
+   *  are half-floats."
+   *
+   * This means that when we emit a 3-src instruction such as MAD or LRP,
+   * for which we use Align16, if we need to promote an HF constant to a
+   * register we need to be aware that the  <0,1,0>:HF region would still
+   * read 2 HF slots and not not replicate the single one like we want.
+   * We fix this by populating both HF slots with the constant we need to
+   * read.
+   */
+  const uint32_t width =
+ devinfo->gen < 9 &&
+ imm->type == BRW_REGISTER_TYPE_HF &&
+ (!imm->inst || imm->inst->is_3src(devinfo)) ? 2 : 1;
+  const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);
 
   reg = retype(reg, imm->type);
   if (imm->type == BRW_REGISTER_TYPE_F) {
@@ -314,7 +333,8 @@ fs_visitor::opt_combine_constants()
   imm->subreg_offset = reg.offset;
 
   /* Keep offsets 32-bit aligned since we are mixing 32-bit and 16-bit
-   * constants into the same register
+   * constants into the same register (and we are writing 32-bit slots
+   * prior to gen9 for HF constants anyway).
*
* TODO: try to pack pairs of HF constants into each 32-bit slot
*/
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 38/42] intel/compiler: fix cmod propagation for non 32-bit types

2019-01-15 Thread Iago Toral Quiroga
v2:
 - Do not propagate if the bit-size changes
---
 src/intel/compiler/brw_fs_cmod_propagation.cpp | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp 
b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 7bb5c9afbc9..57d4e645c05 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -244,8 +244,7 @@ opt_cmod_propagation_local(const gen_device_info *devinfo,
 /* CMP's result is the same regardless of dest type. */
 if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
 scan_inst->opcode == BRW_OPCODE_CMP &&
-(inst->dst.type == BRW_REGISTER_TYPE_D ||
- inst->dst.type == BRW_REGISTER_TYPE_UD)) {
+brw_reg_type_is_integer(inst->dst.type)) {
inst->remove(block);
progress = true;
break;
@@ -258,9 +257,14 @@ opt_cmod_propagation_local(const gen_device_info *devinfo,
break;
 
 /* Comparisons operate differently for ints and floats */
-if (scan_inst->dst.type != inst->dst.type &&
-(scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
- inst->dst.type == BRW_REGISTER_TYPE_F))
+if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
+brw_reg_type_is_floating_point(inst->dst.type))
+   break;
+
+/* Comparison result may be altered if the bit-size changes
+ * since that affects range, denorms, etc
+ */
+if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
break;
 
 /* If the instruction generating inst's source also wrote the
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 20/42] intel/compiler: allow half-float on 3-source instructions since gen8

2019-01-15 Thread Iago Toral Quiroga
Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_eu_emit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index e21df4624b3..a785f96b650 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -755,7 +755,8 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct 
brw_reg dest,
   assert(dest.type == BRW_REGISTER_TYPE_F  ||
  dest.type == BRW_REGISTER_TYPE_DF ||
  dest.type == BRW_REGISTER_TYPE_D  ||
- dest.type == BRW_REGISTER_TYPE_UD);
+ dest.type == BRW_REGISTER_TYPE_UD ||
+ (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
   if (devinfo->gen == 6) {
  brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
 dest.file == 
BRW_MESSAGE_REGISTER_FILE);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 26/42] intel/compiler: split is_partial_write() into two variants

2019-01-15 Thread Iago Toral Quiroga
This function is used in two different scenarios that for 32-bit
instructions are the same, but for 16-bit instructions are not.

One scenario is that in which we are working at a SIMD8 register
level and we need to know if a register is fully defined or written.
This is useful, for example, in the context of liveness analysis or
register allocation, where we work with units of registers.

The other scenario is that in which we want to know if an instruction
is writing a full scalar component or just some subset of it. This is
useful, for example, in the context of some optimization passes
like copy propagation.

For 32-bit instructions (or larger), a SIMD8 dispatch will always write
at least a full SIMD8 register (32B) if the write is not partial. The
function is_partial_write() checks this to determine if we have a partial
write. However, when we deal with 16-bit instructions, that logic disables
some optimizations that should be safe. For example, a SIMD8 16-bit MOV will
only update half of a SIMD register, but it is still a complete write of the
variable for a SIMD8 dispatch, so we should not prevent copy propagation in
this scenario because we don't write all 32 bytes in the SIMD register
or because the write starts at offset 16B (wehere we pack components Y or
W of 16-bit vectors).

This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit
instructions, which lose a number of optimizations because of this, most
important of which is copy-propagation.

This patch splits is_partial_write() into is_partial_reg_write(), which
represents the current is_partial_write(), useful for things like
liveness analysis, and is_partial_var_write(), which considers
the dispatch size to check if we are writing a full variable (rather
than a full register) to decide if the write is partial or not, which
is what we really want in many optimization passes.

Then the patch goes on and rewrites all uses of is_partial_write() to use
one or the other version. Specifically, we use is_partial_var_write()
in the following places: copy propagation, cmod propagation, common
subexpression elimination, saturate propagation and sel peephole.

Notice that the semantics of is_partial_var_write() exactly match the
current implementation of is_partial_write() for anything that is
32-bit or larger, so no changes are expected for 32-bit instructions.

Tested against ~5000 tests involving 16-bit instructions in CTS produced
the following changes in instruction counts:

Patched  | Master|%|

SIMD8  |621,900  |706,721| -12.00% |

SIMD16 | 93,252  | 93,252|   0.00% |


As expected, the change only affects SIMD8 dispatches.

Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_fs.cpp | 31 +++
 .../compiler/brw_fs_cmod_propagation.cpp  | 20 ++--
 .../compiler/brw_fs_copy_propagation.cpp  |  8 ++---
 src/intel/compiler/brw_fs_cse.cpp |  3 +-
 .../compiler/brw_fs_dead_code_eliminate.cpp   |  2 +-
 src/intel/compiler/brw_fs_live_variables.cpp  |  2 +-
 src/intel/compiler/brw_fs_reg_allocate.cpp|  2 +-
 .../compiler/brw_fs_register_coalesce.cpp |  2 +-
 .../compiler/brw_fs_saturate_propagation.cpp  |  7 +++--
 src/intel/compiler/brw_fs_sel_peephole.cpp|  4 +--
 src/intel/compiler/brw_ir_fs.h|  3 +-
 11 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index d6096cd667d..77c955ac435 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -716,14 +716,33 @@ fs_visitor::limit_dispatch_width(unsigned n, const char 
*msg)
  * it.
  */
 bool
-fs_inst::is_partial_write() const
+fs_inst::is_partial_reg_write() const
 {
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
-   (this->exec_size * type_sz(this->dst.type)) < 32 ||
!this->dst.is_contiguous() ||
+   (this->exec_size * type_sz(this->dst.type)) < REG_SIZE ||
this->dst.offset % REG_SIZE != 0);
 }
 
+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire variable for the given dispatch width.
+ *
+ * This is only different from is_partial_reg_write() for SIMD8
+ * dispatches of 16-bit (or smaller) instructions.
+ */
+bool
+fs_inst::is_partial_var_write(uint32_t dispatch_width) const
+{
+   const uint32_t type_size = type_sz(this->dst.type);
+   uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size);
+
+   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
+   !this->dst.is_contiguous() ||
+   (this->exec_size * type_sz(this->dst.type)) < var_size ||
+   this->dst.offset % var_size != 0);
+}
+
 unsigned
 fs_inst::components_read(unsigned i) const
 {
@@ -2896,7 +2915,7 @@ fs_v

[Mesa-dev] [PATCH v3 12/42] compiler/nir: add lowering for 16-bit flrp

2019-01-15 Thread Iago Toral Quiroga
Reviewed-by: Jason Ekstrand 
---
 src/compiler/nir/nir.h| 1 +
 src/compiler/nir/nir_opt_algebraic.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 19056e79206..adcc8e36cc9 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2106,6 +2106,7 @@ typedef struct nir_shader_compiler_options {
bool lower_fdiv;
bool lower_ffma;
bool fuse_ffma;
+   bool lower_flrp16;
bool lower_flrp32;
/** Lowers flrp when it does not support doubles */
bool lower_flrp64;
diff --git a/src/compiler/nir/nir_opt_algebraic.py 
b/src/compiler/nir/nir_opt_algebraic.py
index cd969de1f88..40eb3de02c3 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -124,6 +124,7 @@ optimizations = [
(('~flrp', 0.0, a, b), ('fmul', a, b)),
(('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 
'options->lower_flrp32'),
(('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
+   (('flrp@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 
'options->lower_flrp16'),
(('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 
'options->lower_flrp32'),
(('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 
'options->lower_flrp64'),
(('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 07/42] intel/compiler: lower 16-bit extended math to 32-bit prior to gen9

2019-01-15 Thread Iago Toral Quiroga
Extended math doesn't support half-float on these generations.

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_nir.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index f0fe7f870c2..3b2909da33e 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -631,6 +631,8 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED 
void *data)
if (alu->dest.dest.ssa.bit_size != 16)
   return 0;
 
+   const struct brw_compiler *compiler = (const struct brw_compiler *) data;
+
switch (alu->op) {
case nir_op_idiv:
case nir_op_imod:
@@ -643,6 +645,15 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED 
void *data)
case nir_op_fround_even:
case nir_op_ftrunc:
   return 32;
+   case nir_op_frcp:
+   case nir_op_frsq:
+   case nir_op_fsqrt:
+   case nir_op_fpow:
+   case nir_op_fexp2:
+   case nir_op_flog2:
+   case nir_op_fsin:
+   case nir_op_fcos:
+  return compiler->devinfo->gen < 9 ? 32 : 0;
default:
   return 0;
}
@@ -770,7 +781,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
   OPT(nir_opt_large_constants, NULL, 32);
}
 
-   OPT(nir_lower_bit_size, lower_bit_size_callback, NULL);
+   OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
 
if (is_scalar) {
   OPT(nir_lower_load_const_to_scalar);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 10/42] compiler/nir: add lowering option for 16-bit fmod

2019-01-15 Thread Iago Toral Quiroga
Reviewed-by: Jason Ekstrand 
---
 src/compiler/nir/nir.h| 1 +
 src/compiler/nir/nir_opt_algebraic.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 3cb2d166cb3..19056e79206 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2112,6 +2112,7 @@ typedef struct nir_shader_compiler_options {
bool lower_fpow;
bool lower_fsat;
bool lower_fsqrt;
+   bool lower_fmod16;
bool lower_fmod32;
bool lower_fmod64;
/** Lowers ibitfield_extract/ubitfield_extract to ibfe/ubfe. */
diff --git a/src/compiler/nir/nir_opt_algebraic.py 
b/src/compiler/nir/nir_opt_algebraic.py
index 75a3d2ad238..cd969de1f88 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -636,6 +636,7 @@ optimizations = [
(('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
 
# Misc. lowering
+   (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 
'options->lower_fmod16'),
(('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 
'options->lower_fmod32'),
(('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 
'options->lower_fmod64'),
(('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b, 
'options->lower_fmod32'),
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 34/42] anv/pipeline: support Float16 and Int8 capabilities in gen8+

2019-01-15 Thread Iago Toral Quiroga
v2:
 - Merge Float16 and Int8 in a single patch (Jason)

Reviewed-by: Jason Ekstrand  (v1)
---
 src/intel/vulkan/anv_pipeline.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 899160746d4..663d1c77fa5 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -136,8 +136,10 @@ anv_shader_compile_to_nir(struct anv_device *device,
   .caps = {
  .device_group = true,
  .draw_parameters = true,
+ .float16 = device->instance->physicalDevice.info.gen >= 8,
  .float64 = device->instance->physicalDevice.info.gen >= 8,
  .image_write_without_format = true,
+ .int8 = device->instance->physicalDevice.info.gen >= 8,
  .int16 = device->instance->physicalDevice.info.gen >= 8,
  .int64 = device->instance->physicalDevice.info.gen >= 8,
  .min_lod = true,
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 40/42] intel/compiler: support half-float in the combine constants pass

2019-01-15 Thread Iago Toral Quiroga
Reviewed-by: Topi Pohjolainen 
---
 .../compiler/brw_fs_combine_constants.cpp | 60 +++
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp 
b/src/intel/compiler/brw_fs_combine_constants.cpp
index 7343f77bb45..54017e5668b 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -36,6 +36,7 @@
 
 #include "brw_fs.h"
 #include "brw_cfg.h"
+#include "util/half_float.h"
 
 using namespace brw;
 
@@ -114,8 +115,9 @@ struct imm {
 */
exec_list *uses;
 
-   /** The immediate value.  We currently only handle floats. */
+   /** The immediate value.  We currently only handle float and half-float. */
float val;
+   brw_reg_type type;
 
/**
 * The GRF register and subregister number where we've decided to store the
@@ -145,10 +147,10 @@ struct table {
 };
 
 static struct imm *
-find_imm(struct table *table, float val)
+find_imm(struct table *table, float val, brw_reg_type type)
 {
for (int i = 0; i < table->len; i++) {
-  if (table->imm[i].val == val) {
+  if (table->imm[i].val == val && table->imm[i].type == type) {
  return &table->imm[i];
   }
}
@@ -190,6 +192,20 @@ compare(const void *_a, const void *_b)
return a->first_use_ip - b->first_use_ip;
 }
 
+static bool
+needs_negate(float reg_val, float imm_val, brw_reg_type type)
+{
+   /* reg_val represents the immediate value in the register in its original
+* bit-size, while imm_val is always a valid 32-bit float value.
+*/
+   if (type == BRW_REGISTER_TYPE_HF) {
+  uint32_t reg_val_ud = *((uint32_t *) ®_val);
+  reg_val = _mesa_half_to_float(reg_val_ud & 0x);
+   }
+
+   return signbit(imm_val) != signbit(reg_val);
+}
+
 bool
 fs_visitor::opt_combine_constants()
 {
@@ -215,12 +231,20 @@ fs_visitor::opt_combine_constants()
 
   for (int i = 0; i < inst->sources; i++) {
  if (inst->src[i].file != IMM ||
- inst->src[i].type != BRW_REGISTER_TYPE_F)
+ (inst->src[i].type != BRW_REGISTER_TYPE_F &&
+  inst->src[i].type != BRW_REGISTER_TYPE_HF))
 continue;
 
- float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
- fabs(inst->src[i].f);
- struct imm *imm = find_imm(&table, val);
+ float val;
+ if (inst->src[i].type == BRW_REGISTER_TYPE_F) {
+val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
+fabs(inst->src[i].f);
+ } else {
+val = !inst->can_do_source_mods(devinfo) ?
+   _mesa_half_to_float(inst->src[i].d & 0x) :
+   fabs(_mesa_half_to_float(inst->src[i].d & 0x));
+ }
+ struct imm *imm = find_imm(&table, val, inst->src[i].type);
 
  if (imm) {
 bblock_t *intersection = cfg_t::intersect(block, imm->block);
@@ -238,6 +262,7 @@ fs_visitor::opt_combine_constants()
 imm->uses = new(const_ctx) exec_list();
 imm->uses->push_tail(link(const_ctx, &inst->src[i]));
 imm->val = val;
+imm->type = inst->src[i].type;
 imm->uses_by_coissue = could_coissue(devinfo, inst);
 imm->must_promote = must_promote_imm(devinfo, inst);
 imm->first_use_ip = ip;
@@ -278,12 +303,23 @@ fs_visitor::opt_combine_constants()
   imm->block->last_non_control_flow_inst()->next);
   const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
 
-  ibld.MOV(reg, brw_imm_f(imm->val));
+  reg = retype(reg, imm->type);
+  if (imm->type == BRW_REGISTER_TYPE_F) {
+ ibld.MOV(reg, brw_imm_f(imm->val));
+  } else {
+ const uint16_t val_hf = _mesa_float_to_half(imm->val);
+ ibld.MOV(reg, retype(brw_imm_uw(val_hf), BRW_REGISTER_TYPE_HF));
+  }
   imm->nr = reg.nr;
   imm->subreg_offset = reg.offset;
 
+  /* Keep offsets 32-bit aligned since we are mixing 32-bit and 16-bit
+   * constants into the same register
+   *
+   * TODO: try to pack pairs of HF constants into each 32-bit slot
+   */
   reg.offset += sizeof(float);
-  if (reg.offset == 8 * sizeof(float)) {
+  if (reg.offset == REG_SIZE) {
  reg.nr = alloc.allocate(1);
  reg.offset = 0;
   }
@@ -295,12 +331,14 @@ fs_visitor::opt_combine_constants()
   foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
  fs_reg *reg = link->reg;
  assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
-fabsf(reg->f) == fabs(table.imm[i].val));
+fabsf(reg->f) == fabs(table.imm[i].val) ||
+table.imm[i].type == BRW_REGISTER_TYPE_HF);
 
  reg->file = VGRF;
+ reg->type = table.imm[i].type;
  reg->offset = table.imm[i].subreg_offset;
  reg->stride = 0;
- reg->negate = signbit(reg->f) != signbit(tab

[Mesa-dev] [PATCH v3 13/42] intel/compiler: lower 16-bit flrp

2019-01-15 Thread Iago Toral Quiroga
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_compiler.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/intel/compiler/brw_compiler.c 
b/src/intel/compiler/brw_compiler.c
index f885e79c3e6..04a1a7cac4e 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -33,6 +33,7 @@
.lower_sub = true, \
.lower_fdiv = true,\
.lower_scmp = true,\
+   .lower_flrp16 = true,  \
.lower_fmod16 = true,  \
.lower_fmod32 = true,  \
.lower_fmod64 = false, \
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 29/42] intel/compiler: handle conversions between int and half-float on atom

2019-01-15 Thread Iago Toral Quiroga
Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_fs_nir.cpp | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index e454578d99b..a739562c3ab 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -784,13 +784,20 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
*/
 
case nir_op_f2f16:
-   case nir_op_i2f16:
-   case nir_op_u2f16:
case nir_op_i2i8:
case nir_op_u2u8:
+  assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
+  inst = bld.MOV(result, op[0]);
+  inst->saturate = instr->dest.saturate;
+  break;
+
+   case nir_op_i2f16:
+   case nir_op_u2f16:
case nir_op_f2i8:
case nir_op_f2u8:
   assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
+   case nir_op_f2i16:
+   case nir_op_f2u16:
   inst = bld.MOV(result, op[0]);
   inst->saturate = instr->dest.saturate;
   break;
@@ -822,8 +829,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:
-   case nir_op_f2i16:
-   case nir_op_f2u16:
case nir_op_i2i32:
case nir_op_u2u32:
case nir_op_i2i16:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 31/42] intel/compiler: ask for an integer type if requesting an 8-bit type

2019-01-15 Thread Iago Toral Quiroga
---
 src/intel/compiler/brw_fs_nir.cpp | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index a3d193b8a44..ccf1891b925 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -346,7 +346,9 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
  reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
   unsigned size = array_elems * reg->num_components;
   const brw_reg_type reg_type =
- brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
+ brw_reg_type_from_bit_size(reg->bit_size,
+reg->bit_size == 8 ? BRW_REGISTER_TYPE_D :
+ BRW_REGISTER_TYPE_F);
   nir_locals[reg->index] = bld.vgrf(reg_type, size);
}
 
@@ -4281,7 +4283,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, 
nir_intrinsic_instr *instr
   fs_reg value = get_nir_src(instr->src[0]);
   if (instr->intrinsic == nir_intrinsic_vote_feq) {
  const unsigned bit_size = nir_src_bit_size(instr->src[0]);
- value.type = brw_reg_type_from_bit_size(bit_size, 
BRW_REGISTER_TYPE_F);
+ value.type =
+brw_reg_type_from_bit_size(bit_size,
+   bit_size == 8 ? BRW_REGISTER_TYPE_D :
+   BRW_REGISTER_TYPE_F);
   }
 
   fs_reg uniformized = bld.emit_uniformize(value);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 06/42] intel/compiler: lower some 16-bit float operations to 32-bit

2019-01-15 Thread Iago Toral Quiroga
The hardware doesn't support half-float for these.

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_nir.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 572ab824a94..f0fe7f870c2 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -637,6 +637,11 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED 
void *data)
case nir_op_irem:
case nir_op_udiv:
case nir_op_umod:
+   case nir_op_fceil:
+   case nir_op_ffloor:
+   case nir_op_ffract:
+   case nir_op_fround_even:
+   case nir_op_ftrunc:
   return 32;
default:
   return 0;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 23/42] intel/compiler: fix ddx and ddy for 16-bit float

2019-01-15 Thread Iago Toral Quiroga
We were assuming 32-bit elements. Also, In SIMD8 we pack 2 vector components
in a single SIMD register, so for example, component Y of a 16-bit vec2
starts is at byte offset 16B. This means that when we compute the offset of
the elements to be differentiated we should not stomp whatever base offset we
have, but instead add to it.

v2
 - Use byte_offset() helper (Jason)
 - Merge the fix for SIMD8: using byte_offset() fixes that too.

Reviewed-by: Jason Ekstrand  (v1)
---
 src/intel/compiler/brw_fs_generator.cpp | 37 -
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index 5fc6cf5f8cc..d0cc4a6d231 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1315,10 +1315,9 @@ fs_generator::generate_ddx(const fs_inst *inst,
   width = BRW_WIDTH_4;
}
 
-   struct brw_reg src0 = src;
+   struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
struct brw_reg src1 = src;
 
-   src0.subnr   = sizeof(float);
src0.vstride = vstride;
src0.width   = width;
src0.hstride = BRW_HORIZONTAL_STRIDE_0;
@@ -1337,23 +1336,25 @@ void
 fs_generator::generate_ddy(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src)
 {
+   const uint32_t type_size = type_sz(src.type);
+
if (inst->opcode == FS_OPCODE_DDY_FINE) {
   /* produce accurate derivatives */
   if (devinfo->gen >= 11) {
  src = stride(src, 0, 2, 1);
- struct brw_reg src_0  = byte_offset(src,  0 * sizeof(float));
- struct brw_reg src_2  = byte_offset(src,  2 * sizeof(float));
- struct brw_reg src_4  = byte_offset(src,  4 * sizeof(float));
- struct brw_reg src_6  = byte_offset(src,  6 * sizeof(float));
- struct brw_reg src_8  = byte_offset(src,  8 * sizeof(float));
- struct brw_reg src_10 = byte_offset(src, 10 * sizeof(float));
- struct brw_reg src_12 = byte_offset(src, 12 * sizeof(float));
- struct brw_reg src_14 = byte_offset(src, 14 * sizeof(float));
-
- struct brw_reg dst_0  = byte_offset(dst,  0 * sizeof(float));
- struct brw_reg dst_4  = byte_offset(dst,  4 * sizeof(float));
- struct brw_reg dst_8  = byte_offset(dst,  8 * sizeof(float));
- struct brw_reg dst_12 = byte_offset(dst, 12 * sizeof(float));
+ struct brw_reg src_0  = byte_offset(src,  0 * type_size);
+ struct brw_reg src_2  = byte_offset(src,  2 * type_size);
+ struct brw_reg src_4  = byte_offset(src,  4 * type_size);
+ struct brw_reg src_6  = byte_offset(src,  6 * type_size);
+ struct brw_reg src_8  = byte_offset(src,  8 * type_size);
+ struct brw_reg src_10 = byte_offset(src, 10 * type_size);
+ struct brw_reg src_12 = byte_offset(src, 12 * type_size);
+ struct brw_reg src_14 = byte_offset(src, 14 * type_size);
+
+ struct brw_reg dst_0  = byte_offset(dst,  0 * type_size);
+ struct brw_reg dst_4  = byte_offset(dst,  4 * type_size);
+ struct brw_reg dst_8  = byte_offset(dst,  8 * type_size);
+ struct brw_reg dst_12 = byte_offset(dst, 12 * type_size);
 
  brw_push_insn_state(p);
  brw_set_default_exec_size(p, BRW_EXECUTE_4);
@@ -1380,10 +1381,8 @@ fs_generator::generate_ddy(const fs_inst *inst,
   }
} else {
   /* replicate the derivative at the top-left pixel to other pixels */
-  struct brw_reg src0 = stride(src, 4, 4, 0);
-  struct brw_reg src1 = stride(src, 4, 4, 0);
-  src0.subnr = 0 * sizeof(float);
-  src1.subnr = 2 * sizeof(float);
+  struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
+  struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
 
   brw_ADD(p, dst, negate(src0), src1);
}
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 37/42] intel/compiler: add a brw_reg_type_is_integer helper

2019-01-15 Thread Iago Toral Quiroga
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_reg_type.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/src/intel/compiler/brw_reg_type.h 
b/src/intel/compiler/brw_reg_type.h
index ffbec90d3fe..a3365b7e34c 100644
--- a/src/intel/compiler/brw_reg_type.h
+++ b/src/intel/compiler/brw_reg_type.h
@@ -82,6 +82,24 @@ brw_reg_type_is_floating_point(enum brw_reg_type type)
}
 }
 
+static inline bool
+brw_reg_type_is_integer(enum brw_reg_type type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_Q:
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_UV:
+  return true;
+   default:
+  return false;
+   }
+}
+
 unsigned
 brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
 enum brw_reg_file file, enum brw_reg_type type);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 25/42] intel/compiler: workaround for SIMD8 half-float MAD in gen8

2019-01-15 Thread Iago Toral Quiroga
Broadwell hardware has a bug that manifests in SIMD8 executions of
16-bit MAD instructions when any of the sources is a Y or W component.
We pack these components in the same SIMD register as components X and
Z respectively, but starting at offset 16B (so they live in the second
half of the register). The problem does not exist in SKL or later.

We work around this issue by moving any such sources to a temporary
starting at offset 0B. We want to do this after the main optimization loop
to prevent copy-propagation and friends to undo the fix.

Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_fs.cpp | 48 +++
 src/intel/compiler/brw_fs.h   |  1 +
 2 files changed, 49 insertions(+)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0b3ec94e2d2..d6096cd667d 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6540,6 +6540,48 @@ fs_visitor::optimize()
validate();
 }
 
+/**
+ * Broadwell hardware has a bug that manifests in SIMD8 executions of 16-bit
+ * MAD instructions when any of the sources is a Y or W component. We pack
+ * these components in the same SIMD register as components X and Z
+ * respectively, but starting at offset 16B (so they live in the second half
+ * of the register).
+ *
+ * We work around this issue by moving any such sources to a temporary
+ * starting at offset 0B. We want to do this after the main optimization loop
+ * to prevent copy-propagation and friends to undo the fix.
+ */
+void
+fs_visitor::fixup_hf_mad()
+{
+   if (devinfo->gen != 8)
+  return;
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+  if (inst->opcode != BRW_OPCODE_MAD ||
+  inst->dst.type != BRW_REGISTER_TYPE_HF ||
+  inst->exec_size > 8)
+ continue;
+
+  for (int i = 0; i < 3; i++) {
+ if (inst->src[i].offset > 0) {
+assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
+const fs_builder ibld =
+   bld.at(block, inst).exec_all().group(inst->exec_size, 0);
+fs_reg tmp = ibld.vgrf(inst->src[i].type);
+ibld.MOV(tmp, inst->src[i]);
+inst->src[i] = tmp;
+progress = true;
+ }
+  }
+   }
+
+   if (progress)
+  invalidate_live_intervals();
+}
+
 /**
  * Three source instruction must have a GRF/MRF destination register.
  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
@@ -6698,6 +6740,7 @@ fs_visitor::run_vs()
assign_curb_setup();
assign_vs_urb_setup();
 
+   fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(8, true);
 
@@ -6782,6 +6825,7 @@ fs_visitor::run_tcs_single_patch()
assign_curb_setup();
assign_tcs_single_patch_urb_setup();
 
+   fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(8, true);
 
@@ -6816,6 +6860,7 @@ fs_visitor::run_tes()
assign_curb_setup();
assign_tes_urb_setup();
 
+   fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(8, true);
 
@@ -6865,6 +6910,7 @@ fs_visitor::run_gs()
assign_curb_setup();
assign_gs_urb_setup();
 
+   fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(8, true);
 
@@ -6965,6 +7011,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 
   assign_urb_setup();
 
+  fixup_hf_mad();
   fixup_3src_null_dest();
   allocate_registers(8, allow_spilling);
 
@@ -7009,6 +7056,7 @@ fs_visitor::run_cs(unsigned min_dispatch_width)
 
assign_curb_setup();
 
+   fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(min_dispatch_width, true);
 
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 68287bcdcea..1879d4bc7f7 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -103,6 +103,7 @@ public:
void setup_vs_payload();
void setup_gs_payload();
void setup_cs_payload();
+   void fixup_hf_mad();
void fixup_3src_null_dest();
void assign_curb_setup();
void calculate_urb_setup();
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 33/42] compiler/spirv: add support for Float16 and Int8 capabilities

2019-01-15 Thread Iago Toral Quiroga
v2:
 - Merge Float16 and Int8 capabilities into a single patch (Jason)

Reviewed-by: Jason Ekstrand  (v1)
---
 src/compiler/shader_info.h| 2 ++
 src/compiler/spirv/spirv_to_nir.c | 8 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index 87a2c805d37..1d45433312a 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -37,12 +37,14 @@ struct spirv_supported_capabilities {
bool descriptor_array_dynamic_indexing;
bool device_group;
bool draw_parameters;
+   bool float16;
bool float64;
bool geometry_streams;
bool gcn_shader;
bool image_ms_array;
bool image_read_without_format;
bool image_write_without_format;
+   bool int8;
bool int16;
bool int64;
bool int64_atomics;
diff --git a/src/compiler/spirv/spirv_to_nir.c 
b/src/compiler/spirv/spirv_to_nir.c
index 76a997ee341..731b1cbea5b 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -3518,8 +3518,6 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, 
SpvOp opcode,
   case SpvCapabilityLinkage:
   case SpvCapabilityVector16:
   case SpvCapabilityFloat16Buffer:
-  case SpvCapabilityFloat16:
-  case SpvCapabilityInt8:
   case SpvCapabilitySparseResidency:
  vtn_warn("Unsupported SPIR-V capability: %s",
   spirv_capability_to_string(cap));
@@ -3536,12 +3534,18 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, 
SpvOp opcode,
   case SpvCapabilityFloat64:
  spv_check_supported(float64, cap);
  break;
+  case SpvCapabilityFloat16:
+ spv_check_supported(float16, cap);
+ break;
   case SpvCapabilityInt64:
  spv_check_supported(int64, cap);
  break;
   case SpvCapabilityInt16:
  spv_check_supported(int16, cap);
  break;
+  case SpvCapabilityInt8:
+ spv_check_supported(int8, cap);
+ break;
 
   case SpvCapabilityTransformFeedback:
  spv_check_supported(transform_feedback, cap);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 14/42] compiler/nir: add lowering for 16-bit ldexp

2019-01-15 Thread Iago Toral Quiroga
v2 (Topi):
 - Make bit-size handling order be 16-bit, 32-bit, 64-bit
 - Clamp lower exponent range at -28 instead of -30.

Reviewed-by: Topi Pohjolainen 
Reviewed-by: Jason Ekstrand 
---
 src/compiler/nir/nir_opt_algebraic.py | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py 
b/src/compiler/nir/nir_opt_algebraic.py
index 40eb3de02c3..71c626e1b3f 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -790,7 +790,9 @@ for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 
'i']):
 
 def fexp2i(exp, bits):
# We assume that exp is already in the right range.
-   if bits == 32:
+   if bits == 16:
+  return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
+   elif bits == 32:
   return ('ishl', ('iadd', exp, 127), 23)
elif bits == 64:
   return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
@@ -808,7 +810,9 @@ def ldexp(f, exp, bits):
# handles a range on exp of [-252, 254] which allows you to create any
# value (including denorms if the hardware supports it) and to adjust the
# exponent of any normal value to anything you want.
-   if bits == 32:
+   if bits == 16:
+  exp = ('imin', ('imax', exp, -28), 30)
+   elif bits == 32:
   exp = ('imin', ('imax', exp, -252), 254)
elif bits == 64:
   exp = ('imin', ('imax', exp, -2044), 2046)
@@ -828,6 +832,7 @@ def ldexp(f, exp, bits):
return ('fmul', ('fmul', f, pow2_1), pow2_2)
 
 optimizations += [
+   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
(('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
(('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
 ]
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 42/42] intel/compiler: allow propagating HF immediates to MAD/LRP

2019-01-15 Thread Iago Toral Quiroga
Even if we don't do 3-src algebraic optimizations for MAD and LRP in
the backend any more, the combine constants pass can still do a fine
job putting grouping these constants into single registers for better
register pressure.

v2:
 - updated comment to reference register pressure benefits rather than
   algebraic optimizations.
---
 src/intel/compiler/brw_fs_copy_propagation.cpp | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp 
b/src/intel/compiler/brw_fs_copy_propagation.cpp
index 4e20ddb683a..5695678b766 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -772,16 +772,14 @@ fs_visitor::try_constant_propagate(fs_inst *inst, 
acp_entry *entry)
 
   case BRW_OPCODE_MAD:
   case BRW_OPCODE_LRP:
- /* 3-src instructions can't take IMM registers, however, for 32-bit
-  * floating instructions we rely on the combine constants pass to fix
-  * it up. For anything else, we shouldn't be promoting immediates
-  * until we can make the pass capable of combining constants of
-  * different sizes.
+ /* 3-src instructions can't take IMM registers, but we allow this
+  * here anyway and rely on the combine constants pass to fix it up
+  * later, hopefully leading to better register pressure.
   */
- if (val.type == BRW_REGISTER_TYPE_F) {
-inst->src[i] = val;
-progress = true;
- }
+ assert(val.type == BRW_REGISTER_TYPE_F ||
+val.type == BRW_REGISTER_TYPE_HF);
+ inst->src[i] = val;
+ progress = true;
  break;
 
   default:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 18/42] intel/compiler: add a helper function to query hardware type table

2019-01-15 Thread Iago Toral Quiroga
We open coded this in a couple of places, so a helper function is probably
sensible. Plus it makes it more consistent with the 3src hardware type case.

Suggested-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_reg_type.c | 34 ---
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/intel/compiler/brw_reg_type.c 
b/src/intel/compiler/brw_reg_type.c
index 09b3ea61d4c..0c9f522eca0 100644
--- a/src/intel/compiler/brw_reg_type.c
+++ b/src/intel/compiler/brw_reg_type.c
@@ -193,6 +193,20 @@ static const struct hw_3src_type {
 #undef E
 };
 
+static inline const struct hw_type *
+get_hw_type_map(const struct gen_device_info *devinfo, uint32_t *size)
+{
+   if (devinfo->gen >= 11) {
+  if (size)
+ *size = ARRAY_SIZE(gen11_hw_type);
+  return gen11_hw_type;
+   } else {
+  if (size)
+ *size = ARRAY_SIZE(gen4_hw_type);
+  return gen4_hw_type;
+   }
+}
+
 /**
  * Convert a brw_reg_type enumeration value into the hardware representation.
  *
@@ -203,16 +217,10 @@ brw_reg_type_to_hw_type(const struct gen_device_info 
*devinfo,
 enum brw_reg_file file,
 enum brw_reg_type type)
 {
-   const struct hw_type *table;
-
-   if (devinfo->gen >= 11) {
-  assert(type < ARRAY_SIZE(gen11_hw_type));
-  table = gen11_hw_type;
-   } else {
-  assert(type < ARRAY_SIZE(gen4_hw_type));
-  table = gen4_hw_type;
-   }
+   uint32_t table_size;
+   const struct hw_type *table = get_hw_type_map(devinfo, &table_size);
 
+   assert(type < table_size);
assert(devinfo->has_64bit_types || brw_reg_type_to_size(type) < 8 ||
   type == BRW_REGISTER_TYPE_NF);
 
@@ -234,13 +242,7 @@ enum brw_reg_type
 brw_hw_type_to_reg_type(const struct gen_device_info *devinfo,
 enum brw_reg_file file, unsigned hw_type)
 {
-   const struct hw_type *table;
-
-   if (devinfo->gen >= 11) {
-  table = gen11_hw_type;
-   } else {
-  table = gen4_hw_type;
-   }
+   const struct hw_type *table = get_hw_type_map(devinfo, NULL);
 
if (file == BRW_IMMEDIATE_VALUE) {
   for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 05/42] intel/compiler: assert restrictions on conversions to half-float

2019-01-15 Thread Iago Toral Quiroga
There are some hardware restrictions that brw_nir_lower_conversions should
have taken care of before we get here.

v2:
 - rebased on top of regioning lowering pass

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/compiler/brw_fs_nir.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index e1d0e318b35..d742f55a957 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -784,6 +784,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
*/
 
case nir_op_f2f16:
+   case nir_op_i2f16:
+   case nir_op_u2f16:
+  assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
   inst = bld.MOV(result, op[0]);
   inst->saturate = instr->dest.saturate;
   break;
@@ -821,8 +824,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
case nir_op_u2u32:
case nir_op_i2i16:
case nir_op_u2u16:
-   case nir_op_i2f16:
-   case nir_op_u2f16:
case nir_op_i2i8:
case nir_op_u2u8:
   inst = bld.MOV(result, op[0]);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 11/42] intel/compiler: lower 16-bit fmod

2019-01-15 Thread Iago Toral Quiroga
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_compiler.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/intel/compiler/brw_compiler.c 
b/src/intel/compiler/brw_compiler.c
index fe632c5badc..f885e79c3e6 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -33,6 +33,7 @@
.lower_sub = true, \
.lower_fdiv = true,\
.lower_scmp = true,\
+   .lower_fmod16 = true,  \
.lower_fmod32 = true,  \
.lower_fmod64 = false, \
.lower_bitfield_extract = true,\
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 04/42] intel/compiler: handle b2i/b2f with other integer conversion opcodes

2019-01-15 Thread Iago Toral Quiroga
Since we handle booleans as integers this makes more sense.

v2:
 - rebased to incorporate new boolean conversion opcodes

v3:
 - rebased on top regioning lowering pass

Reviewed-by: Jason Ekstrand  (v1)
Reviewed-by: Topi Pohjolainen  (v2)
---
 src/intel/compiler/brw_fs_nir.cpp | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index a59debf2b78..e1d0e318b35 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -788,6 +788,14 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   inst->saturate = instr->dest.saturate;
   break;
 
+   case nir_op_f2f64:
+   case nir_op_f2i64:
+   case nir_op_f2u64:
+  assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
+  inst = bld.MOV(result, op[0]);
+  inst->saturate = instr->dest.saturate;
+  break;
+
case nir_op_b2i8:
case nir_op_b2i16:
case nir_op_b2i32:
@@ -798,14 +806,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   op[0].type = BRW_REGISTER_TYPE_D;
   op[0].negate = !op[0].negate;
   /* fallthrough */
-   case nir_op_f2f64:
-   case nir_op_f2i64:
-   case nir_op_f2u64:
-  assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
-  inst = bld.MOV(result, op[0]);
-  inst->saturate = instr->dest.saturate;
-  break;
-
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2f64:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 15/42] intel/compiler: Extended Math is limited to SIMD8 on half-float

2019-01-15 Thread Iago Toral Quiroga
From the Skylake PRM, Extended Math Function:

  "The execution size must be no more than 8 when half-floats
   are used in source or destination operand."

Earlier generations do not support Extended Math with half-float.

v2:
 - Rewrite the code to make it more readable (Jason).

v3:
 - Use if-ladders or just if+return exclusively (Topi).

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/compiler/brw_fs.cpp | 27 ++-
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0359eb079f7..0b3ec94e2d2 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5493,18 +5493,27 @@ get_lowered_simd_width(const struct gen_device_info 
*devinfo,
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS:
+   case SHADER_OPCODE_COS: {
   /* Unary extended math instructions are limited to SIMD8 on Gen4 and
-   * Gen6.
+   * Gen6. Extended Math Function is limited to SIMD8 with half-float.
*/
-  return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
-  devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) 
:
-  MIN2(8, inst->exec_size));
+  if (devinfo->gen == 6 || (devinfo->gen == 4 && !devinfo->is_g4x))
+ return MIN2(8, inst->exec_size);
+  if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ return MIN2(8, inst->exec_size);
+  return MIN2(16, inst->exec_size);
+   }
 
-   case SHADER_OPCODE_POW:
-  /* SIMD16 is only allowed on Gen7+. */
-  return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
-  MIN2(8, inst->exec_size));
+   case SHADER_OPCODE_POW: {
+  /* SIMD16 is only allowed on Gen7+. Extended Math Function is limited
+   * to SIMD8 with half-float
+   */
+  if (devinfo->gen < 7)
+ return MIN2(8, inst->exec_size);
+  if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ return MIN2(8, inst->exec_size);
+  return MIN2(16, inst->exec_size);
+   }
 
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 08/42] intel/compiler: implement 16-bit fsign

2019-01-15 Thread Iago Toral Quiroga
v2:
 - make 16-bit be its own separate case (Jason)

Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_fs_nir.cpp | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index d742f55a957..cf546b8ff09 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -844,7 +844,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
 : bld.MOV(result, brw_imm_f(1.0f));
 
  set_predicate(BRW_PREDICATE_NORMAL, inst);
-  } else if (type_sz(op[0].type) < 8) {
+  } else if (type_sz(op[0].type) == 2) {
+ /* AND(val, 0x8000) gives the sign bit.
+  *
+  * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not 
zero.
+  */
+ fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
+ bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
+
+ fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UW);
+ op[0].type = BRW_REGISTER_TYPE_UW;
+ result.type = BRW_REGISTER_TYPE_UW;
+ bld.AND(result_int, op[0], brw_imm_uw(0x8000u));
+
+ inst = bld.OR(result_int, result_int, brw_imm_uw(0x3c00u));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+  } else if (type_sz(op[0].type) == 4) {
  /* AND(val, 0x8000) gives the sign bit.
   *
   * Predicated OR ORs 1.0 (0x3f80) with the sign bit if val is not
@@ -866,6 +881,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   * - The sign is encoded in the high 32-bit of each DF
   * - We need to produce a DF result.
   */
+ assert(type_sz(op[0].type) == 8);
 
  fs_reg zero = vgrf(glsl_type::double_type);
  bld.MOV(zero, setup_imm_df(bld, 0.0));
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 22/42] intel/compiler: don't propagate HF immediates to 3-src instructions

2019-01-15 Thread Iago Toral Quiroga
3-src instructions don't support immediates, but since 36bc5f06dd22,
we allow them on MAD and LRP relying on the combine constants pass to
fix it up later. However, that pass is specialized for 32-bit float
immediates and can't handle HF constants at present, so this patch
ensures that copy-propagation only does this for 32-bit constants.

Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_fs_copy_propagation.cpp | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp 
b/src/intel/compiler/brw_fs_copy_propagation.cpp
index c23ce1ef426..77f2749ba04 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -772,8 +772,16 @@ fs_visitor::try_constant_propagate(fs_inst *inst, 
acp_entry *entry)
 
   case BRW_OPCODE_MAD:
   case BRW_OPCODE_LRP:
- inst->src[i] = val;
- progress = true;
+ /* 3-src instructions can't take IMM registers, however, for 32-bit
+  * floating instructions we rely on the combine constants pass to fix
+  * it up. For anything else, we shouldn't be promoting immediates
+  * until we can make the pass capable of combining constants of
+  * different sizes.
+  */
+ if (val.type == BRW_REGISTER_TYPE_F) {
+inst->src[i] = val;
+progress = true;
+ }
  break;
 
   default:
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 24/42] intel/compiler: fix ddy for half-float in gen8

2019-01-15 Thread Iago Toral Quiroga
We use ALign16 mode for this, since it is more convenient, but the PRM
for Broadwell states in Volume 3D Media GPGPU, Chapter 'Register region
restrictions', Section '1. Special Restrictions':

   "In Align16 mode, the channel selects and channel enables apply to a
pair of half-floats, because these parameters are defined for DWord
elements ONLY. This is applicable when both source and destination
are half-floats."

This means that we cannot select individual HF elements using swizzles
like we do with 32-bit floats so we can't implement the required
regioning for this.

Use the gen11 path for this instead, which uses Align1 mode.

The restriction is not present in gen9 or gen10, where the Align16
implementation seems to work just fine.

Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_generator.cpp | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index d0cc4a6d231..4310f0b7fdc 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1339,8 +1339,14 @@ fs_generator::generate_ddy(const fs_inst *inst,
const uint32_t type_size = type_sz(src.type);
 
if (inst->opcode == FS_OPCODE_DDY_FINE) {
-  /* produce accurate derivatives */
-  if (devinfo->gen >= 11) {
+  /* produce accurate derivatives. We can do this easily in Align16
+   * but this is not supported in gen11+ and gen8 Align16 swizzles
+   * for Half-Float operands work in units of 32-bit and always
+   * select pairs of consecutive half-float elements, so we can't use
+   * use it for this.
+   */
+  if (devinfo->gen >= 11 ||
+  (devinfo->gen == 8 && src.type == BRW_REGISTER_TYPE_HF)) {
  src = stride(src, 0, 2, 1);
  struct brw_reg src_0  = byte_offset(src,  0 * type_size);
  struct brw_reg src_2  = byte_offset(src,  2 * type_size);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 00/42] intel: VK_KHR_shader_float16_int8 implementation

2019-01-15 Thread Iago Toral Quiroga
The changes in this version address review feedback to v2 and, most importantly,
rebase on top of relevant changes in master, specifically Curro's regioning
lowering pass. This new regioning pass simplifies some of the NIR translation
code (specifically the code for translating regioning restrictions on
conversions for atom platforms) making some of the previous work in this series
unnecessary. The regioning restrictions for conversions between integer and
half-float added with this series are are now implemented as part of this
framework instead of doing it at NIR translation time. This version of the
series also dropped the SPIR-V compiler patches that have already been merged.

As always, a branch for with these patches is available for testing in the
itoral/VK_KHR_shader_float16_int8 branch of the Igalia Mesa repository at
https://github.com/Igalia/mesa.

Iago Toral Quiroga (42):
  intel/compiler: handle conversions between int and half-float on atom
  intel/compiler: add a NIR pass to lower conversions
  intel/compiler: split float to 64-bit opcodes from int to 64-bit
  intel/compiler: handle b2i/b2f with other integer conversion opcodes
  intel/compiler: assert restrictions on conversions to half-float
  intel/compiler: lower some 16-bit float operations to 32-bit
  intel/compiler: lower 16-bit extended math to 32-bit prior to gen9
  intel/compiler: implement 16-bit fsign
  intel/compiler: allow extended math functions with HF operands
  compiler/nir: add lowering option for 16-bit fmod
  intel/compiler: lower 16-bit fmod
  compiler/nir: add lowering for 16-bit flrp
  intel/compiler: lower 16-bit flrp
  compiler/nir: add lowering for 16-bit ldexp
  intel/compiler: Extended Math is limited to SIMD8 on half-float
  intel/compiler: add instruction setters for Src1Type and Src2Type.
  intel/compiler: add new half-float register type for 3-src
instructions
  intel/compiler: add a helper function to query hardware type table
  intel/compiler: don't compact 3-src instructions with Src1Type or
Src2Type bits
  intel/compiler: allow half-float on 3-source instructions since gen8
  intel/compiler: set correct precision fields for 3-source float
instructions
  intel/compiler: don't propagate HF immediates to 3-src instructions
  intel/compiler: fix ddx and ddy for 16-bit float
  intel/compiler: fix ddy for half-float in gen8
  intel/compiler: workaround for SIMD8 half-float MAD in gen8
  intel/compiler: split is_partial_write() into two variants
  intel/compiler: activate 16-bit bit-size lowerings also for 8-bit
  intel/compiler: handle 64-bit float to 8-bit integer conversions
  intel/compiler: handle conversions between int and half-float on atom
  intel/compiler: implement isign for int8
  intel/compiler: ask for an integer type if requesting an 8-bit type
  intel/eu: force stride of 2 on NULL register for Byte instructions
  compiler/spirv: add support for Float16 and Int8 capabilities
  anv/pipeline: support Float16 and Int8 capabilities in gen8+
  anv/device: expose shaderFloat16 and shaderInt8 in gen8+
  intel/compiler: implement is_zero, is_one, is_negative_one for
8-bit/16-bit
  intel/compiler: add a brw_reg_type_is_integer helper
  intel/compiler: fix cmod propagation for non 32-bit types
  intel/compiler: remove MAD/LRP algebraic optimizations from the
backend
  intel/compiler: support half-float in the combine constants pass
  intel/compiler: fix combine constants for Align16 with half-float
prior to gen9
  intel/compiler: allow propagating HF immediates to MAD/LRP

 src/compiler/nir/nir.h|   2 +
 src/compiler/nir/nir_opt_algebraic.py |  11 +-
 src/compiler/shader_info.h|   2 +
 src/compiler/spirv/spirv_to_nir.c |   8 +-
 src/intel/Makefile.sources|   1 +
 src/intel/compiler/brw_compiler.c |   2 +
 src/intel/compiler/brw_eu_compact.c   |   5 +-
 src/intel/compiler/brw_eu_emit.c  |  36 +++-
 src/intel/compiler/brw_fs.cpp | 143 ++--
 src/intel/compiler/brw_fs.h   |   1 +
 .../compiler/brw_fs_cmod_propagation.cpp  |  34 ++--
 .../compiler/brw_fs_combine_constants.cpp |  82 +++--
 .../compiler/brw_fs_copy_propagation.cpp  |  14 +-
 src/intel/compiler/brw_fs_cse.cpp |   3 +-
 .../compiler/brw_fs_dead_code_eliminate.cpp   |   2 +-
 src/intel/compiler/brw_fs_generator.cpp   |  47 +++---
 src/intel/compiler/brw_fs_live_variables.cpp  |   2 +-
 src/intel/compiler/brw_fs_nir.cpp |  85 --
 src/intel/compiler/brw_fs_reg_allocate.cpp|   2 +-
 .../compiler/brw_fs_register_coalesce.cpp |   2 +-
 .../compiler/brw_fs_saturate_propagation.cpp  |   7 +-
 src/intel/compiler/brw_fs_sel_peephole.cpp|   4 +-
 src/intel/compiler/brw_inst.h |   2 +
 src/intel/compiler/brw_ir_fs.h|  36 +++-
 src/intel/compiler/brw_nir.c

[Mesa-dev] [PATCH v3 01/42] intel/compiler: handle conversions between int and half-float on atom

2019-01-15 Thread Iago Toral Quiroga
v2: adapted to work with the new regioning lowering pass

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/compiler/brw_ir_fs.h | 33 ++---
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index 3c23fb375e4..ba4d6a95720 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -497,9 +497,10 @@ is_unordered(const fs_inst *inst)
 }
 
 /**
- * Return whether the following regioning restriction applies to the specified
- * instruction.  From the Cherryview PRM Vol 7. "Register Region
- * Restrictions":
+ * Return whether one of the the following regioning restrictions apply to the
+ * specified instruction.
+ *
+ * From the Cherryview PRM Vol 7. "Register Region Restrictions":
  *
  * "When source or destination datatype is 64b or operation is integer DWord
  *  multiply, regioning in Align1 must follow these rules:
@@ -508,6 +509,14 @@ is_unordered(const fs_inst *inst)
  *  2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
  *  3. Source and Destination offset must be the same, except the case of
  * scalar source."
+ *
+ * From the Cherryview PRM Vol 7. "Register Region Restrictions":
+ *
+ *"Conversion between Integer and HF (Half Float) must be DWord
+ * aligned and strided by a DWord on the destination."
+ *
+ *The same restriction is listed for other hardware platforms, however,
+ *empirical testing suggests that only atom platforms are affected.
  */
 static inline bool
 has_dst_aligned_region_restriction(const gen_device_info *devinfo,
@@ -518,10 +527,20 @@ has_dst_aligned_region_restriction(const gen_device_info 
*devinfo,
  (inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD);
 
if (type_sz(inst->dst.type) > 4 || type_sz(exec_type) > 4 ||
-   (type_sz(exec_type) == 4 && is_int_multiply))
-  return devinfo->is_cherryview || gen_device_info_is_9lp(devinfo);
-   else
-  return false;
+   (type_sz(exec_type) == 4 && is_int_multiply)) {
+  if (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))
+ return true;
+   }
+
+   const bool dst_type_is_hf = inst->dst.type == BRW_REGISTER_TYPE_HF;
+   const bool exec_type_is_hf = exec_type == BRW_REGISTER_TYPE_HF;
+   if ((dst_type_is_hf && !brw_reg_type_is_floating_point(exec_type)) ||
+   (exec_type_is_hf && !brw_reg_type_is_floating_point(inst->dst.type))) {
+  if (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))
+ return true;
+   }
+
+   return false;
 }
 
 #endif
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 09/42] intel/compiler: allow extended math functions with HF operands

2019-01-15 Thread Iago Toral Quiroga
The PRM states that half-float operands are supported since gen9.

Reviewed-by: Topi Pohjolainen 
---
 src/intel/compiler/brw_eu_emit.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 45e2552783b..e21df4624b3 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1874,8 +1874,10 @@ void gen6_math(struct brw_codegen *p,
   assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
  (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
} else {
-  assert(src0.type == BRW_REGISTER_TYPE_F);
-  assert(src1.type == BRW_REGISTER_TYPE_F);
+  assert(src0.type == BRW_REGISTER_TYPE_F ||
+ (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
+  assert(src1.type == BRW_REGISTER_TYPE_F ||
+ (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
}
 
/* Source modifiers are ignored for extended math instructions on Gen6. */
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 02/42] intel/compiler: add a NIR pass to lower conversions

2019-01-15 Thread Iago Toral Quiroga
Some conversions are not directly supported in hardware and need to be
split in two conversion instructions going through an intermediary type.
Doing this at the NIR level simplifies a bit the complexity in the backend.

v2:
 - Consider fp16 rounding conversion opcodes
 - Properly handle swizzles on conversion sources.

Reviewed-by: Topi Pohjolainen  (v1)
---
 src/intel/Makefile.sources|   1 +
 src/intel/compiler/brw_nir.c  |   1 +
 src/intel/compiler/brw_nir.h  |   2 +
 .../compiler/brw_nir_lower_conversions.c  | 158 ++
 src/intel/compiler/meson.build|   1 +
 5 files changed, 163 insertions(+)
 create mode 100644 src/intel/compiler/brw_nir_lower_conversions.c

diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 94a28d370e8..9975daa3ad1 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -83,6 +83,7 @@ COMPILER_FILES = \
compiler/brw_nir_analyze_boolean_resolves.c \
compiler/brw_nir_analyze_ubo_ranges.c \
compiler/brw_nir_attribute_workarounds.c \
+   compiler/brw_nir_lower_conversions.c \
compiler/brw_nir_lower_cs_intrinsics.c \
compiler/brw_nir_lower_image_load_store.c \
compiler/brw_nir_lower_mem_access_bit_sizes.c \
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 92d7fe4bede..572ab824a94 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -882,6 +882,7 @@ brw_postprocess_nir(nir_shader *nir, const struct 
brw_compiler *compiler,
OPT(nir_opt_move_comparisons);
 
OPT(nir_lower_bool_to_int32);
+   OPT(brw_nir_lower_conversions);
 
OPT(nir_lower_locals_to_regs);
 
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index bc81950d47e..662b2627e95 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -114,6 +114,8 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const 
struct brw_vue_map *vue,
GLenum tes_primitive_mode);
 void brw_nir_lower_fs_outputs(nir_shader *nir);
 
+bool brw_nir_lower_conversions(nir_shader *nir);
+
 bool brw_nir_lower_image_load_store(nir_shader *nir,
 const struct gen_device_info *devinfo);
 void brw_nir_rewrite_image_intrinsic(nir_intrinsic_instr *intrin,
diff --git a/src/intel/compiler/brw_nir_lower_conversions.c 
b/src/intel/compiler/brw_nir_lower_conversions.c
new file mode 100644
index 000..583167c7753
--- /dev/null
+++ b/src/intel/compiler/brw_nir_lower_conversions.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+static nir_op
+get_conversion_op(nir_alu_type src_type,
+  unsigned src_bit_size,
+  nir_alu_type dst_type,
+  unsigned dst_bit_size,
+  nir_rounding_mode rounding_mode)
+{
+   nir_alu_type src_full_type = (nir_alu_type) (src_type | src_bit_size);
+   nir_alu_type dst_full_type = (nir_alu_type) (dst_type | dst_bit_size);
+
+   return nir_type_conversion_op(src_full_type, dst_full_type, rounding_mode);
+}
+
+static nir_rounding_mode
+get_opcode_rounding_mode(nir_op op)
+{
+   switch (op) {
+   case nir_op_f2f16_rtz:
+  return nir_rounding_mode_rtz;
+   case nir_op_f2f16_rtne:
+  return nir_rounding_mode_rtne;
+   default:
+  return nir_rounding_mode_undef;
+   }
+}
+
+static void
+split_conversion(nir_builder *b, nir_alu_instr *alu, nir_op op1, nir_op op2)
+{
+   b->cursor = nir_before_instr(&alu->instr);
+   assert(alu->dest.write_mask == 1);
+   nir_ssa_def *src = nir_ssa_for_alu_src(b, alu, 0);
+   nir_ssa_def *tmp = nir_build_alu(b, op1, src, NULL, NULL, NULL);
+   nir_ssa_def *res = nir_build_alu(b, op2, tmp, NULL, NULL, NUL

[Mesa-dev] [PATCH v4] anv/device: fix maximum number of images supported

2019-01-14 Thread Iago Toral Quiroga
We had defined MAX_IMAGES as 8, which we used to size the array for
image push constant data. The comment there stated that this was for
gen8, but anv_nir_apply_pipeline_layout runs for all gens and writes
that array, asserting that we don't exceed that number of images,
which imposes a limit of MAX_IMAGES on all gens.

Furthermore, despite this, we are exposing up to 64 images per shader
stage on all gens, gen8 included.

This patch lowers the number of images we expose in gen8 to 8 and
keeps 64 images for gen9+ while making sure that only pre-SKL gens
use push constant space to handle images.

v2:
 - <= instead of < in the assert (Eric, Lionel)
 - Change the way the assertion is written (Eric)

v3:
 - Revert the way the assertion is written to the form it had in v1,
   the version in v2 was not equivalent and was incorrect. (Lionel)

v4:
 - gen9+ doesn't need push constants for images at all (Jason)
---
 src/intel/vulkan/anv_device.c |  7 --
 .../vulkan/anv_nir_apply_pipeline_layout.c|  4 +--
 src/intel/vulkan/anv_private.h|  5 ++--
 src/intel/vulkan/genX_cmd_buffer.c| 25 +--
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 523f1483e29..f85458b672e 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -987,9 +987,12 @@ void anv_GetPhysicalDeviceProperties(
const uint32_t max_samplers = (devinfo->gen >= 8 || devinfo->is_haswell) ?
  128 : 16;
 
+   const uint32_t max_images = devinfo->gen < 9 ? MAX_GEN8_IMAGES : MAX_IMAGES;
+
VkSampleCountFlags sample_counts =
   isl_device_get_sample_counts(&pdevice->isl_dev);
 
+
VkPhysicalDeviceLimits limits = {
   .maxImageDimension1D  = (1 << 14),
   .maxImageDimension2D  = (1 << 14),
@@ -1009,7 +1012,7 @@ void anv_GetPhysicalDeviceProperties(
   .maxPerStageDescriptorUniformBuffers  = 64,
   .maxPerStageDescriptorStorageBuffers  = 64,
   .maxPerStageDescriptorSampledImages   = max_samplers,
-  .maxPerStageDescriptorStorageImages   = 64,
+  .maxPerStageDescriptorStorageImages   = max_images,
   .maxPerStageDescriptorInputAttachments= 64,
   .maxPerStageResources = 250,
   .maxDescriptorSetSamplers = 6 * max_samplers, /* number 
of stages * maxPerStageDescriptorSamplers */
@@ -1018,7 +1021,7 @@ void anv_GetPhysicalDeviceProperties(
   .maxDescriptorSetStorageBuffers   = 6 * 64,   /* number 
of stages * maxPerStageDescriptorStorageBuffers */
   .maxDescriptorSetStorageBuffersDynamic= MAX_DYNAMIC_BUFFERS / 2,
   .maxDescriptorSetSampledImages= 6 * max_samplers, /* number 
of stages * maxPerStageDescriptorSampledImages */
-  .maxDescriptorSetStorageImages= 6 * 64,   /* number 
of stages * maxPerStageDescriptorStorageImages */
+  .maxDescriptorSetStorageImages= 6 * max_images,   /* number 
of stages * maxPerStageDescriptorStorageImages */
   .maxDescriptorSetInputAttachments = 256,
   .maxVertexInputAttributes = MAX_VBS,
   .maxVertexInputBindings   = MAX_VBS,
diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c 
b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
index b3daf702bc0..623984b0f8c 100644
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -528,8 +528,8 @@ anv_nir_apply_pipeline_layout(const struct 
anv_physical_device *pdevice,
   }
}
 
-   if (map->image_count > 0) {
-  assert(map->image_count <= MAX_IMAGES);
+   if (map->image_count > 0 && pdevice->compiler->devinfo->gen < 9) {
+  assert(map->image_count <= MAX_GEN8_IMAGES);
   assert(shader->num_uniforms == prog_data->nr_params * 4);
   state.first_image_uniform = shader->num_uniforms;
   uint32_t *param = brw_stage_prog_data_add_params(prog_data,
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 770254e93ea..47878adb066 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -157,7 +157,8 @@ struct gen_l3_config;
 #define MAX_SCISSORS16
 #define MAX_PUSH_CONSTANTS_SIZE 128
 #define MAX_DYNAMIC_BUFFERS 16
-#define MAX_IMAGES 8
+#define MAX_IMAGES 64
+#define MAX_GEN8_IMAGES 8
 #define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */
 
 /* The kernel relocation API has a limitation of a 32-bit delta value
@@ -1883,7 +1884,7 @@ struct anv_push_constants {
uint32_t base_work_group_id[3];
 
/* Image data for image_load_store on pre-SKL */
-   struct brw_image_param images[MAX_IMAGES];
+   struct brw_image_param images[MAX_GEN8_IMAGES];
 };
 
 struct anv_dynamic_state {
diff --git a/src/intel/vulkan/genX_cmd_buffer.c 
b/src/intel/vulkan/genX_cmd_b

  1   2   3   4   5   6   7   8   9   10   >