[Mesa-dev] [PATCH] intel/common: Improve the comments for sample positions

2017-10-12 Thread Jason Ekstrand
These are pulled directly from brw_multisample_state.h
---
 src/intel/common/gen_sample_positions.h | 65 +
 1 file changed, 65 insertions(+)

diff --git a/src/intel/common/gen_sample_positions.h 
b/src/intel/common/gen_sample_positions.h
index b86a7d8..f0ce95d 100644
--- a/src/intel/common/gen_sample_positions.h
+++ b/src/intel/common/gen_sample_positions.h
@@ -23,16 +23,38 @@
 #ifndef GEN_SAMPLE_POSITIONS_H
 #define GEN_SAMPLE_POSITIONS_H
 
+/*
+ * This file defines the standard multisample positions used by both GL and
+ * Vulkan.  These correspond to the Vulkan "standard sample locations".
+ */
+
+/**
+ * 1x MSAA has a single sample at the center: (0.5, 0.5) -> (0x8, 0x8).
+ */
 #define GEN_SAMPLE_POS_1X(prefix) \
 prefix##0XOffset   = 0.5; \
 prefix##0YOffset   = 0.5;
 
+/**
+ * 2x MSAA sample positions are (0.25, 0.25) and (0.75, 0.75):
+ *   4 c
+ * 4 0
+ * c   1
+ */
 #define GEN_SAMPLE_POS_2X(prefix) \
 prefix##0XOffset   = 0.25; \
 prefix##0YOffset   = 0.25; \
 prefix##1XOffset   = 0.75; \
 prefix##1YOffset   = 0.75;
 
+/**
+ * Sample positions:
+ *   2 6 a e
+ * 2   0
+ * 6   1
+ * a 2
+ * e 3
+ */
 #define GEN_SAMPLE_POS_4X(prefix) \
 prefix##0XOffset   = 0.375; \
 prefix##0YOffset   = 0.125; \
@@ -43,6 +65,28 @@ prefix##2YOffset   = 0.625; \
 prefix##3XOffset   = 0.625; \
 prefix##3YOffset   = 0.875;
 
+/**
+ * Sample positions:
+ *
+ * From the Ivy Bridge PRM, Vol2 Part1 p304 (3DSTATE_MULTISAMPLE:
+ * Programming Notes):
+ * "When programming the sample offsets (for NUMSAMPLES_4 or _8 and
+ * MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3 (or 7
+ * for 8X) must have monotonically increasing distance from the
+ * pixel center. This is required to get the correct centroid
+ * computation in the device."
+ *
+ * Sample positions:
+ *   1 3 5 7 9 b d f
+ * 1   7
+ * 3 3
+ * 5 0
+ * 7 5
+ * 9 2
+ * b   1
+ * d   4
+ * f   6
+ */
 #define GEN_SAMPLE_POS_8X(prefix) \
 prefix##0XOffset   = 0.5625; \
 prefix##0YOffset   = 0.3125; \
@@ -61,6 +105,27 @@ prefix##6YOffset   = 0.9375; \
 prefix##7XOffset   = 0.9375; \
 prefix##7YOffset   = 0.0625;
 
+/**
+ * Sample positions:
+ *
+ *0 1 2 3 4 5 6 7 8 9 a b c d e f
+ * 0   15
+ * 1  9
+ * 2 10
+ * 37
+ * 4   13
+ * 51
+ * 64
+ * 7  3
+ * 8 12
+ * 90
+ * a2
+ * b6
+ * c 11
+ * d  5
+ * e  8
+ * f 14
+ */
 #define GEN_SAMPLE_POS_16X(prefix) \
 prefix##0XOffset   = 0.5625; \
 prefix##0YOffset   = 0.5625; \
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] anv: Get rid of gen fall-through

2017-10-12 Thread Jason Ekstrand
In the early days of the Vulkan driver, we thought it would be a good
idea to just make genN just fall back to the genN-1 code if it didn't
need to be any different for genN.  While this seemed like a good idea,
it ultimately ended up being far simpler to just recompile everything.
We haven't been using the fall-through functionality for some time so
we're better off just deleting it so it doesn't accidentally start
causing problems.
---
 src/intel/vulkan/anv_entrypoints_gen.py | 34 -
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/intel/vulkan/anv_entrypoints_gen.py 
b/src/intel/vulkan/anv_entrypoints_gen.py
index bf376a8..a4ecbf2 100644
--- a/src/intel/vulkan/anv_entrypoints_gen.py
+++ b/src/intel/vulkan/anv_entrypoints_gen.py
@@ -160,31 +160,31 @@ anv_resolve_entrypoint(const struct gen_device_info 
*devinfo, uint32_t index)
   return anv_layer.entrypoints[index];
}
 
+   const struct anv_dispatch_table *genX_table;
switch (devinfo->gen) {
case 10:
-  if (gen10_layer.entrypoints[index])
- return gen10_layer.entrypoints[index];
-  /* fall through */
+  genX_table = _layer;
+  break;
case 9:
-  if (gen9_layer.entrypoints[index])
- return gen9_layer.entrypoints[index];
-  /* fall through */
+  genX_table = _layer;
+  break;
case 8:
-  if (gen8_layer.entrypoints[index])
- return gen8_layer.entrypoints[index];
-  /* fall through */
+  genX_table = _layer;
+  break;
case 7:
-  if (devinfo->is_haswell && gen75_layer.entrypoints[index])
- return gen75_layer.entrypoints[index];
-
-  if (gen7_layer.entrypoints[index])
- return gen7_layer.entrypoints[index];
-  /* fall through */
-   case 0:
-  return anv_layer.entrypoints[index];
+  if (devinfo->is_haswell)
+ genX_table = _layer;
+  else
+ genX_table = _layer;
+  break;
default:
   unreachable("unsupported gen\\n");
}
+
+   if (genX_table->entrypoints[index])
+  return genX_table->entrypoints[index];
+   else
+  return anv_layer.entrypoints[index];
 }
 
 /* Hash table stats:
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 50/52] spirv: Rework barriers

2017-10-12 Thread Jason Ekstrand
Our previous handling of barriers always used the big hammer and didn't
correctly emit memory barriers when specified along with a control
barrier.  This commit completely reworks the way we emit barriers to
make things both more precise and more correct.
---
 src/compiler/spirv/spirv_to_nir.c | 132 --
 1 file changed, 114 insertions(+), 18 deletions(-)

diff --git a/src/compiler/spirv/spirv_to_nir.c 
b/src/compiler/spirv/spirv_to_nir.c
index 079ff0f..a729ef4 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -2571,36 +2571,132 @@ vtn_handle_composite(struct vtn_builder *b, SpvOp 
opcode,
 }
 
 static void
+vtn_emit_barrier(struct vtn_builder *b, nir_intrinsic_op op)
+{
+   nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(b->shader, op);
+   nir_builder_instr_insert(>nb, >instr);
+}
+
+static void
+vtn_emit_memory_barrier(struct vtn_builder *b, SpvScope scope,
+SpvMemorySemanticsMask semantics)
+{
+   static const SpvMemorySemanticsMask all_memory_semantics =
+  SpvMemorySemanticsUniformMemoryMask |
+  SpvMemorySemanticsWorkgroupMemoryMask |
+  SpvMemorySemanticsAtomicCounterMemoryMask |
+  SpvMemorySemanticsImageMemoryMask;
+
+   /* If we're not actually doing a memory barrier, bail */
+   if (!(semantics & all_memory_semantics))
+  return;
+
+   /* GL and Vulkan don't have these */
+   assert(scope != SpvScopeCrossDevice);
+
+   if (scope == SpvScopeSubgroup)
+  return; /* Nothing to do here */
+
+   if (scope == SpvScopeWorkgroup) {
+  vtn_emit_barrier(b, nir_intrinsic_group_memory_barrier);
+  return;
+   }
+
+   /* There's only two scopes thing left */
+   assert(scope == SpvScopeInvocation || scope == SpvScopeDevice);
+
+   if ((semantics & all_memory_semantics) == all_memory_semantics) {
+  vtn_emit_barrier(b, nir_intrinsic_memory_barrier);
+  return;
+   }
+
+   /* Issue a bunch of more specific barriers */
+   uint32_t bits = semantics;
+   while (bits) {
+  SpvMemorySemanticsMask semantic = 1 << u_bit_scan();
+  switch (semantic) {
+  case SpvMemorySemanticsUniformMemoryMask:
+ vtn_emit_barrier(b, nir_intrinsic_memory_barrier_buffer);
+ break;
+  case SpvMemorySemanticsWorkgroupMemoryMask:
+ vtn_emit_barrier(b, nir_intrinsic_memory_barrier_shared);
+ break;
+  case SpvMemorySemanticsAtomicCounterMemoryMask:
+ vtn_emit_barrier(b, nir_intrinsic_memory_barrier_atomic_counter);
+ break;
+  case SpvMemorySemanticsImageMemoryMask:
+ vtn_emit_barrier(b, nir_intrinsic_memory_barrier_image);
+ break;
+  default:
+ break;;
+  }
+   }
+}
+
+static void
 vtn_handle_barrier(struct vtn_builder *b, SpvOp opcode,
const uint32_t *w, unsigned count)
 {
-   nir_intrinsic_op intrinsic_op;
switch (opcode) {
case SpvOpEmitVertex:
case SpvOpEmitStreamVertex:
-  intrinsic_op = nir_intrinsic_emit_vertex;
-  break;
case SpvOpEndPrimitive:
-   case SpvOpEndStreamPrimitive:
-  intrinsic_op = nir_intrinsic_end_primitive;
-  break;
-   case SpvOpMemoryBarrier:
-  intrinsic_op = nir_intrinsic_memory_barrier;
-  break;
-   case SpvOpControlBarrier:
-  intrinsic_op = nir_intrinsic_barrier;
+   case SpvOpEndStreamPrimitive: {
+  nir_intrinsic_op intrinsic_op;
+  switch (opcode) {
+  case SpvOpEmitVertex:
+  case SpvOpEmitStreamVertex:
+ intrinsic_op = nir_intrinsic_emit_vertex;
+ break;
+  case SpvOpEndPrimitive:
+  case SpvOpEndStreamPrimitive:
+ intrinsic_op = nir_intrinsic_end_primitive;
+ break;
+  default:
+ unreachable("Invalid opcode");
+  }
+
+  nir_intrinsic_instr *intrin =
+ nir_intrinsic_instr_create(b->shader, intrinsic_op);
+
+  switch (opcode) {
+  case SpvOpEmitStreamVertex:
+  case SpvOpEndStreamPrimitive:
+ nir_intrinsic_set_stream_id(intrin, w[1]);
+ break;
+  default:
+ break;
+  }
+
+  nir_builder_instr_insert(>nb, >instr);
   break;
-   default:
-  unreachable("unknown barrier instruction");
}
 
-   nir_intrinsic_instr *intrin =
-  nir_intrinsic_instr_create(b->shader, intrinsic_op);
+   case SpvOpMemoryBarrier: {
+  SpvScope scope = vtn_constant_value(b, w[1])->values[0].u32[0];
+  SpvMemorySemanticsMask semantics =
+ vtn_constant_value(b, w[2])->values[0].u32[0];
+  vtn_emit_memory_barrier(b, scope, semantics);
+  return;
+   }
+
+   case SpvOpControlBarrier: {
+  SpvScope execution_scope =
+ vtn_constant_value(b, w[1])->values[0].u32[0];
+  if (execution_scope == SpvScopeWorkgroup)
+ vtn_emit_barrier(b, nir_intrinsic_barrier);
 
-   if (opcode == SpvOpEmitStreamVertex || opcode == SpvOpEndStreamPrimitive)
-  nir_intrinsic_set_stream_id(intrin, w[1]);
+  SpvScope memory_scope =
+ 

[Mesa-dev] [PATCH v2 47/52] nir/lower_subgroups: Lower ballot intrinsics to the specified bit size

2017-10-12 Thread Jason Ekstrand
Ballot intrinsics return a bitfield of subgroups.  In GLSL and some
SPIR-V extensions, they return a uint64_t.  In SPV_KHR_shader_ballot,
they return a uvec4.  Also, some back-ends would rather pass around
32-bit values because it's easier than messing with 64-bit all the time.
To solve this mess, we make nir_lower_subgroups take a new parameter
called ballot_bit_size and it lowers whichever thing it gets in from the
source language (uint64_t or uvec4) to a scalar with the specified
number of bits.  This replaces a chunk of the old lowering code.
---
 src/compiler/nir/nir.h |   3 +-
 src/compiler/nir/nir_lower_subgroups.c | 101 +++--
 src/compiler/nir/nir_opt_intrinsics.c  |  18 --
 src/intel/compiler/brw_compiler.c  |   1 -
 src/intel/compiler/brw_nir.c   |   1 +
 5 files changed, 98 insertions(+), 26 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 17efc9b..47c3f21 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1850,8 +1850,6 @@ typedef struct nir_shader_compiler_options {
 */
bool use_interpolated_input_intrinsics;
 
-   unsigned max_subgroup_size;
-
unsigned max_unroll_iterations;
 } nir_shader_compiler_options;
 
@@ -2467,6 +2465,7 @@ bool nir_lower_samplers_as_deref(nir_shader *shader,
  const struct gl_shader_program 
*shader_program);
 
 typedef struct nir_lower_subgroups_options {
+   uint8_t ballot_bit_size;
bool lower_to_scalar:1;
bool lower_vote_trivial:1;
bool lower_subgroup_masks:1;
diff --git a/src/compiler/nir/nir_lower_subgroups.c 
b/src/compiler/nir/nir_lower_subgroups.c
index 02738c4..1969740 100644
--- a/src/compiler/nir/nir_lower_subgroups.c
+++ b/src/compiler/nir/nir_lower_subgroups.c
@@ -28,6 +28,43 @@
  * \file nir_opt_intrinsics.c
  */
 
+/* Converts a uint32_t or uint64_t value to uint64_t or uvec4 */
+static nir_ssa_def *
+uint_to_ballot_type(nir_builder *b, nir_ssa_def *value,
+unsigned num_components, unsigned bit_size,
+uint32_t extend_val)
+{
+   assert(value->num_components == 1);
+   assert(value->bit_size == 32 || value->bit_size == 64);
+
+   nir_ssa_def *extend = nir_imm_int(b, extend_val);
+   if (num_components > 1) {
+  /* SPIR-V uses a uvec4 for ballot values */
+  assert(num_components == 4);
+  assert(bit_size == 32);
+
+  if (value->bit_size == 32) {
+ return nir_vec4(b, value, extend, extend, extend);
+  } else {
+ assert(value->bit_size == 64);
+ return nir_vec4(b, nir_unpack_64_2x32_split_x(b, value),
+nir_unpack_64_2x32_split_y(b, value),
+extend, extend);
+  }
+   } else {
+  /* GLSL uses a uint64_t for ballot values */
+  assert(num_components == 1);
+  assert(bit_size == 64);
+
+  if (value->bit_size == 32) {
+ return nir_pack_64_2x32_split(b, value, extend);
+  } else {
+ assert(value->bit_size == 64);
+ return value;
+  }
+   }
+}
+
 static nir_ssa_def *
 lower_read_invocation_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
 {
@@ -86,24 +123,78 @@ lower_subgroups_intrin(nir_builder *b, nir_intrinsic_instr 
*intrin,
   if (!options->lower_subgroup_masks)
  return NULL;
 
+  uint64_t mask;
+  switch (intrin->intrinsic) {
+  case nir_intrinsic_load_subgroup_eq_mask:
+ mask = 1ull;
+ break;
+  case nir_intrinsic_load_subgroup_ge_mask:
+  case nir_intrinsic_load_subgroup_lt_mask:
+ mask = ~0ull;
+ break;
+  case nir_intrinsic_load_subgroup_gt_mask:
+  case nir_intrinsic_load_subgroup_le_mask:
+ mask = ~1ull;
+ break;
+  default:
+ unreachable("you seriously can't tell this is unreachable?");
+  }
+
   nir_ssa_def *count = nir_load_subgroup_invocation(b);
+  nir_ssa_def *shifted;
+  if (options->ballot_bit_size == 32 && intrin->dest.ssa.bit_size == 32) {
+ assert(intrin->dest.ssa.num_components == 4);
+ shifted = nir_ishl(b, nir_imm_int(b, mask), count);
+  } else {
+ /* We're either working with 64-bit types natively or we're in OpenGL
+  * where we want a uint64_t as our final value.  In either case we
+  * know that we have 64-bit types.  In the first case, we need to use
+  * 64 bits because of the native subgroup size.  In the second, we
+  * want a 64-bit result and a 64-bit shift is likely more efficient
+  * than messing around with 32-bit shifts and packing.
+  */
+ assert(options->ballot_bit_size == 64 ||
+intrin->dest.ssa.bit_size == 64);
+ shifted = nir_ishl(b, nir_imm_int64(b, mask), count);
+  }
+
+  nir_ssa_def *ballot =
+ uint_to_ballot_type(b, shifted,
+ intrin->dest.ssa.num_components,
+ 

[Mesa-dev] [PATCH v2 51/52] nir: Validate base types on array dereferences

2017-10-12 Thread Jason Ekstrand
We were already validating that the parent type goes along with the
child type but we weren't actually validating that the parent type is
reasonable.  This fixes that.
---
 src/compiler/nir/nir_validate.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index cdbe6a6..fc74dea 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -397,7 +397,8 @@ validate_alu_instr(nir_alu_instr *instr, validate_state 
*state)
 }
 
 static void
-validate_deref_chain(nir_deref *deref, validate_state *state)
+validate_deref_chain(nir_deref *deref, nir_variable_mode mode,
+ validate_state *state)
 {
validate_assert(state, deref->child == NULL || ralloc_parent(deref->child) 
== deref);
 
@@ -405,6 +406,19 @@ validate_deref_chain(nir_deref *deref, validate_state 
*state)
while (deref != NULL) {
   switch (deref->deref_type) {
   case nir_deref_type_array:
+ if (mode == nir_var_shared) {
+/* Shared variables have a bit more relaxed rules because we need
+ * to be able to handle array derefs on vectors.  Fortunately,
+ * nir_lower_io handles these just fine.
+ */
+validate_assert(state, glsl_type_is_array(parent->type) ||
+   glsl_type_is_matrix(parent->type) ||
+   glsl_type_is_vector(parent->type));
+ } else {
+/* Most of NIR cannot handle array derefs on vectors */
+validate_assert(state, glsl_type_is_array(parent->type) ||
+   glsl_type_is_matrix(parent->type));
+ }
  validate_assert(state, deref->type == 
glsl_get_array_element(parent->type));
  if (nir_deref_as_array(deref)->deref_array_type ==
  nir_deref_array_type_indirect)
@@ -451,7 +465,7 @@ validate_deref_var(void *parent_mem_ctx, nir_deref_var 
*deref, validate_state *s
 
validate_var_use(deref->var, state);
 
-   validate_deref_chain(>deref, state);
+   validate_deref_chain(>deref, deref->var->data.mode, state);
 }
 
 static void
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 52/52] compiler/nir_types: Handle vectors in glsl_get_array_element

2017-10-12 Thread Jason Ekstrand
Most of NIR doesn't allow doing array indexing on a vector (though it
does on a matrix).  However, nir_lower_io handles it just fine and this
behavior is needed for shared variables in Vulkan.  This commit makes
glsl_get_array_element do something sensible for vector types and makes
nir_validate happy with them.
---
 src/compiler/nir_types.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp
index 5583bc0..978f7d7 100644
--- a/src/compiler/nir_types.cpp
+++ b/src/compiler/nir_types.cpp
@@ -39,6 +39,8 @@ glsl_get_array_element(const glsl_type* type)
 {
if (type->is_matrix())
   return type->column_type();
+   else if (type->is_vector())
+  return type->get_scalar_type();
return type->fields.array;
 }
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 49/52] spirv: Add a vtn_constant_value helper

2017-10-12 Thread Jason Ekstrand
---
 src/compiler/spirv/vtn_private.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/compiler/spirv/vtn_private.h b/src/compiler/spirv/vtn_private.h
index 8458462..e7a7c36 100644
--- a/src/compiler/spirv/vtn_private.h
+++ b/src/compiler/spirv/vtn_private.h
@@ -557,6 +557,12 @@ vtn_value(struct vtn_builder *b, uint32_t value_id,
return val;
 }
 
+static inline nir_constant *
+vtn_constant_value(struct vtn_builder *b, uint32_t value_id)
+{
+   return vtn_value(b, value_id, vtn_value_type_constant)->constant;
+}
+
 void _vtn_warn(const char *file, int line, const char *msg, ...);
 #define vtn_warn(...) _vtn_warn(__FILE__, __LINE__, __VA_ARGS__)
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 44/52] nir: Add a ssa_dest_init_for_type helper

2017-10-12 Thread Jason Ekstrand
This would be useful a number of places
---
 src/compiler/nir/nir.h | 9 +
 1 file changed, 9 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 1154c42..17efc9b 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2227,6 +2227,15 @@ void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
 void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
   unsigned num_components, unsigned bit_size,
   const char *name);
+static inline void
+nir_ssa_dest_init_for_type(nir_instr *instr, nir_dest *dest,
+   const struct glsl_type *type,
+   const char *name)
+{
+   assert(glsl_type_is_vector_or_scalar(type));
+   nir_ssa_dest_init(instr, dest, glsl_get_components(type),
+ glsl_get_bit_size(type), name);
+}
 void nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src);
 void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src,
 nir_instr *after_me);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 42/52] nir/opt_intrinsics: Rework progress

2017-10-12 Thread Jason Ekstrand
This commit fixes two issues:  First, we were returning false regardless
of whether or not the function made progress.  Second, we were calling
nir_metadata_preserve far more often than needed; we only need to call
it once per impl.
---
 src/compiler/nir/nir_opt_intrinsics.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/compiler/nir/nir_opt_intrinsics.c 
b/src/compiler/nir/nir_opt_intrinsics.c
index f12dc87..26a0f96 100644
--- a/src/compiler/nir/nir_opt_intrinsics.c
+++ b/src/compiler/nir/nir_opt_intrinsics.c
@@ -121,8 +121,6 @@ opt_intrinsics_impl(nir_function_impl *impl)
  nir_ssa_def_rewrite_uses(>dest.ssa,
   nir_src_for_ssa(replacement));
  nir_instr_remove(instr);
- nir_metadata_preserve(impl, nir_metadata_block_index |
- nir_metadata_dominance);
  progress = true;
   }
}
@@ -136,9 +134,15 @@ nir_opt_intrinsics(nir_shader *shader)
bool progress = false;
 
nir_foreach_function(function, shader) {
-  if (function->impl)
- progress |= opt_intrinsics_impl(function->impl);
+  if (!function->impl)
+ continue;
+
+  if (opt_intrinsics_impl(function->impl)) {
+ progress = true;
+ nir_metadata_preserve(function->impl, nir_metadata_block_index |
+   nir_metadata_dominance);
+  }
}
 
-   return false;
+   return progress;
 }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 43/52] nir: Add a new subgroups lowering pass

2017-10-12 Thread Jason Ekstrand
This commit pulls nir_lower_read_invocations_to_scalar along with most
of the guts of nir_opt_intrinsics (which mostly does subgroup lowering)
into a new nir_lower_subgroups pass.  There are various other bits of
subgroup lowering that we're going to want to do so it makes a bit more
sense to keep it all together in one pass.  We also move it in i965 to
happen after nir_lower_system_values to ensure that because we want to
handle the subgroup mask system value intrinsics here.
---
 src/compiler/Makefile.sources  |   2 +-
 src/compiler/nir/nir.h |  12 +-
 .../nir/nir_lower_read_invocation_to_scalar.c  | 112 --
 src/compiler/nir/nir_lower_subgroups.c | 161 +
 src/compiler/nir/nir_opt_intrinsics.c  |  51 +--
 src/intel/compiler/brw_compiler.c  |   3 -
 src/intel/compiler/brw_nir.c   |   8 +-
 7 files changed, 184 insertions(+), 165 deletions(-)
 delete mode 100644 src/compiler/nir/nir_lower_read_invocation_to_scalar.c
 create mode 100644 src/compiler/nir/nir_lower_subgroups.c

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 2724a41..912c003 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -232,11 +232,11 @@ NIR_FILES = \
nir/nir_lower_passthrough_edgeflags.c \
nir/nir_lower_patch_vertices.c \
nir/nir_lower_phis_to_scalar.c \
-   nir/nir_lower_read_invocation_to_scalar.c \
nir/nir_lower_regs_to_ssa.c \
nir/nir_lower_returns.c \
nir/nir_lower_samplers.c \
nir/nir_lower_samplers_as_deref.c \
+   nir/nir_lower_subgroups.c \
nir/nir_lower_system_values.c \
nir/nir_lower_tex.c \
nir/nir_lower_to_source_mods.c \
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 5af1503..1154c42 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1831,9 +1831,6 @@ typedef struct nir_shader_compiler_options {
bool lower_extract_byte;
bool lower_extract_word;
 
-   bool lower_vote_trivial;
-   bool lower_subgroup_masks;
-
/**
 * Does the driver support real 32-bit integers?  (Otherwise, integers
 * are simulated by floats.)
@@ -2460,6 +2457,15 @@ bool nir_lower_samplers(nir_shader *shader,
 bool nir_lower_samplers_as_deref(nir_shader *shader,
  const struct gl_shader_program 
*shader_program);
 
+typedef struct nir_lower_subgroups_options {
+   bool lower_to_scalar:1;
+   bool lower_vote_trivial:1;
+   bool lower_subgroup_masks:1;
+} nir_lower_subgroups_options;
+
+bool nir_lower_subgroups(nir_shader *shader,
+ const nir_lower_subgroups_options *options);
+
 bool nir_lower_system_values(nir_shader *shader);
 
 typedef struct nir_lower_tex_options {
diff --git a/src/compiler/nir/nir_lower_read_invocation_to_scalar.c 
b/src/compiler/nir/nir_lower_read_invocation_to_scalar.c
deleted file mode 100644
index 69e7c0a..000
--- a/src/compiler/nir/nir_lower_read_invocation_to_scalar.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright © 2017 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "nir.h"
-#include "nir_builder.h"
-
-/** @file nir_lower_read_invocation_to_scalar.c
- *
- * Replaces nir_intrinsic_read_invocation/nir_intrinsic_read_first_invocation
- * operations with num_components != 1 with individual per-channel operations.
- */
-
-static void
-lower_read_invocation_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
-{
-   b->cursor = nir_before_instr(>instr);
-
-   nir_ssa_def *value = nir_ssa_for_src(b, intrin->src[0], 
intrin->num_components);
-   nir_ssa_def *reads[4];
-
-   for (unsigned i = 0; i < intrin->num_components; i++) {
-  nir_intrinsic_instr *chan_intrin =
- nir_intrinsic_instr_create(b->shader, 

[Mesa-dev] [PATCH v2 48/52] nir, intel/compiler: Use a fixed subgroup size

2017-10-12 Thread Jason Ekstrand
The GL_ARB_shader_ballot spec says that gl_SubGroupSizeARB is declared
as a uniform.  This means that it cannot change across an invocation
such as a draw call or a compute dispatch.  For compute shaders, we're
ok because we only ever use one dispatch size.  For fragment, however,
the hardware dynamically chooses between SIMD8 and SIMD16 which violates
the spec.  Instead, let's just pick a subgroup size based on the shader
stage.  The fixed size we choose for compute shaders is a bit higher
than strictly needed but there's no real harm in that.  The advantage is
that, if they do anything interesting with the value, NIR will see it as
an immediate and can optimize better.
---
 src/compiler/nir/nir.h | 1 +
 src/compiler/nir/nir_lower_subgroups.c | 5 +
 src/intel/compiler/brw_fs_nir.cpp  | 4 
 src/intel/compiler/brw_nir.c   | 2 ++
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 47c3f21..1a87d66 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2465,6 +2465,7 @@ bool nir_lower_samplers_as_deref(nir_shader *shader,
  const struct gl_shader_program 
*shader_program);
 
 typedef struct nir_lower_subgroups_options {
+   uint8_t subgroup_size;
uint8_t ballot_bit_size;
bool lower_to_scalar:1;
bool lower_vote_trivial:1;
diff --git a/src/compiler/nir/nir_lower_subgroups.c 
b/src/compiler/nir/nir_lower_subgroups.c
index 1969740..f9424c9 100644
--- a/src/compiler/nir/nir_lower_subgroups.c
+++ b/src/compiler/nir/nir_lower_subgroups.c
@@ -109,6 +109,11 @@ lower_subgroups_intrin(nir_builder *b, nir_intrinsic_instr 
*intrin,
  return nir_imm_int(b, NIR_TRUE);
   break;
 
+   case nir_intrinsic_load_subgroup_size:
+  if (options->subgroup_size)
+ return nir_imm_int(b, options->subgroup_size);
+  break;
+
case nir_intrinsic_read_invocation:
case nir_intrinsic_read_first_invocation:
   if (options->lower_to_scalar)
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index b0dacb1..58f2698 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4183,10 +4183,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   break;
}
 
-   case nir_intrinsic_load_subgroup_size:
-  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
-  break;
-
case nir_intrinsic_load_subgroup_invocation:
   bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
   nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 57f8de7..560b2f2 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -637,6 +637,8 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
OPT(nir_lower_system_values);
 
const nir_lower_subgroups_options subgroups_options = {
+  .subgroup_size = nir->stage == MESA_SHADER_COMPUTE ? 32 :
+   nir->stage == MESA_SHADER_FRAGMENT ? 16 : 8,
   .ballot_bit_size = 32,
   .lower_to_scalar = true,
   .lower_subgroup_masks = true,
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 46/52] nir/lower_system_values: Lower SUBGROUP_*_MASK based on type

2017-10-12 Thread Jason Ekstrand
The SUBGROUP_*_MASK system values are uint64_t when coming in from GLSL
but uvec4 when coming in from SPIR-V.  Lowering based on type allows us
to nicely handle both.
---
 src/compiler/nir/nir_lower_system_values.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_lower_system_values.c 
b/src/compiler/nir/nir_lower_system_values.c
index c21a468..f3db3847 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -124,8 +124,9 @@ convert_block(nir_block *block, nir_builder *b)
  nir_intrinsic_op op =
 nir_intrinsic_from_system_value(var->data.location);
  nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op);
- nir_ssa_dest_init(>instr, >dest, 1, 64, NULL);
- load->num_components = 1;
+ nir_ssa_dest_init_for_type(>instr, >dest,
+var->type, NULL);
+ load->num_components = load->dest.ssa.num_components;
  nir_builder_instr_insert(b, >instr);
  sysval = >dest.ssa;
  break;
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 39/52] nir/lower_wpos_ytransform: Support system value intrinsics

2017-10-12 Thread Jason Ekstrand
---
 src/compiler/nir/nir_lower_wpos_ytransform.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/compiler/nir/nir_lower_wpos_ytransform.c 
b/src/compiler/nir/nir_lower_wpos_ytransform.c
index 771c6ff..425e4b8 100644
--- a/src/compiler/nir/nir_lower_wpos_ytransform.c
+++ b/src/compiler/nir/nir_lower_wpos_ytransform.c
@@ -314,6 +314,10 @@ lower_wpos_ytransform_block(lower_wpos_ytransform_state 
*state, nir_block *block
assert(dvar->deref.child == NULL);
lower_load_sample_pos(state, intr);
 }
+ } else if (intr->intrinsic == nir_intrinsic_load_frag_coord) {
+lower_fragcoord(state, intr);
+ } else if (intr->intrinsic == nir_intrinsic_load_sample_pos) {
+lower_load_sample_pos(state, intr);
  } else if (intr->intrinsic == nir_intrinsic_interp_var_at_offset) {
 lower_interp_var_at_offset(state, intr);
  }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 40/52] i965/program: Move nir_lower_system_values higher up

2017-10-12 Thread Jason Ekstrand
We want this to get called before nir_lower_subgroups which is going in
brw_preprocess_nir.  Now that nir_lower_wpos_ytransform can handle
system values, this should be safe to do.
---
 src/mesa/drivers/dri/i965/brw_program.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_program.c 
b/src/mesa/drivers/dri/i965/brw_program.c
index 3b54b37..ebb6998 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -89,6 +89,8 @@ brw_create_nir(struct brw_context *brw,
 
nir = brw_preprocess_nir(brw->screen->compiler, nir);
 
+   NIR_PASS_V(nir, nir_lower_system_values);
+
if (stage == MESA_SHADER_FRAGMENT) {
   static const struct nir_lower_wpos_ytransform_options wpos_options = {
  .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
@@ -104,7 +106,6 @@ brw_create_nir(struct brw_context *brw,
   }
}
 
-   NIR_PASS_V(nir, nir_lower_system_values);
NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 
return nir;
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 45/52] nir: Make ballot intrinsics variable-size

2017-10-12 Thread Jason Ekstrand
This way they can return either a uvec4 or a uint64_t.  At the moment,
this is a no-op since we still always return a uint64_t.
---
 src/compiler/glsl/glsl_to_nir.cpp  |  1 +
 src/compiler/nir/nir_intrinsics.h  | 12 ++--
 src/compiler/nir/nir_lower_system_values.c |  1 +
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/compiler/glsl/glsl_to_nir.cpp 
b/src/compiler/glsl/glsl_to_nir.cpp
index 5e9544f..6110aa9 100644
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -1165,6 +1165,7 @@ nir_visitor::visit(ir_call *ir)
   case nir_intrinsic_ballot: {
  nir_ssa_dest_init(>instr, >dest,
ir->return_deref->type->vector_elements, 64, NULL);
+ instr->num_components = ir->return_deref->type->vector_elements;
 
  ir_rvalue *value = (ir_rvalue *) ir->actual_parameters.get_head();
  instr->src[0] = nir_src_for_ssa(evaluate_rvalue(value));
diff --git a/src/compiler/nir/nir_intrinsics.h 
b/src/compiler/nir/nir_intrinsics.h
index 54a51f8..c346c0e 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -102,7 +102,7 @@ INTRINSIC(shader_clock, 0, ARR(0), true, 2, 0, 0, xx, xx, 
xx, NIR_INTRINSIC_CAN_
  *
  * GLSL functions from ARB_shader_ballot.
  */
-INTRINSIC(ballot, 1, ARR(1), true, 1, 0, 0, xx, xx, xx, 
NIR_INTRINSIC_CAN_ELIMINATE)
+INTRINSIC(ballot, 1, ARR(1), true, 0, 0, 0, xx, xx, xx, 
NIR_INTRINSIC_CAN_ELIMINATE)
 INTRINSIC(read_invocation, 2, ARR(0, 1), true, 0, 0, 0, xx, xx, xx, 
NIR_INTRINSIC_CAN_ELIMINATE)
 INTRINSIC(read_first_invocation, 1, ARR(0), true, 0, 0, 0, xx, xx, xx, 
NIR_INTRINSIC_CAN_ELIMINATE)
 
@@ -350,11 +350,11 @@ SYSTEM_VALUE(layer_id, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(view_index, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_size, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_invocation, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_eq_mask, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_ge_mask, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_gt_mask, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_le_mask, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_lt_mask, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_eq_mask, 0, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_ge_mask, 0, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_gt_mask, 0, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_le_mask, 0, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_lt_mask, 0, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_id, 1, 0, xx, xx, xx)
 
 /* Blend constant color values.  Float values are clamped. */
diff --git a/src/compiler/nir/nir_lower_system_values.c 
b/src/compiler/nir/nir_lower_system_values.c
index ba20d30..c21a468 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -125,6 +125,7 @@ convert_block(nir_block *block, nir_builder *b)
 nir_intrinsic_from_system_value(var->data.location);
  nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op);
  nir_ssa_dest_init(>instr, >dest, 1, 64, NULL);
+ load->num_components = 1;
  nir_builder_instr_insert(b, >instr);
  sysval = >dest.ssa;
  break;
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 41/52] intel/compiler: Call nir_lower_system_values in brw_preprocess_nir

2017-10-12 Thread Jason Ekstrand
---
 src/intel/compiler/brw_nir.c| 2 ++
 src/intel/vulkan/anv_pipeline.c | 2 --
 src/mesa/drivers/dri/i965/brw_program.c | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index a04f4af..0a41768 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -635,6 +635,8 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
/* Lower a bunch of stuff */
OPT(nir_lower_var_copies);
 
+   OPT(nir_lower_system_values);
+
OPT(nir_lower_clip_cull_distance_arrays);
 
nir_variable_mode indirect_mask = 0;
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 1fd54dc..491d640 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -194,8 +194,6 @@ anv_shader_compile_to_nir(struct anv_pipeline *pipeline,
 
nir = brw_preprocess_nir(compiler, nir);
 
-   NIR_PASS_V(nir, nir_lower_system_values);
-
if (stage == MESA_SHADER_FRAGMENT)
   NIR_PASS_V(nir, anv_nir_lower_input_attachments);
 
diff --git a/src/mesa/drivers/dri/i965/brw_program.c 
b/src/mesa/drivers/dri/i965/brw_program.c
index ebb6998..6925121 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -89,8 +89,6 @@ brw_create_nir(struct brw_context *brw,
 
nir = brw_preprocess_nir(brw->screen->compiler, nir);
 
-   NIR_PASS_V(nir, nir_lower_system_values);
-
if (stage == MESA_SHADER_FRAGMENT) {
   static const struct nir_lower_wpos_ytransform_options wpos_options = {
  .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 38/52] anv/pipeline: Call nir_lower_system_valaues after brw_preprocess_nir

2017-10-12 Thread Jason Ekstrand
We currently have a bug where nir_lower_system_values gets called before
nir_lower_var_copies so it will miss any system value uses which come
from a copy_var intrinsic.  Moving it to after brw_preprocess_nir fixes
this problem.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/vulkan/anv_pipeline.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 9645d68..1fd54dc 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -188,13 +188,14 @@ anv_shader_compile_to_nir(struct anv_pipeline *pipeline,
NIR_PASS_V(nir, nir_propagate_invariant);
NIR_PASS_V(nir, nir_lower_io_to_temporaries,
   entry_point->impl, true, false);
-   NIR_PASS_V(nir, nir_lower_system_values);
 
/* Vulkan uses the separate-shader linking model */
nir->info.separate_shader = true;
 
nir = brw_preprocess_nir(compiler, nir);
 
+   NIR_PASS_V(nir, nir_lower_system_values);
+
if (stage == MESA_SHADER_FRAGMENT)
   NIR_PASS_V(nir, anv_nir_lower_input_attachments);
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 33/52] intel/eu: Explicitly set EXECUTE_1 where needed

2017-10-12 Thread Jason Ekstrand
---
 src/intel/compiler/brw_eu_emit.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 679832a..0146770 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1896,6 +1896,7 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 
   /* set message header global offset field (reg 0, element 2) */
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_MOV(p,
  retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
  mrf.nr,
@@ -2015,6 +2016,7 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 
   /* set message header global offset field (reg 0, element 2) */
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
 
   brw_pop_insn_state(p);
@@ -2113,6 +2115,7 @@ void brw_oword_block_read(struct brw_codegen *p,
brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 
/* set message header global offset field (reg 0, element 2) */
+   brw_set_default_exec_size(p, BRW_EXECUTE_1);
brw_MOV(p,
   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
   mrf.nr,
@@ -2361,6 +2364,7 @@ void brw_urb_WRITE(struct brw_codegen *p,
   brw_push_insn_state(p);
   brw_set_default_access_mode(p, BRW_ALIGN_1);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
   BRW_REGISTER_TYPE_UD),
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
@@ -2420,6 +2424,7 @@ brw_send_indirect_message(struct brw_codegen *p,
   brw_push_insn_state(p);
   brw_set_default_access_mode(p, BRW_ALIGN_1);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 
   /* Load the indirect descriptor to an address register using OR so the
@@ -2464,6 +2469,7 @@ brw_send_indirect_surface_message(struct brw_codegen *p,
   brw_push_insn_state(p);
   brw_set_default_access_mode(p, BRW_ALIGN_1);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 
   /* Mask out invalid bits from the surface index to avoid hangs e.g. when
@@ -3191,6 +3197,7 @@ brw_find_live_channel(struct brw_codegen *p, struct 
brw_reg dst,
  struct brw_reg exec_mask =
 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
 
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
  if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0x) {
 /* Unfortunately, ce0 does not take into account the thread
  * dispatch mask, which may be a problem in cases where it's not
@@ -3212,6 +3219,7 @@ brw_find_live_channel(struct brw_codegen *p, struct 
brw_reg dst,
   } else {
  const struct brw_reg flag = brw_flag_reg(1, 0);
 
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
  brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
 
  /* Run enough instructions returning zero with execution masking and
@@ -3237,6 +3245,7 @@ brw_find_live_channel(struct brw_codegen *p, struct 
brw_reg dst,
   * instructions.
   */
  const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
  brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
   }
} else {
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 35/52] intel/fs: Don't use automatic exec size inference

2017-10-12 Thread Jason Ekstrand
The automatic exec size inference can accidentally mess things up if
we're not careful.  For instance, if we have

add(4)g38.2<4>Dg38.1<8,2,4>Dg38.2<8,2,4>D

then the destination register will end up having a width of 2 with a
horizontal stride of 4 and a vertical stride of 8.  The EU emit code
sees the width of 2 and decides that we really wanted an exec size of 2
which doesn't do what we wanted.
---
 src/intel/compiler/brw_fs_generator.cpp | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index cffcc32..5d183e4 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -190,6 +190,12 @@ fs_generator::fs_generator(const struct brw_compiler 
*compiler, void *log_data,
 {
p = rzalloc(mem_ctx, struct brw_codegen);
brw_init_codegen(devinfo, p, mem_ctx);
+
+   /* In the FS code generator, we are very careful to ensure that we always
+* set the right execution size so we don't need the EU code to "help" us
+* by trying to infer it.  Sometimes, it infers the wrong thing.
+*/
+   p->automatic_exec_sizes = false;
 }
 
 fs_generator::~fs_generator()
@@ -395,17 +401,17 @@ fs_generator::generate_fb_write(fs_inst *inst, struct 
brw_reg payload)
   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), 
BRW_REGISTER_TYPE_UD));
 
   /* Check runtime bit to detect if we have to send AA data or not */
-  brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
   brw_push_insn_state(p);
-  brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
+  brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_AND(p,
   v1_null_ud,
   retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
   brw_imm_ud(1<<26));
   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, 
BRW_CONDITIONAL_NZ);
-  brw_pop_insn_state(p);
 
   int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
+  brw_pop_insn_state(p);
   {
  /* Don't send AA data */
  fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 34/52] intel/fs: Explicitly set EXECUTE_1 where needed

2017-10-12 Thread Jason Ekstrand
---
 src/intel/compiler/brw_fs.cpp   | 2 +-
 src/intel/compiler/brw_fs_generator.cpp | 7 +++
 src/intel/compiler/brw_fs_nir.cpp   | 8 
 src/intel/compiler/brw_fs_visitor.cpp   | 7 +++
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0ed0431..ddb52ce 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -4286,7 +4286,7 @@ emit_surface_header(const fs_builder , const fs_reg 
_mask)
fs_builder ubld = bld.exec_all().group(8, 0);
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.MOV(dst, brw_imm_d(0));
-   ubld.MOV(component(dst, 7), sample_mask);
+   ubld.group(1, 0).MOV(component(dst, 7), sample_mask);
return dst;
 }
 
diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index ae3df85..cffcc32 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -323,6 +323,7 @@ fs_generator::generate_fb_write(fs_inst *inst, struct 
brw_reg payload)
if (inst->header_size != 0) {
   brw_push_insn_state(p);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
   brw_set_default_flag_reg(p, 0, 0);
@@ -395,11 +396,14 @@ fs_generator::generate_fb_write(fs_inst *inst, struct 
brw_reg payload)
 
   /* Check runtime bit to detect if we have to send AA data or not */
   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+  brw_push_insn_state(p);
+  brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
   brw_AND(p,
   v1_null_ud,
   retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
   brw_imm_ud(1<<26));
   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, 
BRW_CONDITIONAL_NZ);
+  brw_pop_insn_state(p);
 
   int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
   {
@@ -941,6 +945,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
  /* Explicitly set up the message header by copying g0 to the MRF. */
  brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
 
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
  if (inst->offset) {
 /* Set the offset bits in DWord 2. */
 brw_MOV(p, get_element_ud(header_reg, 2),
@@ -994,6 +999,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
   brw_push_insn_state(p);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   brw_set_default_access_mode(p, BRW_ALIGN_1);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
 
   if (brw_regs_equal(_reg, _reg)) {
  brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
@@ -1441,6 +1447,7 @@ fs_generator::generate_mov_dispatch_to_flags(fs_inst 
*inst)
 
brw_push_insn_state(p);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_exec_size(p, BRW_EXECUTE_1);
brw_MOV(p, flags, dispatch_mask);
brw_pop_insn_state(p);
 }
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 0bdd6c9..b0dacb1 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4200,7 +4200,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   unreachable("not reached");
 
case nir_intrinsic_vote_any: {
-  const fs_builder ubld = bld.exec_all();
+  const fs_builder ubld = bld.exec_all().group(1, 0);
 
   /* The any/all predicates do not consider channel enables. To prevent
* dead channels from affecting the result, we initialize the flag with
@@ -4232,7 +4232,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   break;
}
case nir_intrinsic_vote_all: {
-  const fs_builder ubld = bld.exec_all();
+  const fs_builder ubld = bld.exec_all().group(1, 0);
 
   /* The any/all predicates do not consider channel enables. To prevent
* dead channels from affecting the result, we initialize the flag with
@@ -4266,7 +4266,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
case nir_intrinsic_vote_eq: {
   fs_reg value = get_nir_src(instr->src[0]);
   fs_reg uniformized = bld.emit_uniformize(value);
-  const fs_builder ubld = bld.exec_all();
+  const fs_builder ubld = bld.exec_all().group(1, 0);
 
   /* The any/all predicates do not consider channel enables. To prevent
* dead channels from affecting the result, we initialize the flag with
@@ -4305,7 +4305,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   if (dispatch_width == 32)
  flag.type = BRW_REGISTER_TYPE_UD;
 
-  

[Mesa-dev] [PATCH v2 37/52] anv/pipeline: Drop nir_lower_clip_cull_distance_arrays

2017-10-12 Thread Jason Ekstrand
We already handle it in brw_preprocess_nir
---
 src/intel/vulkan/anv_pipeline.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index e08bdd9..9645d68 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -195,8 +195,6 @@ anv_shader_compile_to_nir(struct anv_pipeline *pipeline,
 
nir = brw_preprocess_nir(compiler, nir);
 
-   NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
-
if (stage == MESA_SHADER_FRAGMENT)
   NIR_PASS_V(nir, anv_nir_lower_input_attachments);
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 28/52] intel/cs: Push subgroup ID instead of base thread ID

2017-10-12 Thread Jason Ekstrand
We're going to want subgroup ID for SPIR-V subgroups eventually anyway.
We really only want to push one and calculate the other from it.  It
makes a bit more sense to push the subgroup ID because it's simpler to
calculate and because it's a real API thing.  The only advantage to
pushing the base thread ID is to avoid a single SHL in the shader.
---
 src/compiler/nir/nir_intrinsics.h|  4 +---
 src/intel/compiler/brw_compiler.h|  2 +-
 src/intel/compiler/brw_fs.cpp| 30 
 src/intel/compiler/brw_fs.h  |  2 +-
 src/intel/compiler/brw_fs_nir.cpp|  8 +++
 src/intel/compiler/brw_nir.h |  3 ++-
 src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 15 
 src/intel/vulkan/anv_cmd_buffer.c|  6 ++---
 src/mesa/drivers/dri/i965/gen6_constant_state.c  |  6 ++---
 9 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/src/compiler/nir/nir_intrinsics.h 
b/src/compiler/nir/nir_intrinsics.h
index 9389b74..54a51f8 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -355,6 +355,7 @@ SYSTEM_VALUE(subgroup_ge_mask, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_gt_mask, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_le_mask, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_lt_mask, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_id, 1, 0, xx, xx, xx)
 
 /* Blend constant color values.  Float values are clamped. */
 SYSTEM_VALUE(blend_const_color_r_float, 1, 0, xx, xx, xx)
@@ -364,9 +365,6 @@ SYSTEM_VALUE(blend_const_color_a_float, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(blend_const_color_rgba_unorm, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(blend_const_color__unorm, 1, 0, xx, xx, xx)
 
-/* Intel specific system values */
-SYSTEM_VALUE(intel_thread_local_id, 1, 0, xx, xx, xx)
-
 /**
  * Barycentric coordinate intrinsics.
  *
diff --git a/src/intel/compiler/brw_compiler.h 
b/src/intel/compiler/brw_compiler.h
index 508d4ba..23c2172 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -543,7 +543,7 @@ enum brw_param_builtin {
BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X,
BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y,
 
-   BRW_PARAM_BUILTIN_THREAD_LOCAL_ID,
+   BRW_PARAM_BUILTIN_SUBGROUP_ID,
 };
 
 #define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index a548df7..0ed0431 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1002,7 +1002,7 @@ fs_visitor::import_uniforms(fs_visitor *v)
this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
-   this->thread_local_id = v->thread_local_id;
+   this->subgroup_id = v->subgroup_id;
 }
 
 void
@@ -1937,14 +1937,14 @@ set_push_pull_constant_loc(unsigned uniform, int 
*chunk_start,
 }
 
 static int
-get_thread_local_id_param_index(const brw_stage_prog_data *prog_data)
+get_subgroup_id_param_index(const brw_stage_prog_data *prog_data)
 {
if (prog_data->nr_params == 0)
   return -1;
 
/* The local thread id is always the last parameter in the list */
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
-   if (last_param == BRW_PARAM_BUILTIN_THREAD_LOCAL_ID)
+   if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
   return prog_data->nr_params - 1;
 
return -1;
@@ -2025,7 +2025,7 @@ fs_visitor::assign_constant_locations()
   }
}
 
-   int thread_local_id_index = 
get_thread_local_id_param_index(stage_prog_data);
+   int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
 
/* Only allow 16 registers (128 uniform components) as push constants.
 *
@@ -2036,7 +2036,7 @@ fs_visitor::assign_constant_locations()
 * brw_curbe.c.
 */
unsigned int max_push_components = 16 * 8;
-   if (thread_local_id_index >= 0)
+   if (subgroup_id_index >= 0)
   max_push_components--; /* Save a slot for the thread ID */
 
/* We push small arrays, but no bigger than 16 floats.  This is big enough
@@ -2081,8 +2081,8 @@ fs_visitor::assign_constant_locations()
   if (!is_live[u])
  continue;
 
-  /* Skip thread_local_id_index to put it in the last push register. */
-  if (thread_local_id_index == (int)u)
+  /* Skip subgroup_id_index to put it in the last push register. */
+  if (subgroup_id_index == (int)u)
  continue;
 
   set_push_pull_constant_loc(u, _start, _chunk_bitsize,
@@ -2096,8 +2096,8 @@ fs_visitor::assign_constant_locations()
}
 
/* Add the CS local thread ID uniform at the end of the push constants */
-   if (thread_local_id_index >= 0)
-  push_constant_loc[thread_local_id_index] = num_push_constants++;
+   if (subgroup_id_index >= 0)
+  push_constant_loc[subgroup_id_index] = num_push_constants++;
 
/* As the uniforms are going to be reordered, stash the old array and
 * create two 

[Mesa-dev] [PATCH v2 36/52] anv/pipeline: Dump shader immedately after spirv_to_nir

2017-10-12 Thread Jason Ekstrand
---
 src/intel/vulkan/anv_pipeline.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 7bfdb5c..e08bdd9 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -83,6 +83,15 @@ void anv_DestroyShaderModule(
 
 #define SPIR_V_MAGIC_NUMBER 0x07230203
 
+static const uint64_t stage_to_debug[] = {
+   [MESA_SHADER_VERTEX] = DEBUG_VS,
+   [MESA_SHADER_TESS_CTRL] = DEBUG_TCS,
+   [MESA_SHADER_TESS_EVAL] = DEBUG_TES,
+   [MESA_SHADER_GEOMETRY] = DEBUG_GS,
+   [MESA_SHADER_FRAGMENT] = DEBUG_WM,
+   [MESA_SHADER_COMPUTE] = DEBUG_CS,
+};
+
 /* Eventually, this will become part of anv_CreateShader.  Unfortunately,
  * we can't do that yet because we don't have the ability to copy nir.
  */
@@ -144,6 +153,12 @@ anv_shader_compile_to_nir(struct anv_pipeline *pipeline,
 
free(spec_entries);
 
+   if (unlikely(INTEL_DEBUG & stage_to_debug[stage])) {
+  fprintf(stderr, "NIR (from SPIR-V) for %s shader:\n",
+  gl_shader_stage_name(stage));
+  nir_print_shader(nir, stderr);
+   }
+
/* We have to lower away local constant initializers right before we
 * inline functions.  That way they get properly initialized at the top
 * of the function and not at the top of its caller.
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 29/52] intel/compiler/fs: Set up subgroup invocation as a system value

2017-10-12 Thread Jason Ekstrand
Subgroup invocation is computed using a vector immediate and some
dispatch-aware arithmetic.  Unfortunately, due to the vector arithmetic,
and the fact that it's frequently read 16-wide, it's not something that
can easily be CSEd by the back-end compiler.  There are a few different
possible approaches to this problem:

 1) Emit the code to calculate the subgroup invocation on-the-fly and
trust NIR to do the CSE.  This is what we were doing.

 2) Add a back-end instruction for the subgroup ID.  This has the
advantage of helping the back-end compiler with CSE but has the
downside of very poor scheduling for the calculation because it has
to be emitted in the back-end.

 3) Emit the calculation at the top of the program and re-use the
result.  This gets rid of the CSE problem but comes at the cost of
an extra live register.

This commit switches us from 1) to 3).  We choose to store the subgroup
invocation values as a W type to reduce the impact of the extra live
register.  Trusting NIR and using 1) was fine but we're soon going to
want to use the subgroup invocation value for other things in the
back-end compiler and this makes it much easier to do without having to
worry about CSE problems.
---
 src/intel/compiler/brw_fs_nir.cpp | 34 +-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 5e79bb4..0bdd6c9 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -231,6 +231,24 @@ fs_visitor::nir_emit_system_values()
   nir_system_values[i] = fs_reg();
}
 
+   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
+* never end up using it.
+*/
+   {
+  const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
+  fs_reg  = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+  reg = abld.vgrf(BRW_REGISTER_TYPE_W);
+
+  const fs_builder allbld8 = abld.group(8, 0).exec_all();
+  allbld8.MOV(reg, brw_imm_v(0x76543210));
+  if (dispatch_width > 8)
+ allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
+  if (dispatch_width > 16) {
+ const fs_builder allbld16 = abld.group(16, 0).exec_all();
+ allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
+  }
+   }
+
nir_foreach_function(function, nir) {
   assert(strcmp(function->name, "main") == 0);
   assert(function->impl);
@@ -4169,20 +4187,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
   break;
 
-   case nir_intrinsic_load_subgroup_invocation: {
-  fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
-  dest = retype(dest, BRW_REGISTER_TYPE_UD);
-  const fs_builder allbld8 = bld.group(8, 0).exec_all();
-  allbld8.MOV(tmp, brw_imm_v(0x76543210));
-  if (dispatch_width > 8)
- allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
-  if (dispatch_width > 16) {
- const fs_builder allbld16 = bld.group(16, 0).exec_all();
- allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
-  }
-  bld.MOV(dest, tmp);
+   case nir_intrinsic_load_subgroup_invocation:
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+  nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
   break;
-   }
 
case nir_intrinsic_load_subgroup_eq_mask:
case nir_intrinsic_load_subgroup_ge_mask:
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 32/52] intel/eu: Make automatic exec sizes a configurable option

2017-10-12 Thread Jason Ekstrand
We have had a feature in codegen for some time that tries to
automatically infer the execution size of an instruction from the width
of its destination.  For things such as fixed function GS, clipper, and
SF programs, this is very useful because they tend to have lots of
hand-rolled register setup and trying to specify the exec size all the
time would be prohibitive.  For things that come from a higher-level IR,
however, it's easier to just set the right size all the time and the
automatic exec sizes can, in fact, cause problems.  This commit makes it
optional while enabling it by default.
---
 src/intel/compiler/brw_eu.c  |  1 +
 src/intel/compiler/brw_eu.h  | 10 ++
 src/intel/compiler/brw_eu_emit.c | 32 ++--
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/intel/compiler/brw_eu.c b/src/intel/compiler/brw_eu.c
index b0bdc38..bc297a2 100644
--- a/src/intel/compiler/brw_eu.c
+++ b/src/intel/compiler/brw_eu.c
@@ -296,6 +296,7 @@ brw_init_codegen(const struct gen_device_info *devinfo,
memset(p, 0, sizeof(*p));
 
p->devinfo = devinfo;
+   p->automatic_exec_sizes = true;
/*
 * Set the initial instruction store array size to 1024, if found that
 * isn't enough, then it will double the store size at brw_next_insn()
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 8e597b2..8abebeb 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -65,6 +65,16 @@ struct brw_codegen {
bool compressed_stack[BRW_EU_MAX_INSN_STACK];
brw_inst *current;
 
+   /** Whether or not the user wants automatic exec sizes
+*
+* If true, codegen will try to automatically infer the exec size of an
+* instruction from the width of the destination register.  If false, it
+* will take whatever is set by brw_set_default_exec_size verbatim.
+*
+* This is set to true by default in brw_init_codegen.
+*/
+   bool automatic_exec_sizes;
+
bool single_program_flow;
const struct gen_device_info *devinfo;
 
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index dc0be9a..679832a 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -141,22 +141,26 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, 
struct brw_reg dest)
 
/* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
 * or 16 (SIMD16), as that's normally correct.  However, when dealing with
-* small registers, we automatically reduce it to match the register size.
-*
-* In platforms that support fp64 we can emit instructions with a width of
-* 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
-* cases we need to make sure that these instructions have their exec sizes
-* set properly when they are emitted and we can't rely on this code to fix
-* it.
+* small registers, it can be useful for us toautomatically reduce it to
+* match the register size.
 */
-   bool fix_exec_size;
-   if (devinfo->gen >= 6)
-  fix_exec_size = dest.width < BRW_EXECUTE_4;
-   else
-  fix_exec_size = dest.width < BRW_EXECUTE_8;
+   if (p->automatic_exec_sizes) {
+  /*
+   * In platforms that support fp64 we can emit instructions with a width
+   * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
+   * these cases we need to make sure that these instructions have their
+   * exec sizes set properly when they are emitted and we can't rely on
+   * this code to fix it.
+   */
+  bool fix_exec_size;
+  if (devinfo->gen >= 6)
+ fix_exec_size = dest.width < BRW_EXECUTE_4;
+  else
+ fix_exec_size = dest.width < BRW_EXECUTE_8;
 
-   if (fix_exec_size)
-  brw_inst_set_exec_size(devinfo, inst, dest.width);
+  if (fix_exec_size)
+ brw_inst_set_exec_size(devinfo, inst, dest.width);
+   }
 }
 
 void
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 27/52] intel/cs: Re-run final NIR optimizations for each SIMD size

2017-10-12 Thread Jason Ekstrand
With the advent of SPIR-V subgroup operations, compute shaders will have
to be slightly different depending on the SIMD size at which they
execute.  In order to allow us to do dispatch-width specific things in
NIR, we re-run the final NIR stages for each sIMD width.

As a side-effect of this change, we start using ralloc on fs_visitor so
we need to add DECLARE_RALLOC_OPERATORS to fs_visitor.
---
 src/intel/compiler/brw_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index d3ab385..9ff06b6 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -60,7 +60,7 @@ offset(const fs_reg , const brw::fs_builder , 
unsigned delta)
 class fs_visitor : public backend_shader
 {
 public:
-   DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
+   DECLARE_RALLOC_CXX_OPERATORS(fs_visitor)
 
fs_visitor(const struct brw_compiler *compiler, void *log_data,
   void *mem_ctx,
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 30/52] intel/fs: Rework zero-length URB write handling

2017-10-12 Thread Jason Ekstrand
Originally we tried to handle this case based on slots_valid.  However,
there are a number of ways that this can go wrong.  For one, we throw
away any trailing slots which either aren't written or are set to
VARYING_SLOT_PAD.  Second, even if PSIZ is a valid slot, we may not
actually write anything there.  Between the lot of these, it was
possible to end up in a case where we tried to do a regular URB write
but ended up with a length of 1 which is invalid.  This commit moves it
to the end and makes it based on a new boolean flag urb_written.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_visitor.cpp | 60 ++-
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/src/intel/compiler/brw_fs_visitor.cpp 
b/src/intel/compiler/brw_fs_visitor.cpp
index 9fd4c20..9a19dc2 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -566,34 +566,6 @@ fs_visitor::emit_urb_writes(const fs_reg _vertex_count)
else
   urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 
-   /* If we don't have any valid slots to write, just do a minimal urb write
-* send to terminate the shader.  This includes 1 slot of undefined data,
-* because it's invalid to write 0 data:
-*
-* From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
-* Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
-* Write Data Payload:
-*
-*"The write data payload can be between 1 and 8 message phases long."
-*/
-   if (vue_map->slots_valid == 0) {
-  /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
-   * end the thread, and emit_gs_thread_end() already emits a SEND with
-   * EOT at the end of the program for us.
-   */
-  if (stage == MESA_SHADER_GEOMETRY)
- return;
-
-  fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
-  bld.exec_all().MOV(payload, urb_handle);
-
-  fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, 
payload);
-  inst->eot = true;
-  inst->mlen = 2;
-  inst->offset = 1;
-  return;
-   }
-
opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
int header_size = 1;
fs_reg per_slot_offsets;
@@ -645,6 +617,7 @@ fs_visitor::emit_urb_writes(const fs_reg _vertex_count)
   last_slot--;
}
 
+   bool urb_written = false;
for (slot = 0; slot < vue_map->num_slots; slot++) {
   int varying = vue_map->slot_to_varying[slot];
   switch (varying) {
@@ -730,7 +703,7 @@ fs_visitor::emit_urb_writes(const fs_reg _vertex_count)
* the last slot or if we need to flush (see BAD_FILE varying case
* above), emit a URB write send now to flush out the data.
*/
-  if (length == 8 || slot == last_slot)
+  if (length == 8 || (length > 0 && slot == last_slot))
  flush = true;
   if (flush) {
  fs_reg *payload_sources =
@@ -755,8 +728,37 @@ fs_visitor::emit_urb_writes(const fs_reg _vertex_count)
  urb_offset = starting_urb_offset + slot + 1;
  length = 0;
  flush = false;
+ urb_written = true;
   }
}
+
+   /* If we don't have any valid slots to write, just do a minimal urb write
+* send to terminate the shader.  This includes 1 slot of undefined data,
+* because it's invalid to write 0 data:
+*
+* From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
+* Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
+* Write Data Payload:
+*
+*"The write data payload can be between 1 and 8 message phases long."
+*/
+   if (!urb_written) {
+  /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
+   * end the thread, and emit_gs_thread_end() already emits a SEND with
+   * EOT at the end of the program for us.
+   */
+  if (stage == MESA_SHADER_GEOMETRY)
+ return;
+
+  fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
+  bld.exec_all().MOV(payload, urb_handle);
+
+  fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, 
payload);
+  inst->eot = true;
+  inst->mlen = 2;
+  inst->offset = 1;
+  return;
+   }
 }
 
 void
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 21/52] intel/cs: Drop min_dispatch_width checks from compile_cs

2017-10-12 Thread Jason Ekstrand
The only things that adjust min_dispatch_width are render target writes
which don't happen in compute shaders so they're pointless.
---
 src/intel/compiler/brw_fs.cpp | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 676496f..1a44d00 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6816,8 +6816,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
  NULL, /* Never used in core profile */
  shader, 16, shader_time_index);
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
-   !fail_msg && v8.max_dispatch_width >= 16 &&
-   min_dispatch_width <= 16) {
+   !fail_msg && min_dispatch_width <= 16) {
   /* Try a SIMD16 compile */
   if (min_dispatch_width <= 8)
  v16.import_uniforms();
@@ -6841,8 +6840,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
fs_visitor v32(compiler, log_data, mem_ctx, key, _data->base,
  NULL, /* Never used in core profile */
  shader, 32, shader_time_index);
-   if (!fail_msg && v8.max_dispatch_width >= 32 &&
-   (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
+   if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
   /* Try a SIMD32 compile */
   if (min_dispatch_width <= 8)
  v32.import_uniforms();
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 31/52] intel/eu: Use EXECUTE_1 for JMPI

2017-10-12 Thread Jason Ekstrand
The PRM says "The execution size must be 1."  In 73137997e23ff6c11, the
execution size was set to 1 when it should have been BRW_EXECUTE_1
(which maps to 0).  Later, in dc2d3a7f5c217a7cee9, JMPI was used for
line AA on gen6 and earlier and we started manually stomping the
exeution size to BRW_EXECUTE_1 in the generator.  This commit fixes the
original bug and makes brw_JMPI just do the right thing.

Reviewed-by: Matt Turner 
Fixes: 73137997e23ff6c1145d036315d1a9ad96651281
---
 src/intel/compiler/brw_eu_emit.c| 2 +-
 src/intel/compiler/brw_fs_generator.cpp | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 2b38d95..dc0be9a 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1103,7 +1103,7 @@ brw_JMPI(struct brw_codegen *p, struct brw_reg index,
struct brw_reg ip = brw_ip_reg();
brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
 
-   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
+   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
brw_inst_set_pred_control(devinfo, inst, predicate_control);
diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index 2622a91..ae3df85 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -402,7 +402,6 @@ fs_generator::generate_fb_write(fs_inst *inst, struct 
brw_reg payload)
   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, 
BRW_CONDITIONAL_NZ);
 
   int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
-  brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
   {
  /* Don't send AA data */
  fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 23/52] intel/cs: Ignore runtime_check_aads_emit for CS

2017-10-12 Thread Jason Ekstrand
It's only set on gen4-5 which clearly don't support compute shaders.
---
 src/intel/compiler/brw_fs.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 78998fd..385f500 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6870,8 +6870,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
}
 
fs_generator g(compiler, log_data, mem_ctx, (void*) key, _data->base,
-  v8.promoted_constants, v8.runtime_check_aads_emit,
-  MESA_SHADER_COMPUTE);
+  v8.promoted_constants, false, MESA_SHADER_COMPUTE);
if (INTEL_DEBUG & DEBUG_CS) {
   char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
shader->info.label ? shader->info.label :
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 25/52] intel/cs: Rework the way thread local ID is handled

2017-10-12 Thread Jason Ekstrand
Previously, brw_nir_lower_intrinsics added the param and then emitted a
load_uniform intrinsic to load it directly.  This commit switches things
over to use a specific NIR intrinsic for the thread id.  The one thing I
don't like about this approach is that we have to copy thread_local_id
over to the new visitor in import_uniforms.
---
 src/compiler/nir/nir_intrinsics.h|  3 ++
 src/intel/compiler/brw_fs.cpp|  4 +-
 src/intel/compiler/brw_fs.h  |  1 +
 src/intel/compiler/brw_fs_nir.cpp| 14 +++
 src/intel/compiler/brw_nir.h |  3 +-
 src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 53 +---
 6 files changed, 32 insertions(+), 46 deletions(-)

diff --git a/src/compiler/nir/nir_intrinsics.h 
b/src/compiler/nir/nir_intrinsics.h
index 0de7080..9389b74 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -364,6 +364,9 @@ SYSTEM_VALUE(blend_const_color_a_float, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(blend_const_color_rgba_unorm, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(blend_const_color__unorm, 1, 0, xx, xx, xx)
 
+/* Intel specific system values */
+SYSTEM_VALUE(intel_thread_local_id, 1, 0, xx, xx, xx)
+
 /**
  * Barycentric coordinate intrinsics.
  *
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 39a9e21..5a35a33 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1002,6 +1002,7 @@ fs_visitor::import_uniforms(fs_visitor *v)
this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
+   this->thread_local_id = v->thread_local_id;
 }
 
 void
@@ -6779,8 +6780,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
 {
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
shader = brw_nir_apply_sampler_key(shader, compiler, >tex, true);
-
-   brw_nir_lower_cs_intrinsics(shader, prog_data);
+   brw_nir_lower_cs_intrinsics(shader);
shader = brw_postprocess_nir(shader, compiler, true);
 
prog_data->local_size[0] = shader->info.cs.local_size[0];
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index da32593..f51a4d8 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -315,6 +315,7 @@ public:
 */
int *push_constant_loc;
 
+   fs_reg thread_local_id;
fs_reg frag_depth;
fs_reg frag_stencil;
fs_reg sample_mask;
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index d7e352d..c05c89f 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -88,6 +88,16 @@ fs_visitor::nir_setup_uniforms()
}
 
uniforms = nir->num_uniforms / 4;
+
+   if (stage == MESA_SHADER_COMPUTE) {
+  /* Add a uniform for the thread local id.  It must be the last uniform
+   * on the list.
+   */
+  assert(uniforms == prog_data->nr_params);
+  uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+  *param = BRW_PARAM_BUILTIN_THREAD_LOCAL_ID;
+  thread_local_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+   }
 }
 
 static bool
@@ -3409,6 +3419,10 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
   cs_prog_data->uses_barrier = true;
   break;
 
+   case nir_intrinsic_load_intel_thread_local_id:
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), thread_local_id);
+  break;
+
case nir_intrinsic_load_local_invocation_id:
case nir_intrinsic_load_work_group_id: {
   gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index 1493b74..3e40712 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -95,8 +95,7 @@ void brw_nir_analyze_boolean_resolves(nir_shader *nir);
 nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler,
nir_shader *nir);
 
-bool brw_nir_lower_cs_intrinsics(nir_shader *nir,
- struct brw_cs_prog_data *prog_data);
+bool brw_nir_lower_cs_intrinsics(nir_shader *nir);
 void brw_nir_lower_vs_inputs(nir_shader *nir,
  bool use_legacy_snorm_formula,
  const uint8_t *vs_attrib_wa_flags);
diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c 
b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
index 9b4a0fd..2bf0d54 100644
--- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
@@ -26,47 +26,12 @@
 
 struct lower_intrinsics_state {
nir_shader *nir;
-   struct brw_cs_prog_data *prog_data;
nir_function_impl *impl;
bool progress;
nir_builder builder;
-   int thread_local_id_index;
+   unsigned local_workgroup_size;
 };
 
-static nir_ssa_def *
-read_thread_local_id(struct lower_intrinsics_state 

[Mesa-dev] [PATCH v2 22/52] intel/cs: Stop setting dispatch_grf_start_reg

2017-10-12 Thread Jason Ekstrand
Nothing ever reads it for compute shaders because it's always 1.
---
 src/intel/compiler/brw_compiler.h | 1 -
 src/intel/compiler/brw_fs.cpp | 2 --
 2 files changed, 3 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h 
b/src/intel/compiler/brw_compiler.h
index 014202d..508d4ba 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -734,7 +734,6 @@ struct brw_push_const_block {
 struct brw_cs_prog_data {
struct brw_stage_prog_data base;
 
-   GLuint dispatch_grf_start_reg_16;
unsigned local_size[3];
unsigned simd_size;
unsigned threads;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1a44d00..78998fd 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6808,7 +6808,6 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
  cfg = v8.cfg;
  cs_set_simd_size(prog_data, 8);
  cs_fill_push_const_info(compiler->devinfo, prog_data);
- prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
   }
}
 
@@ -6833,7 +6832,6 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
  cfg = v16.cfg;
  cs_set_simd_size(prog_data, 16);
  cs_fill_push_const_info(compiler->devinfo, prog_data);
- prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
   }
}
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 24/52] intel/fs: Mark 64-bit values as being contiguous

2017-10-12 Thread Jason Ekstrand
This isn't often a problem , when we're in a compute shader, we must
push the thread local ID so we decrement the amount of available push
space by 1 and it's no longer even and 64-bit data can, in theory, span
it.  By marking those uniforms contiguous, we ensure that they never get
split in half between push and pull constants.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 385f500..39a9e21 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1974,7 +1974,7 @@ fs_visitor::assign_constant_locations()
 
/* For each uniform slot, a value of true indicates that the given slot and
 * the next slot must remain contiguous.  This is used to keep us from
-* splitting arrays apart.
+* splitting arrays and 64-bit values apart.
 */
bool contiguous[uniforms];
memset(contiguous, 0, sizeof(contiguous));
@@ -2011,6 +2011,9 @@ fs_visitor::assign_constant_locations()
 if (constant_nr >= 0 && constant_nr < (int) uniforms) {
int regs_read = inst->components_read(i) *
   type_sz(inst->src[i].type) / 4;
+   assert(regs_read <= 2);
+   if (regs_read == 2)
+  contiguous[constant_nr] = true;
for (int j = 0; j < regs_read; j++) {
   is_live[constant_nr + j] = true;
   bitsize_access[constant_nr + j] =
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 13/52] i965/fs: Return a fs_reg from shuffle_64bit_data_for_32bit_write

2017-10-12 Thread Jason Ekstrand
All callers of this function allocate a fs_reg expressly to pass into
it.  It's much easier if we just let the helper allocate the register.
While we're here, we switch it to doing the MOVs with an integer type so
that we don't accidentally canonicalize floats on half of a double.
---
 src/intel/compiler/brw_fs.h   |  7 +++
 src/intel/compiler/brw_fs_nir.cpp | 34 +-
 2 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 2040575..b070d38 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -493,10 +493,9 @@ void shuffle_32bit_load_result_to_64bit_data(const 
brw::fs_builder ,
  const fs_reg ,
  uint32_t components);
 
-void shuffle_64bit_data_for_32bit_write(const brw::fs_builder ,
-const fs_reg ,
-const fs_reg ,
-uint32_t components);
+fs_reg shuffle_64bit_data_for_32bit_write(const brw::fs_builder ,
+  const fs_reg ,
+  uint32_t components);
 fs_reg setup_imm_df(const brw::fs_builder ,
 double v);
 
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index accfafb..47884c8 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2646,10 +2646,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
 * expected by our 32-bit URB write messages. We use a temporary
 * for that.
 */
-   fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
unsigned channel = iter * 2 + i;
-   shuffle_64bit_data_for_32bit_write(bld,
-  retype(dest, BRW_REGISTER_TYPE_F),
+   fs_reg dest = shuffle_64bit_data_for_32bit_write(bld,
   retype(offset(value, bld, 2 * channel), 
BRW_REGISTER_TYPE_DF),
   1);
 
@@ -3506,14 +3504,9 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
   unsigned type_size = 4;
   if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
- fs_reg tmp =
-   fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
- shuffle_64bit_data_for_32bit_write(
-bld,
-retype(tmp, BRW_REGISTER_TYPE_F),
+ val_reg = shuffle_64bit_data_for_32bit_write(bld,
 retype(val_reg, BRW_REGISTER_TYPE_DF),
 instr->num_components);
- val_reg = tmp;
   }
 
   unsigned type_slots = type_size / 4;
@@ -4011,13 +4004,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   unsigned type_size = 4;
   if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
- fs_reg tmp =
-   fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
- shuffle_64bit_data_for_32bit_write(bld,
-retype(tmp, BRW_REGISTER_TYPE_F),
+ val_reg = shuffle_64bit_data_for_32bit_write(bld,
 retype(val_reg, BRW_REGISTER_TYPE_DF),
 instr->num_components);
- val_reg = tmp;
   }
 
   unsigned type_slots = type_size / 4;
@@ -4075,11 +4064,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   unsigned num_components = instr->num_components;
   unsigned first_component = nir_intrinsic_component(instr);
   if (nir_src_bit_size(instr->src[0]) == 64) {
- fs_reg tmp =
-fs_reg(VGRF, alloc.allocate(2 * num_components),
-   BRW_REGISTER_TYPE_F);
- shuffle_64bit_data_for_32bit_write(
-bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
+ fs_reg tmp = shuffle_64bit_data_for_32bit_write(bld,
+retype(src, BRW_REGISTER_TYPE_DF), num_components);
  src = retype(tmp, src.type);
  num_components *= 2;
   }
@@ -4770,24 +4756,22 @@ shuffle_32bit_load_result_to_64bit_data(const 
fs_builder ,
  * 64-bit data they are about to write. Because of this the function checks
  * that the src and dst regions involved in the operation do not overlap.
  */
-void
+fs_reg
 shuffle_64bit_data_for_32bit_write(const fs_builder ,
-   const fs_reg ,
const fs_reg ,
uint32_t components)
 {
assert(type_sz(src.type) == 8);
-   assert(type_sz(dst.type) == 4);
 
-   assert(!regions_overlap(
- dst, 2 * components * dst.component_size(bld.dispatch_width()),
- src, components * src.component_size(bld.dispatch_width(;
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 2 * components);
 
for (unsigned i = 0; i < 

[Mesa-dev] [PATCH v2 17/52] intel/fs: Uniformize the index in readInvocation

2017-10-12 Thread Jason Ekstrand
The index is any value provided by the shader and this can be called in
non-uniform control flow so we can't just take component 0.  Found by
inspection.
---
 src/intel/compiler/brw_fs_nir.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 653d6d8..333bb13 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4298,7 +4298,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   fs_reg tmp = bld.vgrf(value.type);
 
   bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
-  component(invocation, 0));
+  bld.emit_uniformize(invocation));
 
   bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
   fs_reg(component(tmp, 0)));
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 19/52] intel/fs: Assign constant locations if they haven't been assigned

2017-10-12 Thread Jason Ekstrand
Before, we bailing in assign_constant_locations based on the minimum
dispatch size.  The more direct thing to do is simply to check for
whether or not we have constant locations and bail if we do.  For
nir_setup_uniforms, it's completely safe to do it multiple times because
we just copy a value from the NIR shader.
---
 src/intel/compiler/brw_fs.cpp | 4 +++-
 src/intel/compiler/brw_fs_nir.cpp | 5 -
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index e96b077..e629541 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1962,8 +1962,10 @@ void
 fs_visitor::assign_constant_locations()
 {
/* Only the first compile gets to decide on locations. */
-   if (dispatch_width != min_dispatch_width)
+   if (push_constant_loc) {
+  assert(pull_constant_loc);
   return;
+   }
 
bool is_live[uniforms];
memset(is_live, 0, sizeof(is_live));
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index b36a1b9..d7e352d 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -81,8 +81,11 @@ fs_visitor::nir_setup_outputs()
 void
 fs_visitor::nir_setup_uniforms()
 {
-   if (dispatch_width != min_dispatch_width)
+   /* Only the first compile gets to set up uniforms. */
+   if (push_constant_loc) {
+  assert(pull_constant_loc);
   return;
+   }
 
uniforms = nir->num_uniforms / 4;
 }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 16/52] intel/fs: Protect opt_algebraic from OOB BROADCAST indices

2017-10-12 Thread Jason Ekstrand
---
 src/intel/compiler/brw_fs.cpp | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index c72ed17..e96b077 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2422,8 +2422,14 @@ fs_visitor::opt_algebraic()
 progress = true;
  } else if (inst->src[1].file == IMM) {
 inst->opcode = BRW_OPCODE_MOV;
-inst->src[0] = component(inst->src[0],
- inst->src[1].ud);
+/* It's possible that the selected component will be too large and
+ * overflow the register.  If this happens and we some how manage
+ * to constant fold it in and get here, it would cause an assert
+ * in component() below.  Instead, just let it wrap around if it
+ * goes over exec_size.
+ */
+const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
+inst->src[0] = component(inst->src[0], comp);
 inst->sources = 1;
 inst->force_writemask_all = true;
 progress = true;
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 20/52] intel/fs: Remove min_dispatch_width from fs_visitor

2017-10-12 Thread Jason Ekstrand
It's 8 for everything except compute shaders.  For compute shaders,
there's no need to duplicate the computation and it's just a possible
source of error.
---
 src/intel/compiler/brw_fs.cpp | 42 +++
 src/intel/compiler/brw_fs.h   |  5 ++---
 src/intel/compiler/brw_fs_visitor.cpp | 11 -
 3 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index e629541..676496f 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5884,7 +5884,7 @@ fs_visitor::fixup_3src_null_dest()
 }
 
 void
-fs_visitor::allocate_registers(bool allow_spilling)
+fs_visitor::allocate_registers(unsigned min_dispatch_width, bool 
allow_spilling)
 {
bool allocated_without_spills;
 
@@ -6019,7 +6019,7 @@ fs_visitor::run_vs()
assign_vs_urb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(8, true);
 
return !failed;
 }
@@ -6099,7 +6099,7 @@ fs_visitor::run_tcs_single_patch()
assign_tcs_single_patch_urb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(8, true);
 
return !failed;
 }
@@ -6133,7 +6133,7 @@ fs_visitor::run_tes()
assign_tes_urb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(8, true);
 
return !failed;
 }
@@ -6182,7 +6182,7 @@ fs_visitor::run_gs()
assign_gs_urb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(8, true);
 
return !failed;
 }
@@ -6253,7 +6253,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
   assign_urb_setup();
 
   fixup_3src_null_dest();
-  allocate_registers(allow_spilling);
+  allocate_registers(8, allow_spilling);
 
   if (failed)
  return false;
@@ -6263,9 +6263,10 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 }
 
 bool
-fs_visitor::run_cs()
+fs_visitor::run_cs(unsigned min_dispatch_width)
 {
assert(stage == MESA_SHADER_COMPUTE);
+   assert(dispatch_width >= min_dispatch_width);
 
setup_cs_payload();
 
@@ -6296,7 +6297,7 @@ fs_visitor::run_cs()
assign_curb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(min_dispatch_width, true);
 
if (failed)
   return false;
@@ -6786,8 +6787,11 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
   shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
   shader->info.cs.local_size[2];
 
-   unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
-   unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
+   unsigned min_dispatch_width =
+  DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
+   min_dispatch_width = MAX2(8, min_dispatch_width);
+   min_dispatch_width = util_next_power_of_two(min_dispatch_width);
+   assert(min_dispatch_width <= 32);
 
cfg_t *cfg = NULL;
const char *fail_msg = NULL;
@@ -6797,8 +6801,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
fs_visitor v8(compiler, log_data, mem_ctx, key, _data->base,
  NULL, /* Never used in core profile */
  shader, 8, shader_time_index);
-   if (simd_required <= 8) {
-  if (!v8.run_cs()) {
+   if (min_dispatch_width <= 8) {
+  if (!v8.run_cs(min_dispatch_width)) {
  fail_msg = v8.fail_msg;
   } else {
  cfg = v8.cfg;
@@ -6813,11 +6817,11 @@ brw_compile_cs(const struct brw_compiler *compiler, 
void *log_data,
  shader, 16, shader_time_index);
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
!fail_msg && v8.max_dispatch_width >= 16 &&
-   simd_required <= 16) {
+   min_dispatch_width <= 16) {
   /* Try a SIMD16 compile */
-  if (simd_required <= 8)
+  if (min_dispatch_width <= 8)
  v16.import_uniforms();
-  if (!v16.run_cs()) {
+  if (!v16.run_cs(min_dispatch_width)) {
  compiler->shader_perf_log(log_data,
"SIMD16 shader failed to compile: %s",
v16.fail_msg);
@@ -6838,14 +6842,14 @@ brw_compile_cs(const struct brw_compiler *compiler, 
void *log_data,
  NULL, /* Never used in core profile */
  shader, 32, shader_time_index);
if (!fail_msg && v8.max_dispatch_width >= 32 &&
-   (simd_required > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
+   (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
   /* Try a SIMD32 compile */
-  if (simd_required <= 8)
+  if (min_dispatch_width <= 8)
  v32.import_uniforms();
-  else if (simd_required <= 16)
+  else if (min_dispatch_width <= 16)
  v32.import_uniforms();
 
-  if (!v32.run_cs()) {
+  if (!v32.run_cs(min_dispatch_width)) {
  compiler->shader_perf_log(log_data,

[Mesa-dev] [PATCH v2 18/52] intel/fs: Retype dest to match value in read[First]Invocation

2017-10-12 Thread Jason Ekstrand
This is what we really wanted all along.  Always retyping to D works
because that's what get_nir_src() always gives us, at least for 32-bit
types.  The SPIR-V variants of these operations accept arbitrary types
and we need this if we're going to handle 64 or 16-bit values.
---
 src/intel/compiler/brw_fs_nir.cpp | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 333bb13..b36a1b9 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4300,15 +4300,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
   bld.emit_uniformize(invocation));
 
-  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
-  fs_reg(component(tmp, 0)));
+  bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
   break;
}
 
case nir_intrinsic_read_first_invocation: {
   const fs_reg value = get_nir_src(instr->src[0]);
-  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
-  bld.emit_uniformize(value));
+  bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
   break;
}
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 26/52] intel/cs: Re-run final NIR optimizations for each SIMD size

2017-10-12 Thread Jason Ekstrand
With the advent of SPIR-V subgroup operations, compute shaders will have
to be slightly different depending on the SIMD size at which they
execute.  In order to allow us to do dispatch-width specific things in
NIR, we re-run the final NIR stages for each sIMD width.

One side-effect of this change is that we start rallocing fs_visitors
which means we need DECLARE_RALLOC_CXX_OPERATORS.
---
 src/intel/compiler/brw_fs.cpp | 103 ++
 src/intel/compiler/brw_fs.h   |   2 +
 2 files changed, 66 insertions(+), 39 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 5a35a33..a548df7 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6768,6 +6768,20 @@ cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, 
unsigned size)
cs_prog_data->threads = (group_size + size - 1) / size;
 }
 
+static nir_shader *
+compile_cs_to_nir(const struct brw_compiler *compiler,
+  void *mem_ctx,
+  const struct brw_cs_prog_key *key,
+  struct brw_cs_prog_data *prog_data,
+  const nir_shader *src_shader,
+  unsigned dispatch_width)
+{
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler, >tex, true);
+   brw_nir_lower_cs_intrinsics(shader);
+   return brw_postprocess_nir(shader, compiler, true);
+}
+
 const unsigned *
 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
void *mem_ctx,
@@ -6778,17 +6792,12 @@ brw_compile_cs(const struct brw_compiler *compiler, 
void *log_data,
unsigned *final_assembly_size,
char **error_str)
 {
-   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler, >tex, true);
-   brw_nir_lower_cs_intrinsics(shader);
-   shader = brw_postprocess_nir(shader, compiler, true);
-
-   prog_data->local_size[0] = shader->info.cs.local_size[0];
-   prog_data->local_size[1] = shader->info.cs.local_size[1];
-   prog_data->local_size[2] = shader->info.cs.local_size[2];
+   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
+   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
+   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
unsigned local_workgroup_size =
-  shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
-  shader->info.cs.local_size[2];
+  src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
+  src_shader->info.cs.local_size[2];
 
unsigned min_dispatch_width =
   DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
@@ -6796,71 +6805,87 @@ brw_compile_cs(const struct brw_compiler *compiler, 
void *log_data,
min_dispatch_width = util_next_power_of_two(min_dispatch_width);
assert(min_dispatch_width <= 32);
 
+
+   fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
cfg_t *cfg = NULL;
const char *fail_msg = NULL;
+   unsigned promoted_constants;
 
/* Now the main event: Visit the shader IR and generate our CS IR for it.
 */
-   fs_visitor v8(compiler, log_data, mem_ctx, key, _data->base,
- NULL, /* Never used in core profile */
- shader, 8, shader_time_index);
if (min_dispatch_width <= 8) {
-  if (!v8.run_cs(min_dispatch_width)) {
- fail_msg = v8.fail_msg;
+  nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
+   prog_data, src_shader, 8);
+  v8 = new(mem_ctx) fs_visitor(compiler, log_data, mem_ctx, key,
+   _data->base,
+   NULL, /* Never used in core profile */
+   nir8, 8, shader_time_index);
+  if (!v8->run_cs(min_dispatch_width)) {
+ fail_msg = v8->fail_msg;
   } else {
- cfg = v8.cfg;
+ cfg = v8->cfg;
  cs_set_simd_size(prog_data, 8);
  cs_fill_push_const_info(compiler->devinfo, prog_data);
+ promoted_constants = v8->promoted_constants;
   }
}
 
-   fs_visitor v16(compiler, log_data, mem_ctx, key, _data->base,
- NULL, /* Never used in core profile */
- shader, 16, shader_time_index);
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
!fail_msg && min_dispatch_width <= 16) {
   /* Try a SIMD16 compile */
-  if (min_dispatch_width <= 8)
- v16.import_uniforms();
-  if (!v16.run_cs(min_dispatch_width)) {
+  nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
+prog_data, src_shader, 16);
+  v16 = new(mem_ctx) fs_visitor(compiler, log_data, mem_ctx, key,
+_data->base,
+NULL, /* Never used in core profile */
+nir16, 16, 

[Mesa-dev] [PATCH v2 14/52] i965/fs/nir: Minor refactor of store_output

2017-10-12 Thread Jason Ekstrand
Stop retyping the output of shuffle_64bit_data_for_32bit_write.  It's
always BRW_REGISTER_TYPE_D which is perfectly fine for writing out.
Also, when we change get_nir_src to return something with a 64-bit type
for 64-bit values, the retyping will not be at all what we want.  Also,
retyping the output based on src.type before we whack it back to 32 bits
is a problem because the output is always 32 bits.
---
 src/intel/compiler/brw_fs_nir.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 47884c8..138d292 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4058,18 +4058,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
 
   nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
   assert(const_offset && "Indirect output stores not allowed");
-  fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
-  4 * const_offset->u32[0]), src.type);
 
   unsigned num_components = instr->num_components;
   unsigned first_component = nir_intrinsic_component(instr);
   if (nir_src_bit_size(instr->src[0]) == 64) {
  fs_reg tmp = shuffle_64bit_data_for_32bit_write(bld,
 retype(src, BRW_REGISTER_TYPE_DF), num_components);
- src = retype(tmp, src.type);
+ src = tmp;
  num_components *= 2;
   }
 
+  fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
+  4 * const_offset->u32[0]), src.type);
   for (unsigned j = 0; j < num_components; j++) {
  bld.MOV(offset(new_dest, bld, j + first_component),
  offset(src, bld, j));
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 10/52] i965/fs/nir: Use the nir_src_bit_size helper

2017-10-12 Thread Jason Ekstrand
Reviewed-by: Lionel Landwerlin 
---
 src/intel/compiler/brw_fs_nir.cpp | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index e331637..35a9828 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3458,9 +3458,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
* expected by our 32-bit write messages.
*/
   unsigned type_size = 4;
-  unsigned bit_size = instr->src[0].is_ssa ?
- instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
-  if (bit_size == 64) {
+  if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
  fs_reg tmp =
fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
@@ -3965,9 +3963,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
* expected by our 32-bit write messages.
*/
   unsigned type_size = 4;
-  unsigned bit_size = instr->src[0].is_ssa ?
- instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
-  if (bit_size == 64) {
+  if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
  fs_reg tmp =
fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
@@ -4032,9 +4028,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
 
   unsigned num_components = instr->num_components;
   unsigned first_component = nir_intrinsic_component(instr);
-  unsigned bit_size = instr->src[0].is_ssa ?
- instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
-  if (bit_size == 64) {
+  if (nir_src_bit_size(instr->src[0]) == 64) {
  fs_reg tmp =
 fs_reg(VGRF, alloc.allocate(2 * num_components),
BRW_REGISTER_TYPE_F);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 15/52] i965/fs/nir: Don't stomp 64-bit values to D in get_nir_src

2017-10-12 Thread Jason Ekstrand
---
 src/intel/compiler/brw_fs_nir.cpp | 33 +
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 138d292..653d6d8 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1441,11 +1441,19 @@ fs_visitor::get_nir_src(const nir_src )
src.reg.base_offset * src.reg.reg->num_components);
}
 
-   /* to avoid floating-point denorm flushing problems, set the type by
-* default to D - instructions that need floating point semantics will set
-* this to F if they need to
-*/
-   return retype(reg, BRW_REGISTER_TYPE_D);
+   if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
+  /* The only 64-bit type available on gen7 is DF, so use that. */
+  reg.type = BRW_REGISTER_TYPE_DF;
+   } else {
+  /* To avoid floating-point denorm flushing problems, set the type by
+   * default to an integer type - instructions that need floating point
+   * semantics will set this to F if they need to
+   */
+  reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
+BRW_REGISTER_TYPE_D);
+   }
+
+   return reg;
 }
 
 /**
@@ -1455,6 +1463,10 @@ fs_reg
 fs_visitor::get_nir_src_imm(const nir_src )
 {
nir_const_value *val = nir_src_as_const_value(src);
+   /* This function shouldn't be called on anything which can even
+* possibly be 64 bits as it can't do what it claims.
+*/
+   assert(nir_src_bit_size(src) == 32);
return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
 }
 
@@ -2648,8 +2660,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
 */
unsigned channel = iter * 2 + i;
fs_reg dest = shuffle_64bit_data_for_32bit_write(bld,
-  retype(offset(value, bld, 2 * channel), 
BRW_REGISTER_TYPE_DF),
-  1);
+  offset(value, bld, channel), 1);
 
srcs[header_regs + (i + first_component) * 2] = dest;
srcs[header_regs + (i + first_component) * 2 + 1] =
@@ -3505,8 +3516,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
   if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
  val_reg = shuffle_64bit_data_for_32bit_write(bld,
-retype(val_reg, BRW_REGISTER_TYPE_DF),
-instr->num_components);
+val_reg, instr->num_components);
   }
 
   unsigned type_slots = type_size / 4;
@@ -4005,8 +4015,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
  val_reg = shuffle_64bit_data_for_32bit_write(bld,
-retype(val_reg, BRW_REGISTER_TYPE_DF),
-instr->num_components);
+val_reg, instr->num_components);
   }
 
   unsigned type_slots = type_size / 4;
@@ -4063,7 +4072,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   unsigned first_component = nir_intrinsic_component(instr);
   if (nir_src_bit_size(instr->src[0]) == 64) {
  fs_reg tmp = shuffle_64bit_data_for_32bit_write(bld,
-retype(src, BRW_REGISTER_TYPE_DF), num_components);
+src, num_components);
  src = tmp;
  num_components *= 2;
   }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 08/52] i965/fs: Extend the live ranges of VGRFs which leave loops

2017-10-12 Thread Jason Ekstrand
No Shader-db changes.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_live_variables.cpp | 55 
 1 file changed, 55 insertions(+)

diff --git a/src/intel/compiler/brw_fs_live_variables.cpp 
b/src/intel/compiler/brw_fs_live_variables.cpp
index c449672..380060d 100644
--- a/src/intel/compiler/brw_fs_live_variables.cpp
+++ b/src/intel/compiler/brw_fs_live_variables.cpp
@@ -223,6 +223,61 @@ fs_live_variables::compute_start_end()
  }
   }
}
+
+   /* Due to the explicit way the SIMD data is handled on GEN, we need to be a
+* bit more careful with live ranges and loops.  Consider the following
+* example:
+*
+*vec4 color2;
+*while (1) {
+*   vec4 color = texture();
+*   if (...) {
+*  color2 = color * 2;
+*  break;
+*   }
+*}
+*gl_FragColor = color2;
+*
+* In this case, the definition of color2 dominates the use because the
+* loop only has the one exit.  This means that the live range interval for
+* color2 goes from the statement in the if to it's use below the loop.
+* Now suppose that the texture operation has a header register that gets
+* assigned one of the registers used for color2.  If the loop condition is
+* non-uniform and some of the threads will take the and others will
+* continue.  In this case, the next pass through the loop, the WE_all
+* setup of the header register will stomp the disabled channels of color2
+* and corrupt the value.
+*
+* This same problem can occur if you have a mix of 64, 32, and 16-bit
+* registers because the channels do not line up or if you have a SIMD16
+* program and the first half of one value overlaps the second half of the
+* other.
+*
+* To solve this problem, we take any VGRFs whose live ranges cross the
+* while instruction of a loop and extend their live ranges to the top of
+* the loop.  This more accurately models the hardware because the value in
+* the VGRF needs to be carried through subsequent loop iterations in order
+* to remain valid when we finally do break.
+*/
+   foreach_block (block, cfg) {
+  if (block->end()->opcode != BRW_OPCODE_WHILE)
+ continue;
+
+  /* This is a WHILE instrution. Find the DO block. */
+  bblock_t *do_block = NULL;
+  foreach_list_typed(bblock_link, child_link, link, >children) {
+ if (child_link->block->start_ip < block->end_ip) {
+assert(do_block == NULL);
+do_block = child_link->block;
+ }
+  }
+  assert(do_block);
+
+  for (int i = 0; i < num_vars; i++) {
+ if (start[i] < block->end_ip && end[i] > block->end_ip)
+start[i] = MIN2(start[i], do_block->start_ip);
+  }
+   }
 }
 
 fs_live_variables::fs_live_variables(fs_visitor *v, const cfg_t *cfg)
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 01/52] intel/fs: Pass builders instead of blocks into emit_[un]zip

2017-10-12 Thread Jason Ekstrand
This makes it far more explicit where we're inserting the instructions
rather than the magic "before and after" stuff that the emit_[un]zip
helpers did based on block and inst.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp | 50 ---
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 66cb331..dc29765 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5029,8 +5029,7 @@ needs_src_copy(const fs_builder , const fs_inst 
*inst, unsigned i)
  * will be emitted before the given \p inst in \p block.
  */
 static fs_reg
-emit_unzip(const fs_builder , bblock_t *block, fs_inst *inst,
-   unsigned i)
+emit_unzip(const fs_builder , fs_inst *inst, unsigned i)
 {
/* Specified channel group from the source region. */
const fs_reg src = horiz_offset(inst->src[i], lbld.group());
@@ -5045,8 +5044,7 @@ emit_unzip(const fs_builder , bblock_t *block, 
fs_inst *inst,
   const fs_reg tmp = lbld.vgrf(inst->src[i].type, 
inst->components_read(i));
 
   for (unsigned k = 0; k < inst->components_read(i); ++k)
- cbld.at(block, inst)
- .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
+ cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
 
   return tmp;
 
@@ -5116,36 +5114,43 @@ needs_dst_copy(const fs_builder , const fs_inst 
*inst)
  * be emitted around the given \p inst in \p block.
  */
 static fs_reg
-emit_zip(const fs_builder , bblock_t *block, fs_inst *inst)
+emit_zip(const fs_builder _before, const fs_builder _after,
+ fs_inst *inst)
 {
-   /* Builder of the right width to perform the copy avoiding uninitialized
-* data if the lowered execution size is greater than the original
-* execution size of the instruction.
-*/
-   const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
-   inst->exec_size), 0);
+   assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
+   assert(lbld_before.group() == lbld_after.group());
 
/* Specified channel group from the destination region. */
-   const fs_reg dst = horiz_offset(inst->dst, lbld.group());
+   const fs_reg dst = horiz_offset(inst->dst, lbld_after.group());
const unsigned dst_size = inst->size_written /
   inst->dst.component_size(inst->exec_size);
 
-   if (needs_dst_copy(lbld, inst)) {
-  const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
+   if (needs_dst_copy(lbld_after, inst)) {
+  const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
 
   if (inst->predicate) {
  /* Handle predication by copying the original contents of
   * the destination into the temporary before emitting the
   * lowered instruction.
   */
- for (unsigned k = 0; k < dst_size; ++k)
-cbld.at(block, inst)
-.MOV(offset(tmp, lbld, k), offset(dst, inst->exec_size, k));
+ for (unsigned k = 0; k < dst_size; ++k) {
+lbld_before.group(MIN2(lbld_before.dispatch_width(),
+   inst->exec_size), 0)
+   .MOV(offset(tmp, lbld_before, k),
+offset(dst, inst->exec_size, k));
+ }
   }
 
-  for (unsigned k = 0; k < dst_size; ++k)
- cbld.at(block, inst->next)
- .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));
+  for (unsigned k = 0; k < dst_size; ++k) {
+ /* Use a builder of the right width to perform the copy avoiding
+  * uninitialized data if the lowered execution size is greater than
+  * the original execution size of the instruction.
+  */
+ lbld_after.group(MIN2(lbld_after.dispatch_width(),
+   inst->exec_size), 0)
+   .MOV(offset(dst, inst->exec_size, k),
+offset(tmp, lbld_after, k));
+  }
 
   return tmp;
 
@@ -5201,9 +5206,10 @@ fs_visitor::lower_simd_width()
 const fs_builder lbld = ibld.group(lower_width, i);
 
 for (unsigned j = 0; j < inst->sources; j++)
-   split_inst.src[j] = emit_unzip(lbld, block, inst, j);
+   split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
 
-split_inst.dst = emit_zip(lbld, block, inst);
+split_inst.dst = emit_zip(lbld.at(block, inst),
+  lbld.at(block, inst->next), inst);
 split_inst.size_written =
split_inst.dst.component_size(lower_width) * dst_size;
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 06/52] intel/fs: Use an explicit D type for vote any/all/eq intrinsics

2017-10-12 Thread Jason Ekstrand
They return a boolean so this is the right type.  Unfortunately,
get_nir_dest has the annoying behavior of giving us a float type by
default.  This is mostly to work around the fact that gen7 has 64-bit
float but no Q types.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_nir.cpp | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index ffb2d6a..3d9edf7 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4154,6 +4154,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
   }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
+
+  dest.type = BRW_REGISTER_TYPE_D;
   bld.MOV(dest, brw_imm_d(-1));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
@@ -4176,6 +4178,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
   }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
+
+  dest.type = BRW_REGISTER_TYPE_D;
   bld.MOV(dest, brw_imm_d(-1));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
@@ -4200,6 +4204,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
   }
   bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
+
+  dest.type = BRW_REGISTER_TYPE_D;
   bld.MOV(dest, brw_imm_d(-1));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 11/52] i965/fs: Add brw_reg_type_from_bit_size utility method

2017-10-12 Thread Jason Ekstrand
From: Alejandro Piñeiro 

Returns the brw_type for a given ssa.bit_size, and a reference type.
So if bit_size is 64, and the reference type is BRW_REGISTER_TYPE_F,
it returns BRW_REGISTER_TYPE_DF. The same applies if bit_size is 32
and reference type is BRW_REGISTER_TYPE_HF it returns BRW_REGISTER_TYPE_F

v2 (Jason Ekstrand):
 - Use better unreachable() messages
 - Add Q types

Signed-off-by: Jose Maria Casanova Crespo 
Signed-off-by: Alejandro Piñeiro 
---
 src/intel/compiler/brw_fs_nir.cpp | 69 ---
 1 file changed, 64 insertions(+), 5 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 35a9828..58824ab 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -227,6 +227,65 @@ fs_visitor::nir_emit_system_values()
}
 }
 
+/*
+ * Returns a type based on a reference_type (word, float, half-float) and a
+ * given bit_size.
+ *
+ * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
+ *
+ * @FIXME: 64-bit return types are always DF on integer types to maintain
+ * compability with uses of DF previously to the introduction of int64
+ * support.
+ */
+static brw_reg_type
+brw_reg_type_from_bit_size(const unsigned bit_size,
+   const brw_reg_type reference_type)
+{
+   switch(reference_type) {
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_DF:
+  switch(bit_size) {
+  case 16:
+ return BRW_REGISTER_TYPE_HF;
+  case 32:
+ return BRW_REGISTER_TYPE_F;
+  case 64:
+ return BRW_REGISTER_TYPE_DF;
+  default:
+ unreachable("Invalid bit size");
+  }
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_Q:
+  switch(bit_size) {
+  case 16:
+ return BRW_REGISTER_TYPE_W;
+  case 32:
+ return BRW_REGISTER_TYPE_D;
+  case 64:
+ return BRW_REGISTER_TYPE_DF;
+  default:
+ unreachable("Invalid bit size");
+  }
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_UQ:
+  switch(bit_size) {
+  case 16:
+ return BRW_REGISTER_TYPE_UW;
+  case 32:
+ return BRW_REGISTER_TYPE_UD;
+  case 64:
+ return BRW_REGISTER_TYPE_DF;
+  default:
+ unreachable("Invalid bit size");
+  }
+   default:
+  unreachable("Unknown type");
+   }
+}
+
 void
 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 {
@@ -240,7 +299,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
  reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
   unsigned size = array_elems * reg->num_components;
   const brw_reg_type reg_type =
- reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+ brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
   nir_locals[reg->index] = bld.vgrf(reg_type, size);
}
 
@@ -1341,7 +1400,7 @@ fs_visitor::nir_emit_load_const(const fs_builder ,
 nir_load_const_instr *instr)
 {
const brw_reg_type reg_type =
-  instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+  brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
 
switch (instr->def.bit_size) {
@@ -1369,8 +1428,8 @@ fs_visitor::get_nir_src(const nir_src )
fs_reg reg;
if (src.is_ssa) {
   if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
- const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
-BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+ const brw_reg_type reg_type =
+brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
  reg = bld.vgrf(reg_type, src.ssa->num_components);
   } else {
  reg = nir_ssa_values[src.ssa->index];
@@ -1404,7 +1463,7 @@ fs_visitor::get_nir_dest(const nir_dest )
 {
if (dest.is_ssa) {
   const brw_reg_type reg_type =
- dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+ brw_reg_type_from_bit_size(dest.ssa.bit_size, BRW_REGISTER_TYPE_F);
   nir_ssa_values[dest.ssa.index] =
  bld.vgrf(reg_type, dest.ssa.num_components);
   return nir_ssa_values[dest.ssa.index];
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 07/52] intel/fs: Use a pair of 1-wide MOVs instead of SEL for any/all

2017-10-12 Thread Jason Ekstrand
For some reason, the any/all predicates don't work properly with SIMD32.
In particular, it appears that a SEL with a QtrCtrl of 2H doesn't read
the correct subset of the flag register and you end up getting garbage
in the second half.  Work around this by using a pair of 1-wide MOVs and
scattering the result.  This fixes the any/all instructions for SIMD32.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_nir.cpp | 42 ++-
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 3d9edf7..e331637 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4155,12 +4155,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
 
-  dest.type = BRW_REGISTER_TYPE_D;
-  bld.MOV(dest, brw_imm_d(-1));
+  /* For some reason, the any/all predicates don't work properly with
+   * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+   * doesn't read the correct subset of the flag register and you end up
+   * getting garbage in the second half.  Work around this by using a pair
+   * of 1-wide MOVs and scattering the result.
+   */
+  fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+  ubld.MOV(res1, brw_imm_d(0));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
BRW_PREDICATE_ALIGN1_ANY32H,
-bld.SEL(dest, dest, brw_imm_d(0)));
+ubld.MOV(res1, brw_imm_d(-1)));
+
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
   break;
}
case nir_intrinsic_vote_all: {
@@ -4179,12 +4187,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
 
-  dest.type = BRW_REGISTER_TYPE_D;
-  bld.MOV(dest, brw_imm_d(-1));
+  /* For some reason, the any/all predicates don't work properly with
+   * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+   * doesn't read the correct subset of the flag register and you end up
+   * getting garbage in the second half.  Work around this by using a pair
+   * of 1-wide MOVs and scattering the result.
+   */
+  fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+  ubld.MOV(res1, brw_imm_d(0));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
BRW_PREDICATE_ALIGN1_ALL32H,
-bld.SEL(dest, dest, brw_imm_d(0)));
+ubld.MOV(res1, brw_imm_d(-1)));
+
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
   break;
}
case nir_intrinsic_vote_eq: {
@@ -4205,12 +4221,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   }
   bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
 
-  dest.type = BRW_REGISTER_TYPE_D;
-  bld.MOV(dest, brw_imm_d(-1));
+  /* For some reason, the any/all predicates don't work properly with
+   * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+   * doesn't read the correct subset of the flag register and you end up
+   * getting garbage in the second half.  Work around this by using a pair
+   * of 1-wide MOVs and scattering the result.
+   */
+  fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+  ubld.MOV(res1, brw_imm_d(0));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
BRW_PREDICATE_ALIGN1_ALL32H,
-bld.SEL(dest, dest, brw_imm_d(0)));
+ubld.MOV(res1, brw_imm_d(-1)));
+
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
   break;
}
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 03/52] intel/fs: Handle flag read/write aliasing in needs_src_copy

2017-10-12 Thread Jason Ekstrand
In order to implement the ballot intrinsic, we do a MOV from flag
register to some GRF.  If that GRF is used in a SEL, cmod propagation
helpfully changes it into a MOV from the flag register with a cmod.
This is perfectly valid but when lower_simd_width comes along, it simply
splits into two instructions which both have conditional modifiers.
This is a problem since we're reading the flag register.  This commit
makes us check whether or not flags_written() overlaps with the flag
values that we are reading via the instruction source and, if we have
any interference, will force us to emit a copy of the source.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0eebc70..c72ed17 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5019,7 +5019,9 @@ needs_src_copy(const fs_builder , const fs_inst 
*inst, unsigned i)
 {
return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
 (inst->components_read(i) == 1 &&
- lbld.dispatch_width() <= inst->exec_size));
+ lbld.dispatch_width() <= inst->exec_size)) ||
+  (inst->flags_written() &
+   flag_mask(inst->src[i], type_sz(inst->src[i].type)));
 }
 
 /**
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 12/52] i965/fs/nir: Simplify 64-bit store_output

2017-10-12 Thread Jason Ekstrand
The swizzles weren't doing any good because swiz is just XYZW.  Also, we
were emitting an extra set of MOVs because shuffle_64bit_data_for_32bit
already does a MOV for us.  Finally, the temporary was only ever used
inside the inner loop so there's no need for it to actually be an array.
---
 src/intel/compiler/brw_fs_nir.cpp | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 58824ab..accfafb 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2568,7 +2568,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
  instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
   fs_reg indirect_offset = get_indirect_offset(instr);
   unsigned imm_offset = instr->const_index[0];
-  unsigned swiz = BRW_SWIZZLE_XYZW;
   unsigned mask = instr->const_index[1];
   unsigned header_regs = 0;
   fs_reg srcs[7];
@@ -2598,13 +2597,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
  }
   }
 
-  /* 64-bit data needs to me shuffled before we can write it to the URB.
-   * We will use this temporary to shuffle the components in each
-   * iteration.
-   */
-  fs_reg tmp =
- fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
-
   mask = mask << first_component;
 
   for (unsigned iter = 0; iter < num_iterations; iter++) {
@@ -2648,26 +2640,21 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder 
,
continue;
 
 if (!is_64bit) {
-   srcs[header_regs + i + first_component] =
-  offset(value, bld, BRW_GET_SWZ(swiz, i));
+   srcs[header_regs + i + first_component] = offset(value, bld, i);
 } else {
/* We need to shuffle the 64-bit data to match the layout
 * expected by our 32-bit URB write messages. We use a temporary
 * for that.
 */
-   unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
+   fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
+   unsigned channel = iter * 2 + i;
shuffle_64bit_data_for_32bit_write(bld,
-  retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
+  retype(dest, BRW_REGISTER_TYPE_F),
   retype(offset(value, bld, 2 * channel), 
BRW_REGISTER_TYPE_DF),
   1);
 
-   /* Now copy the data to the destination */
-   fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
-   unsigned idx = 2 * i;
-   bld.MOV(dest, offset(tmp, bld, idx));
-   bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
-   srcs[header_regs + idx + first_component * 2] = dest;
-   srcs[header_regs + idx + 1 + first_component * 2] =
+   srcs[header_regs + (i + first_component) * 2] = dest;
+   srcs[header_regs + (i + first_component) * 2 + 1] =
   offset(dest, bld, 1);
 }
  }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 02/52] intel/fs: Be more explicit about our placement of [un]zip

2017-10-12 Thread Jason Ekstrand
Before, we were careful to place the zip after the last of the split
instructions but did unzip on-demand.  This changes things so that the
unzips go before all of the split instructions and the unzip comes
explicitly after all the split instructions.  As a side-effect of this
change, we now emit the split instruction from highest SIMD group to
lowest instead of low to high.  We could have kept the old behavior, but
it shouldn't matter and this made the code easier.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index dc29765..0eebc70 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5190,6 +5190,7 @@ fs_visitor::lower_simd_width()
 
  assert(!inst->writes_accumulator && !inst->mlen);
 
+ exec_node * const after_inst = inst->next;
  for (unsigned i = 0; i < n; i++) {
 /* Emit a copy of the original instruction with the lowered width.
  * If the EOT flag was set throw it away except for the last
@@ -5197,7 +5198,7 @@ fs_visitor::lower_simd_width()
  */
 fs_inst split_inst = *inst;
 split_inst.exec_size = lower_width;
-split_inst.eot = inst->eot && i == n - 1;
+split_inst.eot = inst->eot && i == 0;
 
 /* Select the correct channel enables for the i-th group, then
  * transform the sources and destination and emit the lowered
@@ -5209,11 +5210,11 @@ fs_visitor::lower_simd_width()
split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
 
 split_inst.dst = emit_zip(lbld.at(block, inst),
-  lbld.at(block, inst->next), inst);
+  lbld.at(block, after_inst), inst);
 split_inst.size_written =
split_inst.dst.component_size(lower_width) * dst_size;
 
-lbld.emit(split_inst);
+lbld.at(block, inst->next).emit(split_inst);
  }
 
  inst->remove(block);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 05/52] intel/fs: Don't stomp f0.1 in SIMD16 ballot

2017-10-12 Thread Jason Ekstrand
In fragment shaders f0.1 is used for discards so doing ballot after a
discard can potentially cause the discard to not happen.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_nir.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 513ff3e..ffb2d6a 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4211,8 +4211,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
case nir_intrinsic_ballot: {
   const fs_reg value = retype(get_nir_src(instr->src[0]),
   BRW_REGISTER_TYPE_UD);
-  const struct brw_reg flag = retype(brw_flag_reg(0, 0),
- BRW_REGISTER_TYPE_UD);
+  struct brw_reg flag = brw_flag_reg(0, 0);
+  if (dispatch_width == 32)
+ flag.type = BRW_REGISTER_TYPE_UD;
 
   bld.exec_all().MOV(flag, brw_imm_ud(0u));
   bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 04/52] intel/fs: Use ANY/ALL32 predicates in SIMD32

2017-10-12 Thread Jason Ekstrand
We have ANY/ALL32 predicates and, for the most part, they work just
fine.  (See the next commit for more details.)  Also, due to the way
that flag registers are handled in hardware, instruction splitting is
able to split the CMP correctly.  Specifically, that hardware looks at
the execution group and knows to shift it's flag usage up correctly so a
2H instruction will write to f0.1 instead of f0.0.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_nir.cpp | 42 ---
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 7ed44f5..513ff3e 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4146,12 +4146,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
-  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
+  if (dispatch_width == 32) {
+ /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+ ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0));
+  } else {
+ ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
+  }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
   bld.MOV(dest, brw_imm_d(-1));
-  set_predicate(dispatch_width == 8 ?
-BRW_PREDICATE_ALIGN1_ANY8H :
-BRW_PREDICATE_ALIGN1_ANY16H,
+  set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
+dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
+   BRW_PREDICATE_ALIGN1_ANY32H,
 bld.SEL(dest, dest, brw_imm_d(0)));
   break;
}
@@ -4162,12 +4168,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
-  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
+  if (dispatch_width == 32) {
+ /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+ ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0x));
+  } else {
+ ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
+  }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
   bld.MOV(dest, brw_imm_d(-1));
-  set_predicate(dispatch_width == 8 ?
-BRW_PREDICATE_ALIGN1_ALL8H :
-BRW_PREDICATE_ALIGN1_ALL16H,
+  set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+   BRW_PREDICATE_ALIGN1_ALL32H,
 bld.SEL(dest, dest, brw_imm_d(0)));
   break;
}
@@ -4180,12 +4192,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
-  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
+  if (dispatch_width == 32) {
+ /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+ ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0x));
+  } else {
+ ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
+  }
   bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
   bld.MOV(dest, brw_imm_d(-1));
-  set_predicate(dispatch_width == 8 ?
-BRW_PREDICATE_ALIGN1_ALL8H :
-BRW_PREDICATE_ALIGN1_ALL16H,
+  set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+   BRW_PREDICATE_ALIGN1_ALL32H,
 bld.SEL(dest, dest, brw_imm_d(0)));
   break;
}
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 09/52] intel/fs: Restrict live intervals to the subset possibly reachable from any definition.

2017-10-12 Thread Jason Ekstrand
From: Francisco Jerez 

Currently the liveness analysis pass would extend a live interval up
to the top of the program when no unconditional and complete
definition of the variable is found that dominates all of its uses.

This can lead to a serious performance problem in shaders containing
many partial writes, like scalar arithmetic, FP64 and soon FP16
operations.  The number of oversize live intervals in such workloads
can cause the compilation time of the shader to explode because of the
worse than quadratic behavior of the register allocator and scheduler
when running out of registers, and it can also cause the running time
of the shader to explode due to the amount of spilling it leads to,
which is orders of magnitude slower than GRF memory.

This patch fixes it by computing the intersection of our current live
intervals with the subset of the program that can possibly be reached
from any definition of the variable.  Extending the storage allocation
of the variable beyond that is pretty useless because its value is
guaranteed to be undefined at a point that cannot be reached from any
definition.

No significant change in the running time of shader-db (with 5%
statistical significance).

shader-db results on IVB:

  total cycles in shared programs: 61108780 -> 60932856 (-0.29%)
  cycles in affected programs: 16335482 -> 16159558 (-1.08%)
  helped: 5121
  HURT: 4347

  total spills in shared programs: 1309 -> 1288 (-1.60%)
  spills in affected programs: 249 -> 228 (-8.43%)
  helped: 3
  HURT: 0

  total fills in shared programs: 1652 -> 1597 (-3.33%)
  fills in affected programs: 262 -> 207 (-20.99%)
  helped: 4
  HURT: 0

  LOST:   2
  GAINED: 209

shader-db results on BDW:

  total cycles in shared programs: 67617262 -> 67361220 (-0.38%)
  cycles in affected programs: 23397142 -> 23141100 (-1.09%)
  helped: 8045
  HURT: 6488

  total spills in shared programs: 1456 -> 1252 (-14.01%)
  spills in affected programs: 465 -> 261 (-43.87%)
  helped: 3
  HURT: 0

  total fills in shared programs: 1720 -> 1465 (-14.83%)
  fills in affected programs: 471 -> 216 (-54.14%)
  helped: 4
  HURT: 0

  LOST:   2
  GAINED: 162

shader-db results on SKL:

  total cycles in shared programs: 65436248 -> 65245186 (-0.29%)
  cycles in affected programs: 22560936 -> 22369874 (-0.85%)
  helped: 8457
  HURT: 6247

  total spills in shared programs: 437 -> 437 (0.00%)
  spills in affected programs: 0 -> 0
  helped: 0
  HURT: 0

  total fills in shared programs: 870 -> 854 (-1.84%)
  fills in affected programs: 16 -> 0
  helped: 1
  HURT: 0

  LOST:   0
  GAINED: 107
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_live_variables.cpp | 34 
 src/intel/compiler/brw_fs_live_variables.h   | 12 ++
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_live_variables.cpp 
b/src/intel/compiler/brw_fs_live_variables.cpp
index 380060d..6330cff 100644
--- a/src/intel/compiler/brw_fs_live_variables.cpp
+++ b/src/intel/compiler/brw_fs_live_variables.cpp
@@ -83,9 +83,11 @@ fs_live_variables::setup_one_write(struct block_data *bd, 
fs_inst *inst,
/* The def[] bitset marks when an initialization in a block completely
 * screens off previous updates of that variable (VGRF channel).
 */
-   if (inst->dst.file == VGRF && !inst->is_partial_write()) {
-  if (!BITSET_TEST(bd->use, var))
+   if (inst->dst.file == VGRF) {
+  if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
  BITSET_SET(bd->def, var);
+
+  BITSET_SET(bd->defout, var);
}
 }
 
@@ -199,6 +201,28 @@ fs_live_variables::compute_live_variables()
  }
   }
}
+
+   /* Propagate defin and defout down the CFG to calculate the union of live
+* variables potentially defined along any possible control flow path.
+*/
+   do {
+  cont = false;
+
+  foreach_block (block, cfg) {
+ const struct block_data *bd = _data[block->num];
+
+foreach_list_typed(bblock_link, child_link, link, >children) {
+struct block_data *child_bd = _data[child_link->block->num];
+
+   for (int i = 0; i < bitset_words; i++) {
+   const BITSET_WORD new_def = bd->defout[i] & ~child_bd->defin[i];
+   child_bd->defin[i] |= new_def;
+   child_bd->defout[i] |= new_def;
+   cont |= new_def;
+   }
+}
+  }
+   } while (cont);
 }
 
 /**
@@ -212,12 +236,12 @@ fs_live_variables::compute_start_end()
   struct block_data *bd = _data[block->num];
 
   for (int i = 0; i < num_vars; i++) {
- if (BITSET_TEST(bd->livein, i)) {
+ if (BITSET_TEST(bd->livein, i) && BITSET_TEST(bd->defin, i)) {
 start[i] = MIN2(start[i], block->start_ip);
 end[i] = MAX2(end[i], block->start_ip);
  }
 
- if (BITSET_TEST(bd->liveout, i)) {
+ if (BITSET_TEST(bd->liveout, i) && 

[Mesa-dev] [PATCH v2 00/52] nir, intel: Prerequisites for subgroups

2017-10-12 Thread Jason Ekstrand
A little over a month ago, I sent a 44 patch series with a bunch of the
prerequisite patches for implementing SPIR-V subgroup support.  This is a
re-spin of that series with a few more patches.  Most of the new fixes are
either because of rebasing on top of my uniform reworks or are fixes for
SIMD32.  As of now, I have all but 8 of the subgroups tests passing with
SIMD32 and those 8 appear to be issues with spilling but I'm not 100% sure.

Some of the patches in here overlap a bit with stuff that Connor did in his
series for radv.  In particular, I've taken a different approach which I
like better to sorting out uint64_t vs. uvec4 for ballot intrinsics.

Cc: Matt Turner 
Cc: Francisco Jerez 
Cc: Connor Abbott 

Alejandro Piñeiro (1):
  i965/fs: Add brw_reg_type_from_bit_size utility method

Francisco Jerez (1):
  intel/fs: Restrict live intervals to the subset possibly reachable
from any definition.

Jason Ekstrand (50):
  intel/fs: Pass builders instead of blocks into emit_[un]zip
  intel/fs: Be more explicit about our placement of [un]zip
  intel/fs: Handle flag read/write aliasing in needs_src_copy
  intel/fs: Use ANY/ALL32 predicates in SIMD32
  intel/fs: Don't stomp f0.1 in SIMD16 ballot
  intel/fs: Use an explicit D type for vote any/all/eq intrinsics
  intel/fs: Use a pair of 1-wide MOVs instead of SEL for any/all
  i965/fs: Extend the live ranges of VGRFs which leave loops
  i965/fs/nir: Use the nir_src_bit_size helper
  i965/fs/nir: Simplify 64-bit store_output
  i965/fs: Return a fs_reg from shuffle_64bit_data_for_32bit_write
  i965/fs/nir: Minor refactor of store_output
  i965/fs/nir: Don't stomp 64-bit values to D in get_nir_src
  intel/fs: Protect opt_algebraic from OOB BROADCAST indices
  intel/fs: Uniformize the index in readInvocation
  intel/fs: Retype dest to match value in read[First]Invocation
  intel/fs: Assign constant locations if they haven't been assigned
  intel/fs: Remove min_dispatch_width from fs_visitor
  intel/cs: Drop min_dispatch_width checks from compile_cs
  intel/cs: Stop setting dispatch_grf_start_reg
  intel/cs: Ignore runtime_check_aads_emit for CS
  intel/fs: Mark 64-bit values as being contiguous
  intel/cs: Rework the way thread local ID is handled
  intel/cs: Re-run final NIR optimizations for each SIMD size
  intel/cs: Re-run final NIR optimizations for each SIMD size
  intel/cs: Push subgroup ID instead of base thread ID
  intel/compiler/fs: Set up subgroup invocation as a system value
  intel/fs: Rework zero-length URB write handling
  intel/eu: Use EXECUTE_1 for JMPI
  intel/eu: Make automatic exec sizes a configurable option
  intel/eu: Explicitly set EXECUTE_1 where needed
  intel/fs: Explicitly set EXECUTE_1 where needed
  intel/fs: Don't use automatic exec size inference
  anv/pipeline: Dump shader immedately after spirv_to_nir
  anv/pipeline: Drop nir_lower_clip_cull_distance_arrays
  anv/pipeline: Call nir_lower_system_valaues after brw_preprocess_nir
  nir/lower_wpos_ytransform: Support system value intrinsics
  i965/program: Move nir_lower_system_values higher up
  intel/compiler: Call nir_lower_system_values in brw_preprocess_nir
  nir/opt_intrinsics: Rework progress
  nir: Add a new subgroups lowering pass
  nir: Add a ssa_dest_init_for_type helper
  nir: Make ballot intrinsics variable-size
  nir/lower_system_values: Lower SUBGROUP_*_MASK based on type
  nir/lower_subgroups: Lower ballot intrinsics to the specified bit size
  nir,intel/compiler: Use a fixed subgroup size
  spirv: Add a vtn_constant_value helper
  spirv: Rework barriers
  nir: Validate base types on array dereferences
  compiler/nir_types: Handle vectors in glsl_get_array_element

 src/compiler/Makefile.sources  |   2 +-
 src/compiler/glsl/glsl_to_nir.cpp  |   1 +
 src/compiler/nir/nir.h |  25 +-
 src/compiler/nir/nir_intrinsics.h  |  13 +-
 .../nir/nir_lower_read_invocation_to_scalar.c  | 112 ---
 src/compiler/nir/nir_lower_subgroups.c | 257 
 src/compiler/nir/nir_lower_system_values.c |   4 +-
 src/compiler/nir/nir_lower_wpos_ytransform.c   |   4 +
 src/compiler/nir/nir_opt_intrinsics.c  |  83 +
 src/compiler/nir/nir_validate.c|  18 +-
 src/compiler/nir_types.cpp |   2 +
 src/compiler/spirv/spirv_to_nir.c  | 132 ++--
 src/compiler/spirv/vtn_private.h   |   6 +
 src/intel/compiler/brw_compiler.c  |   4 -
 src/intel/compiler/brw_compiler.h  |   3 +-
 src/intel/compiler/brw_eu.c|   1 +
 src/intel/compiler/brw_eu.h|  10 +
 src/intel/compiler/brw_eu_emit.c   |  43 ++-
 src/intel/compiler/brw_fs.cpp  | 246 +--
 src/intel/compiler/brw_fs.h|  15 +-
 

[Mesa-dev] [AppVeyor] mesa master #5788 failed

2017-10-12 Thread AppVeyor



Build mesa 5788 failed


Commit b8ab78d1af by Jason Ekstrand on 10/11/2017 7:13 PM:

anv/pipeline_cache: Rework to use multialloc and blob\n\nThis gets rid of all of our hand-rolled size calculation and\nserialization code and replaces it with safe "standards" that are used\nelsewhere in anv and mesa.  This should be significantly safer than\nrolling our own.\n\nReviewed-by: Jordan Justen 


Configure your notification preferences

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 08/11] compiler/blob: Make some parameters void instead of uint8_t

2017-10-12 Thread Jason Ekstrand
On Wed, Oct 11, 2017 at 11:26 PM, Nicolai Hähnle  wrote:

> I've sent some minor comments on patches #3, #5, and #6. Also, spot the
> typo in the title of patch #4 :)
>
> With those addressed, patches 1-8 are:
>
> Reviewed-by: Nicolai Hähnle 
>

Thanks!


> On 11.10.2017 22:38, Jason Ekstrand wrote:
>
>> There are certain advantages to using uint8_t internally such as
>> well-defined arithmetic on all platforms.  However, interfaces that
>> work in terms of raw data should use a void* type.
>> ---
>>   src/compiler/blob.c | 6 +++---
>>   src/compiler/blob.h | 4 ++--
>>   2 files changed, 5 insertions(+), 5 deletions(-)
>>
>> diff --git a/src/compiler/blob.c b/src/compiler/blob.c
>> index 4ebe94b..f523423 100644
>> --- a/src/compiler/blob.c
>> +++ b/src/compiler/blob.c
>> @@ -236,7 +236,7 @@ blob_write_string(struct blob *blob, const char *str)
>>   }
>> void
>> -blob_reader_init(struct blob_reader *blob, const uint8_t *data, size_t
>> size)
>> +blob_reader_init(struct blob_reader *blob, const void *data, size_t size)
>>   {
>>  blob->data = data;
>>  blob->end = blob->data + size;
>> @@ -278,9 +278,9 @@ blob_read_bytes(struct blob_reader *blob, size_t size)
>>   }
>> void
>> -blob_copy_bytes(struct blob_reader *blob, uint8_t *dest, size_t size)
>> +blob_copy_bytes(struct blob_reader *blob, void *dest, size_t size)
>>   {
>> -   const uint8_t *bytes;
>> +   const void *bytes;
>>bytes = blob_read_bytes(blob, size);
>>  if (bytes == NULL)
>> diff --git a/src/compiler/blob.h b/src/compiler/blob.h
>> index 547d49b..71ffcfe 100644
>> --- a/src/compiler/blob.h
>> +++ b/src/compiler/blob.h
>> @@ -269,7 +269,7 @@ blob_write_string(struct blob *blob, const char *str);
>>* current value is unchanged before and after the call.
>>*/
>>   void
>> -blob_reader_init(struct blob_reader *blob, const uint8_t *data, size_t
>> size);
>> +blob_reader_init(struct blob_reader *blob, const void *data, size_t
>> size);
>> /**
>>* Read some unstructured, fixed-size data from the current location,
>> (and
>> @@ -289,7 +289,7 @@ blob_read_bytes(struct blob_reader *blob, size_t
>> size);
>>* it to \dest (and update the current location to just past this data)
>>*/
>>   void
>> -blob_copy_bytes(struct blob_reader *blob, uint8_t *dest, size_t size);
>> +blob_copy_bytes(struct blob_reader *blob, void *dest, size_t size);
>> /**
>>* Read a uint32_t from the current location, (and update the current
>> location
>>
>>
>
> --
> Lerne, wie die Welt wirklich ist,
> Aber vergiss niemals, wie sie sein sollte.
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 03/11] compiler: Move blob up a level

2017-10-12 Thread Jason Ekstrand
On Thu, Oct 12, 2017 at 3:27 PM, Jordan Justen 
wrote:

> Didn't you have the same patch written by Connor in your nir-serialize
> series? :)
>

Yes. but I didn't figure "move some files" was worth cherry-picking.
Besides, mine now has meson fixes. :P

--Jason


> -Jordan
>
> On 2017-10-11 13:38:43, Jason Ekstrand wrote:
> > We're going to want to use the blob for Vulkan pipeline caching so it
> > makes sense to have it in libcompiler not libglsl.
> > ---
> >  src/compiler/Makefile.sources| 4 ++--
> >  src/compiler/{glsl => }/blob.c   | 0
> >  src/compiler/{glsl => }/blob.h   | 0
> >  src/mesa/state_tracker/st_shader_cache.h | 2 +-
> >  4 files changed, 3 insertions(+), 3 deletions(-)
> >  rename src/compiler/{glsl => }/blob.c (100%)
> >  rename src/compiler/{glsl => }/blob.h (100%)
> >
> > diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.
> sources
> > index 36906f4..b500368 100644
> > --- a/src/compiler/Makefile.sources
> > +++ b/src/compiler/Makefile.sources
> > @@ -1,4 +1,6 @@
> >  LIBCOMPILER_FILES = \
> > +   blob.c \
> > +   blob.h \
> > builtin_type_macros.h \
> > glsl_types.cpp \
> > glsl_types.h \
> > @@ -17,8 +19,6 @@ LIBGLSL_FILES = \
> > glsl/ast_function.cpp \
> > glsl/ast_to_hir.cpp \
> > glsl/ast_type.cpp \
> > -   glsl/blob.c \
> > -   glsl/blob.h \
> > glsl/builtin_functions.cpp \
> > glsl/builtin_functions.h \
> > glsl/builtin_int64.h \
> > diff --git a/src/compiler/glsl/blob.c b/src/compiler/blob.c
> > similarity index 100%
> > rename from src/compiler/glsl/blob.c
> > rename to src/compiler/blob.c
> > diff --git a/src/compiler/glsl/blob.h b/src/compiler/blob.h
> > similarity index 100%
> > rename from src/compiler/glsl/blob.h
> > rename to src/compiler/blob.h
> > diff --git a/src/mesa/state_tracker/st_shader_cache.h
> b/src/mesa/state_tracker/st_shader_cache.h
> > index f9e4615..090d7d8 100644
> > --- a/src/mesa/state_tracker/st_shader_cache.h
> > +++ b/src/mesa/state_tracker/st_shader_cache.h
> > @@ -22,7 +22,7 @@
> >   */
> >
> >  #include "st_context.h"
> > -#include "compiler/glsl/blob.h"
> > +#include "compiler/blob.h"
> >  #include "main/mtypes.h"
> >  #include "pipe/p_state.h"
> >  #include "util/disk_cache.h"
> > --
> > 2.5.0.400.gff86faf
> >
> > ___
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 03/11] compiler: Move blob up a level

2017-10-12 Thread Jason Ekstrand
On Wed, Oct 11, 2017 at 11:21 PM, Nicolai Hähnle  wrote:

> On 11.10.2017 22:38, Jason Ekstrand wrote:
>
>> We're going to want to use the blob for Vulkan pipeline caching so it
>> makes sense to have it in libcompiler not libglsl.
>> ---
>>   src/compiler/Makefile.sources| 4 ++--
>>   src/compiler/{glsl => }/blob.c   | 0
>>   src/compiler/{glsl => }/blob.h   | 0
>>   src/mesa/state_tracker/st_shader_cache.h | 2 +-
>>
>
> This needs to update the Meson build system as well.
>

Good catch!


> (That's why I complained about having duplicated source file lists...)
>

Dylan is working on that problem...


> Cheers,
> Nicolai
>
>
>   4 files changed, 3 insertions(+), 3 deletions(-)
>>   rename src/compiler/{glsl => }/blob.c (100%)
>>   rename src/compiler/{glsl => }/blob.h (100%)
>>
>> diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.source
>> s
>> index 36906f4..b500368 100644
>> --- a/src/compiler/Makefile.sources
>> +++ b/src/compiler/Makefile.sources
>> @@ -1,4 +1,6 @@
>>   LIBCOMPILER_FILES = \
>> +   blob.c \
>> +   blob.h \
>> builtin_type_macros.h \
>> glsl_types.cpp \
>> glsl_types.h \
>> @@ -17,8 +19,6 @@ LIBGLSL_FILES = \
>> glsl/ast_function.cpp \
>> glsl/ast_to_hir.cpp \
>> glsl/ast_type.cpp \
>> -   glsl/blob.c \
>> -   glsl/blob.h \
>> glsl/builtin_functions.cpp \
>> glsl/builtin_functions.h \
>> glsl/builtin_int64.h \
>> diff --git a/src/compiler/glsl/blob.c b/src/compiler/blob.c
>> similarity index 100%
>> rename from src/compiler/glsl/blob.c
>> rename to src/compiler/blob.c
>> diff --git a/src/compiler/glsl/blob.h b/src/compiler/blob.h
>> similarity index 100%
>> rename from src/compiler/glsl/blob.h
>> rename to src/compiler/blob.h
>> diff --git a/src/mesa/state_tracker/st_shader_cache.h
>> b/src/mesa/state_tracker/st_shader_cache.h
>> index f9e4615..090d7d8 100644
>> --- a/src/mesa/state_tracker/st_shader_cache.h
>> +++ b/src/mesa/state_tracker/st_shader_cache.h
>> @@ -22,7 +22,7 @@
>>*/
>> #include "st_context.h"
>> -#include "compiler/glsl/blob.h"
>> +#include "compiler/blob.h"
>>   #include "main/mtypes.h"
>>   #include "pipe/p_state.h"
>>   #include "util/disk_cache.h"
>>
>>
>
> --
> Lerne, wie die Welt wirklich ist,
> Aber vergiss niemals, wie sie sein sollte.
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 00/11] anv: Convert the pipeline cache to use blob

2017-10-12 Thread Jason Ekstrand
On Thu, Oct 12, 2017 at 5:52 PM, Jordan Justen 
wrote:

> I think my requests to split 2 patches should be pretty easy, so even
> with those changes:
>
> Series Reviewed-by: Jordan Justen 
>

Thanks!


> On 2017-10-11 13:38:40, Jason Ekstrand wrote:
> > I started trying to debug a random segfault in the pipeline cache that I
> > was seeing in some of the synchronization tests.  After taking a longer
> > look, It became obvious that doing so with the current implementation is
> > nuts.  Instead, we're much better off if we take advantage of the blob
> > structure and all of it's nice safety guarantees.
> >
> > Connor Abbott (1):
> >   compiler/blob: make blob_reserve_bytes() more useful
> >
> > Jason Ekstrand (10):
> >   glsl/blob: Return false from ensure_can_read on overrun
> >   glsl/blob: Return false from grow_to_fit if we've ever failed
> >   compiler: Move blob up a level
> >   compiler/blob: Switch to init/finsih instead of create/destroy
> >   compiler/blob: Add a concept of a fixed-allocation blob
> >   compiler/blob: Constify the reader
> >   compiler/blob: Make some parameters void instead of uint8_t
> >   anv/multialloc: Add new add_size helper
> >   anv/pipeline: Declare bind maps closer to their use
> >   anv/pipeline_cache: Rework to use multialloc and blob
> >
> >  src/compiler/Makefile.sources|   4 +-
> >  src/compiler/{glsl => }/blob.c   |  91 +++---
> >  src/compiler/{glsl => }/blob.h   | 104 ---
> >  src/compiler/glsl/shader_cache.cpp   |  39 ++--
> >  src/compiler/glsl/tests/blob_test.c  |   4 +-
> >  src/intel/vulkan/anv_pipeline.c  |  18 +-
> >  src/intel/vulkan/anv_pipeline_cache.c| 298
> +++
> >  src/intel/vulkan/anv_private.h   |   8 +-
> >  src/mesa/state_tracker/st_shader_cache.c |  23 +--
> >  src/mesa/state_tracker/st_shader_cache.h |   2 +-
> >  10 files changed, 333 insertions(+), 258 deletions(-)
> >  rename src/compiler/{glsl => }/blob.c (79%)
> >  rename src/compiler/{glsl => }/blob.h (76%)
> >
> > --
> > 2.5.0.400.gff86faf
> >
> > ___
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] mesa/bufferobj: fix atomic offset/size get

2017-10-12 Thread Tapani Pälli



On 10/12/2017 11:14 PM, Dave Airlie wrote:

On 12 October 2017 at 18:22, Tapani Pälli  wrote:



On 10/12/2017 11:14 AM, Dave Airlie wrote:



On 12 Oct. 2017 15:40, "Tapani Pälli" > wrote:



 On 10/12/2017 02:34 AM, Dave Airlie wrote:

 From: Dave Airlie >

 When I realigned the bufferobj code, I didn't see the getters
 were different, realign the getters to work the same as ssbo.


 Alternatively you could set these values as 0 in
 bind_buffer_base_atomic_buffer()? Not sure if it's any better but
 then value would match internally what it has been before these
changes.


Before these changes the ssbo and atomic code was gratuitously different,
this is just the last piece of making them consistent.



Right .. what I mean is that before the refactoring Size and Offset values
in the structure were stored as 0, now they are stored as -1 even though
here we return different value. I haven't checked if anything in Mesa would
assume 0 though .. so feel free to ignore my ramblings :) I just wanted to
note this because I tried to fix this and it following change fixes the bug
as well:



For atomics that is true, but for ssbo it was false. The idea of refactoring it
was to align all the code to be same for both, since there is no reason for
differences. Your change would reintroduce differences where none are needed.

My change to get.c aligns the gets
for
GL_ATOMIC_COUNTER_BUFFER_START
GL_ATOMIC_COUNTER_BUFFER_SIZE
with the ones above
for
GL_SHADER_STORAGE_BUFFER_START
GL_SHADER_STORAGE_BUFFER_SIZE

So the code is consistent across both types of buffer.



Right, got it! Thanks;

// Tapani
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 00/11] anv: Convert the pipeline cache to use blob

2017-10-12 Thread Jordan Justen
I think my requests to split 2 patches should be pretty easy, so even
with those changes:

Series Reviewed-by: Jordan Justen 

On 2017-10-11 13:38:40, Jason Ekstrand wrote:
> I started trying to debug a random segfault in the pipeline cache that I
> was seeing in some of the synchronization tests.  After taking a longer
> look, It became obvious that doing so with the current implementation is
> nuts.  Instead, we're much better off if we take advantage of the blob
> structure and all of it's nice safety guarantees.
> 
> Connor Abbott (1):
>   compiler/blob: make blob_reserve_bytes() more useful
> 
> Jason Ekstrand (10):
>   glsl/blob: Return false from ensure_can_read on overrun
>   glsl/blob: Return false from grow_to_fit if we've ever failed
>   compiler: Move blob up a level
>   compiler/blob: Switch to init/finsih instead of create/destroy
>   compiler/blob: Add a concept of a fixed-allocation blob
>   compiler/blob: Constify the reader
>   compiler/blob: Make some parameters void instead of uint8_t
>   anv/multialloc: Add new add_size helper
>   anv/pipeline: Declare bind maps closer to their use
>   anv/pipeline_cache: Rework to use multialloc and blob
> 
>  src/compiler/Makefile.sources|   4 +-
>  src/compiler/{glsl => }/blob.c   |  91 +++---
>  src/compiler/{glsl => }/blob.h   | 104 ---
>  src/compiler/glsl/shader_cache.cpp   |  39 ++--
>  src/compiler/glsl/tests/blob_test.c  |   4 +-
>  src/intel/vulkan/anv_pipeline.c  |  18 +-
>  src/intel/vulkan/anv_pipeline_cache.c| 298 
> +++
>  src/intel/vulkan/anv_private.h   |   8 +-
>  src/mesa/state_tracker/st_shader_cache.c |  23 +--
>  src/mesa/state_tracker/st_shader_cache.h |   2 +-
>  10 files changed, 333 insertions(+), 258 deletions(-)
>  rename src/compiler/{glsl => }/blob.c (79%)
>  rename src/compiler/{glsl => }/blob.h (76%)
> 
> -- 
> 2.5.0.400.gff86faf
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] include: Revert out the update of the Khronos GLX extension header.

2017-10-12 Thread Mark Janes
Tested-by: Mark Janes 
Reviewed-by: Mark Janes 

I'd like to get this pushed right away, because Mesa CI is basically
offline due to the subsequent build failures.

Eric Anholt  writes:

> They made a mistake in the MESA_swap_control XML, which I'm pursuing in
> their github.  Until then, we can just back this piece out.
> ---
>  include/GL/glxext.h | 12 +---
>  1 file changed, 1 insertion(+), 11 deletions(-)
>
> diff --git a/include/GL/glxext.h b/include/GL/glxext.h
> index 8f6abab2d520..0f60a380c21b 100644
> --- a/include/GL/glxext.h
> +++ b/include/GL/glxext.h
> @@ -34,7 +34,7 @@ extern "C" {
>  **   https://github.com/KhronosGroup/OpenGL-Registry
>  */
>  
> -#define GLX_GLXEXT_VERSION 20170926
> +#define GLX_GLXEXT_VERSION 20170728
>  
>  /* Generated C header for:
>   * API: glx
> @@ -503,16 +503,6 @@ Bool glXSet3DfxModeMESA (int mode);
>  #endif
>  #endif /* GLX_MESA_set_3dfx_mode */
>  
> -#ifndef GLX_MESA_swap_control
> -#define GLX_MESA_swap_control 1
> -typedef int ( *PFNGLXGETSWAPINTERVALMESAPROC) (void);
> -typedef void ( *PFNGLXSWAPINTERVALMESAPROC) (unsigned int interval);
> -#ifdef GLX_GLXEXT_PROTOTYPES
> -int glXGetSwapIntervalMESA (void);
> -void glXSwapIntervalMESA (unsigned int interval);
> -#endif
> -#endif /* GLX_MESA_swap_control */
> -
>  #ifndef GLX_NV_copy_buffer
>  #define GLX_NV_copy_buffer 1
>  typedef void ( *PFNGLXCOPYBUFFERSUBDATANVPROC) (Display *dpy, GLXContext 
> readCtx, GLXContext writeCtx, GLenum readTarget, GLenum writeTarget, GLintptr 
> readOffset, GLintptr writeOffset, GLsizeiptr size);
> -- 
> 2.14.2
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] i965: Fix src0 vs src1 typo

2017-10-12 Thread Matt Turner
On Tue, Oct 10, 2017 at 4:43 AM, Eero Tamminen
 wrote:
> Hi,
>
> On 03.10.2017 08:20, Matt Turner wrote:
>>
>> A typo caused us to copy src0's reg file to src1 rather than reading
>> src1's as intended. This caused us to fail to compact instructions like
>>
>> mov(8)   g4<1>D0D  { align1 1Q };
>>
>> because src1 was set to immediate rather than architecture file. Fixing
>> this reenables compaction (after the precompact() pass changes the data
>> types):
>>
>> mov(8)   g4<1>UD   0xUD{ align1 1Q compacted };
>>
>> Fixes: 1cb0a7941b27 ("i965: Switch to using the logical register types")
>
>
> FYI: the original commit regressed SynMark v7 CSDof test performance by 1-2%
> on GEN9+, and this fixes that performance regression.

Very interesting! Thanks.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 00/21] intel/compiler: Rework the world of push/pull params

2017-10-12 Thread Kenneth Graunke
On Friday, October 6, 2017 10:27:02 AM PDT Jordan Justen wrote:
> Series Reviewed-by: Jordan Justen 
> 
> Although, I think you said you might rewrite patch 13 (the
> thread_local_id_index patch). If you just add the small stage check I
> mentioned then you can add my r-b for it.
> 
> -Jordan

Series is:
Reviewed-by: Kenneth Graunke 

though I'll admit that some of the patches in the middle are probably
more of an Ack than a review...


signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 15/21] anv/pipeline: Whack nir->num_uniforms to MAX_PUSH_CONSTANT_SIZE

2017-10-12 Thread Kenneth Graunke
On Friday, September 29, 2017 2:25:15 PM PDT Jason Ekstrand wrote:
> This way any image uniforms end up having locations higher than
> MAX_PUSH_CONSTANT_SIZE.  There's no bug here at the moment, but this
> consistency will make the next commit easier.  Also, because
> nir_apply_pipeline_layout properly increments nir->num_uniforms when
> it expands the param array, we no longer need to stomp it to match
> prog_data::nr_params because it already does.
> ---
>  src/intel/vulkan/anv_pipeline.c | 6 ++
>  1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
> index 191ae55..691cdf8 100644
> --- a/src/intel/vulkan/anv_pipeline.c
> +++ b/src/intel/vulkan/anv_pipeline.c
> @@ -399,6 +399,7 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
> * them the maximum possible number
> */
>assert(nir->num_uniforms <= MAX_PUSH_CONSTANTS_SIZE);
> +  nir->num_uniforms = MAX_PUSH_CONSTANTS_SIZE;
>prog_data->nr_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float);
> }
>  
> @@ -431,10 +432,7 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
> if (pipeline->layout)
>anv_nir_apply_pipeline_layout(pipeline, nir, prog_data, map);
>  
> -   /* nir_lower_io will only handle the push constants; we need to set this
> -* to the full number of possible uniforms.
> -*/
> -   nir->num_uniforms = prog_data->nr_params * 4;
> +   assert(nir->num_uniforms == prog_data->nr_params * 4);
>  
> return nir;
>  }
> 

I don't like this.  There are a bunch of places in the compiler that
assume that nir->num_uniforms / 4 is the number of uniforms...which
is already pretty bogus, to be honest, but...now it's like we've given
up all pretense of having units or having a meaningful value here...

Maybe we should just get rid of it altogether...


signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/3] meta: Unset the textures_used_by_txf bitfield.

2017-10-12 Thread Jason Ekstrand
On Wed, Oct 11, 2017 at 12:15 PM, Kenneth Graunke 
wrote:

> Drivers that use Meta are happily using blitting data using texelFetch
> and GL_SKIP_DECODE_EXT, but the GL_EXT_texture_sRGB spec unfortunately
> makes GL_SKIP_DECODE_EXT not necessarily work with texelFetch.
>
> As a hack, just unset the texture_used_by_txf bitfield so we can
> continue with the old desired behavior.
> ---
>  src/mesa/drivers/common/meta.c | 12 
>  1 file changed, 12 insertions(+)
>
> diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/
> meta.c
> index 73143842485..658a62885bd 100644
> --- a/src/mesa/drivers/common/meta.c
> +++ b/src/mesa/drivers/common/meta.c
> @@ -87,6 +87,7 @@
>  #include "main/glformats.h"
>  #include "util/bitscan.h"
>  #include "util/ralloc.h"
> +#include "compiler/nir/nir.h"
>
>  /** Return offset in bytes of the field within a vertex struct */
>  #define OFFSET(FIELD) ((void *) offsetof(struct vertex, FIELD))
> @@ -195,6 +196,17 @@ _mesa_meta_compile_and_link_program(struct
> gl_context *ctx,
>
> _mesa_meta_link_program_with_debug(ctx, sh_prog);
>
> +   struct gl_program *fp =
> +  sh_prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
> +
> +   /* texelFetch() can break GL_SKIP_DECODE_EXT, but many meta passes want
> +* to use both together; pretend that we're not using texelFetch to
> hack
> +* around this bad interaction.
> +*/
> +   fp->info.textures_used_by_txf = 0;
> +   if (fp->nir)
> +  fp->nir->info.textures_used_by_txf = 0;
> +
>

This is garbage but also, it's meta, so I'm not that inclined to care.  The
biggest problem I see here is that it will break if we ever switching
things to run nir_gather_info post-linking.  I doubt we will, but it might
be worth a small addition to the above comment.  With that, all three are

Reviewed-by: Jason Ekstrand 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 06/11] compiler/blob: make blob_reserve_bytes() more useful

2017-10-12 Thread Jordan Justen
On 2017-10-11 13:38:46, Jason Ekstrand wrote:
> From: Connor Abbott 
> 
> Despite the name, it could only be used if you immediately wrote to the
> pointer. Noboby was using it outside of one test, so clearly this
> behavior wasn't that useful. Instead, make it return an offset into the
> data buffer so that the result isn't invalidated if you later write to
> the blob. In conjunction with blob_overwrite_bytes(), this will be
> useful for leaving a placeholder and then filling it in later, which
> we'll need to do for handling phi nodes when serializing NIR.
> 
> v2 (Jason Ekstrand):
>  - Improve the blob_overwrite_uint32 documentation
>  - Detect overflow in the offset + to_write computation
>  - Add a blob_reserve_uint32 helper
>  - Add a blob_overwrite_intptr helper

Why not add blob_reserve_uint32 and blob_overwrite_intptr in a
separate patch? I think fixing the alignment issue is worth
highlighting in its own patch.

-Jordan

> ---
>  src/compiler/blob.c | 37 +
>  src/compiler/blob.h | 54 
> ++---
>  src/compiler/glsl/tests/blob_test.c |  4 +--
>  3 files changed, 73 insertions(+), 22 deletions(-)
> 
> diff --git a/src/compiler/blob.c b/src/compiler/blob.c
> index 59ad8a3..c5ed9f5 100644
> --- a/src/compiler/blob.c
> +++ b/src/compiler/blob.c
> @@ -130,7 +130,7 @@ blob_overwrite_bytes(struct blob *blob,
>   size_t to_write)
>  {
> /* Detect an attempt to overwrite data out of bounds. */
> -   if (blob->size < offset + to_write)
> +   if (offset + to_write < offset || blob->size < offset + to_write)
>return false;
>  
> VG(VALGRIND_CHECK_MEM_IS_DEFINED(bytes, to_write));
> @@ -156,20 +156,34 @@ blob_write_bytes(struct blob *blob, const void *bytes, 
> size_t to_write)
> return true;
>  }
>  
> -uint8_t *
> +ssize_t
>  blob_reserve_bytes(struct blob *blob, size_t to_write)
>  {
> -   uint8_t *ret;
> +   ssize_t ret;
>  
> if (! grow_to_fit (blob, to_write))
> -  return NULL;
> +  return -1;
>  
> -   ret = blob->data + blob->size;
> +   ret = blob->size;
> blob->size += to_write;
>  
> return ret;
>  }
>  
> +ssize_t
> +blob_reserve_uint32(struct blob *blob)
> +{
> +   align_blob(blob, sizeof(uint32_t));
> +   return blob_reserve_bytes(blob, sizeof(uint32_t));
> +}
> +
> +ssize_t
> +blob_reserve_intptr(struct blob *blob)
> +{
> +   align_blob(blob, sizeof(intptr_t));
> +   return blob_reserve_bytes(blob, sizeof(intptr_t));
> +}
> +
>  bool
>  blob_write_uint32(struct blob *blob, uint32_t value)
>  {
> @@ -178,11 +192,15 @@ blob_write_uint32(struct blob *blob, uint32_t value)
> return blob_write_bytes(blob, , sizeof(value));
>  }
>  
> +#define ASSERT_ALIGNED(_offset, _align) \
> +   assert(ALIGN((_offset), (_align)) == (_offset))
> +
>  bool
>  blob_overwrite_uint32 (struct blob *blob,
> size_t offset,
> uint32_t value)
>  {
> +   ASSERT_ALIGNED(offset, sizeof(value));
> return blob_overwrite_bytes(blob, offset, , sizeof(value));
>  }
>  
> @@ -203,6 +221,15 @@ blob_write_intptr(struct blob *blob, intptr_t value)
>  }
>  
>  bool
> +blob_overwrite_intptr (struct blob *blob,
> +   size_t offset,
> +   intptr_t value)
> +{
> +   ASSERT_ALIGNED(offset, sizeof(value));
> +   return blob_overwrite_bytes(blob, offset, , sizeof(value));
> +}
> +
> +bool
>  blob_write_string(struct blob *blob, const char *str)
>  {
> return blob_write_bytes(blob, str, strlen(str) + 1);
> diff --git a/src/compiler/blob.h b/src/compiler/blob.h
> index 1ef6d99..ad4b6fa 100644
> --- a/src/compiler/blob.h
> +++ b/src/compiler/blob.h
> @@ -126,24 +126,32 @@ blob_write_bytes(struct blob *blob, const void *bytes, 
> size_t to_write);
>   * Reserve space in \blob for a number of bytes.
>   *
>   * Space will be allocated within the blob for these byes, but the bytes will
> - * be left uninitialized. The caller is expected to use the return value to
> - * write directly (and immediately) to these bytes.
> + * be left uninitialized. The caller is expected to use \sa
> + * blob_overwrite_bytes to write to these bytes.
>   *
> - * \note The return value is valid immediately upon return, but can be
> - * invalidated by any other call to a blob function. So the caller should 
> call
> - * blob_reserve_byes immediately before writing through the returned pointer.
> - *
> - * This function is intended to be used when interfacing with an existing API
> - * that is not aware of the blob API, (so that blob_write_bytes cannot be
> - * called).
> - *
> - * \return A pointer to space allocated within \blob to which \to_write bytes
> - * can be written, (or NULL in case of any allocation error).
> + * \return An offset to space allocated within \blob to which \to_write bytes
> + * can be written, (or -1 in case of any allocation error).
>   */
> -uint8_t *
> +ssize_t
>  

Re: [Mesa-dev] [PATCH 05/11] compiler/blob: Add a concept of a fixed-allocation blob

2017-10-12 Thread Jordan Justen
I think the commit message should include something like:

With a fixed allocation blob, the data buffer is managed by the
caller, and can't be grown by blob writes.

I also think it'd be better to add the NULL fixed allocation blob
support in a separate patch. (It looks like the real motivation is the
ability to determine the blob size using a NULL buffer, so it seems
like that deserves its own patch. :)

-Jordan

On 2017-10-11 13:38:45, Jason Ekstrand wrote:
> ---
>  src/compiler/blob.c | 24 +---
>  src/compiler/blob.h | 23 ++-
>  2 files changed, 43 insertions(+), 4 deletions(-)
> 
> diff --git a/src/compiler/blob.c b/src/compiler/blob.c
> index 0b42871..59ad8a3 100644
> --- a/src/compiler/blob.c
> +++ b/src/compiler/blob.c
> @@ -52,6 +52,11 @@ grow_to_fit(struct blob *blob, size_t additional)
> if (blob->size + additional <= blob->allocated)
>return true;
>  
> +   if (blob->fixed_allocation) {
> +  blob->out_of_memory = true;
> +  return false;
> +   }
> +
> if (blob->allocated == 0)
>to_allocate = BLOB_INITIAL_SIZE;
> else
> @@ -86,7 +91,8 @@ align_blob(struct blob *blob, size_t alignment)
>if (!grow_to_fit(blob, new_size - blob->size))
>   return false;
>  
> -  memset(blob->data + blob->size, 0, new_size - blob->size);
> +  if (blob->data)
> + memset(blob->data + blob->size, 0, new_size - blob->size);
>blob->size = new_size;
> }
>  
> @@ -104,6 +110,16 @@ blob_init(struct blob *blob)
>  {
> blob->data = NULL;
> blob->allocated = 0;
> +   blob->fixed_allocation = false;
> +   blob->size = 0;
> +}
> +
> +void
> +blob_init_fixed(struct blob *blob, void *data, size_t size)
> +{
> +   blob->data = data;
> +   blob->allocated = size;
> +   blob->fixed_allocation = true;
> blob->size = 0;
>  }
>  
> @@ -119,7 +135,8 @@ blob_overwrite_bytes(struct blob *blob,
>  
> VG(VALGRIND_CHECK_MEM_IS_DEFINED(bytes, to_write));
>  
> -   memcpy(blob->data + offset, bytes, to_write);
> +   if (blob->data)
> +  memcpy(blob->data + offset, bytes, to_write);
>  
> return true;
>  }
> @@ -132,7 +149,8 @@ blob_write_bytes(struct blob *blob, const void *bytes, 
> size_t to_write)
>  
> VG(VALGRIND_CHECK_MEM_IS_DEFINED(bytes, to_write));
>  
> -   memcpy(blob->data + blob->size, bytes, to_write);
> +   if (blob->data)
> +  memcpy(blob->data + blob->size, bytes, to_write);
> blob->size += to_write;
>  
> return true;
> diff --git a/src/compiler/blob.h b/src/compiler/blob.h
> index fd13a16..1ef6d99 100644
> --- a/src/compiler/blob.h
> +++ b/src/compiler/blob.h
> @@ -56,6 +56,12 @@ struct blob {
> /** The number of bytes that have actual data written to them. */
> size_t size;
>  
> +   /** True if \c data a fixed allocation that we cannot resize
> +*
> +* \see blob_init_fixed
> +*/
> +   bool fixed_allocation;
> +
> /**
>  * True if we've ever failed to realloc or if we go pas the end of a fixed
>  * allocation blob.
> @@ -85,12 +91,27 @@ void
>  blob_init(struct blob *blob);
>  
>  /**
> + * Init a new, fixed-size blob.
> + *
> + * A fixed-size blob has a fixed block of data that will not be freed on
> + * blob_finish and will never be grown.  If we hit the end, we simply start
> + * returning false from the write functions.
> + *
> + * If a fixed-size blob has a NULL data pointer then the blob no data is
> + * written but it otherwise operates normally.  This can be used to determine
> + * the size that will be required to write a given data structure.
> + */
> +void
> +blob_init_fixed(struct blob *blob, void *data, size_t size);
> +
> +/**
>   * Destroy a blob and free its memory.
>   */
>  static inline void
>  blob_finish(struct blob *blob)
>  {
> -   free(blob->data);
> +   if (!blob->fixed_allocation)
> +  free(blob->data);
>  }
>  
>  /**
> -- 
> 2.5.0.400.gff86faf
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 17/21] meson: build nouveau (gallium) driver

2017-10-12 Thread Dylan Baker
Quoting Eric Anholt (2017-10-12 15:33:43)
> Dylan Baker  writes:
> 
> > [ Unknown signature status ]
> > Quoting Eric Anholt (2017-10-12 12:47:19)
> >> Dylan Baker  writes:
> >> 
> >> > Tested with a GK107.
> >> >
> >> > v2: - Add target for nouveau standalone compiler. This target is not
> >> >   built by default.
> >> 
> >> Looks like this missed the update of meson_options.txt.
> >
> > I configured it like the Intel tools, they're not built by default but can 
> > be
> > built with 'ninja -C build src/intel/tools/aubinator', for example.
> 
> I meant the driver itself.  I noticed that it wasn't in the list when I
> went to add vc4.

Oh, duh. Yeah, I'll fix that.


signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/4] radv: update ia_multi_vgt when executing secondary buffers

2017-10-12 Thread Bas Nieuwenhuizen
Why don't we use the approach from patch 3 for this?

Otherwise the series is r-b?

On Wed, Oct 11, 2017 at 10:25 AM, Samuel Pitoiset
 wrote:
> Signed-off-by: Samuel Pitoiset 
> ---
>  src/amd/vulkan/radv_cmd_buffer.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> b/src/amd/vulkan/radv_cmd_buffer.c
> index 9d6fb9fe40..f6f9847a14 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -2680,6 +2680,7 @@ void radv_CmdExecuteCommands(
>
> primary->state.last_primitive_reset_en = 
> secondary->state.last_primitive_reset_en;
> primary->state.last_primitive_reset_index = 
> secondary->state.last_primitive_reset_index;
> +   primary->state.last_ia_multi_vgt_param = 
> secondary->state.last_ia_multi_vgt_param;
> }
>
> /* if we execute secondary we need to mark some stuff to reset dirty 
> */
> --
> 2.14.2
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] include: Revert out the update of the Khronos GLX extension header.

2017-10-12 Thread Eric Anholt
They made a mistake in the MESA_swap_control XML, which I'm pursuing in
their github.  Until then, we can just back this piece out.
---
 include/GL/glxext.h | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/include/GL/glxext.h b/include/GL/glxext.h
index 8f6abab2d520..0f60a380c21b 100644
--- a/include/GL/glxext.h
+++ b/include/GL/glxext.h
@@ -34,7 +34,7 @@ extern "C" {
 **   https://github.com/KhronosGroup/OpenGL-Registry
 */
 
-#define GLX_GLXEXT_VERSION 20170926
+#define GLX_GLXEXT_VERSION 20170728
 
 /* Generated C header for:
  * API: glx
@@ -503,16 +503,6 @@ Bool glXSet3DfxModeMESA (int mode);
 #endif
 #endif /* GLX_MESA_set_3dfx_mode */
 
-#ifndef GLX_MESA_swap_control
-#define GLX_MESA_swap_control 1
-typedef int ( *PFNGLXGETSWAPINTERVALMESAPROC) (void);
-typedef void ( *PFNGLXSWAPINTERVALMESAPROC) (unsigned int interval);
-#ifdef GLX_GLXEXT_PROTOTYPES
-int glXGetSwapIntervalMESA (void);
-void glXSwapIntervalMESA (unsigned int interval);
-#endif
-#endif /* GLX_MESA_swap_control */
-
 #ifndef GLX_NV_copy_buffer
 #define GLX_NV_copy_buffer 1
 typedef void ( *PFNGLXCOPYBUFFERSUBDATANVPROC) (Display *dpy, GLXContext 
readCtx, GLXContext writeCtx, GLenum readTarget, GLenum writeTarget, GLintptr 
readOffset, GLintptr writeOffset, GLsizeiptr size);
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radv: do not allocate CMASK for non-MSSA images with 128 bit formats

2017-10-12 Thread Bas Nieuwenhuizen
r-b

On Thu, Oct 12, 2017 at 4:55 PM, Samuel Pitoiset
 wrote:
> This saves some useless CMASK initializations/eliminations in
> the Vulkan SSAO demo.
>
> Signed-off-by: Samuel Pitoiset 
> ---
>  src/amd/vulkan/radv_cmd_buffer.c | 2 +-
>  src/amd/vulkan/radv_image.c  | 8 
>  src/amd/vulkan/radv_meta_clear.c | 5 -
>  3 files changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> b/src/amd/vulkan/radv_cmd_buffer.c
> index 99a48242c9..a7efcdc218 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -3537,7 +3537,7 @@ static void radv_handle_image_transition(struct 
> radv_cmd_buffer *cmd_buffer,
>dst_queue_mask, range,
>pending_clears);
>
> -   if (image->cmask.size)
> +   if (image->cmask.size || image->fmask.size)
> radv_handle_cmask_image_transition(cmd_buffer, image, 
> src_layout,
>dst_layout, src_queue_mask,
>dst_queue_mask, range);
> diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
> index 7c3e55b1b8..0564454c77 100644
> --- a/src/amd/vulkan/radv_image.c
> +++ b/src/amd/vulkan/radv_image.c
> @@ -819,6 +819,14 @@ radv_image_can_enable_dcc(struct radv_image *image)
>  static inline bool
>  radv_image_can_enable_cmask(struct radv_image *image)
>  {
> +   if (image->surface.bpe > 8 && image->info.samples == 1) {
> +   /* Do not enable CMASK for non-MSAA images (fast color clear)
> +* because 128 bit formats are not supported, but FMASK might
> +* still be used.
> +*/
> +   return false;
> +   }
> +
> return radv_image_can_enable_dcc_or_cmask(image) &&
>image->info.levels == 1 &&
>image->info.depth == 1 &&
> diff --git a/src/amd/vulkan/radv_meta_clear.c 
> b/src/amd/vulkan/radv_meta_clear.c
> index d148a75c19..402271ae4f 100644
> --- a/src/amd/vulkan/radv_meta_clear.c
> +++ b/src/amd/vulkan/radv_meta_clear.c
> @@ -1029,11 +1029,6 @@ emit_fast_color_clear(struct radv_cmd_buffer 
> *cmd_buffer,
> radv_set_dcc_need_cmask_elim_pred(cmd_buffer, iview->image,
>   !can_avoid_fast_clear_elim);
> } else {
> -
> -   if (iview->image->surface.bpe > 8) {
> -   /* 128 bit formats not supported */
> -   return false;
> -   }
> radv_fill_buffer(cmd_buffer, iview->image->bo,
>  iview->image->offset + 
> iview->image->cmask.offset,
>  iview->image->cmask.size, 0);
> --
> 2.14.2
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 4/11] compiler/blob: Switch to init/finsih instead of create/destroy

2017-10-12 Thread Jordan Justen
Typo 'finsih' in subject.

On 2017-10-11 13:52:17, Jason Ekstrand wrote:
>  
>  /**
>   * Destroy a blob and free its memory.

Function comment needs an update.

-Jordan

>   */
>  static inline void
> -blob_destroy(struct blob *blob)
> +blob_finish(struct blob *blob)
>  {
> free(blob->data);
> -   free(blob);
>  }
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 17/21] meson: build nouveau (gallium) driver

2017-10-12 Thread Eric Anholt
Dylan Baker  writes:

> [ Unknown signature status ]
> Quoting Eric Anholt (2017-10-12 12:47:19)
>> Dylan Baker  writes:
>> 
>> > Tested with a GK107.
>> >
>> > v2: - Add target for nouveau standalone compiler. This target is not
>> >   built by default.
>> 
>> Looks like this missed the update of meson_options.txt.
>
> I configured it like the Intel tools, they're not built by default but can be
> built with 'ninja -C build src/intel/tools/aubinator', for example.

I meant the driver itself.  I noticed that it wasn't in the list when I
went to add vc4.


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 18/21] meson: build softpipe

2017-10-12 Thread Eric Anholt
Dylan Baker  writes:

> [ Unknown signature status ]
> Quoting Eric Anholt (2017-10-12 12:27:21)
>> Dylan Baker  writes:
>> 
>> > This doesn't include llvmpipe.
>> >
>> > v2: - Fix inconsistent use of with_gallium_swrast and
>> >   with_gallium_softpipe.
>> >
>> > Signed-off-by: Dylan Baker 
>> > ---
>> 
>> > diff --git a/meson_options.txt b/meson_options.txt
>> > index d29b12e5959..bb7492b9bff 100644
>> > --- a/meson_options.txt
>> > +++ b/meson_options.txt
>> > @@ -22,11 +22,11 @@ option('platforms', type : 'string', value : 
>> > 'x11,wayland,drm',
>> > description : 'comma separated list of window systems to support. 
>> > wayland, x11, surfaceless, drm, etc.')
>> >  option('dri3', type : 'combo', value : 'auto', choices : ['auto', 'yes', 
>> > 'no'],
>> > description : 'enable support for dri3')
>> > -option('dri-drivers', type : 'string', value : 'swrast,i965',
>> > +option('dri-drivers', type : 'string', value : 'i965',
>> > description : 'comma separated list of dri drivers to build.')
>> 
>> Looks like a stray change.
>
> Actually, that's required since meson doesn't allow classic swrast and 
> softpipe
> to be built at the same time, and I think most people would prefer
> softpipe/llvmpipe. Would you prefer me to split those into separate patches?

Oh, right, these are the defaults.  I was thinking as if it was a list
of available drivers.  r-b.



signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 03/11] compiler: Move blob up a level

2017-10-12 Thread Jordan Justen
Didn't you have the same patch written by Connor in your nir-serialize
series? :)

-Jordan

On 2017-10-11 13:38:43, Jason Ekstrand wrote:
> We're going to want to use the blob for Vulkan pipeline caching so it
> makes sense to have it in libcompiler not libglsl.
> ---
>  src/compiler/Makefile.sources| 4 ++--
>  src/compiler/{glsl => }/blob.c   | 0
>  src/compiler/{glsl => }/blob.h   | 0
>  src/mesa/state_tracker/st_shader_cache.h | 2 +-
>  4 files changed, 3 insertions(+), 3 deletions(-)
>  rename src/compiler/{glsl => }/blob.c (100%)
>  rename src/compiler/{glsl => }/blob.h (100%)
> 
> diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
> index 36906f4..b500368 100644
> --- a/src/compiler/Makefile.sources
> +++ b/src/compiler/Makefile.sources
> @@ -1,4 +1,6 @@
>  LIBCOMPILER_FILES = \
> +   blob.c \
> +   blob.h \
> builtin_type_macros.h \
> glsl_types.cpp \
> glsl_types.h \
> @@ -17,8 +19,6 @@ LIBGLSL_FILES = \
> glsl/ast_function.cpp \
> glsl/ast_to_hir.cpp \
> glsl/ast_type.cpp \
> -   glsl/blob.c \
> -   glsl/blob.h \
> glsl/builtin_functions.cpp \
> glsl/builtin_functions.h \
> glsl/builtin_int64.h \
> diff --git a/src/compiler/glsl/blob.c b/src/compiler/blob.c
> similarity index 100%
> rename from src/compiler/glsl/blob.c
> rename to src/compiler/blob.c
> diff --git a/src/compiler/glsl/blob.h b/src/compiler/blob.h
> similarity index 100%
> rename from src/compiler/glsl/blob.h
> rename to src/compiler/blob.h
> diff --git a/src/mesa/state_tracker/st_shader_cache.h 
> b/src/mesa/state_tracker/st_shader_cache.h
> index f9e4615..090d7d8 100644
> --- a/src/mesa/state_tracker/st_shader_cache.h
> +++ b/src/mesa/state_tracker/st_shader_cache.h
> @@ -22,7 +22,7 @@
>   */
>  
>  #include "st_context.h"
> -#include "compiler/glsl/blob.h"
> +#include "compiler/blob.h"
>  #include "main/mtypes.h"
>  #include "pipe/p_state.h"
>  #include "util/disk_cache.h"
> -- 
> 2.5.0.400.gff86faf
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] Mesa 17.3.0 release plan

2017-10-12 Thread Christian Gmeiner
Hi Emil

2017-10-12 15:47 GMT+02:00 Emil Velikov :
> Hi all,
>
> As you've know the Mesa 17.3.0 release plan has been available for a while
> on the mesa3d.org website [1].
>
> In case you've missed it here it is:
>
>  Oct 20 2017 - Feature freeze/Release candidate 1
>  Oct 27 2017 - Release candidate 2
>  Nov 03 2017 - Release candidate 3
>  Nov 10 2017 - Release candidate 4/final release
>
> This gives us just over a week to the branch point.
>
> As always, please let me know of must have features that you'll like in.
>

I want to land occlusion query and half-float texture support for
etnaviv. Will send out patches during the weekend.

greets
--
Christian Gmeiner, MSc

https://christian-gmeiner.info
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 18/21] meson: build softpipe

2017-10-12 Thread Dylan Baker
Quoting Eric Anholt (2017-10-12 12:27:21)
> Dylan Baker  writes:
> 
> > This doesn't include llvmpipe.
> >
> > v2: - Fix inconsistent use of with_gallium_swrast and
> >   with_gallium_softpipe.
> >
> > Signed-off-by: Dylan Baker 
> > ---
> 
> > diff --git a/meson_options.txt b/meson_options.txt
> > index d29b12e5959..bb7492b9bff 100644
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > @@ -22,11 +22,11 @@ option('platforms', type : 'string', value : 
> > 'x11,wayland,drm',
> > description : 'comma separated list of window systems to support. 
> > wayland, x11, surfaceless, drm, etc.')
> >  option('dri3', type : 'combo', value : 'auto', choices : ['auto', 'yes', 
> > 'no'],
> > description : 'enable support for dri3')
> > -option('dri-drivers', type : 'string', value : 'swrast,i965',
> > +option('dri-drivers', type : 'string', value : 'i965',
> > description : 'comma separated list of dri drivers to build.')
> 
> Looks like a stray change.

Actually, that's required since meson doesn't allow classic swrast and softpipe
to be built at the same time, and I think most people would prefer
softpipe/llvmpipe. Would you prefer me to split those into separate patches?

Dylan

> 
> Other than that, 18-20 are:
> 
> Reviewed-by: Eric Anholt 


signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 17/21] meson: build nouveau (gallium) driver

2017-10-12 Thread Dylan Baker
Quoting Eric Anholt (2017-10-12 12:47:19)
> Dylan Baker  writes:
> 
> > Tested with a GK107.
> >
> > v2: - Add target for nouveau standalone compiler. This target is not
> >   built by default.
> 
> Looks like this missed the update of meson_options.txt.

I configured it like the Intel tools, they're not built by default but can be
built with 'ninja -C build src/intel/tools/aubinator', for example.

Dylan


signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 10/21] meson: split and simplify depdendncies

2017-10-12 Thread Dylan Baker
Quoting Eric Anholt (2017-10-12 12:23:19)
> Dylan Baker  writes:
> 
> > Rather than group dependencies in complex groups, use a flatter
> > structure with split dependencies to avoid checking for the same
> > dependencies twice.
> 
> meson's going to be caching the dependency checks in the future, so I
> wouldn't go out of your way to avoid checking twice, unless it cleans
> things up.  In fact, I hope we get to move some of the dependency
> checking logic into the subdirs using them, for deps that aren't shared
> between many pieces of the build.
> 
> However, you've built a lot on top of this patch, and it doesn't hurt,
> so I'm fine with it.  A comment below, then r-b.

I saw that, and I'd like to make use of that feature once it lands, but for now
I'd rather leave it as-is. We're going to want/need to bump the meson version
dependency anyway for a few features and a lot of bug fixes around LLVM.

Dylan

> 
> > @@ -484,10 +515,19 @@ else
> >dep_clock = cc.find_library('rt')
> >  endif
> >  
> > +# TODO: conditionalize libdrm requirement
> 
> I think you can drop this comment now.
> 
> > +dep_libdrm = dependency('libdrm', version : '>= 2.4.75',
> > +required : with_dri2 or with_dri3)
> > +if dep_libdrm.found()
> > +  pre_args += '-DHAVE_LIBDRM'
> > +endif


signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] radv: do not allocate CMASK for non-MSSA images with 128 bit formats

2017-10-12 Thread Samuel Pitoiset
This saves some useless CMASK initializations/eliminations in
the Vulkan SSAO demo.

Signed-off-by: Samuel Pitoiset 
---
 src/amd/vulkan/radv_cmd_buffer.c | 2 +-
 src/amd/vulkan/radv_image.c  | 8 
 src/amd/vulkan/radv_meta_clear.c | 5 -
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 99a48242c9..a7efcdc218 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -3537,7 +3537,7 @@ static void radv_handle_image_transition(struct 
radv_cmd_buffer *cmd_buffer,
   dst_queue_mask, range,
   pending_clears);
 
-   if (image->cmask.size)
+   if (image->cmask.size || image->fmask.size)
radv_handle_cmask_image_transition(cmd_buffer, image, 
src_layout,
   dst_layout, src_queue_mask,
   dst_queue_mask, range);
diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
index 7c3e55b1b8..0564454c77 100644
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -819,6 +819,14 @@ radv_image_can_enable_dcc(struct radv_image *image)
 static inline bool
 radv_image_can_enable_cmask(struct radv_image *image)
 {
+   if (image->surface.bpe > 8 && image->info.samples == 1) {
+   /* Do not enable CMASK for non-MSAA images (fast color clear)
+* because 128 bit formats are not supported, but FMASK might
+* still be used.
+*/
+   return false;
+   }
+
return radv_image_can_enable_dcc_or_cmask(image) &&
   image->info.levels == 1 &&
   image->info.depth == 1 &&
diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c
index d148a75c19..402271ae4f 100644
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -1029,11 +1029,6 @@ emit_fast_color_clear(struct radv_cmd_buffer *cmd_buffer,
radv_set_dcc_need_cmask_elim_pred(cmd_buffer, iview->image,
  !can_avoid_fast_clear_elim);
} else {
-
-   if (iview->image->surface.bpe > 8) {
-   /* 128 bit formats not supported */
-   return false;
-   }
radv_fill_buffer(cmd_buffer, iview->image->bo,
 iview->image->offset + 
iview->image->cmask.offset,
 iview->image->cmask.size, 0);
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] meson: Build i915

2017-10-12 Thread Dylan Baker
Both patches are:
Reviewed-by: Dylan Baker 

Quoting Ville Syrjala (2017-10-12 09:34:55)
> From: Ville Syrjälä 
> 
> Build i915 with meson. More or less copied from i965, with all
> the unneeded cruft removed, and the libdrm_intel dependency added.
> 
> Cc: Dylan Baker 
> Cc: Eric Anholt 
> Signed-off-by: Ville Syrjälä 
> ---
>  meson.build   |  7 +++
>  meson_options.txt |  2 +-
>  src/mesa/drivers/dri/i915/meson.build | 97 
> +++
>  src/mesa/drivers/dri/meson.build  |  3 ++
>  4 files changed, 108 insertions(+), 1 deletion(-)
>  create mode 100644 src/mesa/drivers/dri/i915/meson.build
> 
> diff --git a/meson.build b/meson.build
> index 4ba00283cec7..02264aeed4ef 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -72,16 +72,23 @@ if (with_gles1 or with_gles2) and not with_opengl
>  endif
>  
>  with_dri = false
> +with_dri_i915 = false
>  with_dri_i965 = false
>  with_dri_swrast = false
>  _drivers = get_option('dri-drivers')
>  if _drivers != ''
>_split = _drivers.split(',')
> +  with_dri_i915 = _split.contains('i915')
>with_dri_i965 = _split.contains('i965')
>with_dri_swrast = _split.contains('swrast')
>with_dri = true
>  endif
>  
> +dep_libdrm_intel = []
> +if with_dri_i915
> +  dep_libdrm_intel = dependency('libdrm_intel', version : '>= 2.4.75')
> +endif
> +
>  if not with_dri
>with_gles1 = false
>with_gles2 = false
> diff --git a/meson_options.txt b/meson_options.txt
> index 029626d69a47..abd5135742ac 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -22,7 +22,7 @@ option('platforms', type : 'string', value : 'x11,wayland',
> description : 'comma separated list of window systems to support. 
> wayland, x11, surfaceless, drm, etc.')
>  option('dri3', type : 'combo', value : 'auto', choices : ['auto', 'yes', 
> 'no'],
> description : 'enable support for dri3')
> -option('dri-drivers', type : 'string', value : 'swrast,i965',
> +option('dri-drivers', type : 'string', value : 'swrast,i915,i965',
> description : 'comma separated list of dri drivers to build.')
>  option('dri-drivers-path', type : 'string', value : '',
> description : 'Location of dri drivers. Default: $libdir/dri.')
> diff --git a/src/mesa/drivers/dri/i915/meson.build 
> b/src/mesa/drivers/dri/i915/meson.build
> new file mode 100644
> index ..1971419a6b71
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i915/meson.build
> @@ -0,0 +1,97 @@
> +# Copyright © 2017 Intel Corporation
> +
> +# Permission is hereby granted, free of charge, to any person obtaining a 
> copy
> +# of this software and associated documentation files (the "Software"), to 
> deal
> +# in the Software without restriction, including without limitation the 
> rights
> +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> +# copies of the Software, and to permit persons to whom the Software is
> +# furnished to do so, subject to the following conditions:
> +
> +# The above copyright notice and this permission notice shall be included in
> +# all copies or substantial portions of the Software.
> +
> +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> FROM,
> +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
> THE
> +# SOFTWARE.
> +
> +files_i915 = files(
> +  'i830_context.c',
> +  'i830_context.h',
> +  'i830_reg.h',
> +  'i830_state.c',
> +  'i830_texblend.c',
> +  'i830_texstate.c',
> +  'i830_vtbl.c',
> +  'i915_context.c',
> +  'i915_context.h',
> +  'i915_debug_fp.c',
> +  'i915_debug.h',
> +  'i915_fragprog.c',
> +  'i915_program.c',
> +  'i915_program.h',
> +  'i915_reg.h',
> +  'i915_state.c',
> +  'i915_tex_layout.c',
> +  'i915_texstate.c',
> +  'i915_vtbl.c',
> +  'intel_batchbuffer.c',
> +  'intel_batchbuffer.h',
> +  'intel_blit.c',
> +  'intel_blit.h',
> +  'intel_buffer_objects.c',
> +  'intel_buffer_objects.h',
> +  'intel_buffers.c',
> +  'intel_buffers.h',
> +  'intel_chipset.h',
> +  'intel_clear.c',
> +  'intel_clear.h',
> +  'intel_context.c',
> +  'intel_context.h',
> +  'intel_extensions.c',
> +  'intel_extensions.h',
> +  'intel_fbo.c',
> +  'intel_fbo.h',
> +  'intel_mipmap_tree.c',
> +  'intel_mipmap_tree.h',
> +  'intel_pixel_bitmap.c',
> +  'intel_pixel.c',
> +  'intel_pixel_copy.c',
> +  'intel_pixel_draw.c',
> +  'intel_pixel.h',
> +  'intel_pixel_read.c',
> +  'intel_reg.h',
> +  'intel_regions.c',
> +  'intel_regions.h',
> +  'intel_render.c',
> +  'intel_screen.c',
> +  

Re: [Mesa-dev] [PATCH] mesa: Disallow GL_RED/GL_RG with half-floats on GLES2.

2017-10-12 Thread Mark Janes
Tested-by: Mark Janes 

Eric Anholt  writes:

> Sure, you'd think that the combination of GL_OES_texture_half_float and
> GL_EXT_texture_rg would mean that GL_RG16F exists, but it doesn't.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103227
> Fixes: c16a7443e999 ("mesa: Expose GL_OES_required_internalformat on GLES 
> contexts.")
> ---
>
> Apparently the last Intel CI run I had (and the one before that that I
> thought had come back clean) didn't cover the older GLES2-only
> platforms.
>
>  src/mesa/main/glformats.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
> index 7b4b405a814f..1e797c24c2ac 100644
> --- a/src/mesa/main/glformats.c
> +++ b/src/mesa/main/glformats.c
> @@ -3119,6 +3119,8 @@ _mesa_es3_error_check_format_and_type(const struct 
> gl_context *ctx,
>case GL_HALF_FLOAT_OES:
>   switch (internalFormat) {
>  case GL_RG16F:
> +   if (ctx->Version <= 20)
> +  return GL_INVALID_OPERATION;
> break;
>  case GL_RG:
> if (ctx->Extensions.ARB_texture_rg &&
> @@ -3207,6 +3209,8 @@ _mesa_es3_error_check_format_and_type(const struct 
> gl_context *ctx,
>case GL_HALF_FLOAT_OES:
>   switch (internalFormat) {
>   case GL_R16F:
> +if (ctx->Version <= 20)
> +   return GL_INVALID_OPERATION;
>  break;
>   case GL_RG:
>   case GL_RED:
> -- 
> 2.14.2
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] mesa/bufferobj: fix atomic offset/size get

2017-10-12 Thread Dave Airlie
On 12 October 2017 at 18:22, Tapani Pälli  wrote:
>
>
> On 10/12/2017 11:14 AM, Dave Airlie wrote:
>>
>>
>> On 12 Oct. 2017 15:40, "Tapani Pälli" > > wrote:
>>
>>
>>
>> On 10/12/2017 02:34 AM, Dave Airlie wrote:
>>
>> From: Dave Airlie >
>>
>> When I realigned the bufferobj code, I didn't see the getters
>> were different, realign the getters to work the same as ssbo.
>>
>>
>> Alternatively you could set these values as 0 in
>> bind_buffer_base_atomic_buffer()? Not sure if it's any better but
>> then value would match internally what it has been before these
>> changes.
>>
>>
>> Before these changes the ssbo and atomic code was gratuitously different,
>> this is just the last piece of making them consistent.
>
>
> Right .. what I mean is that before the refactoring Size and Offset values
> in the structure were stored as 0, now they are stored as -1 even though
> here we return different value. I haven't checked if anything in Mesa would
> assume 0 though .. so feel free to ignore my ramblings :) I just wanted to
> note this because I tried to fix this and it following change fixes the bug
> as well:
>

For atomics that is true, but for ssbo it was false. The idea of refactoring it
was to align all the code to be same for both, since there is no reason for
differences. Your change would reintroduce differences where none are needed.

My change to get.c aligns the gets
for
GL_ATOMIC_COUNTER_BUFFER_START
GL_ATOMIC_COUNTER_BUFFER_SIZE
with the ones above
for
GL_SHADER_STORAGE_BUFFER_START
GL_SHADER_STORAGE_BUFFER_SIZE

So the code is consistent across both types of buffer.

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 17/21] meson: build nouveau (gallium) driver

2017-10-12 Thread Eric Anholt
Dylan Baker  writes:

> Tested with a GK107.
>
> v2: - Add target for nouveau standalone compiler. This target is not
>   built by default.

Looks like this missed the update of meson_options.txt.


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 18/21] meson: build softpipe

2017-10-12 Thread Eric Anholt
Dylan Baker  writes:

> This doesn't include llvmpipe.
>
> v2: - Fix inconsistent use of with_gallium_swrast and
>   with_gallium_softpipe.
>
> Signed-off-by: Dylan Baker 
> ---

> diff --git a/meson_options.txt b/meson_options.txt
> index d29b12e5959..bb7492b9bff 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -22,11 +22,11 @@ option('platforms', type : 'string', value : 
> 'x11,wayland,drm',
> description : 'comma separated list of window systems to support. 
> wayland, x11, surfaceless, drm, etc.')
>  option('dri3', type : 'combo', value : 'auto', choices : ['auto', 'yes', 
> 'no'],
> description : 'enable support for dri3')
> -option('dri-drivers', type : 'string', value : 'swrast,i965',
> +option('dri-drivers', type : 'string', value : 'i965',
> description : 'comma separated list of dri drivers to build.')

Looks like a stray change.

Other than that, 18-20 are:

Reviewed-by: Eric Anholt 


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 10/21] meson: split and simplify depdendncies

2017-10-12 Thread Eric Anholt
Dylan Baker  writes:

> Rather than group dependencies in complex groups, use a flatter
> structure with split dependencies to avoid checking for the same
> dependencies twice.

meson's going to be caching the dependency checks in the future, so I
wouldn't go out of your way to avoid checking twice, unless it cleans
things up.  In fact, I hope we get to move some of the dependency
checking logic into the subdirs using them, for deps that aren't shared
between many pieces of the build.

However, you've built a lot on top of this patch, and it doesn't hurt,
so I'm fine with it.  A comment below, then r-b.

> @@ -484,10 +515,19 @@ else
>dep_clock = cc.find_library('rt')
>  endif
>  
> +# TODO: conditionalize libdrm requirement

I think you can drop this comment now.

> +dep_libdrm = dependency('libdrm', version : '>= 2.4.75',
> +required : with_dri2 or with_dri3)
> +if dep_libdrm.found()
> +  pre_args += '-DHAVE_LIBDRM'
> +endif


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] CI for Android builds

2017-10-12 Thread Rob Herring
I've made some improvements to my mesa Android CI job[1]. Previously,
it just built using master twice a day and only emailed me. Now anyone
(with commit rights) can push to mesa branches master or android/* and
it will trigger a build emailing the last commit's author/committer on
failure.

It's slower than I'd like for a couple of reasons. It takes ~15
minutes just to clone an AOSP tree and does that from scratch each
time. The AOSP trees have daily bandwidth limits on them and will
start to refuse a given IP address if you clone too many times in one
day. So I've limited the build frequency to every 2 hours (just a
guess whether this will be low enough). That also means the wrong
person could be emailed for the failure if subsequent commits happen
in that window (but master is supposed to be the last resort, push to
android/* first!). I could point to a Linaro mirror instead, but want
to see how this works first. It should also be possible to craft a
trimmed down manifest with just the repos we need to build.

Also, if master or any previous build was already failing, then you
won't get an email. So you have to fix master first. Maybe that's a
feature. :) I think I need to split master and android/* branch builds
to separate jobs to handle this case. The alternative is every push to
master will spam people after the Android build breaks.

Rob

[1] https://ci.linaro.org/view/All/job/robher-aosp/
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 07/21] meson: Add option to toggle LLVM

2017-10-12 Thread Eric Anholt
Dylan Baker  writes:

> Signed-off-by: Dylan Baker 

5-7 are

Reviewed-by: Eric Anholt 


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 15/20] meson: build radeonsi gallium driver

2017-10-12 Thread Eric Anholt
Dylan Baker  writes:

> This hooks up the bits necessary to build gallium dri drivers, with
> radeonSI as the first example driver. This isn't tested yet.
> ---
>  src/gallium/meson.build |  3 +-
>  src/gallium/targets/dri/meson.build | 90 
> +
>  2 files changed, 91 insertions(+), 2 deletions(-)
>  create mode 100644 src/gallium/targets/dri/meson.build
>
> diff --git a/src/gallium/meson.build b/src/gallium/meson.build
> index d500cf5493c..6c0e9782ae9 100644
> --- a/src/gallium/meson.build
> +++ b/src/gallium/meson.build
> @@ -54,9 +54,8 @@ subdir('state_trackers/dri')
>  # TODO: winsys/sw/xlib
>  # TODO: clover
>  if with_dri
> -  #subdir('targets/dri')
> +  subdir('targets/dri')
>  endif
> -# TODO: dricommon
>  # TODO: xlib-glx
>  # TODO: OMX
>  # TODO: osmesa
> diff --git a/src/gallium/targets/dri/meson.build 
> b/src/gallium/targets/dri/meson.build
> new file mode 100644
> index 000..382b00db5dc
> --- /dev/null
> +++ b/src/gallium/targets/dri/meson.build
> @@ -0,0 +1,90 @@
> +# Copyright © 2017 Dylan Baker
> +
> +# Permission is hereby granted, free of charge, to any person obtaining a 
> copy
> +# of this software and associated documentation files (the "Software"), to 
> deal
> +# in the Software without restriction, including without limitation the 
> rights
> +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> +# copies of the Software, and to permit persons to whom the Software is
> +# furnished to do so, subject to the following conditions:
> +
> +# The above copyright notice and this permission notice shall be included in
> +# all copies or substantial portions of the Software.
> +
> +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> FROM,
> +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
> THE
> +# SOFTWARE.
> +
> +# TODO: support non-static targets
> +# Static targets are always enabled in autotools (unless you modify
> +# configure.ac)
> +
> +gallium_dri_c_args = [
> +  '-DGALLIUM_DDEBUG',
> +  '-DGALLIUM_NOOP',
> +  '-DGALLIUM_RBUG',
> +  '-DGALLIUME_TRACE',
> +]
> +gallium_dri_ld_args = []
> +gallium_dri_link_with = []
> +gallium_dri_depends = []
> +gallium_dri_link_depends = []
> +gallium_dri_drivers = []
> +gallium_dri_sources = []
> +
> +if with_ld_version_script
> +  gallium_dri_ld_args += ['-Wl,--version-script', 
> join_paths(meson.current_source_dir(), 'dri.sym')]
> +  gallium_dri_link_depends += files('dri.sym')
> +endif
> +if with_ld_dynamic_list
> +  gallium_dri_ld_args += ['-Wl,--dynamic-list', 
> join_paths(meson.current_source_dir(), '../dri-vdpau.dyn')]
> +  gallium_dri_link_depends += files('../dri-vdpau.dyn')
> +endif
> +
> +if with_dri
> +  gallium_dri_link_with += libswdri
> +endif
> +if with_gallium_drisw_kms
> +  gallium_dri_link_with += libswkmsdri
> +endif
> +
> +if with_gallium_radeonsi
> +  gallium_dri_c_args += '-DGALLIUM_RADEONSI'

> +  gallium_dri_sources += si_driinfo_h

This seems out of place -- nothing here includes that file, right?

Other than that, my eyes are glazing over, but I think patches 10-17
are:

Reviewed-by: Eric Anholt 


> +  gallium_dri_link_with += [
> +libradeonsi, libnir, libradeonwinsys, libamdgpuwinsys, libradeon,
> +libamd_common,
> +  ]
> +  gallium_dri_drivers += 'radeonsi_dri.so'
> +endif


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 09/20] meson: split and simplify depdendncies

2017-10-12 Thread Eric Anholt
Dylan Baker  writes:

> Rather than group dependencies in complex groups, use a flatter
> structure with split dependencies to avoid checking for the same
> dependencies twice.

In the subject: *dependencies*


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 40/43] i965/fs: Use untyped_surface_read for 16-bit load_ssbo

2017-10-12 Thread Jose Maria Casanova Crespo
SSBO loads were using byte_scattered read messages as they allow
reading 16-bit size components. byte_scattered messages can only
operate one component at a time so we needed to emit as many messages
as components.

But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the
untyped_surface_read message to read pairs of 16-bit components using only
one message. Once each pair is read it is unshuffled to return the proper
16-bit components.

On 16-bit scalar and vec3 16-bit the not paired component is read using
only one byte_scattered_read message.

v2: Removed use of stride = 2 on sources (Jason Ekstrand)
Rework optimization using unshuffle 16 reads (Chema Casanova)
---
 src/intel/compiler/brw_fs_nir.cpp | 37 ++---
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 774bd97968..332fc4bfb8 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2303,16 +2303,39 @@ do_untyped_vector_read(const fs_builder ,
  bld.ADD(read_offset, read_offset, brw_imm_ud(16));
   }
} else if (type_sz(dest.type) == 2) {
-  for (unsigned i = 0; i < num_components; i++) {
+  assert(dest.stride == 1);
+
+  int component_pairs = num_components / 2;
+  /* Pairs of 16-bit components can be read with untyped read */
+  if (component_pairs > 0) {
+ fs_reg read_result = emit_untyped_read(bld, surf_index,
+offset_reg,
+1 /* dims */,
+component_pairs,
+BRW_PREDICATE_NONE);
+ shuffle_32bit_load_result_to_16bit_data(bld,
+   retype(dest, BRW_REGISTER_TYPE_HF),
+   retype(read_result, BRW_REGISTER_TYPE_F),
+   component_pairs * 2);
+  }
+  /* Last component of vec3 and scalar 16-bit read needs to be read
+   * using one byte_scattered_read message
+   */
+  if (num_components % 2) {
  fs_reg base_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
  bld.ADD(base_offset,
  offset_reg,
- brw_imm_ud(i * type_sz(dest.type)));
- fs_reg read_reg = emit_byte_scattered_read(bld, surf_index, 
base_offset,
-1 /* dims */,
-1,
-BRW_PREDICATE_NONE);
- bld.MOV(offset(dest,bld,i), subscript(read_reg, dest.type, 0));
+ brw_imm_ud((num_components - 1) * type_sz(dest.type)));
+ fs_reg read_result = emit_byte_scattered_read(bld, surf_index,
+   base_offset,
+   1 /* dims */,
+   1,
+   BRW_PREDICATE_NONE);
+ read_result.type = dest.type;
+ read_result.stride = 2;
+
+ bld.MOV(offset(dest, bld, num_components - 1),
+ read_result);
   }
} else {
   unreachable("Unsupported type");
-- 
2.13.6

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 42/43] anv: Enable SPV_KHR_16bit_storage on gen 8+

2017-10-12 Thread Jose Maria Casanova Crespo
From: Eduardo Lima Mitev 

v2: minor changes after rebase against recent master (Alejandro)
---
 src/intel/vulkan/anv_pipeline.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index b35bad1050..82ed4ecc1d 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -130,6 +130,7 @@ anv_shader_compile_to_nir(struct anv_pipeline *pipeline,
   .image_write_without_format = true,
   .multiview = true,
   .variable_pointers = true,
+  .storage_16bit = device->instance->physicalDevice.info.gen >= 8,
};
 
nir_function *entry_point =
-- 
2.13.6

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 41/43] i965/fs: Predicate byte scattered writes if needed

2017-10-12 Thread Jose Maria Casanova Crespo
From: Alejandro Piñeiro 

While on Untyped Surface messages the bits of the execution mask are
ANDed with the corresponding bits of the Pixel/Sample Mask, that is
not the case for byte scattered writes. That is needed to avoid ssbo
stores writing on helper invocations. So when that can affect, we load
the sample mask, and predicate the send message.

Note: the need for this patch was tested with a custom test. Right now
the 16 bit storage CTS tests doesnt need this path in order to get a
full pass.
---
 src/intel/compiler/brw_fs_nir.cpp | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 332fc4bfb8..c04d2a4eb4 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4315,11 +4315,23 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  * to rely on byte scattered in order to write 16-bit elements.
  * The byte_scattered_write message needs that every written 16-bit
  * type to be aligned 32-bits (stride=2).
+ * Additionally, while on Untyped Surface messages the
+ * bits of the execution mask are ANDed with the corresponding
+ * bits of the Pixel/Sample Mask, that is not the case for byte
+ * scattered writes. That is needed to avoid ssbo stores writing
+ * on helper invocations. So when that can affect, we load the
+ * sample mask, and predicate the send message.
  */
+brw_predicate pred = BRW_PREDICATE_NONE;
+
+if (stage == MESA_SHADER_FRAGMENT) {
+   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+   pred = BRW_PREDICATE_NORMAL;
+}
 emit_byte_scattered_write(bld, surf_index, offset_reg,
   current_val_reg,
   1 /* dims */, length * type_slots,
-  BRW_PREDICATE_NONE);
+  pred);
  } else {
 unsigned write_size = length * type_slots;
 
-- 
2.13.6

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 39/43] i965/fs: Enables 16-bit load_ubo with sampler

2017-10-12 Thread Jose Maria Casanova Crespo
load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
surface format defined. So when reading 16-bit components with the
sampler we need to unshuffle two 16-bit components from each 32-bit
component.

Using the sampler avoids the use of the byte_scattered_read message
that needs one message for each component and is supposed to be
slower.

In the case of SKL+ we take advance of a hardware feature that
automatically defines a channel mask based on the rlen value, so on
SKL+ we only use half of the registers without using a header in the
payload.
---
 src/intel/compiler/brw_fs.cpp   | 31 +++
 src/intel/compiler/brw_fs_generator.cpp | 10 --
 src/intel/compiler/brw_fs_nir.cpp   | 11 +++
 3 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1d6fbdd06a..45608c1e47 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -185,9 +185,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
 * a double this means we are only loading 2 elements worth of data.
 * We also want to use a 32-bit data type for the dst of the load operation
 * so other parts of the driver don't get confused about the size of the
-* result.
+* result. On the case of 16-bit data we only need half of the 32-bit
+* components on SKL+ as we take advance of using message return size to
+* define an xy channel mask.
 */
-   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+   fs_reg vec4_result;
+   if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
+  vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
+  vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
+   } else {
+  vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+   }
fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
 vec4_result, surf_index, vec4_offset);
inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
@@ -198,8 +206,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
}
 
vec4_result.type = dst.type;
-   bld.MOV(dst, offset(vec4_result, bld,
-   (const_offset & 0xf) / type_sz(vec4_result.type)));
+
+   if (type_sz(dst.type) == 2) {
+  /* 16-bit types need to be unshuffled as each pair of 16-bit components
+   * is packed on a 32-bit compoment because we are using a 32-bit format
+   * in the surface of uniform that is read by the sampler.
+   * TODO: On BDW+ mark when an uniform has 16-bit type so we could setup a
+   * surface format of 16-bit and use the 16-bit return format at the
+   * sampler.
+   */
+  vec4_result.stride = 2;
+  bld.MOV(dst, byte_offset(offset(vec4_result, bld,
+  (const_offset & 0x7) / 4),
+   (const_offset & 0x7) / 2 % 2 * 2));
+   } else {
+  bld.MOV(dst, offset(vec4_result, bld,
+  (const_offset & 0xf) / type_sz(vec4_result.type)));
+   }
 }
 
 /**
diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index c766e72ecf..83852107ce 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1366,12 +1366,18 @@ 
fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
uint32_t simd_mode, rlen, mlen;
if (inst->exec_size == 16) {
   mlen = 2;
-  rlen = 8;
+  if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
+ rlen = 4;
+  else
+ rlen = 8;
   simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
} else {
   assert(inst->exec_size == 8);
   mlen = 1;
-  rlen = 4;
+  if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
+ rlen = 2;
+  else
+ rlen = 4;
   simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
}
 
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index c07b3e4d8d..774bd97968 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4056,14 +4056,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   if (const_offset == NULL) {
  fs_reg base_offset = retype(get_nir_src(instr->src[1]),
  BRW_REGISTER_TYPE_UD);
- if (type_sz(dest.type) == 2) {
-do_untyped_vector_read(bld, dest, surf_index, base_offset,
-   instr->num_components);
- } else {
-for (int i = 0; i < instr->num_components; i++)
-   VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), 
surf_index,
-  base_offset, i * type_sz(dest.type));
- }
+ for (int i = 0; i < instr->num_components; i++)
+VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
+   

[Mesa-dev] [PATCH v3 43/43] anv: Enable VK_KHR_16bit_storage

2017-10-12 Thread Jose Maria Casanova Crespo
From: Alejandro Piñeiro 

It uses VK_KHR_get_physical_device_properties2 functionality to expose
if the extension is supported or not.

v2: update due rebase against master (Alejandro)

Signed-off-by: Jose Maria Casanova Crespo 
Signed-off-by: Alejandro Piñeiro storageBuffer16BitAccess = pdevice->info.gen >= 8;
+ features->uniformAndStorageBuffer16BitAccess = pdevice->info.gen >= 8;
+ features->storagePushConstant16 = pdevice->info.gen >= 8;
+ features->storageInputOutput16 = pdevice->info.gen >= 8;
+ break;
+  }
+
   default:
  anv_debug_ignored_stype(ext->sType);
  break;
diff --git a/src/intel/vulkan/anv_extensions.py 
b/src/intel/vulkan/anv_extensions.py
index a828a668d6..714c034839 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -50,6 +50,7 @@ class Extension:
 # the those extension strings, then tests dEQP-VK.api.info.instance.extensions
 # and dEQP-VK.api.info.device fail due to the duplicated strings.
 EXTENSIONS = [
+Extension('VK_KHR_16bit_storage', 1, True),
 Extension('VK_KHR_bind_memory2',  1, True),
 Extension('VK_KHR_dedicated_allocation',  1, True),
 Extension('VK_KHR_descriptor_update_template',1, True),
-- 
2.13.6

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 38/43] i965/fs: Optimize 16-bit SSBO stores by packing two into a 32-bit reg

2017-10-12 Thread Jose Maria Casanova Crespo
From: Eduardo Lima Mitev 

Currently, we use byte-scattered write messages for storing 16-bit
into an SSBO. This is because untyped surface messages have a fixed
32-bit size.

This patch optimizes these 16-bit writes by combining 2 values (e.g,
two consecutive components) into a 32-bit register, packing the two
16-bit words.

16-bit single component values will continue to use byte-scattered
write messages.

This optimization reduces the number of SEND messages used for storing
16-bit values potentially by 2 or 4, which cuts down execution time
significantly because byte-scattered writes are an expensive
operation.

v2: Removed use of stride = 2 on sources (Jason Ekstrand)
Rework optimization using shuffle 16 write and enable writes
of 16bit vec4 with only one message of 32-bits. (Chema Casanova)

Signed-off-by: Jose Maria Casanova Crespo 
Signed-off-by: Eduardo Lima 
---
 src/intel/compiler/brw_fs_nir.cpp | 64 +++
 1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 2d0b3e139e..c07b3e4d8d 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4218,6 +4218,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
 instr->num_components);
  val_reg = tmp;
   }
+  if (bit_size == 16) {
+ val_reg=retype(val_reg, BRW_REGISTER_TYPE_HF);
+  }
 
   /* 16-bit types would use a minimum of 1 slot */
   unsigned type_slots = MAX2(type_size / 4, 1);
@@ -4231,6 +4234,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  unsigned first_component = ffs(writemask) - 1;
  unsigned length = ffs(~(writemask >> first_component)) - 1;
 
+ fs_reg current_val_reg =
+offset(val_reg, bld, first_component * type_slots);
+
  /* We can't write more than 2 64-bit components at once. Limit the
   * length of the write to what we can do and let the next iteration
   * handle the rest
@@ -4238,11 +4244,40 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  if (type_size > 4) {
 length = MIN2(2, length);
  } else if (type_size == 2) {
-/* For 16-bit types we are using byte scattered writes, that can
- * only write one component per call. So we limit the length, and
- * let the write happening in several iterations.
+/* For 16-bit types we pack two consecutive values into a
+ * 32-bit word and use an untyped write message. For single values
+ * we need to use byte-scattered writes because untyped writes work
+ * on multiples of 32 bits.
+ *
+ * For example, if there is a 3-component vector we submit one
+ * untyped-write message of 32-bit (first two components), and one
+ * byte-scattered write message (the last component).
  */
-length = 1;
+if (length >= 2) {
+   /* pack two consecutive 16-bit words into a 32-bit register,
+* using the same original source register.
+*/
+   length -= length % 2;
+   fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, length / 2);
+   shuffle_16bit_data_for_32bit_write(bld,
+  tmp,
+  current_val_reg,
+  length);
+   current_val_reg = tmp;
+
+} else {
+   /* For single 16-bit values, we just limit the length to 1 and
+* use a byte-scattered write message below.
+*/
+   length = 1;
+   fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
+   shuffle_16bit_data_for_32bit_write(bld,
+  tmp,
+  current_val_reg,
+  length);
+   current_val_reg = tmp;
+
+}
  }
 
  fs_reg offset_reg;
@@ -4257,24 +4292,29 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
 brw_imm_ud(type_size * first_component));
  }
 
- if (type_size == 2) {
+ if (type_size == 2 && length == 1) {
 /* Untyped Surface messages have a fixed 32-bit size, so we need
  * to rely on byte scattered in order to write 16-bit elements.
  * The byte_scattered_write message needs that every written 16-bit
  * type to be aligned 32-bits (stride=2).
  */
-fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
-val_reg.type 

[Mesa-dev] [PATCH v3 37/43] i965/fs: Enable 16-bit render target write on SKL and CHV

2017-10-12 Thread Jose Maria Casanova Crespo
Once the infrastruture to support Render Target Messages with 16-bit
payload is available, this patch enables it on SKL and CHV platforms.

Enabling it allows 16-bit payload that use half of the register on
SIMD16 and avoids the spurious conversion from 16-bit to 32-bit needed
on BDW, just to be converted again to 16-bit.

In the case of CHV there is no support for UINT so in this case the
half precision data format is not enabled and the fallback of the
32-bit payload is used.

From PRM CHV, vol 07, section "Pixel Data Port" page 260:

"Half Precision Render Target Write messages do not support UNIT
formats." where UNIT is a typo for UINT.

v2: Removed use of stride = 2 on sources (Jason Ekstrand)

Signed-off-by: Jose Maria Casanova Crespo 
Signed-off-by: Eduardo Lima 
---
 src/intel/compiler/brw_fs_nir.cpp | 46 +++
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 3dbdcc0955..2d0b3e139e 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -55,19 +55,24 @@ fs_visitor::nir_setup_outputs()
   return;
 
if (stage == MESA_SHADER_FRAGMENT) {
-  /*
+  /* On HW that doesn't support half-precision render-target-write
+   * messages (e.g, some gen8 HW like Broadwell), we need a workaround
+   * to support 16-bit outputs from pixel shaders.
+   *
* The following code uses the outputs map to save the variable's
* original output type, so later we can retrieve it and retype
* the output accordingly while emitting the FS 16-bit outputs.
*/
-  nir_foreach_variable(var, >outputs) {
- const enum glsl_base_type base_type =
-glsl_get_base_type(var->type->without_array());
-
- if (glsl_base_type_is_16bit(base_type)) {
-outputs[var->data.driver_location] =
-   retype(outputs[var->data.driver_location],
-  brw_type_for_base_type(var->type));
+  if (devinfo->gen == 8) {
+ nir_foreach_variable(var, >outputs) {
+const enum glsl_base_type base_type =
+   glsl_get_base_type(var->type->without_array());
+
+if (glsl_base_type_is_16bit(base_type)) {
+   outputs[var->data.driver_location] =
+  retype(outputs[var->data.driver_location],
+ brw_type_for_base_type(var->type));
+}
  }
   }
   return;
@@ -3246,14 +3251,27 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder ,
   const unsigned location = nir_intrinsic_base(instr) +
  SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
 
+  /* This flag discriminates HW where we have support for half-precision
+   * render target write messages (aka, the data-format bit), so 16-bit
+   * render target payloads can be used. It is available since skylake
+   * and cherryview. In the case of cherryview there is no support for
+   * UINT formats.
+   */
+  bool enable_hp_rtw = is_16bit &&
+ (devinfo->gen >= 9 || (devinfo->is_cherryview &&
+outputs[location].type != 
BRW_REGISTER_TYPE_UW));
+
   if (is_16bit) {
- /* The outputs[location] should already have the original output type
-  * stored from nir_setup_outputs.
+ /* outputs[location] should already have the original output type
+  * stored from nir_setup_outputs, in case the HW doesn't support
+  * half-precision RTW messages.
+  * If HP RTW is enabled we just use HF to copy 16-bit values.
   */
- src = retype(src, outputs[location].type);
+ src = retype(src, enable_hp_rtw ?
+  BRW_REGISTER_TYPE_HF : outputs[location].type);
   }
 
-  fs_reg new_dest = retype(alloc_frag_output(this, location, false),
+  fs_reg new_dest = retype(alloc_frag_output(this, location, 
enable_hp_rtw),
src.type);
 
   /* This is a workaround to support 16-bits outputs on HW that doesn't
@@ -3263,7 +3281,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder ,
* render target with a 16-bit surface format will force the correct
* conversion of the 32-bit output values to 16-bit.
*/
-  if (is_16bit) {
+  if (is_16bit && !enable_hp_rtw) {
  new_dest.type = brw_reg_type_from_bit_size(32, src.type);
   }
   for (unsigned j = 0; j < instr->num_components; j++)
-- 
2.13.6

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 30/43] i965/fs: Support 16-bit types at load_input and store_output

2017-10-12 Thread Jose Maria Casanova Crespo
Enables the support of 16-bit types on load_input and
store_outputs intrinsics intra-stages.

The approach was based on re-using the 32-bit URB read
and writes between stages, shuffling pairs of 16-bit values into
32-bit values at load_store intrinsic and un-shuffling the values
at load_inputs.

shuffle_32bit_load_result_to_16bit_data and
shuffle_32bit_load_result_to_16bit_data are implemented in a similar
way than the analogous functions for handling 64-bit types.
---
 src/intel/compiler/brw_fs.h   |  11 
 src/intel/compiler/brw_fs_nir.cpp | 119 +-
 2 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index b9476e69ed..90ada3ef4b 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -498,6 +498,17 @@ void shuffle_64bit_data_for_32bit_write(const 
brw::fs_builder ,
 const fs_reg ,
 const fs_reg ,
 uint32_t components);
+
+void shuffle_32bit_load_result_to_16bit_data(const brw::fs_builder ,
+ const fs_reg ,
+ const fs_reg ,
+ uint32_t components);
+
+void shuffle_16bit_data_for_32bit_write(const brw::fs_builder ,
+const fs_reg ,
+const fs_reg ,
+uint32_t components);
+
 fs_reg setup_imm_df(const brw::fs_builder ,
 double v);
 
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 83ff0607a7..9c694a1c53 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2124,12 +2124,17 @@ fs_visitor::emit_gs_input_load(const fs_reg ,
   first_component = first_component / 2;
}
 
+   if (type_sz(dst.type) == 2) {
+  num_components = DIV_ROUND_UP(num_components, 2);
+  tmp_dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
+   }
+
for (unsigned iter = 0; iter < num_iterations; iter++) {
   if (offset_const) {
  /* Constant indexing - use global offset. */
  if (first_component != 0) {
 unsigned read_components = num_components + first_component;
-fs_reg tmp = bld.vgrf(dst.type, read_components);
+fs_reg tmp = bld.vgrf(tmp_dst.type, read_components);
 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
 inst->size_written = read_components *
  tmp.component_size(inst->exec_size);
@@ -2179,6 +2184,11 @@ fs_visitor::emit_gs_input_load(const fs_reg ,
 bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
   }
 
+  if (type_sz(dst.type) == 2) {
+ shuffle_32bit_load_result_to_16bit_data(
+bld, dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), 
orig_num_components);
+  }
+
   if (num_iterations > 1) {
  num_components = orig_num_components - 2;
  if(offset_const) {
@@ -2484,6 +2494,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
  dst = tmp;
   }
 
+  if (type_sz(dst.type) == 2) {
+ num_components = DIV_ROUND_UP(num_components, 2);
+ dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
+  }
+
   for (unsigned iter = 0; iter < num_iterations; iter++) {
  if (indirect_offset.file == BAD_FILE) {
 /* Constant indexing - use global offset. */
@@ -2539,6 +2554,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
 }
  }
 
+ if (type_sz(orig_dst.type) == 2) {
+shuffle_32bit_load_result_to_16bit_data(
+   bld, orig_dst, dst, instr->num_components);
+ }
+
  /* Copy the temporary to the destination to deal with writemasking.
   *
   * Also attempt to deal with gl_PointSize being in the .w component.
@@ -2629,6 +2649,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
   fs_reg value = get_nir_src(instr->src[0]);
   bool is_64bit = (instr->src[0].is_ssa ?
  instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
+  bool is_16bit = (instr->src[0].is_ssa ?
+ instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 16;
   fs_reg indirect_offset = get_indirect_offset(instr);
   unsigned imm_offset = instr->const_index[0];
   unsigned swiz = BRW_SWIZZLE_XYZW;
@@ -2659,6 +2681,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
 num_iterations = 2;
 iter_components = 2;
  }
+  } else {
+ if (is_16bit) {
+iter_components = DIV_ROUND_UP(num_components, 2);
+ }
   }
 
   /* 64-bit data needs to me shuffled before we can write it to the URB.

  1   2   3   >