Re: [Mesa-dev] [PATCH v3 2/2] i965/vec4: load dvec3/4 uniforms first in the push constant buffer

2017-05-04 Thread Samuel Iglesias Gonsálvez
On Wed, 2017-05-03 at 16:47 -0700, Francisco Jerez wrote:
> Samuel Iglesias Gonsálvez  writes:
> 
> > Reorder the uniforms to load first the dvec4-aligned variables
> > in the push constant buffer and then push the vec4-aligned ones.
> > 
> > This fixes a bug were the dvec3/4 might be loaded one part on a GRF
> > and
> > the rest in next GRF, so the region parameters to read that could
> > break
> > the HW rules.
> > 
> > v2:
> > - Fix broken logic.
> > - Add a comment to explain what should be needed to optimise the
> > usage
> > of the push constant buffer slots, as this patch does not pack the
> > uniforms.
> > 
> > Signed-off-by: Samuel Iglesias Gonsálvez 
> > Cc: "17.1" 
> > ---
> >  src/intel/compiler/brw_vec4.cpp | 97
> > +++--
> >  1 file changed, 74 insertions(+), 23 deletions(-)
> > 
> > diff --git a/src/intel/compiler/brw_vec4.cpp
> > b/src/intel/compiler/brw_vec4.cpp
> > index 0909ddb586..18bfd48fa1 100644
> > --- a/src/intel/compiler/brw_vec4.cpp
> > +++ b/src/intel/compiler/brw_vec4.cpp
> > @@ -583,16 +583,44 @@ vec4_visitor::split_uniform_registers()
> > }
> >  }
> >  
> > +/* This function returns the register number where we placed the
> > uniform */
> > +static int
> > +set_push_constant_loc(const int nr_uniforms, int
> > *new_uniform_count,
> > +  const int src, const int size,
> > +  int *new_loc, int *new_chan,
> > +  int *new_chans_used)
> > +{
> > +   int dst;
> > +   /* Find the lowest place we can slot this uniform in. */
> > +   for (dst = 0; dst < nr_uniforms; dst++) {
> > +  if (new_chans_used[dst] + size <= 4)
> > + break;
> > +   }
> > +
> > +   assert(dst < nr_uniforms);
> > +
> > +   new_loc[src] = dst;
> > +   new_chan[src] = new_chans_used[dst];
> > +   new_chans_used[dst] += size;
> > +
> > +   *new_uniform_count = MAX2(*new_uniform_count, dst + 1);
> > +   return dst;
> > +}
> > +
> >  void
> >  vec4_visitor::pack_uniform_registers()
> >  {
> > uint8_t chans_used[this->uniforms];
> > int new_loc[this->uniforms];
> > int new_chan[this->uniforms];
> > +   bool is_aligned_to_dvec4[this->uniforms];
> > +   int new_chans_used[this->uniforms];
> >  
> > memset(chans_used, 0, sizeof(chans_used));
> > memset(new_loc, 0, sizeof(new_loc));
> > memset(new_chan, 0, sizeof(new_chan));
> > +   memset(new_chans_used, 0, sizeof(new_chans_used));
> > +   memset(is_aligned_to_dvec4, 0, sizeof(is_aligned_to_dvec4));
> >  
> > /* Find which uniform vectors are actually used by the
> > program.  We
> >  * expect unused vector elements when we've moved array access
> > out
> > @@ -631,10 +659,19 @@ vec4_visitor::pack_uniform_registers()
> >  
> >  unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle,
> > c) + 1;
> >  unsigned used = MAX2(chans_used[reg], channel *
> > channel_size);
> > -if (used <= 4)
> > -   chans_used[reg] = used;
> > -else
> > -   chans_used[reg + 1] = used - 4;
> > +/* FIXME: Marked all channels as used, so each uniform
> > will
> > + * fully use one or two vec4s. If we want to pack
> > them, we need
> > + * to, among other changes, set chans_used[reg] =
> > used;
> > + * chans_used[reg+1] = used - 4; and fix the swizzle
> > at the
> > + * end in order to set the proper location.
> > + */
> > +if (used <= 4) {
> > +   chans_used[reg] = 4;
> 
> Uhm...  So this change prevents the uniform packing pass from
> actually
> packing anything?  Might affect more applications negatively than
> broken
> FP64 would.  Are you planning to send a v3 that fixes the issue
> without
> disabling the optimization?

Yes, I am planning to send a v3 of this patch with the optimization in-
place.

>   May be worth holding this off until then.
> Even if that means it will miss the v17.1 release it will probably
> make
> it for the next bug-fix release.
> 

OK, thanks!

Sam


> > +} else {
> > +   is_aligned_to_dvec4[reg] = true;
> > +   is_aligned_to_dvec4[reg + 1] = true;
> > +   chans_used[reg + 1] = 4;
> > +}
> >   }
> >    }
> >  
> > @@ -659,42 +696,56 @@ vec4_visitor::pack_uniform_registers()
> >  
> > int new_uniform_count = 0;
> >  
> > +   /* As the uniforms are going to be reordered, take the data
> > from a temporary
> > +* copy of the original param[].
> > +*/
> > +   gl_constant_value **param = ralloc_array(NULL,
> > gl_constant_value*,
> > +stage_prog_data-
> > >nr_params);
> > +   memcpy(param, stage_prog_data->param,
> > +  sizeof(gl_constant_value*) * stage_prog_data-
> > >nr_params);
> > +
> > /* Now, figure out a packing of the live uniform vectors into
> 

Re: [Mesa-dev] [PATCH v3 2/2] i965/vec4: load dvec3/4 uniforms first in the push constant buffer

2017-05-03 Thread Francisco Jerez
Samuel Iglesias Gonsálvez  writes:

> Reorder the uniforms to load first the dvec4-aligned variables
> in the push constant buffer and then push the vec4-aligned ones.
>
> This fixes a bug were the dvec3/4 might be loaded one part on a GRF and
> the rest in next GRF, so the region parameters to read that could break
> the HW rules.
>
> v2:
> - Fix broken logic.
> - Add a comment to explain what should be needed to optimise the usage
> of the push constant buffer slots, as this patch does not pack the
> uniforms.
>
> Signed-off-by: Samuel Iglesias Gonsálvez 
> Cc: "17.1" 
> ---
>  src/intel/compiler/brw_vec4.cpp | 97 
> +++--
>  1 file changed, 74 insertions(+), 23 deletions(-)
>
> diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
> index 0909ddb586..18bfd48fa1 100644
> --- a/src/intel/compiler/brw_vec4.cpp
> +++ b/src/intel/compiler/brw_vec4.cpp
> @@ -583,16 +583,44 @@ vec4_visitor::split_uniform_registers()
> }
>  }
>  
> +/* This function returns the register number where we placed the uniform */
> +static int
> +set_push_constant_loc(const int nr_uniforms, int *new_uniform_count,
> +  const int src, const int size,
> +  int *new_loc, int *new_chan,
> +  int *new_chans_used)
> +{
> +   int dst;
> +   /* Find the lowest place we can slot this uniform in. */
> +   for (dst = 0; dst < nr_uniforms; dst++) {
> +  if (new_chans_used[dst] + size <= 4)
> + break;
> +   }
> +
> +   assert(dst < nr_uniforms);
> +
> +   new_loc[src] = dst;
> +   new_chan[src] = new_chans_used[dst];
> +   new_chans_used[dst] += size;
> +
> +   *new_uniform_count = MAX2(*new_uniform_count, dst + 1);
> +   return dst;
> +}
> +
>  void
>  vec4_visitor::pack_uniform_registers()
>  {
> uint8_t chans_used[this->uniforms];
> int new_loc[this->uniforms];
> int new_chan[this->uniforms];
> +   bool is_aligned_to_dvec4[this->uniforms];
> +   int new_chans_used[this->uniforms];
>  
> memset(chans_used, 0, sizeof(chans_used));
> memset(new_loc, 0, sizeof(new_loc));
> memset(new_chan, 0, sizeof(new_chan));
> +   memset(new_chans_used, 0, sizeof(new_chans_used));
> +   memset(is_aligned_to_dvec4, 0, sizeof(is_aligned_to_dvec4));
>  
> /* Find which uniform vectors are actually used by the program.  We
>  * expect unused vector elements when we've moved array access out
> @@ -631,10 +659,19 @@ vec4_visitor::pack_uniform_registers()
>  
>  unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
>  unsigned used = MAX2(chans_used[reg], channel * channel_size);
> -if (used <= 4)
> -   chans_used[reg] = used;
> -else
> -   chans_used[reg + 1] = used - 4;
> +/* FIXME: Marked all channels as used, so each uniform will
> + * fully use one or two vec4s. If we want to pack them, we need
> + * to, among other changes, set chans_used[reg] = used;
> + * chans_used[reg+1] = used - 4; and fix the swizzle at the
> + * end in order to set the proper location.
> + */
> +if (used <= 4) {
> +   chans_used[reg] = 4;

Uhm...  So this change prevents the uniform packing pass from actually
packing anything?  Might affect more applications negatively than broken
FP64 would.  Are you planning to send a v3 that fixes the issue without
disabling the optimization?  May be worth holding this off until then.
Even if that means it will miss the v17.1 release it will probably make
it for the next bug-fix release.

> +} else {
> +   is_aligned_to_dvec4[reg] = true;
> +   is_aligned_to_dvec4[reg + 1] = true;
> +   chans_used[reg + 1] = 4;
> +}
>   }
>}
>  
> @@ -659,42 +696,56 @@ vec4_visitor::pack_uniform_registers()
>  
> int new_uniform_count = 0;
>  
> +   /* As the uniforms are going to be reordered, take the data from a 
> temporary
> +* copy of the original param[].
> +*/
> +   gl_constant_value **param = ralloc_array(NULL, gl_constant_value*,
> +stage_prog_data->nr_params);
> +   memcpy(param, stage_prog_data->param,
> +  sizeof(gl_constant_value*) * stage_prog_data->nr_params);
> +
> /* Now, figure out a packing of the live uniform vectors into our
> -* push constants.
> +* push constants. Start with dvec{3,4} because they are aligned to
> +* dvec4 size (2 vec4).
>  */
> for (int src = 0; src < uniforms; src++) {
>int size = chans_used[src];
>  
> -  if (size == 0)
> +  if (size == 0 || !is_aligned_to_dvec4[src])
>   continue;
>  
> -  int dst;
> -  /* Find the lowest place we can slot this uniform in. */
> -  for (dst = 0; dst < src; dst++) {
> - if 

[Mesa-dev] [PATCH v3 2/2] i965/vec4: load dvec3/4 uniforms first in the push constant buffer

2017-05-03 Thread Samuel Iglesias Gonsálvez
Reorder the uniforms to load first the dvec4-aligned variables
in the push constant buffer and then push the vec4-aligned ones.

This fixes a bug were the dvec3/4 might be loaded one part on a GRF and
the rest in next GRF, so the region parameters to read that could break
the HW rules.

v2:
- Fix broken logic.
- Add a comment to explain what should be needed to optimise the usage
of the push constant buffer slots, as this patch does not pack the
uniforms.

Signed-off-by: Samuel Iglesias Gonsálvez 
Cc: "17.1" 
---
 src/intel/compiler/brw_vec4.cpp | 97 +++--
 1 file changed, 74 insertions(+), 23 deletions(-)

diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index 0909ddb586..18bfd48fa1 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -583,16 +583,44 @@ vec4_visitor::split_uniform_registers()
}
 }
 
+/* This function returns the register number where we placed the uniform */
+static int
+set_push_constant_loc(const int nr_uniforms, int *new_uniform_count,
+  const int src, const int size,
+  int *new_loc, int *new_chan,
+  int *new_chans_used)
+{
+   int dst;
+   /* Find the lowest place we can slot this uniform in. */
+   for (dst = 0; dst < nr_uniforms; dst++) {
+  if (new_chans_used[dst] + size <= 4)
+ break;
+   }
+
+   assert(dst < nr_uniforms);
+
+   new_loc[src] = dst;
+   new_chan[src] = new_chans_used[dst];
+   new_chans_used[dst] += size;
+
+   *new_uniform_count = MAX2(*new_uniform_count, dst + 1);
+   return dst;
+}
+
 void
 vec4_visitor::pack_uniform_registers()
 {
uint8_t chans_used[this->uniforms];
int new_loc[this->uniforms];
int new_chan[this->uniforms];
+   bool is_aligned_to_dvec4[this->uniforms];
+   int new_chans_used[this->uniforms];
 
memset(chans_used, 0, sizeof(chans_used));
memset(new_loc, 0, sizeof(new_loc));
memset(new_chan, 0, sizeof(new_chan));
+   memset(new_chans_used, 0, sizeof(new_chans_used));
+   memset(is_aligned_to_dvec4, 0, sizeof(is_aligned_to_dvec4));
 
/* Find which uniform vectors are actually used by the program.  We
 * expect unused vector elements when we've moved array access out
@@ -631,10 +659,19 @@ vec4_visitor::pack_uniform_registers()
 
 unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
 unsigned used = MAX2(chans_used[reg], channel * channel_size);
-if (used <= 4)
-   chans_used[reg] = used;
-else
-   chans_used[reg + 1] = used - 4;
+/* FIXME: Marked all channels as used, so each uniform will
+ * fully use one or two vec4s. If we want to pack them, we need
+ * to, among other changes, set chans_used[reg] = used;
+ * chans_used[reg+1] = used - 4; and fix the swizzle at the
+ * end in order to set the proper location.
+ */
+if (used <= 4) {
+   chans_used[reg] = 4;
+} else {
+   is_aligned_to_dvec4[reg] = true;
+   is_aligned_to_dvec4[reg + 1] = true;
+   chans_used[reg + 1] = 4;
+}
  }
   }
 
@@ -659,42 +696,56 @@ vec4_visitor::pack_uniform_registers()
 
int new_uniform_count = 0;
 
+   /* As the uniforms are going to be reordered, take the data from a temporary
+* copy of the original param[].
+*/
+   gl_constant_value **param = ralloc_array(NULL, gl_constant_value*,
+stage_prog_data->nr_params);
+   memcpy(param, stage_prog_data->param,
+  sizeof(gl_constant_value*) * stage_prog_data->nr_params);
+
/* Now, figure out a packing of the live uniform vectors into our
-* push constants.
+* push constants. Start with dvec{3,4} because they are aligned to
+* dvec4 size (2 vec4).
 */
for (int src = 0; src < uniforms; src++) {
   int size = chans_used[src];
 
-  if (size == 0)
+  if (size == 0 || !is_aligned_to_dvec4[src])
  continue;
 
-  int dst;
-  /* Find the lowest place we can slot this uniform in. */
-  for (dst = 0; dst < src; dst++) {
- if (chans_used[dst] + size <= 4)
-break;
+  int dst = set_push_constant_loc(uniforms, _uniform_count,
+  src, size, new_loc, new_chan,
+  new_chans_used);
+  if (dst != src) {
+ /* Move the references to the data */
+ for (int j = 0; j < size; j++) {
+stage_prog_data->param[dst * 4 + new_chan[src] + j] =
+   param[src * 4 + j];
+ }
   }
+   }
 
-  if (src == dst) {
- new_loc[src] = dst;
- new_chan[src] = 0;
-  } else {
- new_loc[src] = dst;
- new_chan[src] = chans_used[dst];
+   /* Continue with the rest of