Re: [Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo (v2)

2017-12-05 Thread Chema Casanova
On 05/12/17 23:47, Jason Ekstrand wrote:
> On Tue, Dec 5, 2017 at 1:36 PM, Jose Maria Casanova Crespo
> > wrote:
> 
> SSBO loads were using byte_scattered read messages as they allow
> reading 16-bit size components. byte_scattered messages can only
> operate one component at a time so we needed to emit as many messages
> as components.
> 
> But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the
> untyped_surface_read message to read pairs of 16-bit components using
> only one message. Once each pair is read it is unshuffled to return the
> proper 16-bit components. vec3 case is assimilated to vec4 but the 4th
> component is ignored.
> 
> 16-bit scalars are read using one byte_scattered_read message.
> 
> v2: Removed use of stride = 2 on sources (Jason Ekstrand)
>     Rework optimization using unshuffle 16 reads (Chema Casanova)
> v3: Use W and D types insead of HF and F in shuffle to avoid rounding
>     erros (Jason Ekstrand)
>     Use untyped_surface_read for 16-bit vec3. (Jason Ekstrand)
> 
> CC: Jason Ekstrand >
> ---
>  src/intel/compiler/brw_fs_nir.cpp | 29 ++---
>  1 file changed, 22 insertions(+), 7 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_fs_nir.cpp
> b/src/intel/compiler/brw_fs_nir.cpp
> index e11e75e6332..8deec082d59 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -2303,16 +2303,31 @@ do_untyped_vector_read(const fs_builder ,
>                         unsigned num_components)
>  {
>     if (type_sz(dest.type) <= 2) {
> -      fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
> -      bld.MOV(read_offset, offset_reg);
> -      for (unsigned i = 0; i < num_components; i++) {
> -         fs_reg read_reg =
> -            emit_byte_scattered_read(bld, surf_index, read_offset,
> +      assert(dest.stride == 1);
> +
> +      if (num_components > 1) {
> +         /* Pairs of 16-bit components can be read with untyped
> read, for 16-bit
> +          * vec3 4th component is ignored.
> +          */
> +         fs_reg read_result =
> +            emit_untyped_read(bld, surf_index, offset_reg,
> +                              1 /* dims */,
> DIV_ROUND_UP(num_components, 2),
> +                              BRW_PREDICATE_NONE);
> +         shuffle_32bit_load_result_to_16bit_data(bld,
> +               retype(dest, BRW_REGISTER_TYPE_W),
> +               retype(read_result, BRW_REGISTER_TYPE_D),
> +               num_components);
> +      } else {
> +         assert(num_components == 1);
> +         /* scalar 16-bit are read using one byte_scattered_read
> message */
> +         fs_reg read_result =
> +            emit_byte_scattered_read(bld, surf_index, offset_reg,
>                                       1 /* dims */, 1,
>                                       type_sz(dest.type) * 8 /*
> bit_size */,
>                                       BRW_PREDICATE_NONE);
> -         bld.MOV(offset(dest, bld, i), subscript(read_reg,
> dest.type, 0));
> -         bld.ADD(read_offset, read_offset,
> brw_imm_ud(type_sz(dest.type)));
> +         read_result.type = dest.type;
> +         read_result.stride = 2;
> +         bld.MOV(dest, read_result);
> 
> 
> If read_reg has a 32-bit type, you could use subscript here.  Meh.

Fixed locally.

> Reviewed-by: Jason Ekstrand   
> 
>        }
>     } else if (type_sz(dest.type) == 4) {
>        fs_reg read_result = emit_untyped_read(bld, surf_index,
> offset_reg,
> --
> 2.11.0
> 
> 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo (v2)

2017-12-05 Thread Jason Ekstrand
On Tue, Dec 5, 2017 at 1:36 PM, Jose Maria Casanova Crespo <
jmcasan...@igalia.com> wrote:

> SSBO loads were using byte_scattered read messages as they allow
> reading 16-bit size components. byte_scattered messages can only
> operate one component at a time so we needed to emit as many messages
> as components.
>
> But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the
> untyped_surface_read message to read pairs of 16-bit components using
> only one message. Once each pair is read it is unshuffled to return the
> proper 16-bit components. vec3 case is assimilated to vec4 but the 4th
> component is ignored.
>
> 16-bit scalars are read using one byte_scattered_read message.
>
> v2: Removed use of stride = 2 on sources (Jason Ekstrand)
> Rework optimization using unshuffle 16 reads (Chema Casanova)
> v3: Use W and D types insead of HF and F in shuffle to avoid rounding
> erros (Jason Ekstrand)
> Use untyped_surface_read for 16-bit vec3. (Jason Ekstrand)
>
> CC: Jason Ekstrand 
> ---
>  src/intel/compiler/brw_fs_nir.cpp | 29 ++---
>  1 file changed, 22 insertions(+), 7 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs_nir.cpp
> b/src/intel/compiler/brw_fs_nir.cpp
> index e11e75e6332..8deec082d59 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -2303,16 +2303,31 @@ do_untyped_vector_read(const fs_builder ,
> unsigned num_components)
>  {
> if (type_sz(dest.type) <= 2) {
> -  fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
> -  bld.MOV(read_offset, offset_reg);
> -  for (unsigned i = 0; i < num_components; i++) {
> - fs_reg read_reg =
> -emit_byte_scattered_read(bld, surf_index, read_offset,
> +  assert(dest.stride == 1);
> +
> +  if (num_components > 1) {
> + /* Pairs of 16-bit components can be read with untyped read, for
> 16-bit
> +  * vec3 4th component is ignored.
> +  */
> + fs_reg read_result =
> +emit_untyped_read(bld, surf_index, offset_reg,
> +  1 /* dims */, DIV_ROUND_UP(num_components,
> 2),
> +  BRW_PREDICATE_NONE);
> + shuffle_32bit_load_result_to_16bit_data(bld,
> +   retype(dest, BRW_REGISTER_TYPE_W),
> +   retype(read_result, BRW_REGISTER_TYPE_D),
> +   num_components);
> +  } else {
> + assert(num_components == 1);
> + /* scalar 16-bit are read using one byte_scattered_read message
> */
> + fs_reg read_result =
> +emit_byte_scattered_read(bld, surf_index, offset_reg,
>   1 /* dims */, 1,
>   type_sz(dest.type) * 8 /* bit_size
> */,
>   BRW_PREDICATE_NONE);
> - bld.MOV(offset(dest, bld, i), subscript(read_reg, dest.type, 0));
> - bld.ADD(read_offset, read_offset, brw_imm_ud(type_sz(dest.type))
> );
> + read_result.type = dest.type;
> + read_result.stride = 2;
> + bld.MOV(dest, read_result);
>

If read_reg has a 32-bit type, you could use subscript here.  Meh.

Reviewed-by: Jason Ekstrand 


>}
> } else if (type_sz(dest.type) == 4) {
>fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
> --
> 2.11.0
>
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo (v2)

2017-12-05 Thread Jose Maria Casanova Crespo
SSBO loads were using byte_scattered read messages as they allow
reading 16-bit size components. byte_scattered messages can only
operate one component at a time so we needed to emit as many messages
as components.

But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the
untyped_surface_read message to read pairs of 16-bit components using
only one message. Once each pair is read it is unshuffled to return the
proper 16-bit components. vec3 case is assimilated to vec4 but the 4th
component is ignored.

16-bit scalars are read using one byte_scattered_read message.

v2: Removed use of stride = 2 on sources (Jason Ekstrand)
Rework optimization using unshuffle 16 reads (Chema Casanova)
v3: Use W and D types insead of HF and F in shuffle to avoid rounding
erros (Jason Ekstrand)
Use untyped_surface_read for 16-bit vec3. (Jason Ekstrand)

CC: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_nir.cpp | 29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index e11e75e6332..8deec082d59 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2303,16 +2303,31 @@ do_untyped_vector_read(const fs_builder ,
unsigned num_components)
 {
if (type_sz(dest.type) <= 2) {
-  fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
-  bld.MOV(read_offset, offset_reg);
-  for (unsigned i = 0; i < num_components; i++) {
- fs_reg read_reg =
-emit_byte_scattered_read(bld, surf_index, read_offset,
+  assert(dest.stride == 1);
+
+  if (num_components > 1) {
+ /* Pairs of 16-bit components can be read with untyped read, for 
16-bit
+  * vec3 4th component is ignored.
+  */
+ fs_reg read_result =
+emit_untyped_read(bld, surf_index, offset_reg,
+  1 /* dims */, DIV_ROUND_UP(num_components, 2),
+  BRW_PREDICATE_NONE);
+ shuffle_32bit_load_result_to_16bit_data(bld,
+   retype(dest, BRW_REGISTER_TYPE_W),
+   retype(read_result, BRW_REGISTER_TYPE_D),
+   num_components);
+  } else {
+ assert(num_components == 1);
+ /* scalar 16-bit are read using one byte_scattered_read message */
+ fs_reg read_result =
+emit_byte_scattered_read(bld, surf_index, offset_reg,
  1 /* dims */, 1,
  type_sz(dest.type) * 8 /* bit_size */,
  BRW_PREDICATE_NONE);
- bld.MOV(offset(dest, bld, i), subscript(read_reg, dest.type, 0));
- bld.ADD(read_offset, read_offset, brw_imm_ud(type_sz(dest.type)));
+ read_result.type = dest.type;
+ read_result.stride = 2;
+ bld.MOV(dest, read_result);
   }
} else if (type_sz(dest.type) == 4) {
   fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo

2017-12-01 Thread Chema Casanova
On 01/12/17 11:49, Jason Ekstrand wrote:
> On Wed, Nov 29, 2017 at 6:57 PM, Jose Maria Casanova Crespo
> > wrote:
> 
> SSBO loads were using byte_scattered read messages as they allow
> reading 16-bit size components. byte_scattered messages can only
> operate one component at a time so we needed to emit as many messages
> as components.
> 
> But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the
> untyped_surface_read message to read pairs of 16-bit components
> using only
> one message. Once each pair is read it is unshuffled to return the
> proper
> 16-bit components.
> 
> On 16-bit scalar and vec3 16-bit the not paired component is read using
> only one byte_scattered_read message.
> 
> 
> My gut tells me that, for vec3's, we'd be better off with a single
> untyped read than one untyped read and one byte scattered read.  Also,
> are there alignment issues with untyped surface reads/writes that might
> cause us problems on vec3's?  I don't know what the alignment rules are
> for 16-bit vec3's in Vulkan.

I think that untyped_read will work perfectly fine with vec3 as there
are not special rules for 16-bits. The only thing would be that we would
writing always the unused 4th component, so we decided to play save and
just modify what was expected and only scattered write allowed that with
that approach:

"* A three- or four-component vector, with components of size N, has a
base alignment of 4 N."

I was trying for this V4 of the series, to use untyped_surface_read for
all the cases, but I focused on scalar ones, without success. But for
vec3 it should be easy to do if we can assume to write random data at
the 4th component.

>  
> 
> v2: Removed use of stride = 2 on sources (Jason Ekstrand)
>     Rework optimization using unshuffle 16 reads (Chema Casanova)
> ---
>  src/intel/compiler/brw_fs_nir.cpp | 43
> ++-
>  1 file changed, 33 insertions(+), 10 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_fs_nir.cpp
> b/src/intel/compiler/brw_fs_nir.cpp
> index fa7aa9c247..57e79853ef 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -2354,16 +2354,39 @@ do_untyped_vector_read(const fs_builder ,
>           bld.ADD(read_offset, read_offset, brw_imm_ud(16));
>        }
>     } else if (type_sz(dest.type) == 2) {
> -      fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
> -      bld.MOV(read_offset, offset_reg);
> -      for (unsigned i = 0; i < num_components; i++) {
> -         fs_reg read_reg = emit_byte_scattered_read(bld,
> surf_index, read_offset,
> -                                                    1 /* dims */,
> -                                                    1,
> -                                                    16 /*bit_size */,
> -                                                   
> BRW_PREDICATE_NONE);
> -         bld.MOV(offset(dest,bld,i), subscript(read_reg, dest.type,
> 0));
> -         bld.ADD(read_offset, read_offset,
> brw_imm_ud(type_sz(dest.type)));
> +      assert(dest.stride == 1);
> +
> +      int component_pairs = num_components / 2;
> +      /* Pairs of 16-bit components can be read with untyped read */
> +      if (component_pairs > 0) {
> +         fs_reg read_result = emit_untyped_read(bld, surf_index,
> +                                                offset_reg,
> +                                                1 /* dims */,
> +                                                component_pairs,
> +                                                BRW_PREDICATE_NONE);
> +         shuffle_32bit_load_result_to_16bit_data(bld,
> +               retype(dest, BRW_REGISTER_TYPE_HF),
> +               retype(read_result, BRW_REGISTER_TYPE_F),
> 
> 
> I'd rather we use W and D rather than HF and F.  Rounding errors scare me.

Ok.

Thanks for the review.

Chema

> +               component_pairs * 2);
> +      }
> +      /* Last component of vec3 and scalar 16-bit read needs to be read
> +       * using one byte_scattered_read message
> +       */
> +      if (num_components % 2) {
> +         fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
> +         bld.ADD(read_offset,
> +                 offset_reg,
> +                 brw_imm_ud((num_components - 1) *
> type_sz(dest.type)));
> +         fs_reg read_result = emit_byte_scattered_read(bld, surf_index,
> +                                                       read_offset,
> +                                                       1 /* dims */,
> +                                                       1,
> +                                                       16 /*
> bit_size */,
> +     

Re: [Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo

2017-12-01 Thread Jason Ekstrand
On Wed, Nov 29, 2017 at 6:57 PM, Jose Maria Casanova Crespo <
jmcasan...@igalia.com> wrote:

> SSBO loads were using byte_scattered read messages as they allow
> reading 16-bit size components. byte_scattered messages can only
> operate one component at a time so we needed to emit as many messages
> as components.
>
> But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the
> untyped_surface_read message to read pairs of 16-bit components using only
> one message. Once each pair is read it is unshuffled to return the proper
> 16-bit components.
>
> On 16-bit scalar and vec3 16-bit the not paired component is read using
> only one byte_scattered_read message.
>

My gut tells me that, for vec3's, we'd be better off with a single untyped
read than one untyped read and one byte scattered read.  Also, are there
alignment issues with untyped surface reads/writes that might cause us
problems on vec3's?  I don't know what the alignment rules are for 16-bit
vec3's in Vulkan.


> v2: Removed use of stride = 2 on sources (Jason Ekstrand)
> Rework optimization using unshuffle 16 reads (Chema Casanova)
> ---
>  src/intel/compiler/brw_fs_nir.cpp | 43 ++
> -
>  1 file changed, 33 insertions(+), 10 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs_nir.cpp
> b/src/intel/compiler/brw_fs_nir.cpp
> index fa7aa9c247..57e79853ef 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -2354,16 +2354,39 @@ do_untyped_vector_read(const fs_builder ,
>   bld.ADD(read_offset, read_offset, brw_imm_ud(16));
>}
> } else if (type_sz(dest.type) == 2) {
> -  fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
> -  bld.MOV(read_offset, offset_reg);
> -  for (unsigned i = 0; i < num_components; i++) {
> - fs_reg read_reg = emit_byte_scattered_read(bld, surf_index,
> read_offset,
> -1 /* dims */,
> -1,
> -16 /*bit_size */,
> -BRW_PREDICATE_NONE);
> - bld.MOV(offset(dest,bld,i), subscript(read_reg, dest.type, 0));
> - bld.ADD(read_offset, read_offset, brw_imm_ud(type_sz(dest.type))
> );
> +  assert(dest.stride == 1);
> +
> +  int component_pairs = num_components / 2;
> +  /* Pairs of 16-bit components can be read with untyped read */
> +  if (component_pairs > 0) {
> + fs_reg read_result = emit_untyped_read(bld, surf_index,
> +offset_reg,
> +1 /* dims */,
> +component_pairs,
> +BRW_PREDICATE_NONE);
> + shuffle_32bit_load_result_to_16bit_data(bld,
> +   retype(dest, BRW_REGISTER_TYPE_HF),
> +   retype(read_result, BRW_REGISTER_TYPE_F),
>

I'd rather we use W and D rather than HF and F.  Rounding errors scare me.


> +   component_pairs * 2);
> +  }
> +  /* Last component of vec3 and scalar 16-bit read needs to be read
> +   * using one byte_scattered_read message
> +   */
> +  if (num_components % 2) {
> + fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
> + bld.ADD(read_offset,
> + offset_reg,
> + brw_imm_ud((num_components - 1) * type_sz(dest.type)));
> + fs_reg read_result = emit_byte_scattered_read(bld, surf_index,
> +   read_offset,
> +   1 /* dims */,
> +   1,
> +   16 /* bit_size */,
> +
>  BRW_PREDICATE_NONE);
> + read_result.type = dest.type;
> + read_result.stride = 2;
> +
> + bld.MOV(offset(dest, bld, num_components - 1), read_result);
>}
> } else {
>unreachable("Unsupported type");
> --
> 2.14.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo

2017-11-29 Thread Jose Maria Casanova Crespo
SSBO loads were using byte_scattered read messages as they allow
reading 16-bit size components. byte_scattered messages can only
operate one component at a time so we needed to emit as many messages
as components.

But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the
untyped_surface_read message to read pairs of 16-bit components using only
one message. Once each pair is read it is unshuffled to return the proper
16-bit components.

On 16-bit scalar and vec3 16-bit the not paired component is read using
only one byte_scattered_read message.

v2: Removed use of stride = 2 on sources (Jason Ekstrand)
Rework optimization using unshuffle 16 reads (Chema Casanova)
---
 src/intel/compiler/brw_fs_nir.cpp | 43 ++-
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index fa7aa9c247..57e79853ef 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2354,16 +2354,39 @@ do_untyped_vector_read(const fs_builder ,
  bld.ADD(read_offset, read_offset, brw_imm_ud(16));
   }
} else if (type_sz(dest.type) == 2) {
-  fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
-  bld.MOV(read_offset, offset_reg);
-  for (unsigned i = 0; i < num_components; i++) {
- fs_reg read_reg = emit_byte_scattered_read(bld, surf_index, 
read_offset,
-1 /* dims */,
-1,
-16 /*bit_size */,
-BRW_PREDICATE_NONE);
- bld.MOV(offset(dest,bld,i), subscript(read_reg, dest.type, 0));
- bld.ADD(read_offset, read_offset, brw_imm_ud(type_sz(dest.type)));
+  assert(dest.stride == 1);
+
+  int component_pairs = num_components / 2;
+  /* Pairs of 16-bit components can be read with untyped read */
+  if (component_pairs > 0) {
+ fs_reg read_result = emit_untyped_read(bld, surf_index,
+offset_reg,
+1 /* dims */,
+component_pairs,
+BRW_PREDICATE_NONE);
+ shuffle_32bit_load_result_to_16bit_data(bld,
+   retype(dest, BRW_REGISTER_TYPE_HF),
+   retype(read_result, BRW_REGISTER_TYPE_F),
+   component_pairs * 2);
+  }
+  /* Last component of vec3 and scalar 16-bit read needs to be read
+   * using one byte_scattered_read message
+   */
+  if (num_components % 2) {
+ fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.ADD(read_offset,
+ offset_reg,
+ brw_imm_ud((num_components - 1) * type_sz(dest.type)));
+ fs_reg read_result = emit_byte_scattered_read(bld, surf_index,
+   read_offset,
+   1 /* dims */,
+   1,
+   16 /* bit_size */,
+   BRW_PREDICATE_NONE);
+ read_result.type = dest.type;
+ read_result.stride = 2;
+
+ bld.MOV(offset(dest, bld, num_components - 1), read_result);
   }
} else {
   unreachable("Unsupported type");
-- 
2.14.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev