subject:"\[Mesa\-dev\] \[PATCH v4 23\/44\] i965\/fs\: Enables 16\-bit load

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler (v2)

2017-12-05 Thread Jason Ekstrand

Looks good to me.

On Tue, Dec 5, 2017 at 3:47 PM, Jose Maria Casanova Crespo <
jmcasan...@igalia.com> wrote:

> From: Jason Ekstrand 
>
> load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
> surface format defined. So when reading 16-bit components with the
> sampler we need to unshuffle two 16-bit components from each 32-bit
> component.
>
> Using the sampler avoids the use of the byte_scattered_read message
> that needs one message for each component and is supposed to be
> slower.
>
> v2: (Jason Ekstrand)
> - Simplify component selection and unshuffling for different bitsizes
> - Remove SKL optimization of reading only two 32-bit components when
>   reading 16-bits types.
>
> Reviewed-by: Jose Maria Casanova Crespo 
> ---
>  src/intel/compiler/brw_fs.cpp | 21 ++---
>  1 file changed, 14 insertions(+), 7 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 91399c6c1d..93bb6b4673 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -191,14 +191,21 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> fs_builder ,
>  vec4_result, surf_index, vec4_offset);
> inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
>
> -   if (type_sz(dst.type) == 8) {
> -  shuffle_32bit_load_result_to_64bit_data(
> - bld, retype(vec4_result, dst.type), vec4_result, 2);
> +   fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
> +   switch (type_sz(dst.type)) {
> +   case 2:
> +  shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
> +  bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
> +  break;
> +   case 4:
> +  bld.MOV(dst, retype(dw, dst.type));
> +  break;
> +   case 8:
> +  shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
> +  break;
> +   default:
> +  unreachable("Unsupported bit_size");
> }
> -
> -   vec4_result.type = dst.type;
> -   bld.MOV(dst, offset(vec4_result, bld,
> -   (const_offset & 0xf) / type_sz(vec4_result.type)));
>  }
>
>  /**
> --
> 2.14.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler (v2)

2017-12-05 Thread Jose Maria Casanova Crespo

From: Jason Ekstrand 

load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
surface format defined. So when reading 16-bit components with the
sampler we need to unshuffle two 16-bit components from each 32-bit
component.

Using the sampler avoids the use of the byte_scattered_read message
that needs one message for each component and is supposed to be
slower.

v2: (Jason Ekstrand)
- Simplify component selection and unshuffling for different bitsizes
- Remove SKL optimization of reading only two 32-bit components when
  reading 16-bits types.

Reviewed-by: Jose Maria Casanova Crespo 
---
 src/intel/compiler/brw_fs.cpp | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 91399c6c1d..93bb6b4673 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -191,14 +191,21 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
 vec4_result, surf_index, vec4_offset);
inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
 
-   if (type_sz(dst.type) == 8) {
-  shuffle_32bit_load_result_to_64bit_data(
- bld, retype(vec4_result, dst.type), vec4_result, 2);
+   fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
+   switch (type_sz(dst.type)) {
+   case 2:
+  shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
+  bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
+  break;
+   case 4:
+  bld.MOV(dst, retype(dw, dst.type));
+  break;
+   case 8:
+  shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
+  break;
+   default:
+  unreachable("Unsupported bit_size");
}
-
-   vec4_result.type = dst.type;
-   bld.MOV(dst, offset(vec4_result, bld,
-   (const_offset & 0xf) / type_sz(vec4_result.type)));
 }
 
 /**
-- 
2.14.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

2017-12-05 Thread Chema Casanova



On 05/12/17 22:25, Chema Casanova wrote:
> On 05/12/17 19:53, Jason Ekstrand wrote:
>> On Tue, Dec 5, 2017 at 9:08 AM, Chema Casanova > > wrote:
>>
>> El 30/11/17 a las 23:58, Jason Ekstrand escribió:
>> > On Wed, Nov 29, 2017 at 6:50 PM, Jose Maria Casanova Crespo
>> > 
>> >> wrote:
>> >
>> >     load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
>> >     surface format defined. So when reading 16-bit components with the
>> >     sampler we need to unshuffle two 16-bit components from each
>> 32-bit
>> >     component.
>> >
>> >     Using the sampler avoids the use of the byte_scattered_read
>> message
>> >     that needs one message for each component and is supposed to be
>> >     slower.
>> >
>> >     In the case of SKL+ we take advantage of a hardware feature that
>> >     automatically defines a channel mask based on the rlen value,
>> so on
>> >     SKL+ we only use half of the registers without using a header
>> in the
>> >     payload.
>> >     ---
>> >      src/intel/compiler/brw_fs.cpp           | 31
>> >     +++
>> >      src/intel/compiler/brw_fs_generator.cpp | 10 --
>> >      2 files changed, 35 insertions(+), 6 deletions(-)
>> >
>> >     diff --git a/src/intel/compiler/brw_fs.cpp
>> >     b/src/intel/compiler/brw_fs.cpp
>> >     index 1ca4d416b2..9c543496ba 100644
>> >     --- a/src/intel/compiler/brw_fs.cpp
>> >     +++ b/src/intel/compiler/brw_fs.cpp
>> >     @@ -184,9 +184,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
>> >     fs_builder ,
>> >          * a double this means we are only loading 2 elements worth of
>> >     data.
>> >          * We also want to use a 32-bit data type for the dst of the
>> >     load operation
>> >          * so other parts of the driver don't get confused about the
>> >     size of the
>> >     -    * result.
>> >     +    * result. On the case of 16-bit data we only need half of the
>> >     32-bit
>> >     +    * components on SKL+ as we take advance of using message
>> >     return size to
>> >     +    * define an xy channel mask.
>> >          */
>> >     -   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
>> >     +   fs_reg vec4_result;
>> >     +   if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
>> >     +      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
>> >     +      vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
>> >     +   } else {
>> >     +      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
>> >     +   }
>> >
>> >         fs_inst *inst =
>> >     bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
>> >                                  vec4_result, surf_index,
>> vec4_offset);
>> >         inst->size_written = 4 *
>> >     vec4_result.component_size(inst->exec_size);
>> >     @@ -197,8 +205,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
>> >     fs_builder ,
>> >         }
>> >
>> >         vec4_result.type = dst.type;
>> >     -   bld.MOV(dst, offset(vec4_result, bld,
>> >     -                       (const_offset & 0xf) /
>> >     type_sz(vec4_result.type)));
>> >     +
>> >     +   if (type_sz(dst.type) == 2) {
>> >     +      /* 16-bit types need to be unshuffled as each pair of
>> >     16-bit components
>> >     +       * is packed on a 32-bit compoment because we are using a
>> >     32-bit format
>> >     +       * in the surface of uniform that is read by the sampler.
>> >     +       * TODO: On BDW+ mark when an uniform has 16-bit type so we
>> >     could setup a
>> >     +       * surface format of 16-bit and use the 16-bit return
>> >     format at the
>> >     +       * sampler.
>> >     +       */
>> >     +      vec4_result.stride = 2;
>> >     +      bld.MOV(dst, byte_offset(offset(vec4_result, bld,
>> >     +                                      (const_offset & 0x7) / 4),
>> >     +                               (const_offset & 0x7) / 2 % 2 *
>> 2));
>> >     +   } else {
>> >     +      bld.MOV(dst, offset(vec4_result, bld,
>> >     +                          (const_offset & 0xf) /
>> >     type_sz(vec4_result.type)));
>> >     +   }
>> >
>> >
>> > This seems overly complicated.  How about something like
>>
>> > fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
>> > switch (type_sz(dst.type)) {
>> > case 2:
>> >    shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
>> >    bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
>> >

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

2017-12-05 Thread Chema Casanova

On 05/12/17 19:53, Jason Ekstrand wrote:
> On Tue, Dec 5, 2017 at 9:08 AM, Chema Casanova  > wrote:
> 
> El 30/11/17 a las 23:58, Jason Ekstrand escribió:
> > On Wed, Nov 29, 2017 at 6:50 PM, Jose Maria Casanova Crespo
> > 
> >> wrote:
> >
> >     load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
> >     surface format defined. So when reading 16-bit components with the
> >     sampler we need to unshuffle two 16-bit components from each
> 32-bit
> >     component.
> >
> >     Using the sampler avoids the use of the byte_scattered_read
> message
> >     that needs one message for each component and is supposed to be
> >     slower.
> >
> >     In the case of SKL+ we take advantage of a hardware feature that
> >     automatically defines a channel mask based on the rlen value,
> so on
> >     SKL+ we only use half of the registers without using a header
> in the
> >     payload.
> >     ---
> >      src/intel/compiler/brw_fs.cpp           | 31
> >     +++
> >      src/intel/compiler/brw_fs_generator.cpp | 10 --
> >      2 files changed, 35 insertions(+), 6 deletions(-)
> >
> >     diff --git a/src/intel/compiler/brw_fs.cpp
> >     b/src/intel/compiler/brw_fs.cpp
> >     index 1ca4d416b2..9c543496ba 100644
> >     --- a/src/intel/compiler/brw_fs.cpp
> >     +++ b/src/intel/compiler/brw_fs.cpp
> >     @@ -184,9 +184,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> >     fs_builder ,
> >          * a double this means we are only loading 2 elements worth of
> >     data.
> >          * We also want to use a 32-bit data type for the dst of the
> >     load operation
> >          * so other parts of the driver don't get confused about the
> >     size of the
> >     -    * result.
> >     +    * result. On the case of 16-bit data we only need half of the
> >     32-bit
> >     +    * components on SKL+ as we take advance of using message
> >     return size to
> >     +    * define an xy channel mask.
> >          */
> >     -   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> >     +   fs_reg vec4_result;
> >     +   if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
> >     +      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> >     +      vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
> >     +   } else {
> >     +      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> >     +   }
> >
> >         fs_inst *inst =
> >     bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
> >                                  vec4_result, surf_index,
> vec4_offset);
> >         inst->size_written = 4 *
> >     vec4_result.component_size(inst->exec_size);
> >     @@ -197,8 +205,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> >     fs_builder ,
> >         }
> >
> >         vec4_result.type = dst.type;
> >     -   bld.MOV(dst, offset(vec4_result, bld,
> >     -                       (const_offset & 0xf) /
> >     type_sz(vec4_result.type)));
> >     +
> >     +   if (type_sz(dst.type) == 2) {
> >     +      /* 16-bit types need to be unshuffled as each pair of
> >     16-bit components
> >     +       * is packed on a 32-bit compoment because we are using a
> >     32-bit format
> >     +       * in the surface of uniform that is read by the sampler.
> >     +       * TODO: On BDW+ mark when an uniform has 16-bit type so we
> >     could setup a
> >     +       * surface format of 16-bit and use the 16-bit return
> >     format at the
> >     +       * sampler.
> >     +       */
> >     +      vec4_result.stride = 2;
> >     +      bld.MOV(dst, byte_offset(offset(vec4_result, bld,
> >     +                                      (const_offset & 0x7) / 4),
> >     +                               (const_offset & 0x7) / 2 % 2 *
> 2));
> >     +   } else {
> >     +      bld.MOV(dst, offset(vec4_result, bld,
> >     +                          (const_offset & 0xf) /
> >     type_sz(vec4_result.type)));
> >     +   }
> >
> >
> > This seems overly complicated.  How about something like
> 
> > fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
> > switch (type_sz(dst.type)) {
> > case 2:
> >    shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
> >    bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
> >    break;
> > case 4:
> >    bld.MOV(dst, dw);
> >    break;
> > case 8:
> >    shuffle_32bit_load_result_to_64bit_data(bld, dst,

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

2017-12-05 Thread Jason Ekstrand

On Tue, Dec 5, 2017 at 9:08 AM, Chema Casanova 
wrote:

> El 30/11/17 a las 23:58, Jason Ekstrand escribió:
> > On Wed, Nov 29, 2017 at 6:50 PM, Jose Maria Casanova Crespo
> > > wrote:
> >
> > load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
> > surface format defined. So when reading 16-bit components with the
> > sampler we need to unshuffle two 16-bit components from each 32-bit
> > component.
> >
> > Using the sampler avoids the use of the byte_scattered_read message
> > that needs one message for each component and is supposed to be
> > slower.
> >
> > In the case of SKL+ we take advantage of a hardware feature that
> > automatically defines a channel mask based on the rlen value, so on
> > SKL+ we only use half of the registers without using a header in the
> > payload.
> > ---
> >  src/intel/compiler/brw_fs.cpp   | 31
> > +++
> >  src/intel/compiler/brw_fs_generator.cpp | 10 --
> >  2 files changed, 35 insertions(+), 6 deletions(-)
> >
> > diff --git a/src/intel/compiler/brw_fs.cpp
> > b/src/intel/compiler/brw_fs.cpp
> > index 1ca4d416b2..9c543496ba 100644
> > --- a/src/intel/compiler/brw_fs.cpp
> > +++ b/src/intel/compiler/brw_fs.cpp
> > @@ -184,9 +184,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> > fs_builder ,
> >  * a double this means we are only loading 2 elements worth of
> > data.
> >  * We also want to use a 32-bit data type for the dst of the
> > load operation
> >  * so other parts of the driver don't get confused about the
> > size of the
> > -* result.
> > +* result. On the case of 16-bit data we only need half of the
> > 32-bit
> > +* components on SKL+ as we take advance of using message
> > return size to
> > +* define an xy channel mask.
> >  */
> > -   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> > +   fs_reg vec4_result;
> > +   if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
> > +  vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> > +  vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
> > +   } else {
> > +  vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> > +   }
> >
> > fs_inst *inst =
> > bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
> >  vec4_result, surf_index, vec4_offset);
> > inst->size_written = 4 *
> > vec4_result.component_size(inst->exec_size);
> > @@ -197,8 +205,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> > fs_builder ,
> > }
> >
> > vec4_result.type = dst.type;
> > -   bld.MOV(dst, offset(vec4_result, bld,
> > -   (const_offset & 0xf) /
> > type_sz(vec4_result.type)));
> > +
> > +   if (type_sz(dst.type) == 2) {
> > +  /* 16-bit types need to be unshuffled as each pair of
> > 16-bit components
> > +   * is packed on a 32-bit compoment because we are using a
> > 32-bit format
> > +   * in the surface of uniform that is read by the sampler.
> > +   * TODO: On BDW+ mark when an uniform has 16-bit type so we
> > could setup a
> > +   * surface format of 16-bit and use the 16-bit return
> > format at the
> > +   * sampler.
> > +   */
> > +  vec4_result.stride = 2;
> > +  bld.MOV(dst, byte_offset(offset(vec4_result, bld,
> > +  (const_offset & 0x7) / 4),
> > +   (const_offset & 0x7) / 2 % 2 * 2));
> > +   } else {
> > +  bld.MOV(dst, offset(vec4_result, bld,
> > +  (const_offset & 0xf) /
> > type_sz(vec4_result.type)));
> > +   }
> >
> >
> > This seems overly complicated.  How about something like
>
> > fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
> > switch (type_sz(dst.type)) {
> > case 2:
> >shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
> >bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
> >break;
> > case 4:
> >bld.MOV(dst, dw);
> >break;
> > case 8:
> >shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
> >break;
> > default:
> >unreachable();
> > }
>
> This implementation it is really more clear. Tested and works perfectly
> fine.
>
> >
> >
> >  }
> >
> >  /**
> > diff --git a/src/intel/compiler/brw_fs_generator.cpp
> > b/src/intel/compiler/brw_fs_generator.cpp
> > index a3861cd68e..00a4e29147 100644
> > --- a/src/intel/compiler/brw_fs_generator.cpp
> > +++ b/src/intel/compiler/brw_fs_generator.cpp
> > @@ -1381,12 +1381,18 @@
> > fs_generator::generate_varying_pull_constant_load_gen7(fs_inst
> *inst,
> > uint32_t

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

2017-12-05 Thread Chema Casanova

El 30/11/17 a las 23:58, Jason Ekstrand escribió:
> On Wed, Nov 29, 2017 at 6:50 PM, Jose Maria Casanova Crespo
> > wrote:
>
> load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
> surface format defined. So when reading 16-bit components with the
> sampler we need to unshuffle two 16-bit components from each 32-bit
> component.
>
> Using the sampler avoids the use of the byte_scattered_read message
> that needs one message for each component and is supposed to be
> slower.
>
> In the case of SKL+ we take advantage of a hardware feature that
> automatically defines a channel mask based on the rlen value, so on
> SKL+ we only use half of the registers without using a header in the
> payload.
> ---
>  src/intel/compiler/brw_fs.cpp           | 31
> +++
>  src/intel/compiler/brw_fs_generator.cpp | 10 --
>  2 files changed, 35 insertions(+), 6 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs.cpp
> b/src/intel/compiler/brw_fs.cpp
> index 1ca4d416b2..9c543496ba 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -184,9 +184,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> fs_builder ,
>      * a double this means we are only loading 2 elements worth of
> data.
>      * We also want to use a 32-bit data type for the dst of the
> load operation
>      * so other parts of the driver don't get confused about the
> size of the
> -    * result.
> +    * result. On the case of 16-bit data we only need half of the
> 32-bit
> +    * components on SKL+ as we take advance of using message
> return size to
> +    * define an xy channel mask.
>      */
> -   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> +   fs_reg vec4_result;
> +   if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
> +      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> +      vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
> +   } else {
> +      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> +   }
>
>     fs_inst *inst =
> bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
>                              vec4_result, surf_index, vec4_offset);
>     inst->size_written = 4 *
> vec4_result.component_size(inst->exec_size);
> @@ -197,8 +205,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> fs_builder ,
>     }
>
>     vec4_result.type = dst.type;
> -   bld.MOV(dst, offset(vec4_result, bld,
> -                       (const_offset & 0xf) /
> type_sz(vec4_result.type)));
> +
> +   if (type_sz(dst.type) == 2) {
> +      /* 16-bit types need to be unshuffled as each pair of
> 16-bit components
> +       * is packed on a 32-bit compoment because we are using a
> 32-bit format
> +       * in the surface of uniform that is read by the sampler.
> +       * TODO: On BDW+ mark when an uniform has 16-bit type so we
> could setup a
> +       * surface format of 16-bit and use the 16-bit return
> format at the
> +       * sampler.
> +       */
> +      vec4_result.stride = 2;
> +      bld.MOV(dst, byte_offset(offset(vec4_result, bld,
> +                                      (const_offset & 0x7) / 4),
> +                               (const_offset & 0x7) / 2 % 2 * 2));
> +   } else {
> +      bld.MOV(dst, offset(vec4_result, bld,
> +                          (const_offset & 0xf) /
> type_sz(vec4_result.type)));
> +   }
>
>
> This seems overly complicated.  How about something like

> fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
> switch (type_sz(dst.type)) {
> case 2:
>    shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
>    bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
>    break;
> case 4:
>    bld.MOV(dst, dw);
>    break;
> case 8:
>    shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
>    break;
> default:
>    unreachable();
> }

This implementation it is really more clear. Tested and works perfectly
fine.

>  
>
>  }
>
>  /**
> diff --git a/src/intel/compiler/brw_fs_generator.cpp
> b/src/intel/compiler/brw_fs_generator.cpp
> index a3861cd68e..00a4e29147 100644
> --- a/src/intel/compiler/brw_fs_generator.cpp
> +++ b/src/intel/compiler/brw_fs_generator.cpp
> @@ -1381,12 +1381,18 @@
> fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
>     uint32_t simd_mode, rlen, mlen;
>     if (inst->exec_size == 16) {
>        mlen = 2;
> -      rlen = 8;
> +      if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
> +         rlen = 4;
> +      else
> +         rlen = 8;
>
>
> I'm not sure what I think of this.  We intentionally use a vec4 today
> instead

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

2017-11-30 Thread Jason Ekstrand

On Wed, Nov 29, 2017 at 6:50 PM, Jose Maria Casanova Crespo <
jmcasan...@igalia.com> wrote:

> load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
> surface format defined. So when reading 16-bit components with the
> sampler we need to unshuffle two 16-bit components from each 32-bit
> component.
>
> Using the sampler avoids the use of the byte_scattered_read message
> that needs one message for each component and is supposed to be
> slower.
>
> In the case of SKL+ we take advantage of a hardware feature that
> automatically defines a channel mask based on the rlen value, so on
> SKL+ we only use half of the registers without using a header in the
> payload.
> ---
>  src/intel/compiler/brw_fs.cpp   | 31
> +++
>  src/intel/compiler/brw_fs_generator.cpp | 10 --
>  2 files changed, 35 insertions(+), 6 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 1ca4d416b2..9c543496ba 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -184,9 +184,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> fs_builder ,
>  * a double this means we are only loading 2 elements worth of data.
>  * We also want to use a 32-bit data type for the dst of the load
> operation
>  * so other parts of the driver don't get confused about the size of
> the
> -* result.
> +* result. On the case of 16-bit data we only need half of the 32-bit
> +* components on SKL+ as we take advance of using message return size
> to
> +* define an xy channel mask.
>  */
> -   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> +   fs_reg vec4_result;
> +   if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
> +  vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> +  vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
> +   } else {
> +  vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> +   }
>
fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
>  vec4_result, surf_index, vec4_offset);
> inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
> @@ -197,8 +205,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> fs_builder ,
> }
>
> vec4_result.type = dst.type;
> -   bld.MOV(dst, offset(vec4_result, bld,
> -   (const_offset & 0xf) / type_sz(vec4_result.type)));
> +
> +   if (type_sz(dst.type) == 2) {
> +  /* 16-bit types need to be unshuffled as each pair of 16-bit
> components
> +   * is packed on a 32-bit compoment because we are using a 32-bit
> format
> +   * in the surface of uniform that is read by the sampler.
> +   * TODO: On BDW+ mark when an uniform has 16-bit type so we could
> setup a
> +   * surface format of 16-bit and use the 16-bit return format at the
> +   * sampler.
> +   */
> +  vec4_result.stride = 2;
> +  bld.MOV(dst, byte_offset(offset(vec4_result, bld,
> +  (const_offset & 0x7) / 4),
> +   (const_offset & 0x7) / 2 % 2 * 2));
> +   } else {
> +  bld.MOV(dst, offset(vec4_result, bld,
> +  (const_offset & 0xf) /
> type_sz(vec4_result.type)));
> +   }
>

This seems overly complicated.  How about something like

fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
switch (type_sz(dst.type)) {
case 2:
   shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
   bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
   break;
case 4:
   bld.MOV(dst, dw);
   break;
case 8:
   shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
   break;
default:
   unreachable();
}


>  }
>
>  /**
> diff --git a/src/intel/compiler/brw_fs_generator.cpp
> b/src/intel/compiler/brw_fs_generator.cpp
> index a3861cd68e..00a4e29147 100644
> --- a/src/intel/compiler/brw_fs_generator.cpp
> +++ b/src/intel/compiler/brw_fs_generator.cpp
> @@ -1381,12 +1381,18 @@ fs_generator::generate_varying
> _pull_constant_load_gen7(fs_inst *inst,
> uint32_t simd_mode, rlen, mlen;
> if (inst->exec_size == 16) {
>mlen = 2;
> -  rlen = 8;
> +  if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
> + rlen = 4;
> +  else
> + rlen = 8;
>

I'm not sure what I think of this.  We intentionally use a vec4 today
instead of a single float because it lets us potentially batch up a bunch
of loads in a single texture operation.  Are we sure that we never need
more than 2 of those result registers?

--Jason

   simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
> } else {
>assert(inst->exec_size == 8);
>mlen = 1;
> -  rlen = 4;
> +  if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
> + rlen = 2;
> +  else
> + rlen = 4;
>simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
> }
>
> --
> 2.14.3
>
> ___
> mesa-dev mailing list
>

[Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

2017-11-29 Thread Jose Maria Casanova Crespo

load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
surface format defined. So when reading 16-bit components with the
sampler we need to unshuffle two 16-bit components from each 32-bit
component.

Using the sampler avoids the use of the byte_scattered_read message
that needs one message for each component and is supposed to be
slower.

In the case of SKL+ we take advantage of a hardware feature that
automatically defines a channel mask based on the rlen value, so on
SKL+ we only use half of the registers without using a header in the
payload.
---
 src/intel/compiler/brw_fs.cpp   | 31 +++
 src/intel/compiler/brw_fs_generator.cpp | 10 --
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1ca4d416b2..9c543496ba 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -184,9 +184,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
 * a double this means we are only loading 2 elements worth of data.
 * We also want to use a 32-bit data type for the dst of the load operation
 * so other parts of the driver don't get confused about the size of the
-* result.
+* result. On the case of 16-bit data we only need half of the 32-bit
+* components on SKL+ as we take advance of using message return size to
+* define an xy channel mask.
 */
-   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+   fs_reg vec4_result;
+   if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
+  vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
+  vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
+   } else {
+  vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+   }
fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
 vec4_result, surf_index, vec4_offset);
inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
@@ -197,8 +205,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
}
 
vec4_result.type = dst.type;
-   bld.MOV(dst, offset(vec4_result, bld,
-   (const_offset & 0xf) / type_sz(vec4_result.type)));
+
+   if (type_sz(dst.type) == 2) {
+  /* 16-bit types need to be unshuffled as each pair of 16-bit components
+   * is packed on a 32-bit compoment because we are using a 32-bit format
+   * in the surface of uniform that is read by the sampler.
+   * TODO: On BDW+ mark when an uniform has 16-bit type so we could setup a
+   * surface format of 16-bit and use the 16-bit return format at the
+   * sampler.
+   */
+  vec4_result.stride = 2;
+  bld.MOV(dst, byte_offset(offset(vec4_result, bld,
+  (const_offset & 0x7) / 4),
+   (const_offset & 0x7) / 2 % 2 * 2));
+   } else {
+  bld.MOV(dst, offset(vec4_result, bld,
+  (const_offset & 0xf) / type_sz(vec4_result.type)));
+   }
 }
 
 /**
diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index a3861cd68e..00a4e29147 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1381,12 +1381,18 @@ 
fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
uint32_t simd_mode, rlen, mlen;
if (inst->exec_size == 16) {
   mlen = 2;
-  rlen = 8;
+  if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
+ rlen = 4;
+  else
+ rlen = 8;
   simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
} else {
   assert(inst->exec_size == 8);
   mlen = 1;
-  rlen = 4;
+  if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
+ rlen = 2;
+  else
+ rlen = 4;
   simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
}
 
-- 
2.14.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler (v2)

[Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler (v2)

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

Re: [Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

[Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

8 matches

Site Navigation

Mail list logo

Footer information