Re: [Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo (v2)
On 05/12/17 23:47, Jason Ekstrand wrote: > On Tue, Dec 5, 2017 at 1:36 PM, Jose Maria Casanova Crespo >> wrote: > > SSBO loads were using byte_scattered read messages as they allow > reading 16-bit size components. byte_scattered messages can only > operate one component at a time so we needed to emit as many messages > as components. > > But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the > untyped_surface_read message to read pairs of 16-bit components using > only one message. Once each pair is read it is unshuffled to return the > proper 16-bit components. vec3 case is assimilated to vec4 but the 4th > component is ignored. > > 16-bit scalars are read using one byte_scattered_read message. > > v2: Removed use of stride = 2 on sources (Jason Ekstrand) > Rework optimization using unshuffle 16 reads (Chema Casanova) > v3: Use W and D types insead of HF and F in shuffle to avoid rounding > erros (Jason Ekstrand) > Use untyped_surface_read for 16-bit vec3. (Jason Ekstrand) > > CC: Jason Ekstrand > > --- > src/intel/compiler/brw_fs_nir.cpp | 29 ++--- > 1 file changed, 22 insertions(+), 7 deletions(-) > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index e11e75e6332..8deec082d59 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -2303,16 +2303,31 @@ do_untyped_vector_read(const fs_builder , > unsigned num_components) > { > if (type_sz(dest.type) <= 2) { > - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > - bld.MOV(read_offset, offset_reg); > - for (unsigned i = 0; i < num_components; i++) { > - fs_reg read_reg = > - emit_byte_scattered_read(bld, surf_index, read_offset, > + assert(dest.stride == 1); > + > + if (num_components > 1) { > + /* Pairs of 16-bit components can be read with untyped > read, for 16-bit > + * vec3 4th component is ignored. > + */ > + fs_reg read_result = > + emit_untyped_read(bld, surf_index, offset_reg, > + 1 /* dims */, > DIV_ROUND_UP(num_components, 2), > + BRW_PREDICATE_NONE); > + shuffle_32bit_load_result_to_16bit_data(bld, > + retype(dest, BRW_REGISTER_TYPE_W), > + retype(read_result, BRW_REGISTER_TYPE_D), > + num_components); > + } else { > + assert(num_components == 1); > + /* scalar 16-bit are read using one byte_scattered_read > message */ > + fs_reg read_result = > + emit_byte_scattered_read(bld, surf_index, offset_reg, > 1 /* dims */, 1, > type_sz(dest.type) * 8 /* > bit_size */, > BRW_PREDICATE_NONE); > - bld.MOV(offset(dest, bld, i), subscript(read_reg, > dest.type, 0)); > - bld.ADD(read_offset, read_offset, > brw_imm_ud(type_sz(dest.type))); > + read_result.type = dest.type; > + read_result.stride = 2; > + bld.MOV(dest, read_result); > > > If read_reg has a 32-bit type, you could use subscript here. Meh. Fixed locally. > Reviewed-by: Jason Ekstrand > > } > } else if (type_sz(dest.type) == 4) { > fs_reg read_result = emit_untyped_read(bld, surf_index, > offset_reg, > -- > 2.11.0 > > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo (v2)
On Tue, Dec 5, 2017 at 1:36 PM, Jose Maria Casanova Crespo < jmcasan...@igalia.com> wrote: > SSBO loads were using byte_scattered read messages as they allow > reading 16-bit size components. byte_scattered messages can only > operate one component at a time so we needed to emit as many messages > as components. > > But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the > untyped_surface_read message to read pairs of 16-bit components using > only one message. Once each pair is read it is unshuffled to return the > proper 16-bit components. vec3 case is assimilated to vec4 but the 4th > component is ignored. > > 16-bit scalars are read using one byte_scattered_read message. > > v2: Removed use of stride = 2 on sources (Jason Ekstrand) > Rework optimization using unshuffle 16 reads (Chema Casanova) > v3: Use W and D types insead of HF and F in shuffle to avoid rounding > erros (Jason Ekstrand) > Use untyped_surface_read for 16-bit vec3. (Jason Ekstrand) > > CC: Jason Ekstrand> --- > src/intel/compiler/brw_fs_nir.cpp | 29 ++--- > 1 file changed, 22 insertions(+), 7 deletions(-) > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index e11e75e6332..8deec082d59 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -2303,16 +2303,31 @@ do_untyped_vector_read(const fs_builder , > unsigned num_components) > { > if (type_sz(dest.type) <= 2) { > - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > - bld.MOV(read_offset, offset_reg); > - for (unsigned i = 0; i < num_components; i++) { > - fs_reg read_reg = > -emit_byte_scattered_read(bld, surf_index, read_offset, > + assert(dest.stride == 1); > + > + if (num_components > 1) { > + /* Pairs of 16-bit components can be read with untyped read, for > 16-bit > + * vec3 4th component is ignored. > + */ > + fs_reg read_result = > +emit_untyped_read(bld, surf_index, offset_reg, > + 1 /* dims */, DIV_ROUND_UP(num_components, > 2), > + BRW_PREDICATE_NONE); > + shuffle_32bit_load_result_to_16bit_data(bld, > + retype(dest, BRW_REGISTER_TYPE_W), > + retype(read_result, BRW_REGISTER_TYPE_D), > + num_components); > + } else { > + assert(num_components == 1); > + /* scalar 16-bit are read using one byte_scattered_read message > */ > + fs_reg read_result = > +emit_byte_scattered_read(bld, surf_index, offset_reg, > 1 /* dims */, 1, > type_sz(dest.type) * 8 /* bit_size > */, > BRW_PREDICATE_NONE); > - bld.MOV(offset(dest, bld, i), subscript(read_reg, dest.type, 0)); > - bld.ADD(read_offset, read_offset, brw_imm_ud(type_sz(dest.type)) > ); > + read_result.type = dest.type; > + read_result.stride = 2; > + bld.MOV(dest, read_result); > If read_reg has a 32-bit type, you could use subscript here. Meh. Reviewed-by: Jason Ekstrand >} > } else if (type_sz(dest.type) == 4) { >fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, > -- > 2.11.0 > > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo (v2)
SSBO loads were using byte_scattered read messages as they allow reading 16-bit size components. byte_scattered messages can only operate one component at a time so we needed to emit as many messages as components. But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the untyped_surface_read message to read pairs of 16-bit components using only one message. Once each pair is read it is unshuffled to return the proper 16-bit components. vec3 case is assimilated to vec4 but the 4th component is ignored. 16-bit scalars are read using one byte_scattered_read message. v2: Removed use of stride = 2 on sources (Jason Ekstrand) Rework optimization using unshuffle 16 reads (Chema Casanova) v3: Use W and D types insead of HF and F in shuffle to avoid rounding erros (Jason Ekstrand) Use untyped_surface_read for 16-bit vec3. (Jason Ekstrand) CC: Jason Ekstrand--- src/intel/compiler/brw_fs_nir.cpp | 29 ++--- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index e11e75e6332..8deec082d59 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2303,16 +2303,31 @@ do_untyped_vector_read(const fs_builder , unsigned num_components) { if (type_sz(dest.type) <= 2) { - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); - bld.MOV(read_offset, offset_reg); - for (unsigned i = 0; i < num_components; i++) { - fs_reg read_reg = -emit_byte_scattered_read(bld, surf_index, read_offset, + assert(dest.stride == 1); + + if (num_components > 1) { + /* Pairs of 16-bit components can be read with untyped read, for 16-bit + * vec3 4th component is ignored. + */ + fs_reg read_result = +emit_untyped_read(bld, surf_index, offset_reg, + 1 /* dims */, DIV_ROUND_UP(num_components, 2), + BRW_PREDICATE_NONE); + shuffle_32bit_load_result_to_16bit_data(bld, + retype(dest, BRW_REGISTER_TYPE_W), + retype(read_result, BRW_REGISTER_TYPE_D), + num_components); + } else { + assert(num_components == 1); + /* scalar 16-bit are read using one byte_scattered_read message */ + fs_reg read_result = +emit_byte_scattered_read(bld, surf_index, offset_reg, 1 /* dims */, 1, type_sz(dest.type) * 8 /* bit_size */, BRW_PREDICATE_NONE); - bld.MOV(offset(dest, bld, i), subscript(read_reg, dest.type, 0)); - bld.ADD(read_offset, read_offset, brw_imm_ud(type_sz(dest.type))); + read_result.type = dest.type; + read_result.stride = 2; + bld.MOV(dest, read_result); } } else if (type_sz(dest.type) == 4) { fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, -- 2.11.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo
On 01/12/17 11:49, Jason Ekstrand wrote: > On Wed, Nov 29, 2017 at 6:57 PM, Jose Maria Casanova Crespo >> wrote: > > SSBO loads were using byte_scattered read messages as they allow > reading 16-bit size components. byte_scattered messages can only > operate one component at a time so we needed to emit as many messages > as components. > > But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the > untyped_surface_read message to read pairs of 16-bit components > using only > one message. Once each pair is read it is unshuffled to return the > proper > 16-bit components. > > On 16-bit scalar and vec3 16-bit the not paired component is read using > only one byte_scattered_read message. > > > My gut tells me that, for vec3's, we'd be better off with a single > untyped read than one untyped read and one byte scattered read. Also, > are there alignment issues with untyped surface reads/writes that might > cause us problems on vec3's? I don't know what the alignment rules are > for 16-bit vec3's in Vulkan. I think that untyped_read will work perfectly fine with vec3 as there are not special rules for 16-bits. The only thing would be that we would writing always the unused 4th component, so we decided to play save and just modify what was expected and only scattered write allowed that with that approach: "* A three- or four-component vector, with components of size N, has a base alignment of 4 N." I was trying for this V4 of the series, to use untyped_surface_read for all the cases, but I focused on scalar ones, without success. But for vec3 it should be easy to do if we can assume to write random data at the 4th component. > > > v2: Removed use of stride = 2 on sources (Jason Ekstrand) > Rework optimization using unshuffle 16 reads (Chema Casanova) > --- > src/intel/compiler/brw_fs_nir.cpp | 43 > ++- > 1 file changed, 33 insertions(+), 10 deletions(-) > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index fa7aa9c247..57e79853ef 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -2354,16 +2354,39 @@ do_untyped_vector_read(const fs_builder , > bld.ADD(read_offset, read_offset, brw_imm_ud(16)); > } > } else if (type_sz(dest.type) == 2) { > - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > - bld.MOV(read_offset, offset_reg); > - for (unsigned i = 0; i < num_components; i++) { > - fs_reg read_reg = emit_byte_scattered_read(bld, > surf_index, read_offset, > - 1 /* dims */, > - 1, > - 16 /*bit_size */, > - > BRW_PREDICATE_NONE); > - bld.MOV(offset(dest,bld,i), subscript(read_reg, dest.type, > 0)); > - bld.ADD(read_offset, read_offset, > brw_imm_ud(type_sz(dest.type))); > + assert(dest.stride == 1); > + > + int component_pairs = num_components / 2; > + /* Pairs of 16-bit components can be read with untyped read */ > + if (component_pairs > 0) { > + fs_reg read_result = emit_untyped_read(bld, surf_index, > + offset_reg, > + 1 /* dims */, > + component_pairs, > + BRW_PREDICATE_NONE); > + shuffle_32bit_load_result_to_16bit_data(bld, > + retype(dest, BRW_REGISTER_TYPE_HF), > + retype(read_result, BRW_REGISTER_TYPE_F), > > > I'd rather we use W and D rather than HF and F. Rounding errors scare me. Ok. Thanks for the review. Chema > + component_pairs * 2); > + } > + /* Last component of vec3 and scalar 16-bit read needs to be read > + * using one byte_scattered_read message > + */ > + if (num_components % 2) { > + fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > + bld.ADD(read_offset, > + offset_reg, > + brw_imm_ud((num_components - 1) * > type_sz(dest.type))); > + fs_reg read_result = emit_byte_scattered_read(bld, surf_index, > + read_offset, > + 1 /* dims */, > + 1, > + 16 /* > bit_size */, > +
Re: [Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo
On Wed, Nov 29, 2017 at 6:57 PM, Jose Maria Casanova Crespo < jmcasan...@igalia.com> wrote: > SSBO loads were using byte_scattered read messages as they allow > reading 16-bit size components. byte_scattered messages can only > operate one component at a time so we needed to emit as many messages > as components. > > But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the > untyped_surface_read message to read pairs of 16-bit components using only > one message. Once each pair is read it is unshuffled to return the proper > 16-bit components. > > On 16-bit scalar and vec3 16-bit the not paired component is read using > only one byte_scattered_read message. > My gut tells me that, for vec3's, we'd be better off with a single untyped read than one untyped read and one byte scattered read. Also, are there alignment issues with untyped surface reads/writes that might cause us problems on vec3's? I don't know what the alignment rules are for 16-bit vec3's in Vulkan. > v2: Removed use of stride = 2 on sources (Jason Ekstrand) > Rework optimization using unshuffle 16 reads (Chema Casanova) > --- > src/intel/compiler/brw_fs_nir.cpp | 43 ++ > - > 1 file changed, 33 insertions(+), 10 deletions(-) > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index fa7aa9c247..57e79853ef 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -2354,16 +2354,39 @@ do_untyped_vector_read(const fs_builder , > bld.ADD(read_offset, read_offset, brw_imm_ud(16)); >} > } else if (type_sz(dest.type) == 2) { > - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > - bld.MOV(read_offset, offset_reg); > - for (unsigned i = 0; i < num_components; i++) { > - fs_reg read_reg = emit_byte_scattered_read(bld, surf_index, > read_offset, > -1 /* dims */, > -1, > -16 /*bit_size */, > -BRW_PREDICATE_NONE); > - bld.MOV(offset(dest,bld,i), subscript(read_reg, dest.type, 0)); > - bld.ADD(read_offset, read_offset, brw_imm_ud(type_sz(dest.type)) > ); > + assert(dest.stride == 1); > + > + int component_pairs = num_components / 2; > + /* Pairs of 16-bit components can be read with untyped read */ > + if (component_pairs > 0) { > + fs_reg read_result = emit_untyped_read(bld, surf_index, > +offset_reg, > +1 /* dims */, > +component_pairs, > +BRW_PREDICATE_NONE); > + shuffle_32bit_load_result_to_16bit_data(bld, > + retype(dest, BRW_REGISTER_TYPE_HF), > + retype(read_result, BRW_REGISTER_TYPE_F), > I'd rather we use W and D rather than HF and F. Rounding errors scare me. > + component_pairs * 2); > + } > + /* Last component of vec3 and scalar 16-bit read needs to be read > + * using one byte_scattered_read message > + */ > + if (num_components % 2) { > + fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > + bld.ADD(read_offset, > + offset_reg, > + brw_imm_ud((num_components - 1) * type_sz(dest.type))); > + fs_reg read_result = emit_byte_scattered_read(bld, surf_index, > + read_offset, > + 1 /* dims */, > + 1, > + 16 /* bit_size */, > + > BRW_PREDICATE_NONE); > + read_result.type = dest.type; > + read_result.stride = 2; > + > + bld.MOV(offset(dest, bld, num_components - 1), read_result); >} > } else { >unreachable("Unsupported type"); > -- > 2.14.3 > > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 28/44] i965/fs: Use untyped_surface_read for 16-bit load_ssbo
SSBO loads were using byte_scattered read messages as they allow reading 16-bit size components. byte_scattered messages can only operate one component at a time so we needed to emit as many messages as components. But for vec2 and vec4 of 16-bit, being multiple of 32-bit we can use the untyped_surface_read message to read pairs of 16-bit components using only one message. Once each pair is read it is unshuffled to return the proper 16-bit components. On 16-bit scalar and vec3 16-bit the not paired component is read using only one byte_scattered_read message. v2: Removed use of stride = 2 on sources (Jason Ekstrand) Rework optimization using unshuffle 16 reads (Chema Casanova) --- src/intel/compiler/brw_fs_nir.cpp | 43 ++- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index fa7aa9c247..57e79853ef 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2354,16 +2354,39 @@ do_untyped_vector_read(const fs_builder , bld.ADD(read_offset, read_offset, brw_imm_ud(16)); } } else if (type_sz(dest.type) == 2) { - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); - bld.MOV(read_offset, offset_reg); - for (unsigned i = 0; i < num_components; i++) { - fs_reg read_reg = emit_byte_scattered_read(bld, surf_index, read_offset, -1 /* dims */, -1, -16 /*bit_size */, -BRW_PREDICATE_NONE); - bld.MOV(offset(dest,bld,i), subscript(read_reg, dest.type, 0)); - bld.ADD(read_offset, read_offset, brw_imm_ud(type_sz(dest.type))); + assert(dest.stride == 1); + + int component_pairs = num_components / 2; + /* Pairs of 16-bit components can be read with untyped read */ + if (component_pairs > 0) { + fs_reg read_result = emit_untyped_read(bld, surf_index, +offset_reg, +1 /* dims */, +component_pairs, +BRW_PREDICATE_NONE); + shuffle_32bit_load_result_to_16bit_data(bld, + retype(dest, BRW_REGISTER_TYPE_HF), + retype(read_result, BRW_REGISTER_TYPE_F), + component_pairs * 2); + } + /* Last component of vec3 and scalar 16-bit read needs to be read + * using one byte_scattered_read message + */ + if (num_components % 2) { + fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.ADD(read_offset, + offset_reg, + brw_imm_ud((num_components - 1) * type_sz(dest.type))); + fs_reg read_result = emit_byte_scattered_read(bld, surf_index, + read_offset, + 1 /* dims */, + 1, + 16 /* bit_size */, + BRW_PREDICATE_NONE); + read_result.type = dest.type; + read_result.stride = 2; + + bld.MOV(offset(dest, bld, num_components - 1), read_result); } } else { unreachable("Unsupported type"); -- 2.14.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev