Iago Toral <ito...@igalia.com> writes: > On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote: >> Samuel Iglesias Gonsálvez <sigles...@igalia.com> writes: >> >> > From: Iago Toral Quiroga <ito...@igalia.com> >> > >> > There are a few places where we need to shuffle the result of a 32-bit load >> > into valid 64-bit data, so extract this logic into a separate helper that >> > we >> > can reuse. >> > >> > Also, the shuffling needs to operate with WE_all set, which we were missing >> > before, because we are changing the layout of the data across the various >> > channels. Otherwise we will run into problems in non-uniform control-flow >> > scenarios. >> > --- >> > src/mesa/drivers/dri/i965/brw_fs.cpp | 95 >> > +++++++++++++++++++++----------- >> > src/mesa/drivers/dri/i965/brw_fs.h | 5 ++ >> > src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++-------------- >> > 3 files changed, 73 insertions(+), 73 deletions(-) >> > >> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp >> > b/src/mesa/drivers/dri/i965/brw_fs.cpp >> > index dff13ea..709e4b8 100644 >> > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp >> > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp >> > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const >> > fs_builder &bld, >> > >> > vec4_result.type = dst.type; >> > >> > - /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. >> > If we >> > - * are reading doubles this means that we get this: >> > - * >> > - * r0: x0 x0 x0 x0 x0 x0 x0 x0 >> > - * r1: x1 x1 x1 x1 x1 x1 x1 x1 >> > - * r2: y0 y0 y0 y0 y0 y0 y0 y0 >> > - * r3: y1 y1 y1 y1 y1 y1 y1 y1 >> > - * >> > - * Fix this up so we return valid double elements: >> > - * >> > - * r0: x0 x1 x0 x1 x0 x1 x0 x1 >> > - * r1: x0 x1 x0 x1 x0 x1 x0 x1 >> > - * r2: y0 y1 y0 y1 y0 y1 y0 y1 >> > - * r3: y0 y1 y0 y1 y0 y1 y0 y1 >> > - */ >> > - if (type_sz(dst.type) == 8) { >> > - int multiplier = bld.dispatch_width() / 8; >> > - fs_reg fixed_res = >> > - fs_reg(VGRF, alloc.allocate(2 * multiplier), >> > BRW_REGISTER_TYPE_F); >> > - /* We only have 2 doubles in a 32-bit vec4 */ >> > - for (int i = 0; i < 2; i++) { >> > - fs_reg vec4_float = >> > - horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F), >> > - multiplier * 16 * i); >> > - >> > - bld.MOV(stride(fixed_res, 2), vec4_float); >> > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2), >> > - horiz_offset(vec4_float, 8 * multiplier)); >> > - >> > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i), >> > - retype(fixed_res, BRW_REGISTER_TYPE_DF)); >> > - } >> > - } >> > + if (type_sz(dst.type) == 8) >> > + SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, >> > vec4_result, 2); >> > >> > int type_slots = MAX2(type_sz(dst.type) / 4, 1); >> > bld.MOV(dst, offset(vec4_result, bld, >> > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const >> > fs_builder &bld, >> > } >> > >> > /** >> > + * This helper takes the result of a load operation that reads 32-bit >> > elements >> > + * in this format: >> > + * >> > + * x x x x x x x x >> > + * y y y y y y y y >> > + * z z z z z z z z >> > + * w w w w w w w w >> > + * >> > + * and shuffles the data to get this: >> > + * >> > + * x y x y x y x y >> > + * x y x y x y x y >> > + * z w z w z w z w >> > + * z w z w z w z w >> > + * >> > + * Which is exactly what we want if the load is reading 64-bit components >> > + * like doubles, where x represents the low 32-bit of the x double >> > component >> > + * and y represents the high 32-bit of the x double component (likewise >> > with >> > + * z and w for double component y). The parameter @components represents >> > + * the number of 64-bit components present in @src. This would typically >> > be >> > + * 2 at most, since we can only fit 2 double elements in the result of a >> > + * vec4 load. >> > + * >> > + * Notice that @dst and @src can be the same register. >> > + */ >> > +void >> > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder &bld, >> >> I don't see any reason to make this an fs_visitor method. Declare this >> as a static function local to brw_fs_nir.cpp what should improve >> encapsulation and reduce the amount of boilerplate. Also please don't >> write it in capitals unless you want people to shout the name of your >> function while discussing out loud about it. ;) >> >> > + const fs_reg dst, >> > + const fs_reg src, >> > + uint32_t components) >> > +{ >> > + int multiplier = bld.dispatch_width() / 8; >> >> This definition is redundant with the changes below taken into account. >> >> > + >> > + /* A temporary that we will use to shuffle the 32-bit data of each >> > + * component in the vector into valid 64-bit data >> > + */ >> > + fs_reg tmp = >> > + fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F); >> >> I don't think there is any reason to do this inside a temporary instead >> of writing into the destination register directly. > > src and dst can be the same register, which is in fact how we will > usually call this since we just want to take whatever we have loaded and > shuffle it in-place. With that in mind: > Ah! That sounds like a valid reason to do the work into a temporary, but because the temporary is going to contain packed 64-bit packed data in the same format as the destination you should probably allocate it like:
| /* We calculate the result into a temporary first in order to allow | * source and destination to overlap. | */ | const fs_reg tmp = bld.vgrf(dst.type); >> > + >> > + /* We are going to manipulate the data in elements of 32-bit */ >> > + fs_reg src_data = retype(src, BRW_REGISTER_TYPE_F); >> > + >> > + /* We are going to manipulate the dst in elements of 64-bit */ >> > + fs_reg dst_data = retype(dst, BRW_REGISTER_TYPE_DF); >> > + >> >> I would turn these into assertions on the register types rather than >> overriding the type passed by the caller -- If the caller didn't intend >> 'dst' to be a vector of 64bit element type or if it didn't intend 'src' >> to be a vector of 32bit type this will simply ignore what the caller's >> intention and emit incorrect code. We should probably kill the program >> with an assertion failure instead so the problem doesn't go unnoticed. >> >> > + /* Shuffle the data */ >> > + for (unsigned i = 0; i < components; i++) { >> > + fs_reg component_i = horiz_offset(src_data, multiplier * 16 * i); >> >> Mark as 'const'. And horiz_offset() isn't really meant to step out of >> the given SIMD-wide vector, it will work or not depending on what the >> stride value is. You should be using offset(src_data, bld, 2 * i) >> instead which takes the SIMD width into account correctly for you. >> >> > + >> > + bld.MOV(stride(tmp, 2), component_i)->force_writemask_all = true; > > this writes two registers so if src == dst, the src data for the > MOV below would have been overwritten by this. > >> Looks like you still need to update this to use subscript(). The >> force_writemask_all flag shouldn't be necessary because you won't be >> crossing channel boundaries at all. >> >> > + bld.MOV(stride(horiz_offset(tmp, 1), 2), >> > + horiz_offset(component_i, 8 * multiplier)) >> > + ->force_writemask_all = true; >> > + >> >> Use 'subscript(...)' instead of 'stride(...)', 'offset(component_i, bld, >> 1)' instead of 'horiz_offset(...)', and drop the force_writemask_all. >> >> >> > + bld.MOV(horiz_offset(dst_data, multiplier * 8 * i), >> > + retype(tmp, BRW_REGISTER_TYPE_DF))->force_writemask_all = >> > true; >> >> Use 'offset(dst, bld, i)' instead of 'horiz_offset()' to index the >> destination, and fold it into the instructions above (or declare a >> temporary as you did for component_i). >> >> I notice that the comments above amount to an almost full rewrite of the >> patch -- Would you mind sending a v2 before I give my R-b? >> >> Thanks. >> >> > + } >> > +} >> > + >> > +/** >> > * A helper for MOV generation for fixing up broken hardware SEND >> > dependency >> > * handling. >> > */ >> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h >> > b/src/mesa/drivers/dri/i965/brw_fs.h >> > index 1f18112..c95b30a 100644 >> > --- a/src/mesa/drivers/dri/i965/brw_fs.h >> > +++ b/src/mesa/drivers/dri/i965/brw_fs.h >> > @@ -106,6 +106,11 @@ public: >> > uint32_t const_offset); >> > void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf); >> > >> > + void SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const brw::fs_builder >> > &bld, >> > + const fs_reg dst, >> > + const fs_reg src, >> > + uint32_t components); >> > + >> > bool run_fs(bool do_rep_send); >> > bool run_vs(gl_clip_plane *clip_planes); >> > bool run_tes(); >> > diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp >> > b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp >> > index 5a12d63..9cd751b 100644 >> > --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp >> > +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp >> > @@ -3082,44 +3082,19 @@ fs_visitor::nir_emit_intrinsic(const fs_builder >> > &bld, nir_intrinsic_instr *instr >> > for (int i = 0; i < instr->num_components; i++) >> > bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); >> > } else { >> > - /* Reading a dvec, then the untyped read below gives us this: >> > + /* Reading a dvec, so we need to: >> > * >> > - * x0 x0 x0 x0 x0 x0 x0 x0 >> > - * x1 x1 x1 x1 x1 x1 x1 x1 >> > - * y0 y0 y0 y0 y0 y0 y0 y0 >> > - * y1 y1 y1 y1 y1 y1 y1 y1 >> > - * >> > - * But that is not valid 64-bit data, instead we want: >> > - * >> > - * x0 x1 x0 x1 x0 x1 x0 x1 >> > - * x0 x1 x0 x1 x0 x1 x0 x1 >> > - * y0 y1 y0 y1 y0 y1 y0 y1 >> > - * y0 y1 y0 y1 y0 y1 y0 y1 >> > - * >> > - * Also, that would only load half of a dvec4. So, we have to: >> > - * >> > - * 1. Multiply num_components by 2, to account for the fact that >> > we need >> > - * to read 64-bit components. >> > + * 1. Multiply num_components by 2, to account for the fact that >> > we >> > + * need to read 64-bit components. >> > * 2. Shuffle the result of the load to form valid 64-bit >> > elements >> > * 3. Emit a second load (for components z/w) if needed. >> > - * >> > - * FIXME: extract the shuffling logic and share it with >> > - * varying_pull_constant_load >> > */ >> > - int multiplier = bld.dispatch_width() / 8; >> > - >> > fs_reg read_offset = vgrf(glsl_type::uint_type); >> > bld.MOV(read_offset, offset_reg); >> > >> > int num_components = instr->num_components; >> > int iters = num_components <= 2 ? 1 : 2; >> > >> > - /* A temporary that we will use to shuffle the 32-bit data of >> > each >> > - * component in the vector into valid 64-bit data >> > - */ >> > - fs_reg tmp = >> > - fs_reg(VGRF, alloc.allocate(2 * multiplier), >> > BRW_REGISTER_TYPE_F); >> > - >> > /* Load the dvec, the first iteration loads components x/y, the >> > second >> > * iteration, if needed, loads components z/w >> > */ >> > @@ -3138,18 +3113,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder >> > &bld, nir_intrinsic_instr *instr >> > read_result.type = BRW_REGISTER_TYPE_F; >> > >> > /* Shuffle the 32-bit load result into valid 64-bit data */ >> > - int multiplier = bld.dispatch_width() / 8; >> > - for (int i = 0; i < iter_components; i++) { >> > - fs_reg component_i = >> > - horiz_offset(read_result, multiplier * 16 * i); >> > - >> > - bld.MOV(stride(tmp, 2), component_i); >> > - bld.MOV(stride(horiz_offset(tmp, 1), 2), >> > - horiz_offset(component_i, 8 * multiplier)); >> > - >> > - bld.MOV(horiz_offset(dest, multiplier * 8 * (i + it * 2)), >> > - retype(tmp, BRW_REGISTER_TYPE_DF)); >> > - } >> > + SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, >> > + offset(dest, bld, it >> > * 2), >> > + read_result, >> > iter_components); >> > >> > bld.ADD(read_offset, read_offset, brw_imm_ud(16)); >> > } >> > -- >> > 2.5.0 >> > >> > _______________________________________________ >> > mesa-dev mailing list >> > mesa-dev@lists.freedesktop.org >> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev >> _______________________________________________ >> mesa-dev mailing list >> mesa-dev@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev