Michael Schellenberger Costa <mschellenbergerco...@googlemail.com> writes:
> Hi Curro, > > Am 04.05.2016 um 06:26 schrieb Francisco Jerez: >> Instead of using the LOAD_PAYLOAD instruction (emitted through the >> emit_transpose() helper that is no longer useful and this commit >> removes) which had to be marked force_writemask_all in some cases, >> emit a series of moves to apply proper channel enable signals to the >> destination. Until now lower_simd_width() had mainly been used to >> lower things that invariably had a basic block-local temporary as >> destination so it didn't seem like a big deal, but I found it to be >> the reason for several Piglit regressions in my SIMD32 branch and >> Igalia discovered the same issue independently while working on FP64 >> support. >> --- >> This is taken from the following WIP series: >> >> https://cgit.freedesktop.org/~currojerez/mesa/log/?h=i965-late-simd-lowering >> >> See also: >> https://lists.freedesktop.org/archives/mesa-dev/2016-May/115596.html >> >> src/mesa/drivers/dri/i965/brw_fs.cpp | 58 >> +++++++++++------------------------- >> 1 file changed, 18 insertions(+), 40 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp >> b/src/mesa/drivers/dri/i965/brw_fs.cpp >> index 4b6aa67..34599cb 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp >> @@ -4625,29 +4625,6 @@ get_lowered_simd_width(const struct brw_device_info >> *devinfo, >> } >> } >> >> -/** >> - * The \p rows array of registers represents a \p num_rows by \p num_columns >> - * matrix in row-major order, write it in column-major order into the >> register >> - * passed as destination. \p stride gives the separation between matrix >> - * elements in the input in fs_builder::dispatch_width() units. >> - */ >> -static void >> -emit_transpose(const fs_builder &bld, >> - const fs_reg &dst, const fs_reg *rows, >> - unsigned num_rows, unsigned num_columns, unsigned stride) >> -{ >> - fs_reg *const components = new fs_reg[num_rows * num_columns]; >> - >> - for (unsigned i = 0; i < num_columns; ++i) { >> - for (unsigned j = 0; j < num_rows; ++j) >> - components[num_rows * i + j] = offset(rows[j], bld, stride * i); >> - } >> - >> - bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0); >> - >> - delete[] components; >> -} >> - >> bool >> fs_visitor::lower_simd_width() >> { >> @@ -4698,16 +4675,19 @@ fs_visitor::lower_simd_width() >> if (inst->src[j].file != BAD_FILE && >> !is_uniform(inst->src[j])) { >> /* Get the i-th copy_width-wide chunk of the source. */ >> - const fs_reg src = horiz_offset(inst->src[j], copy_width >> * i); >> + const fs_builder cbld = lbld.group(copy_width, 0); >> + const fs_reg src = offset(inst->src[j], cbld, i); >> const unsigned src_size = inst->components_read(j); >> >> - /* Use a trivial transposition to copy one every n >> - * copy_width-wide components of the register into a >> - * temporary passed as source to the lowered instruction. >> + /* Copy one every n copy_width-wide components of the > That first sentence is really unclear. Shouldnt it sound "Copy each > copy_width-wide component of..." > Nope, it really is copying one every n copy_width chunks of the register, any suggestions to make the wording more clear? > Michael >> + * register into a temporary passed as source to the >> lowered >> + * instruction. >> */ >> split_inst.src[j] = lbld.vgrf(inst->src[j].type, >> src_size); >> - emit_transpose(lbld.group(copy_width, 0), >> - split_inst.src[j], &src, 1, src_size, n); >> + >> + for (unsigned k = 0; k < src_size; ++k) >> + cbld.MOV(offset(split_inst.src[j], lbld, k), >> + offset(src, cbld, n * k)); >> } >> } >> >> @@ -4726,20 +4706,18 @@ fs_visitor::lower_simd_width() >> } >> >> if (inst->regs_written) { >> - /* Distance between useful channels in the temporaries, skipping >> - * garbage if the lowered instruction is wider than the >> original. >> - */ >> - const unsigned m = lower_width / copy_width; >> + const fs_builder lbld = ibld.group(lower_width, 0); >> >> /* Interleave the components of the result from the lowered >> - * instructions. We need to set exec_all() when copying more >> than >> - * one half per component, because LOAD_PAYLOAD (in terms of >> which >> - * emit_transpose is implemented) can only use the same channel >> - * enable signals for all of its non-header sources. >> + * instructions. >> */ >> - emit_transpose(ibld.exec_all(inst->exec_size > copy_width) >> - .group(copy_width, 0), >> - inst->dst, dsts, n, dst_size, m); >> + for (unsigned i = 0; i < dst_size; ++i) { >> + for (unsigned j = 0; j < n; ++j) { >> + const fs_builder cbld = ibld.group(copy_width, j); >> + cbld.MOV(offset(inst->dst, cbld, n * i + j), >> + offset(dsts[j], lbld, i)); >> + } >> + } >> } >> >> inst->remove(block);
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev