El 25/08/17 a las 20:09, Francisco Jerez escribió: > Alejandro Piñeiro <apinhe...@igalia.com> writes: > >> Although it is possible to emit them directly as AND/OR on brw_fs_nir, >> having specific opcodes makes it easier to remove duplicate settings >> later. >> >> Signed-off-by: Alejandro Piñeiro <apinhe...@igalia.com> >> Signed-off-by: Jose Maria Casanova Crespo <jmcasan...@igalia.com> >> --- >> src/intel/compiler/brw_eu.h | 3 +++ >> src/intel/compiler/brw_eu_defines.h | 9 +++++++++ >> src/intel/compiler/brw_eu_emit.c | 19 +++++++++++++++++++ >> src/intel/compiler/brw_fs_generator.cpp | 8 ++++++++ >> src/intel/compiler/brw_shader.cpp | 5 +++++ >> 5 files changed, 44 insertions(+) >> >> diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h >> index a3a9c63239d..0a7f8020398 100644 >> --- a/src/intel/compiler/brw_eu.h >> +++ b/src/intel/compiler/brw_eu.h >> @@ -500,6 +500,9 @@ brw_broadcast(struct brw_codegen *p, >> struct brw_reg src, >> struct brw_reg idx); >> >> +void >> +brw_rounding_mode(struct brw_codegen *p, >> + enum brw_rnd_mode mode); >> /*********************************************************************** >> * brw_eu_util.c: >> */ >> diff --git a/src/intel/compiler/brw_eu_defines.h >> b/src/intel/compiler/brw_eu_defines.h >> index 1af835d47ed..50435df2fcf 100644 >> --- a/src/intel/compiler/brw_eu_defines.h >> +++ b/src/intel/compiler/brw_eu_defines.h >> @@ -388,6 +388,9 @@ enum opcode { >> SHADER_OPCODE_TYPED_SURFACE_WRITE, >> SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, >> >> + SHADER_OPCODE_RND_MODE_RTE, >> + SHADER_OPCODE_RND_MODE_RTZ, >> + > We don't need an opcode for each possible rounding mode (there's also RU > and RD). How about you add a single SHADER_OPCODE_RND_MODE opcode > taking an immediate with the right rounding mode? I like the proposal. It is better having a unique opcode for setting the rounding mode. Changed for v2 of this patch. > Also, you should be marking the rounding mode opcodes as > has_side_effects(), because otherwise you're giving the scheduler the > freedom of moving your rounding mode update instruction past the > instruction you wanted it to have an effect on... Well pointed, we already realized that it was missing while debugging the latency problem of the control register. It was hiding the problem re-scheduling the cr0 modification to the beginning of the shader. >> SHADER_OPCODE_MEMORY_FENCE, >> >> SHADER_OPCODE_GEN4_SCRATCH_READ, >> @@ -1233,4 +1236,10 @@ enum brw_message_target { >> /* R0 */ >> # define GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT 27 >> >> +enum PACKED brw_rnd_mode { >> + BRW_RND_MODE_UNSPECIFIED, >> + BRW_RND_MODE_RTE, >> + BRW_RND_MODE_RTZ, > Since you're introducing a back-end-specific rounding mode enum already, > why not use the hardware values right away so you avoid hard-coding > magic constants below. At the end we removed MODE_UNSPECIFIED as it isn't really needed and we include Round Up and Down in the enum assignation based using the PRM values. Also renamed RTE for RTNE to maintain coherence with nir conversion modifiers using the same acronym as PRM.
+enum PACKED brw_rnd_mode { + BRW_RND_MODE_RTNE = 0, /* Round to Nearest or Even */ + BRW_RND_MODE_RU = 1, /* Round Up, toward +inf */ + BRW_RND_MODE_RD = 2, /* Round Down, toward -inf */ + BRW_RND_MODE_RTZ = 3 /* Round Toward Zero */ +}; I have a doubt about how to avoid the magic constants to close the v2 of this patch. One approach would be using the same code structure and taking advantage using the codification of rounding field. This way we just formula and expect that the C compiler optimizer to guess that the immediate value is a constant. switch (mode) { case BRW_RND_MODE_RTZ: - inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(0x00000030u)); + inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(((unsigned int) mode << 4))); break; - case BRW_RND_MODE_RTE: - inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(0xffffffcfu)); + case BRW_RND_MODE_RTNE: + inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(((unsigned int) mode << 4) | ~0x00000030u)); break; default: Another approach could be to implement a general solution for all supported rounding modes by the hw including Round Up and Down using bitwise operations. /** * Changes the floating point rounding mode updating the control register * field defined at cr0.0[5-6] bits. This function supports the changes to * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations. * Only RTNE and RTZ rounding are enabled at nir. */ void brw_rounding_mode(struct brw_codegen *p, enum brw_rnd_mode mode) { const unsigned int mask = 0x0000000030u; const unsigned int enable_bits = ((unsigned int) mode) << 4; const unsigned int disable_bits = enable_bits | ~mask; /* Used by RTNE rounding to set field to 00 */ if (disable_bits != ~0) { brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(disable_bits)); } /* Used by RTZ rounding to set field to 11 */ if (enable_bits) { brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(enable_bits)); } } Any preference ? Thanks for the review. Chema > >> +}; >> + >> #endif /* BRW_EU_DEFINES_H */ >> diff --git a/src/intel/compiler/brw_eu_emit.c >> b/src/intel/compiler/brw_eu_emit.c >> index 0b0d67a5c56..07ad3d9384b 100644 >> --- a/src/intel/compiler/brw_eu_emit.c >> +++ b/src/intel/compiler/brw_eu_emit.c >> @@ -3723,3 +3723,22 @@ brw_WAIT(struct brw_codegen *p) >> brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); >> brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); >> } >> + >> +void >> +brw_rounding_mode(struct brw_codegen *p, >> + enum brw_rnd_mode mode) >> +{ >> + switch (mode) { >> + case BRW_RND_MODE_UNSPECIFIED: >> + /* nothing to do here */ >> + break; >> + case BRW_RND_MODE_RTZ: >> + brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(0x00000030u)); >> + break; >> + case BRW_RND_MODE_RTE: >> + brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(0xffffffcfu)); > This has undefined behavior because the ALU instructions you use to set > cr0 have non-zero latency, so the rounding mode change won't take effect > till ~8 cycles after the instruction is issued. Any instructions issued > in that window will pick up the wrong rounding mode. This is likely one > of the reasons for your observations off-list regarding some shaders > using the right or wrong rounding mode non-deterministically depending > on the scheduler's behaviour. > > Here's a spec quote from the SKL PRM suggesting a workaround you should > probably include in this commit: > > | Implementation Restriction on Register Access: When the control > | register is used as an explicit source and/or destination, hardware > | does not ensure execution pipeline coherency. Software must set the > | thread control field to ‘switch’ for an instruction that uses control > | register as an explicit operand. This is important as the control > | register is an implicit source for most instructions. For example, > | fields like FPMode and Accumulator Disable control the arithmetic > | and/or logic instructions. Therefore, if the instruction updating the > | control register doesn’t set ‘switch’, subsequent instructions may > | have undefined results. > > >> + break; >> + default: >> + unreachable("Not reached"); >> + } >> +} >> diff --git a/src/intel/compiler/brw_fs_generator.cpp >> b/src/intel/compiler/brw_fs_generator.cpp >> index 2ade486705b..e0bd191ea7e 100644 >> --- a/src/intel/compiler/brw_fs_generator.cpp >> +++ b/src/intel/compiler/brw_fs_generator.cpp >> @@ -2139,6 +2139,14 @@ fs_generator::generate_code(const cfg_t *cfg, int >> dispatch_width) >> brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); >> break; >> >> + case SHADER_OPCODE_RND_MODE_RTE: >> + brw_rounding_mode(p, BRW_RND_MODE_RTE); >> + break; >> + >> + case SHADER_OPCODE_RND_MODE_RTZ: >> + brw_rounding_mode(p, BRW_RND_MODE_RTZ); >> + break; >> + >> default: >> unreachable("Unsupported opcode"); >> >> diff --git a/src/intel/compiler/brw_shader.cpp >> b/src/intel/compiler/brw_shader.cpp >> index c62b8ba6140..f22e204e262 100644 >> --- a/src/intel/compiler/brw_shader.cpp >> +++ b/src/intel/compiler/brw_shader.cpp >> @@ -486,6 +486,11 @@ brw_instruction_name(const struct gen_device_info >> *devinfo, enum opcode op) >> return "tes_add_indirect_urb_offset"; >> case TES_OPCODE_GET_PRIMITIVE_ID: >> return "tes_get_primitive_id"; >> + >> + case SHADER_OPCODE_RND_MODE_RTE: >> + return "round_mode_rte"; >> + case SHADER_OPCODE_RND_MODE_RTZ: >> + return "round_mode_rtz"; >> } >> >> unreachable("not reached"); >> -- >> 2.11.0 >> >> _______________________________________________ >> mesa-dev mailing list >> mesa-dev@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev >> >> >> _______________________________________________ >> mesa-dev mailing list >> mesa-dev@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev