Grr! I've done it again. ENOPATCH. > -----Original Message----- > From: Roger Sayle <ro...@nextmovesoftware.com> > Sent: 06 October 2023 14:58 > To: 'gcc-patches@gcc.gnu.org' <gcc-patches@gcc.gnu.org> > Cc: 'Uros Bizjak' <ubiz...@gmail.com> > Subject: [X86 PATCH] Implement doubleword right shifts by 1 bit using s[ha]r+rcr. > > > This patch tweaks the i386 back-end's ix86_split_ashr and ix86_split_lshr > functions to implement doubleword right shifts by 1 bit, using a shift of the > highpart that sets the carry flag followed by a rotate-carry-right > (RCR) instruction on the lowpart. > > Conceptually this is similar to the recent left shift patch, but with two > complicating factors. The first is that although the RCR sequence is shorter, and is > a ~3x performance improvement on AMD, my micro-benchmarking shows it > ~10% slower on Intel. Hence this patch also introduces a new > X86_TUNE_USE_RCR tuning parameter. The second is that I believe this is the > first time a "rotate-right-through-carry" and a right shift that sets the carry flag > from the least significant bit has been modelled in GCC RTL (on a MODE_CC > target). For this I've used the i386 back-end's UNSPEC_CC_NE which seems > appropriate. Finally rcrsi2 and rcrdi2 are separate define_insns so that we can > use their generator functions. > > For the pair of functions: > unsigned __int128 foo(unsigned __int128 x) { return x >> 1; } > __int128 bar(__int128 x) { return x >> 1; } > > with -O2 -march=znver4 we previously generated: > > foo: movq %rdi, %rax > movq %rsi, %rdx > shrdq $1, %rsi, %rax > shrq %rdx > ret > bar: movq %rdi, %rax > movq %rsi, %rdx > shrdq $1, %rsi, %rax > sarq %rdx > ret > > with this patch we now generate: > > foo: movq %rsi, %rdx > movq %rdi, %rax > shrq %rdx > rcrq %rax > ret > bar: movq %rsi, %rdx > movq %rdi, %rax > sarq %rdx > rcrq %rax > ret > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and > make -k check, both with and without --target_board=unix{-m32} with no new > failures. And to provide additional testing, I've also bootstrapped and regression > tested a version of this patch where the RCR is always generated (independent of > the -march target) again with no regressions. Ok for mainline? > > > 2023-10-06 Roger Sayle <ro...@nextmovesoftware.com> > > gcc/ChangeLog > * config/i386/i386-expand.c (ix86_split_ashr): Split shifts by > one into ashr[sd]i3_carry followed by rcr[sd]i2, if TARGET_USE_RCR > or -Oz. > (ix86_split_lshr): Likewise, split shifts by one bit into > lshr[sd]i3_carry followed by rcr[sd]i2, if TARGET_USE_RCR or -Oz. > * config/i386/i386.h (TARGET_USE_RCR): New backend macro. > * config/i386/i386.md (rcrsi2): New define_insn for rcrl. > (rcrdi2): New define_insn for rcrq. > (<anyshiftrt><mode>3_carry): New define_insn for right shifts that > set the carry flag from the least significant bit, modelled using > UNSPEC_CC_NE. > * config/i386/x86-tune.def (X86_TUNE_USE_RCR): New tuning parameter > controlling use of rcr 1 vs. shrd, which is significantly faster on > AMD processors. > > gcc/testsuite/ChangeLog > * gcc.target/i386/rcr-1.c: New 64-bit test case. > * gcc.target/i386/rcr-2.c: New 32-bit test case. > > > Thanks in advance, > Roger > --
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index e42ff27..399eb8e 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -6496,6 +6496,22 @@ ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode) emit_insn (gen_ashr3 (low[0], low[0], GEN_INT (count - half_width))); } + else if (count == 1 + && (TARGET_USE_RCR || optimize_size > 1)) + { + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + if (mode == DImode) + { + emit_insn (gen_ashrsi3_carry (high[0], high[0])); + emit_insn (gen_rcrsi2 (low[0], low[0])); + } + else + { + emit_insn (gen_ashrdi3_carry (high[0], high[0])); + emit_insn (gen_rcrdi2 (low[0], low[0])); + } + } else { gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; @@ -6561,6 +6577,22 @@ ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode) emit_insn (gen_lshr3 (low[0], low[0], GEN_INT (count - half_width))); } + else if (count == 1 + && (TARGET_USE_RCR || optimize_size > 1)) + { + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + if (mode == DImode) + { + emit_insn (gen_lshrsi3_carry (high[0], high[0])); + emit_insn (gen_rcrsi2 (low[0], low[0])); + } + else + { + emit_insn (gen_lshrdi3_carry (high[0], high[0])); + emit_insn (gen_rcrdi2 (low[0], low[0])); + } + } else { gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 3e8488f..6544a16 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -449,6 +449,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_DEST_FALSE_DEP_FOR_GLC \ ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC] #define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC] +#define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index eef8a0e..01d62ea 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -15804,6 +15804,59 @@ [(parallel [(set (strict_low_part (match_dup 0)) (bswap:HI (match_dup 0))) (clobber (reg:CC FLAGS_REG))])]) + +;; Rotations through carry flag +(define_insn "rcrsi2" + [(set (match_operand:SI 0 "register_operand" "=r") + (plus:SI + (lshiftrt:SI (match_operand:SI 1 "register_operand" "0") + (const_int 1)) + (ashift:SI (ltu:SI (reg:CCC FLAGS_REG) (const_int 0)) + (const_int 31)))) + (clobber (reg:CC FLAGS_REG))] + "" + "rcr{l}\t%0" + [(set_attr "type" "ishift1") + (set_attr "memory" "none") + (set_attr "length_immediate" "0") + (set_attr "mode" "SI")]) + +(define_insn "rcrdi2" + [(set (match_operand:DI 0 "register_operand" "=r") + (plus:DI + (lshiftrt:DI (match_operand:DI 1 "register_operand" "0") + (const_int 1)) + (ashift:DI (ltu:DI (reg:CCC FLAGS_REG) (const_int 0)) + (const_int 63)))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT" + "rcr{q}\t%0" + [(set_attr "type" "ishift1") + (set_attr "length_immediate" "0") + (set_attr "mode" "DI")]) + +;; Versions of sar and shr that set the carry flag. +(define_insn "<insn><mode>3_carry" + [(set (reg:CCC FLAGS_REG) + (unspec:CCC [(and:SWI48 (match_operand:SWI48 1 "register_operand" "0") + (const_int 1)) + (const_int 0)] UNSPEC_CC_NE)) + (set (match_operand:SWI48 0 "register_operand" "=r") + (any_shiftrt:SWI48 (match_dup 1) (const_int 1)))] + "" +{ + if (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)) + return "<shift>{<imodesuffix>}\t%0"; + return "<shift>{<imodesuffix>}\t{1, %0|%0, 1}"; +} + [(set_attr "type" "ishift1") + (set (attr "length_immediate") + (if_then_else + (ior (match_test "TARGET_SHIFT1") + (match_test "optimize_function_for_size_p (cfun)")) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "<MODE>")]) ;; Bit set / bit test instructions diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 4b2c5d5..3636a4a 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -717,3 +717,6 @@ DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL) /* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag modifications on architectures where theses operations are slow. */ DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4) + +/* X86_TUNE_USE_RCR: Controls use of rcr 1 instruction instead of shrd. */ +DEF_TUNE (X86_TUNE_USE_RCR, "use_rcr", m_AMD_MULTIPLE) diff --git a/gcc/testsuite/gcc.target/i386/rcr-1.c b/gcc/testsuite/gcc.target/i386/rcr-1.c new file mode 100644 index 0000000..8f369ef --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/rcr-1.c @@ -0,0 +1,6 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-Oz" } */ +unsigned __int128 foo(unsigned __int128 x) { return x >> 1; } +__int128 bar(__int128 x) { return x >> 1; } +/* { dg-final { scan-assembler-times "rcrq" 2 } } */ +/* { dg-final { scan-assembler-not "shrdq" } } */ diff --git a/gcc/testsuite/gcc.target/i386/rcr-2.c b/gcc/testsuite/gcc.target/i386/rcr-2.c new file mode 100644 index 0000000..c8ed50e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/rcr-2.c @@ -0,0 +1,6 @@ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-Oz -mno-stv" } */ +unsigned long long foo(unsigned long long x) { return x >> 1; } +long long bar(long long x) { return x >> 1; } +/* { dg-final { scan-assembler-times "rcrl" 2 } } */ +/* { dg-final { scan-assembler-not "shrdl" } } */