Grr!  I've done it again.  ENOPATCH.

> -----Original Message-----
> From: Roger Sayle <ro...@nextmovesoftware.com>
> Sent: 06 October 2023 14:58
> To: 'gcc-patches@gcc.gnu.org' <gcc-patches@gcc.gnu.org>
> Cc: 'Uros Bizjak' <ubiz...@gmail.com>
> Subject: [X86 PATCH] Implement doubleword right shifts by 1 bit using
s[ha]r+rcr.
> 
> 
> This patch tweaks the i386 back-end's ix86_split_ashr and ix86_split_lshr
> functions to implement doubleword right shifts by 1 bit, using a shift of
the
> highpart that sets the carry flag followed by a rotate-carry-right
> (RCR) instruction on the lowpart.
> 
> Conceptually this is similar to the recent left shift patch, but with two
> complicating factors.  The first is that although the RCR sequence is
shorter, and is
> a ~3x performance improvement on AMD, my micro-benchmarking shows it
> ~10% slower on Intel.  Hence this patch also introduces a new
> X86_TUNE_USE_RCR tuning parameter.  The second is that I believe this is
the
> first time a "rotate-right-through-carry" and a right shift that sets the
carry flag
> from the least significant bit has been modelled in GCC RTL (on a MODE_CC
> target).  For this I've used the i386 back-end's UNSPEC_CC_NE which seems
> appropriate.  Finally rcrsi2 and rcrdi2 are separate define_insns so that
we can
> use their generator functions.
> 
> For the pair of functions:
> unsigned __int128 foo(unsigned __int128 x) { return x >> 1; }
> __int128 bar(__int128 x) { return x >> 1; }
> 
> with -O2 -march=znver4 we previously generated:
> 
> foo:    movq    %rdi, %rax
>         movq    %rsi, %rdx
>         shrdq   $1, %rsi, %rax
>         shrq    %rdx
>         ret
> bar:    movq    %rdi, %rax
>         movq    %rsi, %rdx
>         shrdq   $1, %rsi, %rax
>         sarq    %rdx
>         ret
> 
> with this patch we now generate:
> 
> foo:    movq    %rsi, %rdx
>         movq    %rdi, %rax
>         shrq    %rdx
>         rcrq    %rax
>         ret
> bar:    movq    %rsi, %rdx
>         movq    %rdi, %rax
>         sarq    %rdx
>         rcrq    %rax
>         ret
> 
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and
> make -k check, both with and without --target_board=unix{-m32} with no new
> failures.  And to provide additional testing, I've also bootstrapped and
regression
> tested a version of this patch where the RCR is always generated
(independent of
> the -march target) again with no regressions.  Ok for mainline?
> 
> 
> 2023-10-06  Roger Sayle  <ro...@nextmovesoftware.com>
> 
> gcc/ChangeLog
>         * config/i386/i386-expand.c (ix86_split_ashr): Split shifts by
>         one into ashr[sd]i3_carry followed by rcr[sd]i2, if TARGET_USE_RCR
>         or -Oz.
>         (ix86_split_lshr): Likewise, split shifts by one bit into
>         lshr[sd]i3_carry followed by rcr[sd]i2, if TARGET_USE_RCR or -Oz.
>         * config/i386/i386.h (TARGET_USE_RCR): New backend macro.
>         * config/i386/i386.md (rcrsi2): New define_insn for rcrl.
>         (rcrdi2): New define_insn for rcrq.
>         (<anyshiftrt><mode>3_carry): New define_insn for right shifts that
>         set the carry flag from the least significant bit, modelled using
>         UNSPEC_CC_NE.
>         * config/i386/x86-tune.def (X86_TUNE_USE_RCR): New tuning
parameter
>         controlling use of rcr 1 vs. shrd, which is significantly faster
on
>         AMD processors.
> 
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/rcr-1.c: New 64-bit test case.
>         * gcc.target/i386/rcr-2.c: New 32-bit test case.
> 
> 
> Thanks in advance,
> Roger
> --

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index e42ff27..399eb8e 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -6496,6 +6496,22 @@ ix86_split_ashr (rtx *operands, rtx scratch, 
machine_mode mode)
            emit_insn (gen_ashr3 (low[0], low[0],
                                  GEN_INT (count - half_width)));
        }
+      else if (count == 1
+              && (TARGET_USE_RCR || optimize_size > 1))
+       {
+         if (!rtx_equal_p (operands[0], operands[1]))
+           emit_move_insn (operands[0], operands[1]);
+         if (mode == DImode)
+           {
+             emit_insn (gen_ashrsi3_carry (high[0], high[0]));
+             emit_insn (gen_rcrsi2 (low[0], low[0]));
+           }
+         else
+           {
+             emit_insn (gen_ashrdi3_carry (high[0], high[0]));
+             emit_insn (gen_rcrdi2 (low[0], low[0]));
+           }
+       }
       else
        {
          gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
@@ -6561,6 +6577,22 @@ ix86_split_lshr (rtx *operands, rtx scratch, 
machine_mode mode)
            emit_insn (gen_lshr3 (low[0], low[0],
                                  GEN_INT (count - half_width)));
        }
+      else if (count == 1
+              && (TARGET_USE_RCR || optimize_size > 1))
+       {
+         if (!rtx_equal_p (operands[0], operands[1]))
+           emit_move_insn (operands[0], operands[1]);
+         if (mode == DImode)
+           {
+             emit_insn (gen_lshrsi3_carry (high[0], high[0]));
+             emit_insn (gen_rcrsi2 (low[0], low[0]));
+           }
+         else
+           {
+             emit_insn (gen_lshrdi3_carry (high[0], high[0]));
+             emit_insn (gen_rcrdi2 (low[0], low[0]));
+           }
+       }
       else
        {
          gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 3e8488f..6544a16 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -449,6 +449,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_DEST_FALSE_DEP_FOR_GLC \
        ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
 #define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
+#define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index eef8a0e..01d62ea 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -15804,6 +15804,59 @@
  [(parallel [(set (strict_low_part (match_dup 0))
                  (bswap:HI (match_dup 0)))
             (clobber (reg:CC FLAGS_REG))])])
+
+;; Rotations through carry flag
+(define_insn "rcrsi2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+       (plus:SI
+         (lshiftrt:SI (match_operand:SI 1 "register_operand" "0")
+                      (const_int 1))
+         (ashift:SI (ltu:SI (reg:CCC FLAGS_REG) (const_int 0))
+                    (const_int 31))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "rcr{l}\t%0"
+  [(set_attr "type" "ishift1")
+   (set_attr "memory" "none")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "SI")])
+
+(define_insn "rcrdi2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (plus:DI
+         (lshiftrt:DI (match_operand:DI 1 "register_operand" "0")
+                      (const_int 1))
+         (ashift:DI (ltu:DI (reg:CCC FLAGS_REG) (const_int 0))
+                    (const_int 63))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT"
+  "rcr{q}\t%0"
+  [(set_attr "type" "ishift1")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "DI")])
+
+;; Versions of sar and shr that set the carry flag.
+(define_insn "<insn><mode>3_carry"
+  [(set (reg:CCC FLAGS_REG)
+       (unspec:CCC [(and:SWI48 (match_operand:SWI48 1 "register_operand" "0")
+                               (const_int 1))
+                    (const_int 0)] UNSPEC_CC_NE))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+       (any_shiftrt:SWI48 (match_dup 1) (const_int 1)))]
+  ""
+{
+  if (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
+    return "<shift>{<imodesuffix>}\t%0";
+  return "<shift>{<imodesuffix>}\t{1, %0|%0, 1}";
+}
+  [(set_attr "type" "ishift1")
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (match_test "TARGET_SHIFT1")
+           (match_test "optimize_function_for_size_p (cfun)"))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
 
 ;; Bit set / bit test instructions
 
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 4b2c5d5..3636a4a 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -717,3 +717,6 @@ DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", 
~m_KNL)
 /* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
   modifications on architectures where theses operations are slow.  */
 DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
+
+/* X86_TUNE_USE_RCR: Controls use of rcr 1 instruction instead of shrd.  */
+DEF_TUNE (X86_TUNE_USE_RCR, "use_rcr", m_AMD_MULTIPLE)
diff --git a/gcc/testsuite/gcc.target/i386/rcr-1.c 
b/gcc/testsuite/gcc.target/i386/rcr-1.c
new file mode 100644
index 0000000..8f369ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/rcr-1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-Oz" } */
+unsigned __int128 foo(unsigned __int128 x) { return x >> 1; }
+__int128 bar(__int128 x) { return x >> 1; }
+/* { dg-final { scan-assembler-times "rcrq" 2 } } */
+/* { dg-final { scan-assembler-not "shrdq" } } */
diff --git a/gcc/testsuite/gcc.target/i386/rcr-2.c 
b/gcc/testsuite/gcc.target/i386/rcr-2.c
new file mode 100644
index 0000000..c8ed50e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/rcr-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-Oz -mno-stv" } */
+unsigned long long foo(unsigned long long x) { return x >> 1; }
+long long bar(long long x) { return x >> 1; }
+/* { dg-final { scan-assembler-times "rcrl" 2 } } */
+/* { dg-final { scan-assembler-not "shrdl" } } */

Reply via email to