This patch improves the speed of ARC's ashrsi3 and lshrsi3, on CPUs
without a barrel shifter, when not optimizing for size.  The current
implementations of right shifts by a constant are optimal for code
size, but at significant performance cost.  By emitting an extra
instruction or two, when not optimizing for size, we can improve
performance (sometimes dramatically).

[al]shrsi3 #5   Before 4 insns@12 cycles, after 5 insns@5 cycles

Without -mswap
[al]shrsi3 #29  Before 4 insns@60 cycles, after 5 insns@31 cycles

With -mswap
lshrsi3 #29     Before 4 insns@60 cycles, after 6 insns@16 cycles


This patch has been minimally tested by building a cross-compiler
to arc-linux hosted on x86_64-pc-linux-gnu where there are no new
failures from "make -k check" in the compile-only tests.
Ok for mainline (after 3rd-party testing)?


2024-07-11  Roger Sayle  <ro...@nextmovesoftware.com>

gcc/ChangeLog
        * config/arc/arc.cc (arc_split_ashr): When not optimizing for
        size; fully unroll ashr #5, on TARGET_SWAP for shifts between
        19 and 29, perform ashr #16 using two instructions then
        recursively perform the remaining shift, and for shifts by
        odd amounts perform a single shift then the remainder
        of the shift using a loop doing two bits per iteration.
        (arc_split_lshr): Likewise.


Thanks in advance,
Roger
--

diff --git a/gcc/config/arc/arc.cc b/gcc/config/arc/arc.cc
index 686de0ff2d5..b56e65d2d3e 100644
--- a/gcc/config/arc/arc.cc
+++ b/gcc/config/arc/arc.cc
@@ -4556,7 +4556,8 @@ arc_split_ashr (rtx *operands)
   if (CONST_INT_P (operands[2]))
     {
       int n = INTVAL (operands[2]) & 0x1f;
-      if (n <= 4)
+      if (n <= 4
+         || (n == 5 && !optimize_function_for_size_p (cfun)))
        {
          if (n != 0)
            {
@@ -4577,6 +4578,17 @@ arc_split_ashr (rtx *operands)
            emit_insn (gen_ashrsi3_cnt1 (operands[0], operands[0]));
          return;
        }
+      else if (n >= 19 && n <= 29 && TARGET_SWAP
+              && !optimize_function_for_size_p (cfun))
+       {
+         emit_insn (gen_rotrsi2_cnt16 (operands[0], operands[1]));
+         emit_insn (gen_extendhisi2 (operands[0],
+                                     gen_lowpart (HImode, operands[0])));
+         operands[1] = operands[0];
+         operands[2] = GEN_INT (n - 16);
+         arc_split_ashr (operands);
+         return;
+       }
       else if (n == 30)
        {
          rtx tmp = gen_reg_rtx (SImode);
@@ -4592,6 +4604,13 @@ arc_split_ashr (rtx *operands)
          emit_insn (gen_sbc (operands[0], operands[0], operands[0]));
          return;
        }
+      else if ((n & 1) != 0 && !optimize_function_for_size_p (cfun))
+       {
+         emit_insn (gen_ashrsi3_cnt1 (operands[0], operands[1]));
+         emit_insn (gen_ashrsi3_loop (operands[0], operands[0],
+                                      GEN_INT (n - 1)));
+         return;
+       }
     }
 
   emit_insn (gen_ashrsi3_loop (operands[0], operands[1], operands[2]));
@@ -4604,7 +4623,8 @@ arc_split_lshr (rtx *operands)
   if (CONST_INT_P (operands[2]))
     {
       int n = INTVAL (operands[2]) & 0x1f;
-      if (n <= 4)
+      if (n <= 4
+         || (n == 5 && !optimize_function_for_size_p (cfun)))
        {
          if (n != 0)
            {
@@ -4623,6 +4643,15 @@ arc_split_lshr (rtx *operands)
            emit_insn (gen_lshrsi3_cnt1 (operands[0], operands[0]));
          return;
        }
+      else if (n >= 20 && n <= 29 && TARGET_SWAP && TARGET_V2
+              && !optimize_function_for_size_p (cfun))
+       {
+         emit_insn (gen_lshrsi2_cnt16 (operands[0], operands[1]));
+         operands[1] = operands[0];
+         operands[2] = GEN_INT (n - 16);
+         arc_split_lshr (operands);
+         return;
+       }
       else if (n == 30)
        {
          rtx tmp = gen_reg_rtx (SImode);
@@ -4638,6 +4667,13 @@ arc_split_lshr (rtx *operands)
          emit_insn (gen_scc_ltu_cc_c (operands[0]));
          return;
        }
+      else if ((n & 1) != 0 && !optimize_function_for_size_p (cfun))
+       {
+         emit_insn (gen_lshrsi3_cnt1 (operands[0], operands[1]));
+         emit_insn (gen_lshrsi3_loop (operands[0], operands[0],
+                                      GEN_INT (n - 1)));
+         return;
+       }
     }
 
   emit_insn (gen_lshrsi3_loop (operands[0], operands[1], operands[2]));

Reply via email to