https://gcc.gnu.org/g:dbbd059b49edc936769737204f5c270d8d6ff553

commit r15-367-gdbbd059b49edc936769737204f5c270d8d6ff553
Author: Jeff Law <j...@ventanamicro.com>
Date:   Fri May 10 13:49:44 2024 -0600

    [RISC-V] Use shNadd for constant synthesis
    
    So here's the next idiom to improve constant synthesis.
    
    The basic idea here is to try and use shNadd to generate the constant when 
profitable.
    
    Let's take 0x300000801.  Right now that generates:
    
            li      a0,3145728
            addi    a0,a0,1
            slli    a0,a0,12
            addi    a0,a0,-2047
    
    But we can do better.  The constant is evenly divisible by 9 resulting in
    0x55555639 which doesn't look terribly interesting.  But that constant can 
be
    generated with two instructions, then we can use a sh3add to multiply it by 
9.
    So the updated sequence looks like:
    
            li      a0,1431654400
            addi    a0,a0,1593
            sh3add  a0,a0,a0
    
    This doesn't trigger a whole lot, but I haven't really set up a test to 
explore
    the most likely space where this might be useful.  The tests were found
    exploring a different class of constant synthesis problems.
    
    If you were to dive into the before/after you'd see that the shNadd 
interacts
    quite nicely with the recent bseti work.   The joys of recursion.
    
    Probably the most controversial thing in here is using the "FMA" opcode to
    stand in for when we want to use shNadd.  Essentially when we synthesize a
    constant we generate a series of RTL opcodes and constants for emission by
    another routine.   We don't really have a way to say we want a shift-add.  
But
    you can think of shift-add as a limited form of multiply-accumulate.  It's a
    bit of a stretch, but not crazy bad IMHO.
    
    Other approaches would be to store our own enum rather than an RTL opcode.  
Or
    store an actual generator function rather than any kind of opcode.
    
    It wouldn't take much pushback over (ab)using FMA in this manner to get me 
to
    use our own enums rather than RTL opcodes for this stuff.
    
    gcc/
    
            * config/riscv/riscv.cc (riscv_build_integer_1): Recognize cases 
where
            we can use shNadd to improve constant synthesis.
            (riscv_move_integer): Handle code generation for shNadd.
    
    gcc/testsuite
            * gcc.target/riscv/synthesis-1.c: Also count shNadd instructions.
            * gcc.target/riscv/synthesis-3.c: New test.

Diff:
---
 gcc/config/riscv/riscv.cc                    | 42 +++++++++++++++
 gcc/testsuite/gcc.target/riscv/synthesis-1.c |  2 +-
 gcc/testsuite/gcc.target/riscv/synthesis-3.c | 81 ++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 2860137af718..9c98b1da0357 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -880,6 +880,40 @@ riscv_build_integer_1 (struct riscv_integer_op 
codes[RISCV_MAX_INTEGER_OPS],
        }
     }
 
+  if (cost > 2 && TARGET_64BIT && TARGET_ZBA)
+    {
+      if ((value % 9) == 0
+         && (alt_cost
+             = riscv_build_integer_1 (alt_codes, value / 9, mode) + 1) < cost)
+       {
+          alt_codes[alt_cost - 1].code = FMA;
+          alt_codes[alt_cost - 1].value = 9;
+          alt_codes[alt_cost - 1].use_uw = false;
+          memcpy (codes, alt_codes, sizeof (alt_codes));
+          cost = alt_cost;
+       }
+      if ((value % 5) == 0
+         && (alt_cost
+             = riscv_build_integer_1 (alt_codes, value / 5, mode) + 1) < cost)
+       {
+          alt_codes[alt_cost - 1].code = FMA;
+          alt_codes[alt_cost - 1].value = 5;
+          alt_codes[alt_cost - 1].use_uw = false;
+          memcpy (codes, alt_codes, sizeof (alt_codes));
+          cost = alt_cost;
+       }
+      if ((value % 3) == 0
+         && (alt_cost
+             = riscv_build_integer_1 (alt_codes, value / 3, mode) + 1) < cost)
+       {
+          alt_codes[alt_cost - 1].code = FMA;
+          alt_codes[alt_cost - 1].value = 3;
+          alt_codes[alt_cost - 1].use_uw = false;
+          memcpy (codes, alt_codes, sizeof (alt_codes));
+          cost = alt_cost;
+       }
+    }
+
   /* Final cases, particularly focused on bseti.  */
   if (cost > 2 && TARGET_ZBS)
     {
@@ -2542,6 +2576,14 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT 
value,
              x = gen_rtx_fmt_ee (AND, mode, x, GEN_INT (value));
              x = riscv_emit_set (t, x);
            }
+         else if (codes[i].code == FMA)
+           {
+             HOST_WIDE_INT value = exact_log2 (codes[i].value - 1);
+             rtx ashift = gen_rtx_fmt_ee (ASHIFT, mode, x, GEN_INT (value));
+             x = gen_rtx_fmt_ee (PLUS, mode, ashift, x);
+             rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
+             x = riscv_emit_set (t, x);
+           }
          else
            x = gen_rtx_fmt_ee (codes[i].code, mode,
                                x, GEN_INT (codes[i].value));
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-1.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
index 3384e488ade2..9176d5f4989f 100644
--- a/gcc/testsuite/gcc.target/riscv/synthesis-1.c
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
@@ -12,7 +12,7 @@
    total number of instructions. 
 
    This isn't expected to change much and any change is worthy of a look.  */
-/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|slli)" 5822 } 
} */
+/* { dg-final { scan-assembler-times 
"\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 5822 } } */
 
  unsigned long foo_0x3(void) { return 0x3UL; }
  unsigned long foo_0x5(void) { return 0x5UL; }
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-3.c 
b/gcc/testsuite/gcc.target/riscv/synthesis-3.c
new file mode 100644
index 000000000000..5d92ac8e3099
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-3.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } 
*/
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions. 
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times 
"\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 248 } } */
+
+unsigned long foo_0x1200000120000000 (void) { return 0x1200000120000000UL; }
+unsigned long foo_0x120000402000000 (void) { return 0x120000402000000UL; }
+unsigned long foo_0x1200100020000 (void) { return 0x1200100020000UL; }
+unsigned long foo_0x14000000a0000000 (void) { return 0x14000000a0000000UL; }
+unsigned long foo_0x140000082000000 (void) { return 0x140000082000000UL; }
+unsigned long foo_0x14000080200000 (void) { return 0x14000080200000UL; }
+unsigned long foo_0x1400080020000 (void) { return 0x1400080020000UL; }
+unsigned long foo_0x140080002000 (void) { return 0x140080002000UL; }
+unsigned long foo_0x1800000120000000 (void) { return 0x1800000120000000UL; }
+unsigned long foo_0x180000102000000 (void) { return 0x180000102000000UL; }
+unsigned long foo_0x180000801 (void) { return 0x180000801UL; }
+unsigned long foo_0x180000802 (void) { return 0x180000802UL; }
+unsigned long foo_0x180001001 (void) { return 0x180001001UL; }
+unsigned long foo_0x18000100200000 (void) { return 0x18000100200000UL; }
+unsigned long foo_0x180001002 (void) { return 0x180001002UL; }
+unsigned long foo_0x180003000 (void) { return 0x180003000UL; }
+unsigned long foo_0x1800100020000 (void) { return 0x1800100020000UL; }
+unsigned long foo_0x180081000 (void) { return 0x180081000UL; }
+unsigned long foo_0x182001000 (void) { return 0x182001000UL; }
+unsigned long foo_0x2400000240000000 (void) { return 0x2400000240000000UL; }
+unsigned long foo_0x24000080400000 (void) { return 0x24000080400000UL; }
+unsigned long foo_0x2400200040000 (void) { return 0x2400200040000UL; }
+unsigned long foo_0x2800000140000000 (void) { return 0x2800000140000000UL; }
+unsigned long foo_0x280000104000000 (void) { return 0x280000104000000UL; }
+unsigned long foo_0x28000100400000 (void) { return 0x28000100400000UL; }
+unsigned long foo_0x2800100040000 (void) { return 0x2800100040000UL; }
+unsigned long foo_0x280011000 (void) { return 0x280011000UL; }
+unsigned long foo_0x280100004000 (void) { return 0x280100004000UL; }
+unsigned long foo_0x280401000 (void) { return 0x280401000UL; }
+unsigned long foo_0x290001000 (void) { return 0x290001000UL; }
+unsigned long foo_0x30000000c0000000 (void) { return 0x30000000c0000000UL; }
+unsigned long foo_0x300000084000000 (void) { return 0x300000084000000UL; }
+unsigned long foo_0x300000801 (void) { return 0x300000801UL; }
+unsigned long foo_0x30000080400000 (void) { return 0x30000080400000UL; }
+unsigned long foo_0x300002004 (void) { return 0x300002004UL; }
+unsigned long foo_0x300006000 (void) { return 0x300006000UL; }
+unsigned long foo_0x3000080040000 (void) { return 0x3000080040000UL; }
+unsigned long foo_0x300021000 (void) { return 0x300021000UL; }
+unsigned long foo_0x300080004000 (void) { return 0x300080004000UL; }
+unsigned long foo_0x300102000 (void) { return 0x300102000UL; }
+unsigned long foo_0x300801000 (void) { return 0x300801000UL; }
+unsigned long foo_0x304002000 (void) { return 0x304002000UL; }
+unsigned long foo_0x320001000 (void) { return 0x320001000UL; }
+unsigned long foo_0x4800000480000000 (void) { return 0x4800000480000000UL; }
+unsigned long foo_0x48000100800000 (void) { return 0x48000100800000UL; }
+unsigned long foo_0x4800400080000 (void) { return 0x4800400080000UL; }
+unsigned long foo_0x5000000280000000 (void) { return 0x5000000280000000UL; }
+unsigned long foo_0x500000208000000 (void) { return 0x500000208000000UL; }
+unsigned long foo_0x50000200800000 (void) { return 0x50000200800000UL; }
+unsigned long foo_0x5000200080000 (void) { return 0x5000200080000UL; }
+unsigned long foo_0x500200008000 (void) { return 0x500200008000UL; }
+unsigned long foo_0x6000000180000000 (void) { return 0x6000000180000000UL; }
+unsigned long foo_0x600000108000000 (void) { return 0x600000108000000UL; }
+unsigned long foo_0x60000100800000 (void) { return 0x60000100800000UL; }
+unsigned long foo_0x6000100080000 (void) { return 0x6000100080000UL; }
+unsigned long foo_0x600100008000 (void) { return 0x600100008000UL; }
+unsigned long foo_0x900000090000000 (void) { return 0x900000090000000UL; }
+unsigned long foo_0x90000201000000 (void) { return 0x90000201000000UL; }
+unsigned long foo_0x900080010000 (void) { return 0x900080010000UL; }
+unsigned long foo_0x90200001000 (void) { return 0x90200001000UL; }
+unsigned long foo_0xc00000090000000 (void) { return 0xc00000090000000UL; }
+unsigned long foo_0xc0000081000000 (void) { return 0xc0000081000000UL; }
+unsigned long foo_0xc000080100000 (void) { return 0xc000080100000UL; }
+unsigned long foo_0xc00080010000 (void) { return 0xc00080010000UL; }
+unsigned long foo_0xc0080001000 (void) { return 0xc0080001000UL; }

Reply via email to