https://gcc.gnu.org/g:07f793b32359e37191b2bda2a4e8cac8428648e8

commit 07f793b32359e37191b2bda2a4e8cac8428648e8
Author: Christoph Müllner <christoph.muell...@vrull.eu>
Date:   Tue May 7 15:16:21 2024 -0600

    [committed] [RISC-V] Allow uarchs to set TARGET_OVERLAP_OP_BY_PIECES_P
    
    This is almost exclusively work from the VRULL team.
    
    As we've discussed in the Tuesday meeting in the past, we'd like to have a 
knob
    in the tuning structure to indicate that overlapped stores during
    move_by_pieces expansion of memcpy & friends are acceptable.
    
    This patch adds the that capability in our tuning structure.  It's off for 
all
    the uarchs upstream, but we have been using it inside Ventana for our uarch
    with success.  So technically it's NFC upstream, but puts in the 
infrastructure
    multiple organizations likely need.
    
    gcc/
    
            * config/riscv/riscv.cc (struct riscv_tune_param): Add new
            "overlap_op_by_pieces" field.
            (rocket_tune_info, sifive_7_tune_info): Set it.
            (sifive_p400_tune_info, sifive_p600_tune_info): Likewise.
            (thead_c906_tune_info, xiangshan_nanhu_tune_info): Likewise.
            (generic_ooo_tune_info, optimize_size_tune_info): Likewise.
            (riscv_overlap_op_by_pieces): New function.
            (TARGET_OVERLAP_OP_BY_PIECES_P): define.
    
    gcc/testsuite/
    
            * gcc.target/riscv/memcpy-nonoverlapping.c: New test.
            * gcc.target/riscv/memset-nonoverlapping.c: New test.
    
    (cherry picked from commit 300393484dbfa9fd3891174ea47aa3fb41915abc)

Diff:
---
 gcc/config/riscv/riscv.cc                          | 18 ++++++++
 .../gcc.target/riscv/memcpy-nonoverlapping.c       | 54 ++++++++++++++++++++++
 .../gcc.target/riscv/memset-nonoverlapping.c       | 45 ++++++++++++++++++
 3 files changed, 117 insertions(+)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 545e68566dc7..a9b57d411841 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -288,6 +288,7 @@ struct riscv_tune_param
   unsigned short fmv_cost;
   bool slow_unaligned_access;
   bool use_divmod_expansion;
+  bool overlap_op_by_pieces;
   unsigned int fusible_ops;
   const struct cpu_vector_cost *vec_costs;
 };
@@ -427,6 +428,7 @@ static const struct riscv_tune_param rocket_tune_info = {
   8,                                           /* fmv_cost */
   true,                                                /* 
slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
+  false,                                       /* overlap_op_by_pieces */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   NULL,                                                /* vector cost */
 };
@@ -444,6 +446,7 @@ static const struct riscv_tune_param sifive_7_tune_info = {
   8,                                           /* fmv_cost */
   true,                                                /* 
slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
+  false,                                       /* overlap_op_by_pieces */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   NULL,                                                /* vector cost */
 };
@@ -461,6 +464,7 @@ static const struct riscv_tune_param sifive_p400_tune_info 
= {
   4,                                           /* fmv_cost */
   true,                                                /* 
slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
+  false,                                       /* overlap_op_by_pieces */
   RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI,  /* fusible_ops */
   &generic_vector_cost,                                /* vector cost */
 };
@@ -478,6 +482,7 @@ static const struct riscv_tune_param sifive_p600_tune_info 
= {
   4,                                           /* fmv_cost */
   true,                                                /* 
slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
+  false,                                       /* overlap_op_by_pieces */
   RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI,  /* fusible_ops */
   &generic_vector_cost,                                /* vector cost */
 };
@@ -495,6 +500,7 @@ static const struct riscv_tune_param thead_c906_tune_info = 
{
   8,           /* fmv_cost */
   false,            /* slow_unaligned_access */
   false,       /* use_divmod_expansion */
+  false,                                       /* overlap_op_by_pieces */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   NULL,                                                /* vector cost */
 };
@@ -512,6 +518,7 @@ static const struct riscv_tune_param 
xiangshan_nanhu_tune_info = {
   3,                                           /* fmv_cost */
   true,                                                /* 
slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
+  false,                                       /* overlap_op_by_pieces */
   RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
   NULL,                                                /* vector cost */
 };
@@ -529,6 +536,7 @@ static const struct riscv_tune_param generic_ooo_tune_info 
= {
   4,                                           /* fmv_cost */
   false,                                       /* slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
+  false,                                       /* overlap_op_by_pieces */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   &generic_vector_cost,                                /* vector cost */
 };
@@ -546,6 +554,7 @@ static const struct riscv_tune_param 
optimize_size_tune_info = {
   8,                                           /* fmv_cost */
   false,                                       /* slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
+  false,                                       /* overlap_op_by_pieces */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
   NULL,                                                /* vector cost */
 };
@@ -9979,6 +9988,12 @@ riscv_slow_unaligned_access (machine_mode, unsigned int)
   return riscv_slow_unaligned_access_p;
 }
 
+static bool
+riscv_overlap_op_by_pieces (void)
+{
+  return tune_param->overlap_op_by_pieces;
+}
+
 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
 
 static bool
@@ -11420,6 +11435,9 @@ riscv_get_raw_result_mode (int regno)
 #undef TARGET_SLOW_UNALIGNED_ACCESS
 #define TARGET_SLOW_UNALIGNED_ACCESS riscv_slow_unaligned_access
 
+#undef TARGET_OVERLAP_OP_BY_PIECES_P
+#define TARGET_OVERLAP_OP_BY_PIECES_P riscv_overlap_op_by_pieces
+
 #undef TARGET_SECONDARY_MEMORY_NEEDED
 #define TARGET_SECONDARY_MEMORY_NEEDED riscv_secondary_memory_needed
 
diff --git a/gcc/testsuite/gcc.target/riscv/memcpy-nonoverlapping.c 
b/gcc/testsuite/gcc.target/riscv/memcpy-nonoverlapping.c
new file mode 100644
index 000000000000..1c99e13fc269
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/memcpy-nonoverlapping.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-mcpu=sifive-u74 -march=rv64gc -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" "-Og" } } */
+
+
+#define COPY_N(N)                              \
+void copy##N (char *src, char *dst)            \
+{                                              \
+  dst = __builtin_assume_aligned (dst, 4096);  \
+  src = __builtin_assume_aligned (src, 4096);  \
+  __builtin_memcpy (dst, src, N);              \
+}
+
+/* Emits 1x {ld,sd} and 1x {lhu,lbu,sh,sb}.  */
+COPY_N(11)
+
+/* Emits 1x {ld,sd} and 1x {lw,lbu,sw,sb}.  */
+COPY_N(13)
+
+/* Emits 1x {ld,sd} and 1x {lw,lhu,sw,sh}.  */
+COPY_N(14)
+
+/* Emits 1x {ld,sd} and 1x {lw,lhu,lbu,sw,sh,sb}.  */
+COPY_N(15)
+
+/* Emits 2x {ld,sd} and 1x {lhu,lbu,sh,sb}.  */
+COPY_N(19)
+
+/* Emits 2x {ld,sd} and 1x {lw,lhu,lbu,sw,sh,sb}.  */
+COPY_N(23)
+
+/* The by-pieces infrastructure handles up to 24 bytes.
+   So the code below is emitted via cpymemsi/block_move_straight.  */
+
+/* Emits 3x {ld,sd} and 1x {lhu,lbu,sh,sb}.  */
+COPY_N(27)
+
+/* Emits 3x {ld,sd} and 1x {lw,lbu,sw,sb}.  */
+COPY_N(29)
+
+/* Emits 3x {ld,sd} and 1x {lw,lhu,lbu,sw,sh,sb}.  */
+COPY_N(31)
+
+/* { dg-final { scan-assembler-times "ld\t" 17 } } */
+/* { dg-final { scan-assembler-times "sd\t" 17 } } */
+
+/* { dg-final { scan-assembler-times "lw\t" 6 } } */
+/* { dg-final { scan-assembler-times "sw\t" 6 } } */
+
+/* { dg-final { scan-assembler-times "lhu\t" 7 } } */
+/* { dg-final { scan-assembler-times "sh\t" 7 } } */
+
+/* { dg-final { scan-assembler-times "lbu\t" 8 } } */
+/* { dg-final { scan-assembler-times "sb\t" 8 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/memset-nonoverlapping.c 
b/gcc/testsuite/gcc.target/riscv/memset-nonoverlapping.c
new file mode 100644
index 000000000000..c4311c7a8d03
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/memset-nonoverlapping.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-mcpu=sifive-u74 -march=rv64gc -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" "-Og" } } */
+
+#define ZERO_N(N)                              \
+void zero##N (char *dst)                       \
+{                                              \
+  dst = __builtin_assume_aligned (dst, 4096);  \
+  __builtin_memset (dst, 0, N);                        \
+}
+
+/* Emits 1x sd and 1x {sh,sb}.  */
+ZERO_N(11)
+
+/* Emits 1x sd and 1x {sw,sb}.  */
+ZERO_N(13)
+
+/* Emits 1x sd and 1x {sw,sh}.  */
+ZERO_N(14)
+
+/* Emits 1x sd and 1x {sw,sh,sb}.  */
+ZERO_N(15)
+
+/* Emits 2x sd and 1x {sh,sb}.  */
+ZERO_N(19)
+
+/* Emits 2x sd and 1x {sw,sh,sb}.  */
+ZERO_N(23)
+
+/* The by-pieces infrastructure handles up to 24 bytes.
+   So the code below is emitted via cpymemsi/block_move_straight.  */
+
+/* Emits 3x sd and 1x {sh,sb}.  */
+ZERO_N(27)
+
+/* Emits 3x sd and 1x {sw,sb}.  */
+ZERO_N(29)
+
+/* Emits 3x sd and 1x {sw,sh,sb}.  */
+ZERO_N(31)
+
+/* { dg-final { scan-assembler-times "sd\t" 17 } } */
+/* { dg-final { scan-assembler-times "sw\t" 6 } } */
+/* { dg-final { scan-assembler-times "sh\t" 7 } } */
+/* { dg-final { scan-assembler-times "sb\t" 8 } } */

Reply via email to