From: Christoph Müllner <christoph.muell...@vrull.eu> This patch adds the field overlap_op_by_pieces to the struct riscv_tune_param, which allows to enable the overlap_op_by_pieces infrastructure.
gcc/ChangeLog: * config/riscv/riscv.c (struct riscv_tune_param): New field. (riscv_overlap_op_by_pieces): New function. (TARGET_OVERLAP_OP_BY_PIECES_P): Connect to riscv_overlap_op_by_pieces. Signed-off-by: Christoph Müllner <christoph.muell...@vrull.eu> --- gcc/config/riscv/riscv.cc | 17 +++++- .../gcc.target/riscv/memcpy-nonoverlapping.c | 54 +++++++++++++++++++ .../gcc.target/riscv/memcpy-overlapping.c | 50 +++++++++++++++++ .../gcc.target/riscv/memset-nonoverlapping.c | 45 ++++++++++++++++ .../gcc.target/riscv/memset-overlapping.c | 43 +++++++++++++++ 5 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/riscv/memcpy-nonoverlapping.c create mode 100644 gcc/testsuite/gcc.target/riscv/memcpy-overlapping.c create mode 100644 gcc/testsuite/gcc.target/riscv/memset-nonoverlapping.c create mode 100644 gcc/testsuite/gcc.target/riscv/memset-overlapping.c diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index a0c00cfb66f..7357cf51cdf 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -243,6 +243,7 @@ struct riscv_tune_param unsigned short fmv_cost; bool slow_unaligned_access; unsigned int fusible_ops; + bool overlap_op_by_pieces; }; /* Information about one micro-arch we know about. */ @@ -331,6 +332,7 @@ static const struct riscv_tune_param rocket_tune_info = { 8, /* fmv_cost */ true, /* slow_unaligned_access */ RISCV_FUSE_NOTHING, /* fusible_ops */ + false, /* overlap_op_by_pieces */ }; /* Costs to use when optimizing for Sifive 7 Series. */ @@ -346,6 +348,7 @@ static const struct riscv_tune_param sifive_7_tune_info = { 8, /* fmv_cost */ true, /* slow_unaligned_access */ RISCV_FUSE_NOTHING, /* fusible_ops */ + false, /* overlap_op_by_pieces */ }; /* Costs to use when optimizing for T-HEAD c906. */ @@ -361,6 +364,7 @@ static const struct riscv_tune_param thead_c906_tune_info = { 8, /* fmv_cost */ false, /* slow_unaligned_access */ RISCV_FUSE_NOTHING, /* fusible_ops */ + false, /* overlap_op_by_pieces */ }; /* Costs to use when optimizing for size. */ @@ -376,6 +380,7 @@ static const struct riscv_tune_param optimize_size_tune_info = { 8, /* fmv_cost */ false, /* slow_unaligned_access */ RISCV_FUSE_NOTHING, /* fusible_ops */ + false, /* overlap_op_by_pieces */ }; /* Costs to use when optimizing for Ventana Micro VT1. */ @@ -393,7 +398,8 @@ static const struct riscv_tune_param ventana_vt1_tune_info = { ( RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH | /* fusible_ops */ RISCV_FUSE_ZEXTWS | RISCV_FUSE_LDINDEXED | RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI | - RISCV_FUSE_LUI_LD | RISCV_FUSE_AUIPC_LD ) + RISCV_FUSE_LUI_LD | RISCV_FUSE_AUIPC_LD ), + true, /* overlap_op_by_pieces */ }; static tree riscv_handle_fndecl_attribute (tree *, tree, tree, int, bool *); @@ -6444,6 +6450,12 @@ riscv_slow_unaligned_access (machine_mode, unsigned int) return riscv_slow_unaligned_access_p; } +static bool +riscv_overlap_op_by_pieces (void) +{ + return tune_param->overlap_op_by_pieces; +} + /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */ static bool @@ -6974,6 +6986,9 @@ riscv_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor, #undef TARGET_SLOW_UNALIGNED_ACCESS #define TARGET_SLOW_UNALIGNED_ACCESS riscv_slow_unaligned_access +#undef TARGET_OVERLAP_OP_BY_PIECES_P +#define TARGET_OVERLAP_OP_BY_PIECES_P riscv_overlap_op_by_pieces + #undef TARGET_SECONDARY_MEMORY_NEEDED #define TARGET_SECONDARY_MEMORY_NEEDED riscv_secondary_memory_needed diff --git a/gcc/testsuite/gcc.target/riscv/memcpy-nonoverlapping.c b/gcc/testsuite/gcc.target/riscv/memcpy-nonoverlapping.c new file mode 100644 index 00000000000..1c99e13fc26 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/memcpy-nonoverlapping.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-mcpu=sifive-u74 -march=rv64gc -mabi=lp64" } */ +/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" "-Og" } } */ + + +#define COPY_N(N) \ +void copy##N (char *src, char *dst) \ +{ \ + dst = __builtin_assume_aligned (dst, 4096); \ + src = __builtin_assume_aligned (src, 4096); \ + __builtin_memcpy (dst, src, N); \ +} + +/* Emits 1x {ld,sd} and 1x {lhu,lbu,sh,sb}. */ +COPY_N(11) + +/* Emits 1x {ld,sd} and 1x {lw,lbu,sw,sb}. */ +COPY_N(13) + +/* Emits 1x {ld,sd} and 1x {lw,lhu,sw,sh}. */ +COPY_N(14) + +/* Emits 1x {ld,sd} and 1x {lw,lhu,lbu,sw,sh,sb}. */ +COPY_N(15) + +/* Emits 2x {ld,sd} and 1x {lhu,lbu,sh,sb}. */ +COPY_N(19) + +/* Emits 2x {ld,sd} and 1x {lw,lhu,lbu,sw,sh,sb}. */ +COPY_N(23) + +/* The by-pieces infrastructure handles up to 24 bytes. + So the code below is emitted via cpymemsi/block_move_straight. */ + +/* Emits 3x {ld,sd} and 1x {lhu,lbu,sh,sb}. */ +COPY_N(27) + +/* Emits 3x {ld,sd} and 1x {lw,lbu,sw,sb}. */ +COPY_N(29) + +/* Emits 3x {ld,sd} and 1x {lw,lhu,lbu,sw,sh,sb}. */ +COPY_N(31) + +/* { dg-final { scan-assembler-times "ld\t" 17 } } */ +/* { dg-final { scan-assembler-times "sd\t" 17 } } */ + +/* { dg-final { scan-assembler-times "lw\t" 6 } } */ +/* { dg-final { scan-assembler-times "sw\t" 6 } } */ + +/* { dg-final { scan-assembler-times "lhu\t" 7 } } */ +/* { dg-final { scan-assembler-times "sh\t" 7 } } */ + +/* { dg-final { scan-assembler-times "lbu\t" 8 } } */ +/* { dg-final { scan-assembler-times "sb\t" 8 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/memcpy-overlapping.c b/gcc/testsuite/gcc.target/riscv/memcpy-overlapping.c new file mode 100644 index 00000000000..ffb7248bfd1 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/memcpy-overlapping.c @@ -0,0 +1,50 @@ +/* { dg-do compile } */ +/* { dg-options "-mcpu=ventana-vt1 -march=rv64gc -mabi=lp64" } */ +/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" "-Og" } } */ + +#define COPY_N(N) \ +void copy##N (char *src, char *dst) \ +{ \ + dst = __builtin_assume_aligned (dst, 4096); \ + src = __builtin_assume_aligned (src, 4096); \ + __builtin_memcpy (dst, src, N); \ +} + +/* Emits 1x {ld,sd} and 1x {lw,sw}. */ +COPY_N(11) + +/* Emits 2x {ld,sd}. */ +COPY_N(13) + +/* Emits 2x {ld,sd}. */ +COPY_N(14) + +/* Emits 2x {ld,sd}. */ +COPY_N(15) + +/* Emits 2x {ld,sd} and 1x {lw,sw}. */ +COPY_N(19) + +/* Emits 3x ld and 3x sd. */ +COPY_N(23) + +/* The by-pieces infrastructure handles up to 24 bytes. + So the code below is emitted via cpymemsi/block_move_straight. */ + +/* Emits 3x {ld,sd} and 1x {lhu,lbu,sh,sb}. */ +COPY_N(27) + +/* Emits 3x {ld,sd} and 1x {lw,lbu,sw,sb}. */ +COPY_N(29) + +/* Emits 3x {ld,sd} and 2x {lw,sw}. */ +COPY_N(31) + +/* { dg-final { scan-assembler-times "ld\t" 21 } } */ +/* { dg-final { scan-assembler-times "sd\t" 21 } } */ + +/* { dg-final { scan-assembler-times "lw\t" 5 } } */ +/* { dg-final { scan-assembler-times "sw\t" 5 } } */ + +/* { dg-final { scan-assembler-times "lbu\t" 2 } } */ +/* { dg-final { scan-assembler-times "sb\t" 2 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/memset-nonoverlapping.c b/gcc/testsuite/gcc.target/riscv/memset-nonoverlapping.c new file mode 100644 index 00000000000..c4311c7a8d0 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/memset-nonoverlapping.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-mcpu=sifive-u74 -march=rv64gc -mabi=lp64" } */ +/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" "-Og" } } */ + +#define ZERO_N(N) \ +void zero##N (char *dst) \ +{ \ + dst = __builtin_assume_aligned (dst, 4096); \ + __builtin_memset (dst, 0, N); \ +} + +/* Emits 1x sd and 1x {sh,sb}. */ +ZERO_N(11) + +/* Emits 1x sd and 1x {sw,sb}. */ +ZERO_N(13) + +/* Emits 1x sd and 1x {sw,sh}. */ +ZERO_N(14) + +/* Emits 1x sd and 1x {sw,sh,sb}. */ +ZERO_N(15) + +/* Emits 2x sd and 1x {sh,sb}. */ +ZERO_N(19) + +/* Emits 2x sd and 1x {sw,sh,sb}. */ +ZERO_N(23) + +/* The by-pieces infrastructure handles up to 24 bytes. + So the code below is emitted via cpymemsi/block_move_straight. */ + +/* Emits 3x sd and 1x {sh,sb}. */ +ZERO_N(27) + +/* Emits 3x sd and 1x {sw,sb}. */ +ZERO_N(29) + +/* Emits 3x sd and 1x {sw,sh,sb}. */ +ZERO_N(31) + +/* { dg-final { scan-assembler-times "sd\t" 17 } } */ +/* { dg-final { scan-assembler-times "sw\t" 6 } } */ +/* { dg-final { scan-assembler-times "sh\t" 7 } } */ +/* { dg-final { scan-assembler-times "sb\t" 8 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/memset-overlapping.c b/gcc/testsuite/gcc.target/riscv/memset-overlapping.c new file mode 100644 index 00000000000..793766b5262 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/memset-overlapping.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-options "-mcpu=ventana-vt1 -march=rv64gc -mabi=lp64" } */ +/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" "-Og" } } */ + +#define ZERO_N(N) \ +void zero##N (char *dst) \ +{ \ + dst = __builtin_assume_aligned (dst, 4096); \ + __builtin_memset (dst, 0, N); \ +} + +/* Emits 1x sd and 1x sw. */ +ZERO_N(11) + +/* Emits 2x sd. */ +ZERO_N(13) + +/* Emits 2x sd. */ +ZERO_N(14) + +/* Emits 2x sd. */ +ZERO_N(15) + +/* Emits 2x sd and 1x sw. */ +ZERO_N(19) + +/* Emits 3x sd. */ +ZERO_N(23) + +/* The by-pieces infrastructure handles up to 24 bytes. + So the code below is emitted via cpymemsi/block_move_straight. */ + +/* Emits 3x sd and 1x sw. */ +ZERO_N(27) + +/* Emits 4x sd. */ +ZERO_N(29) + +/* Emits 4x sd. */ +ZERO_N(31) + +/* { dg-final { scan-assembler-times "sd\t" 23 } } */ +/* { dg-final { scan-assembler-times "sw\t" 3 } } */ -- 2.38.1