Ping.
On 8/11/21 11:02 AM, Pat Haugen via Gcc-patches wrote:
> Enable store fusion on Power10.
>
> Use the SCHED_REORDER hook to implement Power10 specific ready list
> reordering.
> As of now this is just store fusion.
>
> Things changed in this version of the patch
> - Separate patch for additional load/store checks
> - Move option check from is_fusable_store() to caller
> - Misc coding style changes pointed out in review (parens/braces)
> - Add testcases
>
> Bootstrap/regtest on powerpc64(32/64) and powerpc64le(Power10) with no new
> regressions.
> Ok for master?
>
> -Pat
>
>
> 2021-08-11 Pat Haugen <pthau...@linux.ibm.com>
>
> gcc/ChangeLog:
>
> * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag.
> (POWERPC_MASKS): Likewise.
> * config/rs6000/rs6000.c (rs6000_option_override_internal): Enable
> store fusion for Power10.
> (is_fusable_store): New.
> (power10_sched_reorder): Likewise.
> (rs6000_sched_reorder): Do Power10 specific reordering.
> (rs6000_sched_reorder2): Likewise.
> * config/rs6000/rs6000.opt: Add new option.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/powerpc/fusion-p10-stst.c: New test.
> * gcc.target/powerpc/fusion-p10-stst2.c: New test.
>
>
>
> diff --git a/gcc/config/rs6000/rs6000-cpus.def
> b/gcc/config/rs6000/rs6000-cpus.def
> index 6758296c0fd..f5812da0184 100644
> --- a/gcc/config/rs6000/rs6000-cpus.def
> +++ b/gcc/config/rs6000/rs6000-cpus.def
> @@ -90,7 +90,8 @@
> | OPTION_MASK_P10_FUSION_2LOGICAL \
> | OPTION_MASK_P10_FUSION_LOGADD \
> | OPTION_MASK_P10_FUSION_ADDLOG \
> - | OPTION_MASK_P10_FUSION_2ADD)
> + | OPTION_MASK_P10_FUSION_2ADD \
> + | OPTION_MASK_P10_FUSION_2STORE)
>
> /* Flags that need to be turned off if -mno-power9-vector. */
> #define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW
> \
> @@ -143,6 +144,7 @@
> | OPTION_MASK_P10_FUSION_LOGADD \
> | OPTION_MASK_P10_FUSION_ADDLOG \
> | OPTION_MASK_P10_FUSION_2ADD \
> + | OPTION_MASK_P10_FUSION_2STORE \
> | OPTION_MASK_HTM \
> | OPTION_MASK_ISEL \
> | OPTION_MASK_MFCRF \
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 60f406a4ff6..402cc924e3f 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -4495,6 +4495,10 @@ rs6000_option_override_internal (bool global_init_p)
> && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0)
> rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD;
>
> + if (TARGET_POWER10
> + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
> + rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
> +
> /* Turn off vector pair/mma options on non-power10 systems. */
> else if (!TARGET_POWER10 && TARGET_MMA)
> {
> @@ -18874,6 +18878,91 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos)
> return cached_can_issue_more;
> }
>
> +/* Determine if INSN is a store to memory that can be fused with a similar
> + adjacent store. */
> +
> +static bool
> +is_fusable_store (rtx_insn *insn, rtx *str_mem)
> +{
> + /* Insn must be a non-prefixed base+disp form store. */
> + if (is_store_insn (insn, str_mem)
> + && get_attr_prefixed (insn) == PREFIXED_NO
> + && get_attr_update (insn) == UPDATE_NO
> + && get_attr_indexed (insn) == INDEXED_NO)
> + {
> + /* Further restictions by mode and size. */
> + machine_mode mode = GET_MODE (*str_mem);
> + HOST_WIDE_INT size;
> + if (MEM_SIZE_KNOWN_P (*str_mem))
> + size = MEM_SIZE (*str_mem);
> + else
> + return false;
> +
> + if (INTEGRAL_MODE_P (mode))
> + /* Must be word or dword size. */
> + return (size == 4 || size == 8);
> + else if (FLOAT_MODE_P (mode))
> + /* Must be dword size. */
> + return (size == 8);
> + }
> +
> + return false;
> +}
> +
> +/* Do Power10 specific reordering of the ready list. */
> +
> +static int
> +power10_sched_reorder (rtx_insn **ready, int lastpos)
> +{
> + rtx mem1;
> +
> + /* Do store fusion during sched2 only. */
> + if (!reload_completed)
> + return cached_can_issue_more;
> +
> + /* If the prior insn finished off a store fusion pair then simply
> + reset the counter and return, nothing more to do. */
> + if (load_store_pendulum != 0)
> + {
> + load_store_pendulum = 0;
> + return cached_can_issue_more;
> + }
> +
> + /* Try to pair certain store insns to adjacent memory locations
> + so that the hardware will fuse them to a single operation. */
> + if (TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE
> + && is_fusable_store (last_scheduled_insn, &mem1))
> + {
> + int pos;
> + rtx mem2;
> +
> + /* A fusable store was just scheduled. Scan the ready list for another
> + store that it can fuse with. */
> + pos = lastpos;
> + while (pos >= 0)
> + {
> + /* GPR stores can be ascending or descending offsets, FPR/VSR stores
> + must be ascending only. */
> + if (is_fusable_store (ready[pos], &mem2)
> + && ((INTEGRAL_MODE_P (GET_MODE (mem1))
> + && adjacent_mem_locations (mem1, mem2))
> + || (FLOAT_MODE_P (GET_MODE (mem1))
> + && (adjacent_mem_locations (mem1, mem2) == mem1))))
> + {
> + /* Found a fusable store. Move it to the end of the ready list
> + so it is scheduled next. */
> + move_to_end_of_ready (ready, pos, lastpos);
> +
> + load_store_pendulum = -1;
> + break;
> + }
> + pos--;
> + }
> + }
> +
> + return cached_can_issue_more;
> +}
> +
> /* We are about to begin issuing insns for this clock cycle. */
>
> static int
> @@ -18900,6 +18989,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED,
> int sched_verbose,
> if (rs6000_tune == PROCESSOR_POWER6)
> load_store_pendulum = 0;
>
> + /* Do Power10 dependent reordering. */
> + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
> + power10_sched_reorder (ready, n_ready - 1);
> +
> return rs6000_issue_rate ();
> }
>
> @@ -18921,6 +19014,10 @@ rs6000_sched_reorder2 (FILE *dump, int
> sched_verbose, rtx_insn **ready,
> && recog_memoized (last_scheduled_insn) >= 0)
> return power9_sched_reorder2 (ready, *pn_ready - 1);
>
> + /* Do Power10 dependent reordering. */
> + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
> + return power10_sched_reorder (ready, *pn_ready - 1);
> +
> return cached_can_issue_more;
> }
>
> diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
> index 0538db387dc..3753de19557 100644
> --- a/gcc/config/rs6000/rs6000.opt
> +++ b/gcc/config/rs6000/rs6000.opt
> @@ -514,6 +514,10 @@ mpower10-fusion-2add
> Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags)
> Fuse dependent pairs of add or vaddudm instructions for better performance
> on power10.
>
> +mpower10-fusion-2store
> +Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags)
> +Fuse certain store operations together for better performance on power10.
> +
> mcrypto
> Target Mask(CRYPTO) Var(rs6000_isa_flags)
> Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.
> diff --git a/gcc/testsuite/gcc.target/powerpc/fusion-p10-stst.c
> b/gcc/testsuite/gcc.target/powerpc/fusion-p10-stst.c
> new file mode 100644
> index 00000000000..528a7e542ab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/fusion-p10-stst.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +/* Verify store fusion is enabled */
> +
> +void fuse_stw (int *i, int a, int b, int c)
> +{
> + i[1] = a;
> + i[5] = b;
> + i[2] = c;
> +}
> +
> +void fuse_std (long *i, long a, long b, long c)
> +{
> + i[1] = a;
> + i[5] = b;
> + i[2] = c;
> +}
> +
> +void fuse_stfd (double *i, double a, double b, double c)
> +{
> + i[1] = a;
> + i[5] = b;
> + i[2] = c;
> +}
> +
> +/* { dg-final { scan-assembler-times {stw 4,4\(3\)\n\tstw 6,8\(3\)} 1 {
> target lp64 } } } */
> +/* { dg-final { scan-assembler-times {stw 4,4\(3\)\n\tstw 6,8\(3\)} 2 {
> target ilp32 } } } */
> +/* { dg-final { scan-assembler-times {std 4,8\(3\)\n\tstd 6,16\(3\)} 1 {
> target lp64 } } } */
> +/* { dg-final { scan-assembler-times {stfd 1,8\(3\)\n\tstfd 3,16\(3\)} 1 } }
> */
> +
> diff --git a/gcc/testsuite/gcc.target/powerpc/fusion-p10-stst2.c
> b/gcc/testsuite/gcc.target/powerpc/fusion-p10-stst2.c
> new file mode 100644
> index 00000000000..62f1a92c2b1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/fusion-p10-stst2.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mdejagnu-cpu=power10 -mno-power10-fusion -O2" } */
> +
> +/* Verify store fusion is disabled */
> +
> +void fuse_stw (int *i, int a, int b, int c)
> +{
> + i[1] = a;
> + i[5] = b;
> + i[2] = c;
> +}
> +
> +void fuse_std (long *i, long a, long b, long c)
> +{
> + i[1] = a;
> + i[5] = b;
> + i[2] = c;
> +}
> +
> +void fuse_stfd (double *i, double a, double b, double c)
> +{
> + i[1] = a;
> + i[5] = b;
> + i[2] = c;
> +}
> +
> +
> +/* { dg-final { scan-assembler-not {stw 4,4\(3\)\n\tstw 6,8\(3\)} } } */
> +/* { dg-final { scan-assembler-not {std 4,8\(3\)\n\tstd 6,16\(3\)} { target
> lp64 } } } */
> +/* { dg-final { scan-assembler-not {stfd 1,8\(3\)\n\tstfd 3,16\(3\)} } } */
>