Wilco Dijkstra <[email protected]> writes:
> The early scheduler takes up ~33% of the total build time, however it doesn't
> provide a meaningful performance gain. This is partly because modern OoO
> cores
> need far less scheduling, partly because the scheduler tends to create many
> unnecessary spills by increasing register pressure. Building applications
> 56% faster is far more useful than ~0.1% improvement on SPEC, so switch off
> early scheduling on AArch64. Codesize reduces by ~0.2%.
>
> The combine_and_move pass runs if the scheduler is disabled and aggressively
> combines moves. The movsf/df patterns allow all FP immediates since they
> rely on a split pattern, however splits do not happen this late. To fix this,
> use a more accurate check that blocks creation of literal loads during
> combine_and_move. Fix various tests that depend on scheduling by explicitly
> adding -fschedule-insns.
>
> Passes bootstrap & regress, OK for commit?
I'm in favour of this. Obviously the numbers are what count, but
also from first principles:
- I can't remember the last time a scheduling model was added to the port.
- We've (consciously) never added scheduling types for SVE.
- It doesn't make logical sense to schedule for Neoverse V3 (say)
as thought it were a Cortex A57.
So at this point, it seems better for scheduling to be opt-in rather
than opt-out. (That is, we can switch to a tune-based default if
anyone does add a new scheduling model in future.)
Let's see what others think.
Please split the md changes out into a separate pre-patch though.
What do you think about disabling late scheduling as well?
Thanks,
Richard
> gcc/ChangeLog:
> * common/config/aarch64/aarch64-common.cc: Switch off fschedule_insns.
> * config/aarch64/aarch64.md (movhf_aarch64): Use
> aarch64_valid_fp_move.
> (movsf_aarch64): Likewise.
> (movdf_aarch64): Likewise.
> * config/aarch64/aarch64.cc (aarch64_valid_fp_move): New function.
> * config/aarch64/aarch64-protos.h (aarch64_valid_fp_move): Likewise.
>
> gcc/testsuite/ChangeLog:
> * testsuite/gcc.target/aarch64/ldp_aligned.c: Fix test.
> * testsuite/gcc.target/aarch64/ldp_always.c: Likewise.
> * testsuite/gcc.target/aarch64/ldp_stp_10.c: Add -fschedule-insns.
> * testsuite/gcc.target/aarch64/ldp_stp_12.c: Likewise.
> * testsuite/gcc.target/aarch64/ldp_stp_13.c: Remove test.
> * testsuite/gcc.target/aarch64/ldp_stp_21.c: Add -fschedule-insns.
> * testsuite/gcc.target/aarch64/ldp_stp_8.c: Likewise.
> * testsuite/gcc.target/aarch64/ldp_vec_v2sf.c: Likewise.
> * testsuite/gcc.target/aarch64/ldp_vec_v2si.c: Likewise.
> * testsuite/gcc.target/aarch64/test_frame_16.c: Fix test.
> * testsuite/gcc.target/aarch64/sve/vcond_12.c: Add -fschedule-insns.
> * testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c: Likewise.
>
> ---
>
> diff --git a/gcc/common/config/aarch64/aarch64-common.cc
> b/gcc/common/config/aarch64/aarch64-common.cc
> index
> 2bfc597e333b6018970a9ee6e370a66b6d0960ef..845747e31e821c2f3970fd39ea70f046eddbe920
> 100644
> --- a/gcc/common/config/aarch64/aarch64-common.cc
> +++ b/gcc/common/config/aarch64/aarch64-common.cc
> @@ -54,6 +54,8 @@ static const struct default_options
> aarch_option_optimization_table[] =
> { OPT_LEVELS_ALL, OPT_fomit_frame_pointer, NULL, 0 },
> /* Enable -fsched-pressure by default when optimizing. */
> { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
> + /* Disable early scheduling due to high compile-time overheads. */
> + { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
> /* Enable redundant extension instructions removal at -O2 and higher. */
> { OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 },
> { OPT_LEVELS_2_PLUS, OPT_mearly_ra_, NULL, AARCH64_EARLY_RA_ALL },
> diff --git a/gcc/config/aarch64/aarch64-protos.h
> b/gcc/config/aarch64/aarch64-protos.h
> index
> 250c5b96a21ea1c969a0e77e420525eec90e4de4..b30329d7f85f5b962dca43cf12ca938898425874
> 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -758,6 +758,7 @@ bool aarch64_advsimd_struct_mode_p (machine_mode mode);
> opt_machine_mode aarch64_vq_mode (scalar_mode);
> opt_machine_mode aarch64_full_sve_mode (scalar_mode);
> bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
> +bool aarch64_valid_fp_move (rtx, rtx, machine_mode);
> bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
> bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
> HOST_WIDE_INT);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> 2647293f7cf020378dacc37b7bfbccc856573e44..965ec18412a6486e6ac4ff2e4a7d742bf61e5d75
> 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -11223,6 +11223,36 @@ aarch64_can_const_movi_rtx_p (rtx x, machine_mode
> mode)
> return aarch64_simd_valid_mov_imm (v_op);
> }
>
> +/* Return TRUE if DST and SRC with mode MODE is a valid fp move. */
> +bool
> +aarch64_valid_fp_move (rtx dst, rtx src, machine_mode mode)
> +{
> + if (!TARGET_FLOAT)
> + return false;
> +
> + if (aarch64_reg_or_fp_zero (src, mode))
> + return true;
> +
> + if (!register_operand (dst, mode))
> + return false;
> +
> + if (MEM_P (src))
> + return true;
> +
> + if (!DECIMAL_FLOAT_MODE_P (mode))
> + {
> + if (aarch64_can_const_movi_rtx_p (src, mode)
> + || aarch64_float_const_representable_p (src)
> + || aarch64_float_const_zero_rtx_p (src))
> + return true;
> +
> + /* This requires a split which is only allowed before regalloc. */
> + if (aarch64_float_const_rtx_p (src))
> + return can_create_pseudo_p () && !ira_in_progress;
> + }
> +
> + return can_create_pseudo_p ();
> +}
>
> /* Return the fixed registers used for condition codes. */
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index
> 20956fc49d8232763b127629ded17037ad7d7960..5d3fa9628952031f52474291e160b957d774b011
> 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1644,8 +1644,7 @@ (define_expand "mov<mode>"
> (define_insn "*mov<mode>_aarch64"
> [(set (match_operand:HFBF 0 "nonimmediate_operand")
> (match_operand:HFBF 1 "general_operand"))]
> - "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> - || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
> + "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
> {@ [ cons: =0 , 1 ; attrs: type , arch ]
> [ w , Y ; neon_move , simd ] movi\t%0.4h, #0
> [ w , ?rY ; f_mcr , fp16 ] fmov\t%h0, %w1
> @@ -1668,8 +1667,7 @@ (define_insn "*mov<mode>_aarch64"
> (define_insn "*mov<mode>_aarch64"
> [(set (match_operand:SFD 0 "nonimmediate_operand")
> (match_operand:SFD 1 "general_operand"))]
> - "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> - || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
> + "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
> {@ [ cons: =0 , 1 ; attrs: type , arch ]
> [ w , Y ; neon_move , simd ] movi\t%0.2s, #0
> [ w , ?rY ; f_mcr , * ] fmov\t%s0, %w1
> @@ -1689,8 +1687,7 @@ (define_insn "*mov<mode>_aarch64"
> (define_insn "*mov<mode>_aarch64"
> [(set (match_operand:DFD 0 "nonimmediate_operand")
> (match_operand:DFD 1 "general_operand"))]
> - "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> - || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
> + "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
> {@ [ cons: =0 , 1 ; attrs: type , arch ]
> [ w , Y ; neon_move , simd ] movi\t%d0, #0
> [ w , ?rY ; f_mcr , * ] fmov\t%d0, %x1
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
> b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
> index
> 75495d71df28235b2bb2dc634c3e5121d398bac2..8ec2b0392b80d4c0d8b47a512ba291e3bade3be3
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
> @@ -14,25 +14,11 @@ TYPE ldp_aligned_##TYPE(char* ptr){ \
> return a_0 + a_1; \
> }
>
> -#define LDP_TEST_ADJUST_ALIGNED(TYPE) \
> -TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \
> - TYPE a_0, a_1, a_2, a_3, a_4; \
> - TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \
> - a_0 = arr[100]; \
> - a_1 = arr[101]; \
> - a_2 = arr[102]; \
> - a_3 = arr[103]; \
> - a_4 = arr[110]; \
> - return a_0 + a_1 + a_2 + a_3 + a_4; \
> -}
> -
> LDP_TEST_ALIGNED(int32_t);
> LDP_TEST_ALIGNED(int64_t);
> LDP_TEST_ALIGNED(v4si);
> -LDP_TEST_ADJUST_ALIGNED(int32_t);
> -LDP_TEST_ADJUST_ALIGNED(int64_t);
>
> -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 3 } } */
> -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 3 } } */
> +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 1 } } */
> /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_always.c
> b/gcc/testsuite/gcc.target/aarch64/ldp_always.c
> index
> 9cada57db8947e8ace4ad0bdacc14c80ee0fe9b5..5ffb98a886ecb659bb5c7a5e7ef013cacd14ffb7
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_always.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_always.c
> @@ -24,43 +24,14 @@ TYPE ldp_unaligned_##TYPE(char* ptr){ \
> return a_0 + a_1; \
> }
>
> -#define LDP_TEST_ADJUST_ALIGNED(TYPE) \
> -TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \
> - TYPE a_0, a_1, a_2, a_3, a_4; \
> - TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \
> - a_0 = arr[100]; \
> - a_1 = arr[101]; \
> - a_2 = arr[102]; \
> - a_3 = arr[103]; \
> - a_4 = arr[110]; \
> - return a_0 + a_1 + a_2 + a_3 + a_4; \
> -}
> -
> -#define LDP_TEST_ADJUST_UNALIGNED(TYPE) \
> -TYPE ldp_unaligned_adjust_##TYPE(char* ptr){ \
> - TYPE a_0, a_1, a_2, a_3, a_4; \
> - TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \
> - TYPE *a = arr+1; \
> - a_0 = a[100]; \
> - a_1 = a[101]; \
> - a_2 = a[102]; \
> - a_3 = a[103]; \
> - a_4 = a[110]; \
> - return a_0 + a_1 + a_2 + a_3 + a_4; \
> -}
> -
> LDP_TEST_ALIGNED(int32_t);
> LDP_TEST_ALIGNED(int64_t);
> LDP_TEST_ALIGNED(v4si);
> LDP_TEST_UNALIGNED(int32_t);
> LDP_TEST_UNALIGNED(int64_t);
> LDP_TEST_UNALIGNED(v4si);
> -LDP_TEST_ADJUST_ALIGNED(int32_t);
> -LDP_TEST_ADJUST_ALIGNED(int64_t);
> -LDP_TEST_ADJUST_UNALIGNED(int32_t);
> -LDP_TEST_ADJUST_UNALIGNED(int64_t);
>
> -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 6 } } */
> -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 6 } } */
> +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 2 } } */
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 2 } } */
> /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 2 } } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
> index
> 31f392901d2ca9e9e31cb20735fdf86eb040ee88..ac4828af76175388aa0112458476b02064c4e8fc
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
> int
> load (int *arr)
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
> index
> 718e82b53f0ccfd09a19afa26ebdb88654359e33..495e199270a60f797a8de21bbe6b8a771f927f23
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
> void
> store_offset (int *array, int x, int y)
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c
> deleted file mode 100644
> index
> 9cc3942f153773e8ffe9bcaf07f6b32dc0d5f95e..0000000000000000000000000000000000000000
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c
> +++ /dev/null
> @@ -1,18 +0,0 @@
> -/* { dg-do compile } */
> -/* { dg-options "-O2 -mabi=ilp32" } */
> -
> -long long
> -load_long (long long int *arr)
> -{
> - return arr[400] << 1 + arr[401] << 1 + arr[403] << 1 + arr[404] << 1;
> -}
> -
> -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, " 2 } } */
> -
> -int
> -load (int *arr)
> -{
> - return arr[527] << 1 + arr[400] << 1 + arr[401] << 1 + arr[528] << 1;
> -}
> -
> -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]+, " 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
> index
> d54c322ce860688de734721718a9c57185d4be63..ac7bc164840ddff765fe599c525aa1d62f217401
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
> #pragma GCC target "+nosimd+fp"
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
> index
> b25678323b85046d4a320d534be24aee429274b8..2adf151491b76fbdae8382852feefd810ab3611a
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
> typedef float __attribute__ ((vector_size (8))) fvec;
> typedef int __attribute__ ((vector_size (8))) ivec;
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
> b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
> index
> fbdae1c6cff1aef40db644361381ce511f0be64a..7a87fe7dd0a4715230733e25acd791dcd082f360
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
> typedef float __attribute__((vector_size(8))) vec;
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
> b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
> index
> 7714cd6cd9e8fa7dc1febf484d6726d44c246408..068f53e28ce5c5d1e60105a7c2b4001fa96f5153
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
> typedef int __attribute__((vector_size(8))) vec;
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
> index
> 41ad0bcea00f287757dd510b21915decafbc48c1..14eacce09c0585ec2132cd5dd185626e051ca588
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
> #include <arm_sve.h>
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
> b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
> index
> de650bf39e27b5cdb0f06d04b5d7948b3cc94a54..59dcc0abecf57455bb43ba47a65a2bfd3eae1929
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
> +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fschedule-insns" } */
>
> #include <stdint.h>
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> index
> 28f3826adadd5eaa6486659e4d6b6d7c5960b9d2..0f67458f71856afc54741960e0ac045ad5447395
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> @@ -17,7 +17,7 @@ double vararg_outgoing (int x1, ...)
> double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 =
> x1 * 6;
> __builtin_va_list vl;
> __builtin_va_start (vl, x1);
> - outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1));
> + outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1), REP8 (1));
> __builtin_va_end (vl);
> return a1 + a2 + a3 + a4 + a5 + a6;
> }