From: Siddhesh Poyarekar <siddh...@sourceware.org> Hi,
Jim Wilson posted a patch for this in September[1] and it appears following discussions that the patch was an acceptable fix for falkor. Kugan followed up[2] with a test case since that was requested during initial review. Jim has moved on from Linaro, so I'm pinging this patch with the hope that it is OK for inclusion since it was posted before the freeze and is also isolated in impact to just falkor. Siddhesh [1] https://gcc.gnu.org/ml/gcc-patches/2017-09/msg01547.html [2] https://gcc.gnu.org/ml/gcc-patches/2017-11/msg00050.html On Falkor, because of an idiosyncracy of how the pipelines are designed, a quad-word store using a reg+reg addressing mode is almost twice as slow as an add followed by a quad-word store with a single reg addressing mode. So we get better performance if we disallow addressing modes using register offsets with quad-word stores. Using lmbench compiled with -O2 -ftree-vectorize as my benchmark, I see a 13% performance increase on stream copy using this patch, and a 16% performance increase on stream scale using this patch. I also see a small performance increase on SPEC CPU2006 of around 0.2% for int and 0.4% for FP at -O3. 2018-01-17 Jim Wilson <jim.wil...@linaro.org> Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> gcc/ * config/aarch64/aarch64-protos.h (aarch64_falkor_movti_target_operand_p): Declare. constraint instead of m. * config/aarch64/aarch64.c (aarch64_falkor_movti_target_operand_p): New. * config/aarch64/constraints.md (Utf): New. * config/aarch64/aarch64.md (movti_aarch64): Use Utf constraint instead of m. (movtf_aarch64): Likewise. * config/aarch64/aarch64-simd.md (aarch64_simd_mov<mode>): Use Utf gcc/testsuite/ * gcc/testsuite/gcc.target/aarch64/pr82533.c: New test case. --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64-simd.md | 4 ++-- gcc/config/aarch64/aarch64.c | 10 ++++++++++ gcc/config/aarch64/aarch64.md | 6 +++--- gcc/config/aarch64/constraints.md | 6 ++++++ gcc/testsuite/gcc.target/aarch64/pr82533.c | 11 +++++++++++ 6 files changed, 33 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/pr82533.c diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 2d705d2..088d864 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -433,6 +433,7 @@ bool aarch64_simd_mem_operand_p (rtx); bool aarch64_sve_ld1r_operand_p (rtx); bool aarch64_sve_ldr_operand_p (rtx); bool aarch64_sve_struct_memory_operand_p (rtx); +bool aarch64_falkor_movti_target_operand_p (rtx); rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool); rtx aarch64_tls_get_addr (void); tree aarch64_fold_builtin (tree, int, tree *, bool); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 3d1f6a0..f7daac3 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -131,9 +131,9 @@ (define_insn "*aarch64_simd_mov<VQ:mode>" [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, Umq, m, w, ?r, ?w, ?r, w") + "=w, Umq, Utf, w, ?r, ?w, ?r, w") (match_operand:VQ 1 "general_operand" - "m, Dz, w, w, w, r, r, Dn"))] + "m, Dz, w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], <MODE>mode) || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))" diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 2e70f3a..0db7a4f 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -13477,6 +13477,16 @@ aarch64_sve_struct_memory_operand_p (rtx op) && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last)); } +/* Return TRUE if OP is a good address mode for movti target on falkor. */ +bool +aarch64_falkor_movti_target_operand_p (rtx op) +{ + if ((enum attr_tune) aarch64_tune == TUNE_FALKOR) + return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS + && ! CONST_INT_P (XEXP (XEXP (op, 0), 1))); + return MEM_P (op); +} + /* Emit a register copy from operand to operand, taking care not to early-clobber source registers in the process. diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index edb6a75..696fd12 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1079,7 +1079,7 @@ (define_insn "*movti_aarch64" [(set (match_operand:TI 0 - "nonimmediate_operand" "=r, w,r,w,r,m,m,w,m") + "nonimmediate_operand" "=r, w,r,w,r,m,m,w,Utf") (match_operand:TI 1 "aarch64_movti_operand" " rn,r,w,w,m,r,Z,m,w"))] "(register_operand (operands[0], TImode) @@ -1226,9 +1226,9 @@ (define_insn "*movtf_aarch64" [(set (match_operand:TF 0 - "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m") + "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,Utf,?r,m ,m") (match_operand:TF 1 - "general_operand" " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))] + "general_operand" " w,?r, ?r,w ,Y,Y ,m,w ,m ,?r,Y"))] "TARGET_FLOAT && (register_operand (operands[0], TFmode) || aarch64_reg_or_fp_zero (operands[1], TFmode))" "@ diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index 6cc4cad..d9f2921 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -229,6 +229,12 @@ (and (match_code "mem") (match_test "aarch64_sve_ldr_operand_p (op)"))) +(define_memory_constraint "Utf" + "@iternal + A good address for a falkor movti target operand." + (and (match_code "mem") + (match_test "aarch64_falkor_movti_target_operand_p (op)"))) + (define_memory_constraint "Utv" "@internal An address valid for loading/storing opaque structure diff --git a/gcc/testsuite/gcc.target/aarch64/pr82533.c b/gcc/testsuite/gcc.target/aarch64/pr82533.c new file mode 100644 index 0000000..fa28ffa --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr82533.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-mcpu=falkor -O2 -ftree-vectorize" } */ + +void +copy (int N, double *c, double *a) +{ + for (int i = 0; i < N; ++i) + c[i] = a[i]; +} + +/* { dg-final { scan-assembler-not "str\tq\[0-9\]+, \\\[x\[0-9\]+, x\[0-9\]+\\\]" } } */ -- 2.1.4