Re: [PATCH][AArch64][2/5] Implement adrp+add fusion

2014-11-21 Thread Marcus Shawcroft
On 18 November 2014 10:33, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:

 2014-11-18  Kyrylo Tkachov  kyrylo.tkac...@arm.com

 * config/aarch64/aarch64.c: Include tm-constrs.h
 (AARCH64_FUSE_ADRP_ADD): Define.
 (cortexa57_tunings): Add AARCH64_FUSE_ADRP_ADD to fuseable_ops.
 (cortexa53_tunings): Likewise.
 (aarch_macro_fusion_pair_p): Handle AARCH64_FUSE_ADRP_ADD.

OK /Marcus


[PATCH][AArch64][2/5] Implement adrp+add fusion

2014-11-18 Thread Kyrill Tkachov

Hi all,

This patch is just rebased on top of the changes from the previous patch 
in the series.
Otherwise it's the same as 
https://gcc.gnu.org/ml/gcc-patches/2014-11/msg01263.html with some style 
cleanup


There can be cases where we miss fusion of adrd+add because although 
they are generated together (in aarch64_load_symref_appropriately),
combine can sometimes combine the losym part with the instruction after 
it and we end up with an instruction stream where the is an insn between 
the two, preventing the fusion in sched1.
We still catch enough cases to make this approach worthwhile and the 
above-mentioned exceptions can be mitigated in the future (for example, 
by somehow delaying the generation of the adrp,add RTL after combine but 
before sched1)


Tested and bootstrapped on aarch64-none-linux-gnu.
Ok for trunk?

2014-11-18  Kyrylo Tkachov  kyrylo.tkac...@arm.com

* config/aarch64/aarch64.c: Include tm-constrs.h
(AARCH64_FUSE_ADRP_ADD): Define.
(cortexa57_tunings): Add AARCH64_FUSE_ADRP_ADD to fuseable_ops.
(cortexa53_tunings): Likewise.
(aarch_macro_fusion_pair_p): Handle AARCH64_FUSE_ADRP_ADD.commit 248ec70cfac6cb552a427b4336a3340bb25a5e53
Author: Kyrylo Tkachov kyrylo.tkac...@arm.com
Date:   Thu Nov 6 12:05:26 2014 +

[AArch64] Fuse ADRP+ADD

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 502ba6d..03ae7c4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -77,6 +77,7 @@
 #include dumpfile.h
 #include builtins.h
 #include rtl-iter.h
+#include tm-constrs.h
 
 /* Defined for convenience.  */
 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
@@ -304,6 +305,7 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
 
 #define AARCH64_FUSE_NOTHING	(0)
 #define AARCH64_FUSE_MOV_MOVK	(1  0)
+#define AARCH64_FUSE_ADRP_ADD	(1  1)
 
 #if HAVE_DESIGNATED_INITIALIZERS  GCC_VERSION = 2007
 __extension__
@@ -327,7 +329,7 @@ static const struct tune_params cortexa53_tunings =
   generic_vector_cost,
   NAMED_PARAM (memmov_cost, 4),
   NAMED_PARAM (issue_rate, 2),
-  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK)
+  NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD))
 };
 
 static const struct tune_params cortexa57_tunings =
@@ -338,7 +340,7 @@ static const struct tune_params cortexa57_tunings =
   cortexa57_vector_cost,
   NAMED_PARAM (memmov_cost, 4),
   NAMED_PARAM (issue_rate, 3),
-  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK)
+  NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD))
 };
 
 static const struct tune_params thunderx_tunings =
@@ -10037,6 +10039,32 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 }
 }
 
+  if (simple_sets_p
+   (aarch64_tune_params-fuseable_ops  AARCH64_FUSE_ADRP_ADD))
+{
+
+  /*  We're trying to match:
+  prev (adrp) == (set (reg r1)
+  (high (symbol_ref (SYM
+  curr (add) == (set (reg r0)
+ (lo_sum (reg r1)
+ (symbol_ref (SYM
+  Note that r0 need not necessarily be the same as r1, especially
+  during pre-regalloc scheduling.  */
+
+  if (satisfies_constraint_Ush (SET_SRC (prev_set))
+   REG_P (SET_DEST (prev_set))  REG_P (SET_DEST (curr_set)))
+{
+  if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
+   REG_P (XEXP (SET_SRC (curr_set), 0))
+   REGNO (XEXP (SET_SRC (curr_set), 0))
+ == REGNO (SET_DEST (prev_set))
+   rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
+  XEXP (SET_SRC (curr_set), 1)))
+return true;
+}
+}
+
   return false;
 }