currently we are generating sub-optimal epilogue when there
is frame pointer and there is outgoing area.
take gcc.target/aarch64/test_frame_12.c for example: the epilogue for test_12 is: .L12:
      sub     sp, x29, #16
      ldp     x29, x30, [sp, 16]
      add     sp, sp, 432
      ret
while the optimized version should be: .L12:
      add     sp, x29, 0
      ldp     x29, x30, [sp], 416
      ret
when there is frame pointer, it is set up to point to base address
of our reg save area in prologue, so in epilogue we could utilize
this feature, and skip outgoing if there is, thus we could always utilize
load write-back for stack adjustment when there is frame pointer.

ok to install?

thanks.

gcc/
  * config/aarch64/aarch64.c (aarch64_expand_epilogue): Don't subtract
  outgoing area size when restore stack_pointer_rtx.

gcc/testsuite/
  * gcc.target/aarch64/test_frame_12.c: Match optimized instruction sequences.
>From 9d8cbfa071df773ef5edfed499c0dc90be8eebfa Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.w...@arm.com>
Date: Tue, 17 Jun 2014 22:19:33 +0100
Subject: [PATCH 14/19] [AArch64/GCC][15/20] Optimize epilogue when there is
 frame pointer

currently we are generating sub-optimal epilogue when there
is frame pointer and there is outgoing area.

take gcc.target/aarch64/test_frame_12.c for example:

the epilogue for test_12 is:

.L12:
sub     sp, x29, #16
ldp     x29, x30, [sp, 16]
add     sp, sp, 432
ret

while the optimized version should be:

.L12:
add     sp, x29, 0
ldp     x29, x30, [sp], 416
ret

when there is frame pointer, it is set up to point to base address of our
reg save area in prologue, so in epilogue we could utilize this feature,
and skip outgoing if there is, thus we could always utilize load write-back
for stack adjustment when there is frame pointer.

2014-06-16  Jiong Wang <jiong.w...@arm.com>
	    Marcus Shawcroft  <marcus.shawcr...@arm.com>
gcc/
  * config/aarch64/aarch64.c (aarch64_expand_epilogue): Don't subtract
  outgoing area size when restore stack_pointer_rtx.

gcc/testsuite/
  * gcc.target/aarch64/test_frame_12.c: Match optimized instruction sequences.
---
 gcc/config/aarch64/aarch64.c                     |   24 +++++++---------------
 gcc/testsuite/gcc.target/aarch64/test_frame_12.c |    4 ++++
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 425c865..65a84e8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2360,7 +2360,8 @@ aarch64_expand_epilogue (bool for_sibcall)
     {
       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
 				       hard_frame_pointer_rtx,
-				       GEN_INT (- fp_offset)));
+				       GEN_INT (0)));
+      offset = offset - fp_offset;
       RTX_FRAME_RELATED_P (insn) = 1;
       /* As SP is set to (FP - fp_offset), according to the rules in
 	 dwarf2cfi.c:dwarf2out_frame_debug_expr, CFA should be calculated
@@ -2368,27 +2369,16 @@ aarch64_expand_epilogue (bool for_sibcall)
       cfa_reg = stack_pointer_rtx;
     }
 
-  aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM);
+  aarch64_restore_callee_saves (DFmode, frame_pointer_needed ? 0 : fp_offset,
+				V0_REGNUM, V31_REGNUM);
 
   if (offset > 0)
     {
       if (frame_pointer_needed)
 	{
-	  if (fp_offset)
-	    {
-	      aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM,
-					    R30_REGNUM);
-	      insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
-					       GEN_INT (offset)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	    }
-	  else
-	    {
-	      aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM,
-					    R28_REGNUM);
-	      aarch64_popwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset,
-				      cfa_reg);
-	    }
+	  aarch64_restore_callee_saves (DImode, 0, R0_REGNUM, R28_REGNUM);
+	  aarch64_popwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset,
+				  cfa_reg);
 	}
       else
 	{
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
index 3649527..81f0070 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
@@ -12,4 +12,8 @@ t_frame_pattern_outgoing (test12, 400, , 8, a[8])
 t_frame_run (test12)
 
 /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
+
+/* Check epilogue using write-back.  */
+/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp\\\], \[0-9\]+" 3 } } */
+
 /* { dg-final { cleanup-saved-temps } } */
-- 
1.7.9.5

Reply via email to