On 22/07/14 15:52, Jiong Wang wrote:
> currently we are generating sub-optimal epilogue when there
> is frame pointer and there is outgoing area.
>      
> take gcc.target/aarch64/test_frame_12.c for example:
>      
> the epilogue for test_12 is:
>      
>        .L12:
>        sub     sp, x29, #16
>        ldp     x29, x30, [sp, 16]
>        add     sp, sp, 432
>        ret
>      
> while the optimized version should be:
>      
>        .L12:
>        add     sp, x29, 0
>        ldp     x29, x30, [sp], 416
>        ret

Even better would be

        ldp     x29, x30, [x29]
        add     sp, sp, #432

since now the two instructions can dual-issue.

R.

>      
> when there is frame pointer, it is set up to point to base address
> of our reg save area in prologue, so in epilogue we could utilize
> this feature, and skip outgoing if there is, thus we could always utilize
> load write-back for stack adjustment when there is frame pointer.
> 
> ok to install?
> 
> thanks.
> 
> gcc/
>    * config/aarch64/aarch64.c (aarch64_expand_epilogue): Don't subtract
>    outgoing area size when restore stack_pointer_rtx.
> 
> gcc/testsuite/
>    * gcc.target/aarch64/test_frame_12.c: Match optimized instruction 
> sequences.
> 
> 
> 0014-AArch64-GCC-15-20-Optimize-epilogue-when-there-is-fr.patch
> 
> 
>>From 9d8cbfa071df773ef5edfed499c0dc90be8eebfa Mon Sep 17 00:00:00 2001
> From: Jiong Wang <jiong.w...@arm.com>
> Date: Tue, 17 Jun 2014 22:19:33 +0100
> Subject: [PATCH 14/19] [AArch64/GCC][15/20] Optimize epilogue when there is
>  frame pointer
> 
> currently we are generating sub-optimal epilogue when there
> is frame pointer and there is outgoing area.
> 
> take gcc.target/aarch64/test_frame_12.c for example:
> 
> the epilogue for test_12 is:
> 
> .L12:
> sub     sp, x29, #16
> ldp     x29, x30, [sp, 16]
> add     sp, sp, 432
> ret
> 
> while the optimized version should be:
> 
> .L12:
> add     sp, x29, 0
> ldp     x29, x30, [sp], 416
> ret
> 
> when there is frame pointer, it is set up to point to base address of our
> reg save area in prologue, so in epilogue we could utilize this feature,
> and skip outgoing if there is, thus we could always utilize load write-back
> for stack adjustment when there is frame pointer.
> 
> 2014-06-16  Jiong Wang <jiong.w...@arm.com>
>           Marcus Shawcroft  <marcus.shawcr...@arm.com>
> gcc/
>   * config/aarch64/aarch64.c (aarch64_expand_epilogue): Don't subtract
>   outgoing area size when restore stack_pointer_rtx.
> 
> gcc/testsuite/
>   * gcc.target/aarch64/test_frame_12.c: Match optimized instruction sequences.
> ---
>  gcc/config/aarch64/aarch64.c                     |   24 
> +++++++---------------
>  gcc/testsuite/gcc.target/aarch64/test_frame_12.c |    4 ++++
>  2 files changed, 11 insertions(+), 17 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 425c865..65a84e8 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -2360,7 +2360,8 @@ aarch64_expand_epilogue (bool for_sibcall)
>      {
>        insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
>                                      hard_frame_pointer_rtx,
> -                                    GEN_INT (- fp_offset)));
> +                                    GEN_INT (0)));
> +      offset = offset - fp_offset;
>        RTX_FRAME_RELATED_P (insn) = 1;
>        /* As SP is set to (FP - fp_offset), according to the rules in
>        dwarf2cfi.c:dwarf2out_frame_debug_expr, CFA should be calculated
> @@ -2368,27 +2369,16 @@ aarch64_expand_epilogue (bool for_sibcall)
>        cfa_reg = stack_pointer_rtx;
>      }
>  
> -  aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM);
> +  aarch64_restore_callee_saves (DFmode, frame_pointer_needed ? 0 : fp_offset,
> +                             V0_REGNUM, V31_REGNUM);
>  
>    if (offset > 0)
>      {
>        if (frame_pointer_needed)
>       {
> -       if (fp_offset)
> -         {
> -           aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM,
> -                                         R30_REGNUM);
> -           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
> -                                            GEN_INT (offset)));
> -           RTX_FRAME_RELATED_P (insn) = 1;
> -         }
> -       else
> -         {
> -           aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM,
> -                                         R28_REGNUM);
> -           aarch64_popwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset,
> -                                   cfa_reg);
> -         }
> +       aarch64_restore_callee_saves (DImode, 0, R0_REGNUM, R28_REGNUM);
> +       aarch64_popwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset,
> +                               cfa_reg);
>       }
>        else
>       {
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> index 3649527..81f0070 100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> @@ -12,4 +12,8 @@ t_frame_pattern_outgoing (test12, 400, , 8, a[8])
>  t_frame_run (test12)
>  
>  /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
> +
> +/* Check epilogue using write-back.  */
> +/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp\\\], \[0-9\]+" 3 
> } } */
> +
>  /* { dg-final { cleanup-saved-temps } } */
> 


Reply via email to