Kyrill Tkachov wrote: > On 14/10/15 13:30, Wilco Dijkstra wrote: > > Enable instruction fusion of dependent AESE; AESMC and AESD; AESIMC pairs. > > This can give up to 2x > > speedup on many AArch64 implementations. Also model the crypto instructions > > on Cortex-A57 according > > to the Optimization Guide. > > > > Passes regression tests. > > arm-wise this is ok, but I'd like a follow up patch to enable this fusion > for the arm port as well. It should be fairly simple. > Just add a new enum value to fuse_ops inside tune_params in arm-protos.h > and update the arm implementation in aarch_macro_fusion_pair_p similar > to your aarch64 implementation.
I sent out a patch for AArch32 as well. Assuming you're still OK, could you commit this please? Wilco > > ChangeLog: > > 2015-10-14 Wilco Dijkstra <wdijk...@arm.com> > > > > * gcc/config/aarch64/aarch64.c (cortexa53_tunings): Add AES fusion. > > (cortexa57_tunings): Likewise. > > (cortexa72_tunings): Likewise. > > (arch_macro_fusion_pair_p): Add support for AES fusion. > > * gcc/config/aarch64/aarch64-fusion-pairs.def: Add AES_AESMC entry. > > * gcc/config/arm/aarch-common.c (aarch_crypto_can_dual_issue): > > Allow virtual registers before reload so early scheduling works. > > * gcc/config/arm/cortex-a57.md (cortex_a57_crypto_simple): Use > > correct latency and pipeline. > > (cortex_a57_crypto_complex): Likewise. > > (cortex_a57_crypto_xor): Likewise. > > (define_bypass): Add AES bypass. > > > > > > --- > > gcc/config/aarch64/aarch64-fusion-pairs.def | 1 + > > gcc/config/aarch64/aarch64.c | 10 +++++++--- > > gcc/config/arm/aarch-common.c | 7 +++++-- > > gcc/config/arm/cortex-a57.md | 17 +++++++++++------ > > 4 files changed, 24 insertions(+), 11 deletions(-) > > > > diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def > > b/gcc/config/aarch64/aarch64-fusion-pairs.def > > index 53bbef4..fea79fc 100644 > > --- a/gcc/config/aarch64/aarch64-fusion-pairs.def > > +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def > > @@ -33,4 +33,5 @@ AARCH64_FUSION_PAIR ("adrp+add", ADRP_ADD) > > AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK) > > AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR) > > AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH) > > +AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC) > > > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > > index 230902d..96368c6 100644 > > --- a/gcc/config/aarch64/aarch64.c > > +++ b/gcc/config/aarch64/aarch64.c > > @@ -376,7 +376,7 @@ static const struct tune_params cortexa53_tunings = > > &generic_branch_cost, > > 4, /* memmov_cost */ > > 2, /* issue_rate */ > > - (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD > > + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD > > | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ > > 8, /* function_align. */ > > 8, /* jump_align. */ > > @@ -398,7 +398,7 @@ static const struct tune_params cortexa57_tunings = > > &generic_branch_cost, > > 4, /* memmov_cost */ > > 3, /* issue_rate */ > > - (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD > > + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD > > | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ > > 16, /* function_align. */ > > 8, /* jump_align. */ > > @@ -420,7 +420,7 @@ static const struct tune_params cortexa72_tunings = > > &generic_branch_cost, > > 4, /* memmov_cost */ > > 3, /* issue_rate */ > > - (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD > > + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD > > | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ > > 16, /* function_align. */ > > 8, /* jump_align. */ > > @@ -12843,6 +12843,10 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, > > rtx_insn *curr) > > } > > } > > > > + if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC) > > + && aarch_crypto_can_dual_issue (prev, curr)) > > + return true; > > + > > if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH) > > && any_condjump_p (curr)) > > { > > diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c > > index 5dd8222..e191ab6 100644 > > --- a/gcc/config/arm/aarch-common.c > > +++ b/gcc/config/arm/aarch-common.c > > @@ -63,8 +63,11 @@ aarch_crypto_can_dual_issue (rtx_insn *producer_insn, > > rtx_insn *consumer_insn) > > { > > unsigned int regno = REGNO (SET_DEST (producer_set)); > > > > - return REGNO (SET_DEST (consumer_set)) == regno > > - && REGNO (XVECEXP (consumer_src, 0, 0)) == regno; > > + /* Before reload the registers are virtual, so the destination of > > + consumer_set doesn't need to match. */ > > + > > + return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed) > > + && REGNO (XVECEXP (consumer_src, 0, 0)) == regno; > > } > > > > return 0; > > diff --git a/gcc/config/arm/cortex-a57.md b/gcc/config/arm/cortex-a57.md > > index a32c848..eab9d99 100644 > > --- a/gcc/config/arm/cortex-a57.md > > +++ b/gcc/config/arm/cortex-a57.md > > @@ -745,20 +745,20 @@ > > neon_fp_sqrt_s_q, neon_fp_sqrt_d_q")) > > "ca57_cx2_block*3") > > > > -(define_insn_reservation "cortex_a57_crypto_simple" 4 > > +(define_insn_reservation "cortex_a57_crypto_simple" 3 > > (and (eq_attr "tune" "cortexa57") > > (eq_attr "type" > > "crypto_aese,crypto_aesmc,crypto_sha1_fast,crypto_sha256_fast")) > > - "ca57_cx2") > > + "ca57_cx1") > > > > -(define_insn_reservation "cortex_a57_crypto_complex" 7 > > +(define_insn_reservation "cortex_a57_crypto_complex" 6 > > (and (eq_attr "tune" "cortexa57") > > (eq_attr "type" "crypto_sha1_slow,crypto_sha256_slow")) > > - "ca57_cx2+(ca57_cx2_issue,ca57_cx2)") > > + "ca57_cx1*2") > > > > -(define_insn_reservation "cortex_a57_crypto_xor" 7 > > +(define_insn_reservation "cortex_a57_crypto_xor" 6 > > (and (eq_attr "tune" "cortexa57") > > (eq_attr "type" "crypto_sha1_xor")) > > - "(ca57_cx1+ca57_cx2)") > > + "(ca57_cx1*2)|(ca57_cx2*2)") > > > > ;; We lie with calls. They take up all issue slots, but are otherwise > > ;; not harmful. > > @@ -795,3 +795,8 @@ > > (define_bypass 1 "cortex_a57_*" > > "cortex_a57_call,cortex_a57_branch") > > > > +;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency > > +(define_bypass 0 "cortex_a57_crypto_simple" > > + "cortex_a57_crypto_simple" > > + "aarch_crypto_can_dual_issue") > > +