The branch master has been updated
       via  10646160125ac1328d892f1dd27f2847892d33c5 (commit)
      from  f0f4a46c4f5c82d4d9d0fb8a51d546c3135668a2 (commit)


- Log -----------------------------------------------------------------
commit 10646160125ac1328d892f1dd27f2847892d33c5
Author: fangming.fang <fangming.f...@arm.com>
Date:   Fri Mar 19 06:45:57 2021 +0000

    Optimize RSA on armv8
    
    Add Neon path for RSA on armv8, this optimisation targets to A72
    and N1 that are ones of important cores of infrastructure. Other
    platforms are not impacted.
    
    A72
                            old         new             improved
    rsa  512 sign               9828.6          9738.7          -1%
    rsa  512 verify             121497.2        122367.7        1%
    rsa 1024 sign               1818            1816.9          0%
    rsa 1024 verify             37175.6         37161.3         0%
    rsa 2048 sign               267.3           267.4           0%
    rsa 2048 verify             10127.6         10119.6         0%
    rsa 3072 sign               86.8            87              0%
    rsa 3072 verify             4604.2          4956.2          8%
    rsa 4096 sign               38.3            38.5            1%
    rsa 4096 verify             2619.8          2972.1          13%
    rsa 7680 sign               5               7               40%
    rsa 7680 verify             756             929.4           23%
    rsa 15360 sign              0.8             1               25%
    rsa 15360 verify    190.4           246             29%
    
    N1
                            old         new             improved
    rsa  512 sign               12599.2         12596.7         0%
    rsa  512 verify             148636.1        148656.2        0%
    rsa 1024 sign               2150.6          2148.9          0%
    rsa 1024 verify             42353.5         42265.2         0%
    rsa 2048 sign               305.5           305.3           0%
    rsa 2048 verify             11209.7         11205.2         0%
    rsa 3072 sign               97.8            98.2            0%
    rsa 3072 verify             5061.3          5990.7          18%
    rsa 4096 sign               42.8            43              0%
    rsa 4096 verify             2867.6          3509.8          22%
    rsa 7680 sign               5.5             8.4             53%
    rsa 7680 verify             823.5           1058.3          29%
    rsa 15360 sign              0.9             1.1             22%
    rsa 15360 verify    207             273.9           32%
    
    CustomizedGitHooks: yes
    Change-Id: I01c732cc429d793c4eb5ffd27ccd30ff9cebf8af
    Jira: SECLIB-540
    
    Reviewed-by: Tomas Mraz <to...@openssl.org>
    Reviewed-by: Paul Dale <pa...@openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/14761)

-----------------------------------------------------------------------

Summary of changes:
 crypto/armcap.c             |   7 +
 crypto/bn/asm/armv8-mont.pl | 381 ++++++++++++++++++++++++++++++++++++++++++++
 crypto/bn/build.info        |   1 +
 3 files changed, 389 insertions(+)

diff --git a/crypto/armcap.c b/crypto/armcap.c
index 0e7c0842ad..dc2326f8f6 100644
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@@ -19,6 +19,7 @@
 
 unsigned int OPENSSL_armcap_P = 0;
 unsigned int OPENSSL_arm_midr = 0;
+unsigned int OPENSSL_armv8_rsa_neonized = 0;
 
 #if __ARM_MAX_ARCH__<7
 void OPENSSL_cpuid_setup(void)
@@ -237,6 +238,12 @@ void OPENSSL_cpuid_setup(void)
 # ifdef __aarch64__
     if (OPENSSL_armcap_P & ARMV8_CPUID)
         OPENSSL_arm_midr = _armv8_cpuid_probe();
+
+    if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, 
ARM_CPU_PART_CORTEX_A72) ||
+         MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, 
ARM_CPU_PART_N1)) &&
+        (OPENSSL_armcap_P & ARMV7_NEON)) {
+            OPENSSL_armv8_rsa_neonized = 1;
+    }
 # endif
 }
 #endif
diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl
index e8bdfa3bb8..0867ccabee 100755
--- a/crypto/bn/asm/armv8-mont.pl
+++ b/crypto/bn/asm/armv8-mont.pl
@@ -67,16 +67,34 @@ $n0="x4";   # const BN_ULONG *n0,
 $num="x5";     # int num);
 
 $code.=<<___;
+#ifndef        __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armv8_rsa_neonized
+.hidden OPENSSL_armv8_rsa_neonized
+#endif
 .text
 
 .globl bn_mul_mont
 .type  bn_mul_mont,%function
 .align 5
 bn_mul_mont:
+.Lbn_mul_mont:
+       tst     $num,#3
+       b.ne    .Lmul_mont
+       cmp     $num,#32
+       b.le    .Lscalar_impl
+#ifndef        __KERNEL__
+       adrp    x17,OPENSSL_armv8_rsa_neonized
+       ldr     w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
+       cbnz    w17, bn_mul8x_mont_neon
+#endif
+
+.Lscalar_impl:
        tst     $num,#7
        b.eq    __bn_sqr8x_mont
        tst     $num,#3
        b.eq    __bn_mul4x_mont
+
 .Lmul_mont:
        stp     x29,x30,[sp,#-64]!
        add     x29,sp,#0
@@ -274,6 +292,369 @@ bn_mul_mont:
 .size  bn_mul_mont,.-bn_mul_mont
 ___
 {
+my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
+my ($Z,$Temp)=("v4.16b","v5");
+my @ACC=map("v$_",(6..13));
+my ($Bi,$Ni,$M0)=map("v$_",(28..30));
+my $sBi="s28";
+my $sM0="s30";
+my $zero="v14";
+my $temp="v15";
+my $ACCTemp="v16";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
+
+$code.=<<___;
+.type  bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+       stp     x29,x30,[sp,#-80]!
+       mov     x16,sp
+       stp     d8,d9,[sp,#16]
+       stp     d10,d11,[sp,#32]
+       stp     d12,d13,[sp,#48]
+       stp     d14,d15,[sp,#64]
+       lsl     $num,$num,#1
+       eor     $zero.16b,$zero.16b,$zero.16b
+
+.align 4
+.LNEON_8n:
+       eor     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
+       sub     $toutptr,sp,#128
+       eor     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
+       sub     $toutptr,$toutptr,$num,lsl#4
+       eor     @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
+       and     $toutptr,$toutptr,#-64
+       eor     @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
+       mov     sp,$toutptr             // alloca
+       eor     @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
+       add     $toutptr,$toutptr,#256
+       eor     @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
+       sub     $inner,$num,#8
+       eor     @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
+       eor     @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
+
+.LNEON_8n_init:
+       st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+       subs    $inner,$inner,#8
+       st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+       st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+       st1     {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
+       bne     .LNEON_8n_init
+
+       add     $tinptr,sp,#256
+       ld1     {$A0.4s,$A1.4s},[$aptr],#32
+       add     $bnptr,sp,#8
+       ldr     $sM0,[$n0],#4
+       mov     $outer,$num
+       b       .LNEON_8n_outer
+
+.align 4
+.LNEON_8n_outer:
+       ldr     $sBi,[$bptr],#4   // *b++
+       uxtl    $Bi.4s,$Bi.4h
+       add     $toutptr,sp,#128
+       ld1     {$N0.4s,$N1.4s},[$nptr],#32
+
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       shl     $Ni.2d,@ACC[0].2d,#16
+       ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       add     $Ni.2d,$Ni.2d,@ACC[0].2d
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       mul     $Ni.2s,$Ni.2s,$M0.2s
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       st1     {$Bi.2s},[sp]           // put aside smashed b[8*i+0]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       uxtl    $Ni.4s,$Ni.4h
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+       ldr     $sBi,[$bptr],#4   // *b++
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       uxtl    $Bi.4s,$Bi.4h
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       ushr    $temp.2d,@ACC[0].2d,#16
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       ushr    @ACC[0].2d,@ACC[0].2d,#16
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+       add     $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
+       ins     @ACC[1].d[0],$ACCTemp.d[0]
+       st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
+___
+       push(@ACC,shift(@ACC)); $i++;
+$code.=<<___;
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       ld1     {@ACC[7].2d},[$tinptr],#16
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       shl     $Ni.2d,@ACC[0].2d,#16
+       ext     $Ni.16b,$Ni.16b,$Ni.16b,#8
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       add     $Ni.2d,$Ni.2d,@ACC[0].2d
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       mul     $Ni.2s,$Ni.2s,$M0.2s
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       st1     {$Bi.2s},[$bnptr],#8    // put aside smashed b[8*i+$i]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       uxtl    $Ni.4s,$Ni.4h
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+       ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       ld1     {$A0.4s,$A1.4s},[$aptr],#32
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       mov     $Temp.16b,@ACC[0].16b
+       ushr    $Temp.2d,$Temp.2d,#16
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       add     @ACC[0].2d,@ACC[0].2d,$Temp.2d
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       ushr    @ACC[0].2d,@ACC[0].2d,#16
+       eor     $temp.16b,$temp.16b,$temp.16b
+       ins     @ACC[0].d[1],$temp.d[0]
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+       add     @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
+       st1     {$Ni.2s},[$bnptr],#8    // put aside smashed m[8*i+$i]
+       add     $bnptr,sp,#8            // rewind
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       sub     $inner,$num,#8
+       b       .LNEON_8n_inner
+
+.align 4
+.LNEON_8n_inner:
+       subs    $inner,$inner,#8
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       ld1     {@ACC[7].2d},[$tinptr]
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+0]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       ld1     {$N0.4s,$N1.4s},[$nptr],#32
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       b.eq    .LInner_jump
+       add     $tinptr,$tinptr,#16     // don't advance in last iteration
+.LInner_jump:
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+       ld1     {$Bi.2s},[$bnptr],#8    // pull smashed b[8*i+$i]
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+       st1     {@ACC[0].2d},[$toutptr],#16
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       umlal   @ACC[0].2d,$Bi.2s,$A0.s[0]
+       ld1     {@ACC[7].2d},[$tinptr]
+       umlal   @ACC[1].2d,$Bi.2s,$A0.s[1]
+       ld1     {$Ni.2s},[$bnptr],#8    // pull smashed m[8*i+$i]
+       umlal   @ACC[2].2d,$Bi.2s,$A0.s[2]
+       b.eq    .LInner_jump$i
+       add     $tinptr,$tinptr,#16     // don't advance in last iteration
+.LInner_jump$i:
+       umlal   @ACC[3].2d,$Bi.2s,$A0.s[3]
+       umlal   @ACC[4].2d,$Bi.2s,$A1.s[0]
+       umlal   @ACC[5].2d,$Bi.2s,$A1.s[1]
+       umlal   @ACC[6].2d,$Bi.2s,$A1.s[2]
+       umlal   @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+       b.ne    .LInner_after_rewind$i
+       sub     $aptr,$aptr,$num,lsl#2  // rewind
+.LInner_after_rewind$i:
+       umlal   @ACC[0].2d,$Ni.2s,$N0.s[0]
+       ld1     {$Bi.2s},[sp]           // pull smashed b[8*i+0]
+       umlal   @ACC[1].2d,$Ni.2s,$N0.s[1]
+       ld1     {$A0.4s,$A1.4s},[$aptr],#32
+       umlal   @ACC[2].2d,$Ni.2s,$N0.s[2]
+       add     $bnptr,sp,#8            // rewind
+       umlal   @ACC[3].2d,$Ni.2s,$N0.s[3]
+       umlal   @ACC[4].2d,$Ni.2s,$N1.s[0]
+       umlal   @ACC[5].2d,$Ni.2s,$N1.s[1]
+       umlal   @ACC[6].2d,$Ni.2s,$N1.s[2]
+       st1     {@ACC[0].2d},[$toutptr],#16
+       umlal   @ACC[7].2d,$Ni.2s,$N1.s[3]
+
+       bne     .LNEON_8n_inner
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       add     $tinptr,sp,#128
+       st1     {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+       eor     $N0.16b,$N0.16b,$N0.16b // $N0
+       st1     {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+       eor     $N1.16b,$N1.16b,$N1.16b // $N1
+       st1     {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+       st1     {@ACC[6].2d},[$toutptr]
+
+       subs    $outer,$outer,#8
+       ld1     {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
+       ld1     {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
+       ld1     {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
+       ld1     {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
+
+       b.eq    .LInner_8n_jump_2steps
+       sub     $nptr,$nptr,$num,lsl#2  // rewind
+       b       .LNEON_8n_outer
+
+.LInner_8n_jump_2steps:
+       add     $toutptr,sp,#128
+       st1     {$N0.2d,$N1.2d}, [sp],#32       // start wiping stack frame
+       mov     $Temp.16b,@ACC[0].16b
+       ushr    $temp.2d,@ACC[0].2d,#16
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       st1     {$N0.2d,$N1.2d}, [sp],#32
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       st1     {$N0.2d,$N1.2d}, [sp],#32
+       ushr    $temp.2d,@ACC[0].2d,#16
+       st1     {$N0.2d,$N1.2d}, [sp],#32
+       zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
+       ins     $temp.d[1],$zero.d[0]
+
+       mov     $inner,$num
+       b       .LNEON_tail_entry
+
+.align 4
+.LNEON_tail:
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       mov     $Temp.16b,@ACC[0].16b
+       ushr    $temp.2d,@ACC[0].2d,#16
+       ext     @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+       ld1     {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
+       add     @ACC[0].2d,@ACC[0].2d,$temp.2d
+       ld1     {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
+       ushr    $temp.2d,@ACC[0].2d,#16
+       ld1     {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
+       zip1    @ACC[0].4h,$Temp.4h,@ACC[0].4h
+       ins     $temp.d[1],$zero.d[0]
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+       add     @ACC[1].2d,@ACC[1].2d,$temp.2d
+       st1     {@ACC[0].s}[0], [$toutptr],#4
+       ushr    $temp.2d,@ACC[1].2d,#16
+       mov     $Temp.16b,@ACC[1].16b
+       ext     @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
+       add     @ACC[1].2d,@ACC[1].2d,$temp.2d
+       ushr    $temp.2d,@ACC[1].2d,#16
+       zip1    @ACC[1].4h,$Temp.4h,@ACC[1].4h
+       ins     $temp.d[1],$zero.d[0]
+___
+       push(@ACC,shift(@ACC));
+}
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       ld1     {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
+       subs    $inner,$inner,#8
+       st1     {@ACC[7].s}[0], [$toutptr],#4
+       bne     .LNEON_tail
+
+       st1     {$temp.s}[0], [$toutptr],#4     // top-most bit
+       sub     $nptr,$nptr,$num,lsl#2          // rewind $nptr
+       subs    $aptr,sp,#0                     // clear carry flag
+       add     $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+       ldp     w4,w5,[$aptr],#8
+       ldp     w6,w7,[$aptr],#8
+       ldp     w8,w9,[$nptr],#8
+       ldp     w10,w11,[$nptr],#8
+       sbcs    w8,w4,w8
+       sbcs    w9,w5,w9
+       sbcs    w10,w6,w10
+       sbcs    w11,w7,w11
+       sub     x17,$bptr,$aptr
+       stp     w8,w9,[$rptr],#8
+       stp     w10,w11,[$rptr],#8
+       cbnz    x17,.LNEON_sub
+
+       ldr     w10, [$aptr]            // load top-most bit
+       mov     x11,sp
+       eor     v0.16b,v0.16b,v0.16b
+       sub     x11,$bptr,x11           // this is num*4
+       eor     v1.16b,v1.16b,v1.16b
+       mov     $aptr,sp
+       sub     $rptr,$rptr,x11         // rewind $rptr
+       mov     $nptr,$bptr             // second 3/4th of frame
+       sbcs    w10,w10,wzr             // result is carry flag
+
+.LNEON_copy_n_zap:
+       ldp     w4,w5,[$aptr],#8
+       ldp     w6,w7,[$aptr],#8
+       ldp     w8,w9,[$rptr],#8
+       ldp     w10,w11,[$rptr]
+       sub     $rptr,$rptr,#8
+       b.cs    .LCopy_1
+       mov     w8,w4
+       mov     w9,w5
+       mov     w10,w6
+       mov     w11,w7
+.LCopy_1:
+       st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
+       st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
+       ldp     w4,w5,[$aptr],#8
+       ldp     w6,w7,[$aptr],#8
+       stp     w8,w9,[$rptr],#8
+       stp     w10,w11,[$rptr],#8
+       sub     $aptr,$aptr,#32
+       ldp     w8,w9,[$rptr],#8
+       ldp     w10,w11,[$rptr]
+       sub     $rptr,$rptr,#8
+       b.cs    .LCopy_2
+       mov     w8, w4
+       mov     w9, w5
+       mov     w10, w6
+       mov     w11, w7
+.LCopy_2:
+       st1     {v0.2d,v1.2d}, [$aptr],#32              // wipe
+       st1     {v0.2d,v1.2d}, [$nptr],#32              // wipe
+       sub     x17,$bptr,$aptr         // preserves carry
+       stp     w8,w9,[$rptr],#8
+       stp     w10,w11,[$rptr],#8
+       cbnz    x17,.LNEON_copy_n_zap
+
+       mov     sp,x16
+       ldp     d14,d15,[sp,#64]
+       ldp     d12,d13,[sp,#48]
+       ldp     d10,d11,[sp,#32]
+       ldp     d8,d9,[sp,#16]
+       ldr     x29,[sp],#80
+       ret                     // bx lr
+
+.size  bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+___
+}
+{
 ########################################################################
 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
 
diff --git a/crypto/bn/build.info b/crypto/bn/build.info
index 3c32e83067..d0c1034bde 100644
--- a/crypto/bn/build.info
+++ b/crypto/bn/build.info
@@ -177,3 +177,4 @@ INCLUDE[armv4-mont.o]=..
 GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl
 INCLUDE[armv4-gf2m.o]=..
 GENERATE[armv8-mont.S]=asm/armv8-mont.pl
+INCLUDE[armv8-mont.o]=..

Reply via email to