Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package openssl-1_1 for openSUSE:Factory checked in at 2022-02-05 23:22:59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/openssl-1_1 (Old) and /work/SRC/openSUSE:Factory/.openssl-1_1.new.1898 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "openssl-1_1" Sat Feb 5 23:22:59 2022 rev:31 rq:951364 version:1.1.1m Changes: -------- --- /work/SRC/openSUSE:Factory/openssl-1_1/openssl-1_1.changes 2022-01-07 12:44:26.195764436 +0100 +++ /work/SRC/openSUSE:Factory/.openssl-1_1.new.1898/openssl-1_1.changes 2022-02-05 23:23:11.627920344 +0100 @@ -1,0 +2,21 @@ +Thu Feb 3 15:58:29 UTC 2022 - Pedro Monreal <pmonr...@suse.com> + +- Enable zlib compression support [bsc#1195149] + +------------------------------------------------------------------- +Fri Jan 28 17:33:51 UTC 2022 - Pedro Monreal <pmonr...@suse.com> + +- Backport cryptographic improvements from OpenSSL 3 [jsc#SLE-19742] + * Optimize RSA on armv8: openssl-1_1-Optimize-RSA-armv8.patch + * Optimize AES-XTS mode for aarch64: + openssl-1_1-Optimize-AES-XTS-aarch64.patch + * Optimize AES-GCM for uarchs with unroll and new instructions: + openssl-1_1-Optimize-AES-GCM-uarchs.patch + +------------------------------------------------------------------- +Fri Jan 28 17:33:23 UTC 2022 - Pedro Monreal <pmonr...@suse.com> + +- POWER10 performance enhancements for cryptography [jsc#SLE-18136] + * openssl-1_1-Optimize-ppc64.patch + +------------------------------------------------------------------- New: ---- openssl-1_1-Optimize-AES-GCM-uarchs.patch openssl-1_1-Optimize-AES-XTS-aarch64.patch openssl-1_1-Optimize-RSA-armv8.patch openssl-1_1-Optimize-ppc64.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ openssl-1_1.spec ++++++ --- /var/tmp/diff_new_pack.Zh2R15/_old 2022-02-05 23:23:12.803912300 +0100 +++ /var/tmp/diff_new_pack.Zh2R15/_new 2022-02-05 23:23:12.807912273 +0100 @@ -1,7 +1,7 @@ # # spec file for package openssl-1_1 # -# Copyright (c) 2021 SUSE LLC +# Copyright (c) 2022 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -112,7 +112,14 @@ Patch55: openssl-1_1-disable-test_srp-sslapi.patch Patch56: openssl-add_rfc3526_rfc7919.patch Patch57: openssl-1_1-use-include-directive.patch +#PATCH-FIX-UPSTREAM jsc#SLE-18136 POWER10 performance enhancements for cryptography +Patch69: openssl-1_1-Optimize-ppc64.patch +#PATCH-FIX-UPSTREAM jsc#SLE-19742 Backport Arm improvements from OpenSSL 3 +Patch70: openssl-1_1-Optimize-RSA-armv8.patch +Patch71: openssl-1_1-Optimize-AES-XTS-aarch64.patch +Patch72: openssl-1_1-Optimize-AES-GCM-uarchs.patch BuildRequires: pkgconfig +BuildRequires: pkgconfig(zlib) %if 0%{?sle_version} >= 150400 || 0%{?suse_version} >= 1550 Requires: crypto-policies %endif @@ -154,6 +161,7 @@ License: OpenSSL Group: Development/Libraries/C and C++ Requires: libopenssl1_1 = %{version} +Requires: pkgconfig(zlib) Recommends: %{name} = %{version} # we need to have around only the exact version we are able to operate with Conflicts: libopenssl-devel < %{version} @@ -214,6 +222,7 @@ enable-ec_nistp_64_gcc_128 \ %endif enable-camellia \ + zlib \ no-ec2m \ --prefix=%{_prefix} \ --libdir=%{_lib} \ @@ -245,6 +254,7 @@ export MALLOC_CHECK_=3 export MALLOC_PERTURB_=$(($RANDOM % 255 + 1)) #export HARNESS_VERBOSE=1 +#export OPENSSL_FORCE_FIPS_MODE=1 LD_LIBRARY_PATH=`pwd` make test -j1 # show ciphers @@ -330,7 +340,7 @@ # invalidates a HMAC that may have been created earlier. # solution: create the hashes _after_ the macro runs. # -# this shows up earlier because otherwise the %%expand of +# this shows up earlier because otherwise the expand of # the macro is too late. # remark: This is the same as running # openssl dgst -sha256 -hmac 'ppaksykemnsecgtsttplmamstKMEs' ++++++ openssl-1_1-Optimize-AES-GCM-uarchs.patch ++++++ ++++ 7710 lines (skipped) ++++++ openssl-1_1-Optimize-AES-XTS-aarch64.patch ++++++ ++++ 1617 lines (skipped) ++++++ openssl-1_1-Optimize-RSA-armv8.patch ++++++ >From 5ea64b456b1a27ae046f23d632a968a7583bb9eb Mon Sep 17 00:00:00 2001 From: "Fangming.Fang" <fangming.f...@arm.com> Date: Tue, 28 Apr 2020 02:33:50 +0000 Subject: [PATCH] Read MIDR_EL1 system register on aarch64 MIDR_EL1 system register exposes microarchitecture information so that people can make micro-arch related optimization such as exposing as much instruction level parallelism as possible. MIDR_EL1 register can be read only if HWCAP_CPUID feature is supported. Change-Id: Iabb8a36c5d31b184dba6399f378598058d394d4e Reviewed-by: Paul Dale <paul.d...@oracle.com> Reviewed-by: Tomas Mraz <tm...@fedoraproject.org> (Merged from https://github.com/openssl/openssl/pull/11744) --- crypto/arm64cpuid.pl | 7 +++++++ crypto/arm_arch.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ crypto/armcap.c | 11 +++++++++++ 3 files changed, 62 insertions(+) Index: openssl-1.1.1d/crypto/arm64cpuid.pl =================================================================== --- openssl-1.1.1d.orig/crypto/arm64cpuid.pl +++ openssl-1.1.1d/crypto/arm64cpuid.pl @@ -78,6 +78,13 @@ _armv8_sha512_probe: ret .size _armv8_sha512_probe,.-_armv8_sha512_probe +.globl _armv8_cpuid_probe +.type _armv8_cpuid_probe,%function +_armv8_cpuid_probe: + mrs x0, midr_el1 + ret +.size _armv8_cpuid_probe,.-_armv8_cpuid_probe + .globl OPENSSL_cleanse .type OPENSSL_cleanse,%function .align 5 Index: openssl-1.1.1d/crypto/arm_arch.h =================================================================== --- openssl-1.1.1d.orig/crypto/arm_arch.h +++ openssl-1.1.1d/crypto/arm_arch.h @@ -71,6 +71,7 @@ # ifndef __ASSEMBLER__ extern unsigned int OPENSSL_armcap_P; +extern unsigned int OPENSSL_arm_midr; # endif # define ARMV7_NEON (1<<0) @@ -80,5 +81,48 @@ extern unsigned int OPENSSL_armcap_P; # define ARMV8_SHA256 (1<<4) # define ARMV8_PMULL (1<<5) # define ARMV8_SHA512 (1<<6) +# define ARMV8_CPUID (1<<7) +/* + * MIDR_EL1 system register + * + * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0 + * | | | | | | | + * |RES0 | Implementer | Variant | Arch | PartNum |Revision| + * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________| + * + */ + +# define ARM_CPU_IMP_ARM 0x41 + +# define ARM_CPU_PART_CORTEX_A72 0xD08 +# define ARM_CPU_PART_N1 0xD0C + +# define MIDR_PARTNUM_SHIFT 4 +# define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT) +# define MIDR_PARTNUM(midr) \ + (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) + +# define MIDR_IMPLEMENTER_SHIFT 24 +# define MIDR_IMPLEMENTER_MASK (0xff << MIDR_IMPLEMENTER_SHIFT) +# define MIDR_IMPLEMENTER(midr) \ + (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) + +# define MIDR_ARCHITECTURE_SHIFT 16 +# define MIDR_ARCHITECTURE_MASK (0xf << MIDR_ARCHITECTURE_SHIFT) +# define MIDR_ARCHITECTURE(midr) \ + (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) + +# define MIDR_CPU_MODEL_MASK \ + (MIDR_IMPLEMENTER_MASK | \ + MIDR_PARTNUM_MASK | \ + MIDR_ARCHITECTURE_MASK) + +# define MIDR_CPU_MODEL(imp, partnum) \ + (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ + (0xf << MIDR_ARCHITECTURE_SHIFT) | \ + ((partnum) << MIDR_PARTNUM_SHIFT)) + +# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ + (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) #endif Index: openssl-1.1.1d/crypto/armcap.c =================================================================== --- openssl-1.1.1d.orig/crypto/armcap.c +++ openssl-1.1.1d/crypto/armcap.c @@ -18,6 +18,8 @@ #include "arm_arch.h" unsigned int OPENSSL_armcap_P = 0; +unsigned int OPENSSL_arm_midr = 0; +unsigned int OPENSSL_armv8_rsa_neonized = 0; #if __ARM_MAX_ARCH__<7 void OPENSSL_cpuid_setup(void) @@ -48,6 +50,7 @@ void _armv8_sha256_probe(void); void _armv8_pmull_probe(void); # ifdef __aarch64__ void _armv8_sha512_probe(void); +unsigned int _armv8_cpuid_probe(void); # endif uint32_t _armv7_tick(void); @@ -95,6 +98,7 @@ void OPENSSL_cpuid_setup(void) __attribu # define HWCAP_CE_PMULL (1 << 4) # define HWCAP_CE_SHA1 (1 << 5) # define HWCAP_CE_SHA256 (1 << 6) +# define HWCAP_CPUID (1 << 11) # define HWCAP_CE_SHA512 (1 << 21) # endif @@ -155,6 +159,9 @@ void OPENSSL_cpuid_setup(void) # ifdef __aarch64__ if (hwcap & HWCAP_CE_SHA512) OPENSSL_armcap_P |= ARMV8_SHA512; + + if (hwcap & HWCAP_CPUID) + OPENSSL_armcap_P |= ARMV8_CPUID; # endif } # endif @@ -210,5 +217,16 @@ void OPENSSL_cpuid_setup(void) sigaction(SIGILL, &ill_oact, NULL); sigprocmask(SIG_SETMASK, &oset, NULL); + +# ifdef __aarch64__ + if (OPENSSL_armcap_P & ARMV8_CPUID) + OPENSSL_arm_midr = _armv8_cpuid_probe(); + + if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) || + MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)) && + (OPENSSL_armcap_P & ARMV7_NEON)) { + OPENSSL_armv8_rsa_neonized = 1; + } +# endif } #endif Index: openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl =================================================================== --- openssl-1.1.1d.orig/crypto/bn/asm/armv8-mont.pl +++ openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl @@ -64,16 +64,34 @@ $n0="x4"; # const BN_ULONG *n0, $num="x5"; # int num); $code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armv8_rsa_neonized +.hidden OPENSSL_armv8_rsa_neonized +#endif .text .globl bn_mul_mont .type bn_mul_mont,%function .align 5 bn_mul_mont: +.Lbn_mul_mont: + tst $num,#3 + b.ne .Lmul_mont + cmp $num,#32 + b.le .Lscalar_impl +#ifndef __KERNEL__ + adrp x17,OPENSSL_armv8_rsa_neonized + ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] + cbnz w17, bn_mul8x_mont_neon +#endif + +.Lscalar_impl: tst $num,#7 b.eq __bn_sqr8x_mont tst $num,#3 b.eq __bn_mul4x_mont + .Lmul_mont: stp x29,x30,[sp,#-64]! add x29,sp,#0 @@ -271,6 +289,369 @@ bn_mul_mont: .size bn_mul_mont,.-bn_mul_mont ___ { +my ($A0,$A1,$N0,$N1)=map("v$_",(0..3)); +my ($Z,$Temp)=("v4.16b","v5"); +my @ACC=map("v$_",(6..13)); +my ($Bi,$Ni,$M0)=map("v$_",(28..30)); +my $sBi="s28"; +my $sM0="s30"; +my $zero="v14"; +my $temp="v15"; +my $ACCTemp="v16"; + +my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5)); +my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11)); + +$code.=<<___; +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + stp x29,x30,[sp,#-80]! + mov x16,sp + stp d8,d9,[sp,#16] + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + lsl $num,$num,#1 + eor $zero.16b,$zero.16b,$zero.16b + +.align 4 +.LNEON_8n: + eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b + sub $toutptr,sp,#128 + eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b + sub $toutptr,$toutptr,$num,lsl#4 + eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b + and $toutptr,$toutptr,#-64 + eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b + mov sp,$toutptr // alloca + eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b + add $toutptr,$toutptr,#256 + eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b + sub $inner,$num,#8 + eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b + eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b + +.LNEON_8n_init: + st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 + subs $inner,$inner,#8 + st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 + st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 + st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32 + bne .LNEON_8n_init + + add $tinptr,sp,#256 + ld1 {$A0.4s,$A1.4s},[$aptr],#32 + add $bnptr,sp,#8 + ldr $sM0,[$n0],#4 + mov $outer,$num + b .LNEON_8n_outer + +.align 4 +.LNEON_8n_outer: + ldr $sBi,[$bptr],#4 // *b++ + uxtl $Bi.4s,$Bi.4h + add $toutptr,sp,#128 + ld1 {$N0.4s,$N1.4s},[$nptr],#32 + + umlal @ACC[0].2d,$Bi.2s,$A0.s[0] + umlal @ACC[1].2d,$Bi.2s,$A0.s[1] + umlal @ACC[2].2d,$Bi.2s,$A0.s[2] + shl $Ni.2d,@ACC[0].2d,#16 + ext $Ni.16b,$Ni.16b,$Ni.16b,#8 + umlal @ACC[3].2d,$Bi.2s,$A0.s[3] + add $Ni.2d,$Ni.2d,@ACC[0].2d + umlal @ACC[4].2d,$Bi.2s,$A1.s[0] + mul $Ni.2s,$Ni.2s,$M0.2s + umlal @ACC[5].2d,$Bi.2s,$A1.s[1] + st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0] + umlal @ACC[6].2d,$Bi.2s,$A1.s[2] + uxtl $Ni.4s,$Ni.4h + umlal @ACC[7].2d,$Bi.2s,$A1.s[3] +___ +for ($i=0; $i<7;) { +$code.=<<___; + ldr $sBi,[$bptr],#4 // *b++ + umlal @ACC[0].2d,$Ni.2s,$N0.s[0] + umlal @ACC[1].2d,$Ni.2s,$N0.s[1] + uxtl $Bi.4s,$Bi.4h + umlal @ACC[2].2d,$Ni.2s,$N0.s[2] + ushr $temp.2d,@ACC[0].2d,#16 + umlal @ACC[3].2d,$Ni.2s,$N0.s[3] + umlal @ACC[4].2d,$Ni.2s,$N1.s[0] + ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 + add @ACC[0].2d,@ACC[0].2d,$temp.2d + umlal @ACC[5].2d,$Ni.2s,$N1.s[1] + ushr @ACC[0].2d,@ACC[0].2d,#16 + umlal @ACC[6].2d,$Ni.2s,$N1.s[2] + umlal @ACC[7].2d,$Ni.2s,$N1.s[3] + add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d + ins @ACC[1].d[0],$ACCTemp.d[0] + st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] +___ + push(@ACC,shift(@ACC)); $i++; +$code.=<<___; + umlal @ACC[0].2d,$Bi.2s,$A0.s[0] + ld1 {@ACC[7].2d},[$tinptr],#16 + umlal @ACC[1].2d,$Bi.2s,$A0.s[1] + umlal @ACC[2].2d,$Bi.2s,$A0.s[2] + shl $Ni.2d,@ACC[0].2d,#16 + ext $Ni.16b,$Ni.16b,$Ni.16b,#8 + umlal @ACC[3].2d,$Bi.2s,$A0.s[3] + add $Ni.2d,$Ni.2d,@ACC[0].2d + umlal @ACC[4].2d,$Bi.2s,$A1.s[0] + mul $Ni.2s,$Ni.2s,$M0.2s + umlal @ACC[5].2d,$Bi.2s,$A1.s[1] + st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i] + umlal @ACC[6].2d,$Bi.2s,$A1.s[2] + uxtl $Ni.4s,$Ni.4h + umlal @ACC[7].2d,$Bi.2s,$A1.s[3] +___ +} +$code.=<<___; + ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] + umlal @ACC[0].2d,$Ni.2s,$N0.s[0] + ld1 {$A0.4s,$A1.4s},[$aptr],#32 + umlal @ACC[1].2d,$Ni.2s,$N0.s[1] + umlal @ACC[2].2d,$Ni.2s,$N0.s[2] + mov $Temp.16b,@ACC[0].16b + ushr $Temp.2d,$Temp.2d,#16 + ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 + umlal @ACC[3].2d,$Ni.2s,$N0.s[3] + umlal @ACC[4].2d,$Ni.2s,$N1.s[0] + add @ACC[0].2d,@ACC[0].2d,$Temp.2d + umlal @ACC[5].2d,$Ni.2s,$N1.s[1] + ushr @ACC[0].2d,@ACC[0].2d,#16 + eor $temp.16b,$temp.16b,$temp.16b + ins @ACC[0].d[1],$temp.d[0] + umlal @ACC[6].2d,$Ni.2s,$N1.s[2] + umlal @ACC[7].2d,$Ni.2s,$N1.s[3] + add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d + st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] + add $bnptr,sp,#8 // rewind +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + sub $inner,$num,#8 + b .LNEON_8n_inner + +.align 4 +.LNEON_8n_inner: + subs $inner,$inner,#8 + umlal @ACC[0].2d,$Bi.2s,$A0.s[0] + ld1 {@ACC[7].2d},[$tinptr] + umlal @ACC[1].2d,$Bi.2s,$A0.s[1] + ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0] + umlal @ACC[2].2d,$Bi.2s,$A0.s[2] + ld1 {$N0.4s,$N1.4s},[$nptr],#32 + umlal @ACC[3].2d,$Bi.2s,$A0.s[3] + b.eq .LInner_jump + add $tinptr,$tinptr,#16 // don't advance in last iteration +.LInner_jump: + umlal @ACC[4].2d,$Bi.2s,$A1.s[0] + umlal @ACC[5].2d,$Bi.2s,$A1.s[1] + umlal @ACC[6].2d,$Bi.2s,$A1.s[2] + umlal @ACC[7].2d,$Bi.2s,$A1.s[3] +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i] + umlal @ACC[0].2d,$Ni.2s,$N0.s[0] + umlal @ACC[1].2d,$Ni.2s,$N0.s[1] + umlal @ACC[2].2d,$Ni.2s,$N0.s[2] + umlal @ACC[3].2d,$Ni.2s,$N0.s[3] + umlal @ACC[4].2d,$Ni.2s,$N1.s[0] + umlal @ACC[5].2d,$Ni.2s,$N1.s[1] + umlal @ACC[6].2d,$Ni.2s,$N1.s[2] + umlal @ACC[7].2d,$Ni.2s,$N1.s[3] + st1 {@ACC[0].2d},[$toutptr],#16 +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + umlal @ACC[0].2d,$Bi.2s,$A0.s[0] + ld1 {@ACC[7].2d},[$tinptr] + umlal @ACC[1].2d,$Bi.2s,$A0.s[1] + ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i] + umlal @ACC[2].2d,$Bi.2s,$A0.s[2] + b.eq .LInner_jump$i + add $tinptr,$tinptr,#16 // don't advance in last iteration +.LInner_jump$i: + umlal @ACC[3].2d,$Bi.2s,$A0.s[3] + umlal @ACC[4].2d,$Bi.2s,$A1.s[0] + umlal @ACC[5].2d,$Bi.2s,$A1.s[1] + umlal @ACC[6].2d,$Bi.2s,$A1.s[2] + umlal @ACC[7].2d,$Bi.2s,$A1.s[3] +___ +} +$code.=<<___; + b.ne .LInner_after_rewind$i + sub $aptr,$aptr,$num,lsl#2 // rewind +.LInner_after_rewind$i: + umlal @ACC[0].2d,$Ni.2s,$N0.s[0] + ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] + umlal @ACC[1].2d,$Ni.2s,$N0.s[1] + ld1 {$A0.4s,$A1.4s},[$aptr],#32 + umlal @ACC[2].2d,$Ni.2s,$N0.s[2] + add $bnptr,sp,#8 // rewind + umlal @ACC[3].2d,$Ni.2s,$N0.s[3] + umlal @ACC[4].2d,$Ni.2s,$N1.s[0] + umlal @ACC[5].2d,$Ni.2s,$N1.s[1] + umlal @ACC[6].2d,$Ni.2s,$N1.s[2] + st1 {@ACC[0].2d},[$toutptr],#16 + umlal @ACC[7].2d,$Ni.2s,$N1.s[3] + + bne .LNEON_8n_inner +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + add $tinptr,sp,#128 + st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 + eor $N0.16b,$N0.16b,$N0.16b // $N0 + st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 + eor $N1.16b,$N1.16b,$N1.16b // $N1 + st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 + st1 {@ACC[6].2d},[$toutptr] + + subs $outer,$outer,#8 + ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32 + ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32 + ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32 + ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32 + + b.eq .LInner_8n_jump_2steps + sub $nptr,$nptr,$num,lsl#2 // rewind + b .LNEON_8n_outer + +.LInner_8n_jump_2steps: + add $toutptr,sp,#128 + st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame + mov $Temp.16b,@ACC[0].16b + ushr $temp.2d,@ACC[0].2d,#16 + ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 + st1 {$N0.2d,$N1.2d}, [sp],#32 + add @ACC[0].2d,@ACC[0].2d,$temp.2d + st1 {$N0.2d,$N1.2d}, [sp],#32 + ushr $temp.2d,@ACC[0].2d,#16 + st1 {$N0.2d,$N1.2d}, [sp],#32 + zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h + ins $temp.d[1],$zero.d[0] + + mov $inner,$num + b .LNEON_tail_entry + +.align 4 +.LNEON_tail: + add @ACC[0].2d,@ACC[0].2d,$temp.2d + mov $Temp.16b,@ACC[0].16b + ushr $temp.2d,@ACC[0].2d,#16 + ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 + ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32 + add @ACC[0].2d,@ACC[0].2d,$temp.2d + ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32 + ushr $temp.2d,@ACC[0].2d,#16 + ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32 + zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h + ins $temp.d[1],$zero.d[0] + +.LNEON_tail_entry: +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + add @ACC[1].2d,@ACC[1].2d,$temp.2d + st1 {@ACC[0].s}[0], [$toutptr],#4 + ushr $temp.2d,@ACC[1].2d,#16 + mov $Temp.16b,@ACC[1].16b + ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8 + add @ACC[1].2d,@ACC[1].2d,$temp.2d + ushr $temp.2d,@ACC[1].2d,#16 + zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h + ins $temp.d[1],$zero.d[0] +___ + push(@ACC,shift(@ACC)); +} + push(@ACC,shift(@ACC)); +$code.=<<___; + ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32 + subs $inner,$inner,#8 + st1 {@ACC[7].s}[0], [$toutptr],#4 + bne .LNEON_tail + + st1 {$temp.s}[0], [$toutptr],#4 // top-most bit + sub $nptr,$nptr,$num,lsl#2 // rewind $nptr + subs $aptr,sp,#0 // clear carry flag + add $bptr,sp,$num,lsl#2 + +.LNEON_sub: + ldp w4,w5,[$aptr],#8 + ldp w6,w7,[$aptr],#8 + ldp w8,w9,[$nptr],#8 + ldp w10,w11,[$nptr],#8 + sbcs w8,w4,w8 + sbcs w9,w5,w9 + sbcs w10,w6,w10 + sbcs w11,w7,w11 + sub x17,$bptr,$aptr + stp w8,w9,[$rptr],#8 + stp w10,w11,[$rptr],#8 + cbnz x17,.LNEON_sub + + ldr w10, [$aptr] // load top-most bit + mov x11,sp + eor v0.16b,v0.16b,v0.16b + sub x11,$bptr,x11 // this is num*4 + eor v1.16b,v1.16b,v1.16b + mov $aptr,sp + sub $rptr,$rptr,x11 // rewind $rptr + mov $nptr,$bptr // second 3/4th of frame + sbcs w10,w10,wzr // result is carry flag + +.LNEON_copy_n_zap: + ldp w4,w5,[$aptr],#8 + ldp w6,w7,[$aptr],#8 + ldp w8,w9,[$rptr],#8 + ldp w10,w11,[$rptr] + sub $rptr,$rptr,#8 + b.cs .LCopy_1 + mov w8,w4 + mov w9,w5 + mov w10,w6 + mov w11,w7 +.LCopy_1: + st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe + st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe + ldp w4,w5,[$aptr],#8 + ldp w6,w7,[$aptr],#8 + stp w8,w9,[$rptr],#8 + stp w10,w11,[$rptr],#8 + sub $aptr,$aptr,#32 + ldp w8,w9,[$rptr],#8 + ldp w10,w11,[$rptr] + sub $rptr,$rptr,#8 + b.cs .LCopy_2 + mov w8, w4 + mov w9, w5 + mov w10, w6 + mov w11, w7 +.LCopy_2: + st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe + st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe + sub x17,$bptr,$aptr // preserves carry + stp w8,w9,[$rptr],#8 + stp w10,w11,[$rptr],#8 + cbnz x17,.LNEON_copy_n_zap + + mov sp,x16 + ldp d14,d15,[sp,#64] + ldp d12,d13,[sp,#48] + ldp d10,d11,[sp,#32] + ldp d8,d9,[sp,#16] + ldr x29,[sp],#80 + ret // bx lr + +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +___ +} +{ ######################################################################## # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. Index: openssl-1.1.1d/crypto/bn/build.info =================================================================== --- openssl-1.1.1d.orig/crypto/bn/build.info +++ openssl-1.1.1d/crypto/bn/build.info @@ -65,3 +65,4 @@ INCLUDE[armv4-mont.o]=.. GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl $(PERLASM_SCHEME) INCLUDE[armv4-gf2m.o]=.. GENERATE[armv8-mont.S]=asm/armv8-mont.pl $(PERLASM_SCHEME) +INCLUDE[armv8-mont.o]=.. ++++++ openssl-1_1-Optimize-ppc64.patch ++++++ ++++ 2309 lines (skipped)