The branch master has been updated via 0e5c8d566029a995d1fa23cac0f828c39519d97f (commit) via c521e4392f8423af4bc64cdee57495e39e4e7a07 (commit) via 42efffcb70f9805e76c72005d91e5b4ce6979a5f (commit) via 127d6cf747652a182636597cbdf4d0d49d2e661a (commit) from 59bf467ccaff27ab2ffe7243b3a334675fea8ed9 (commit)
- Log ----------------------------------------------------------------- commit 0e5c8d566029a995d1fa23cac0f828c39519d97f Author: Andy Polyakov <ap...@openssl.org> Date: Mon Feb 19 15:19:49 2018 +0100 Configure: engage x25519 assembly support. Reviewed-by: Rich Salz <rs...@openssl.org> Reviewed-by: Paul Dale <paul.d...@oracle.com> (Merged from https://github.com/openssl/openssl/pull/5408) commit c521e4392f8423af4bc64cdee57495e39e4e7a07 Author: Andy Polyakov <ap...@openssl.org> Date: Mon Feb 19 15:17:30 2018 +0100 ec/curve25519.c: facilitate assembly implementations. Currently it's limited to 64-bit platforms only as minimum radix expected in assembly is 2^51. Reviewed-by: Rich Salz <rs...@openssl.org> Reviewed-by: Paul Dale <paul.d...@oracle.com> (Merged from https://github.com/openssl/openssl/pull/5408) commit 42efffcb70f9805e76c72005d91e5b4ce6979a5f Author: Andy Polyakov <ap...@openssl.org> Date: Mon Feb 19 15:09:33 2018 +0100 Add x25519-x86_64.pl module, mod 2^255-19 primitives. Reviewed-by: Rich Salz <rs...@openssl.org> Reviewed-by: Paul Dale <paul.d...@oracle.com> (Merged from https://github.com/openssl/openssl/pull/5408) commit 127d6cf747652a182636597cbdf4d0d49d2e661a Author: Andy Polyakov <ap...@openssl.org> Date: Mon Feb 19 15:00:59 2018 +0100 crypto/ec/curve25519.c: remove redundant fe[51]_cswap. 3 least significant bits of the input scalar are explicitly cleared, hence swap variable has fixed value [of zero] upon exit from the loop. Reviewed-by: Rich Salz <rs...@openssl.org> Reviewed-by: Paul Dale <paul.d...@oracle.com> (Merged from https://github.com/openssl/openssl/pull/5408) ----------------------------------------------------------------------- Summary of changes: Configurations/00-base-templates.conf | 2 +- Configure | 3 + crypto/ec/asm/x25519-x86_64.pl | 816 ++++++++++++++++++++++++++++++++++ crypto/ec/build.info | 2 + crypto/ec/curve25519.c | 323 ++++++++++++-- 5 files changed, 1109 insertions(+), 37 deletions(-) create mode 100755 crypto/ec/asm/x25519-x86_64.pl diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf index 951aeaa..99c271e 100644 --- a/Configurations/00-base-templates.conf +++ b/Configurations/00-base-templates.conf @@ -203,7 +203,7 @@ my %targets=( template => 1, cpuid_asm_src => "x86_64cpuid.s", bn_asm_src => "asm/x86_64-gcc.c x86_64-mont.s x86_64-mont5.s x86_64-gf2m.s rsaz_exp.c rsaz-x86_64.s rsaz-avx2.s", - ec_asm_src => "ecp_nistz256.c ecp_nistz256-x86_64.s", + ec_asm_src => "ecp_nistz256.c ecp_nistz256-x86_64.s x25519-x86_64.s", aes_asm_src => "aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s", md5_asm_src => "md5-x86_64.s", sha1_asm_src => "sha1-x86_64.s sha256-x86_64.s sha512-x86_64.s sha1-mb-x86_64.s sha256-mb-x86_64.s", diff --git a/Configure b/Configure index 0cc554f..05618ed 100755 --- a/Configure +++ b/Configure @@ -1341,6 +1341,9 @@ unless ($disabled{asm}) { if ($target{ec_asm_src} =~ /ecp_nistz256/) { push @{$config{defines}}, "ECP_NISTZ256_ASM"; } + if ($target{ec_asm_src} =~ /x25519/) { + push @{$config{defines}}, "X25519_ASM"; + } if ($target{padlock_asm_src} ne $table{DEFAULTS}->{padlock_asm_src}) { push @{$config{defines}}, "PADLOCK_ASM"; } diff --git a/crypto/ec/asm/x25519-x86_64.pl b/crypto/ec/asm/x25519-x86_64.pl new file mode 100755 index 0000000..6c0f738 --- /dev/null +++ b/crypto/ec/asm/x25519-x86_64.pl @@ -0,0 +1,816 @@ +#!/usr/bin/env perl +# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <ap...@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# X25519 lower-level primitives for x86_86. +# +# February 2018. +# +# This module implements radix 2^51 multiplication and squaring, and +# radix 2^64 multiplication, squaring, addition, subtraction and final +# reduction. Latter radix is used on ADCX/ADOX-capable processors such +# as Broadwell. On related note one should mention that there are +# vector implementations that provide significantly better performance +# on some processors(*), but they are large and overly complex. Which +# in combination with them being effectively processor-specific makes +# the undertaking hard to justify. The goal for this implementation +# is rather versatility and simplicity [and ultimately formal +# verification]. +# +# (*) For example sandy2x should provide ~30% improvement on Sandy +# Bridge, but only nominal ~5% on Haswell [and big loss on +# Broadwell and successors]. +# +###################################################################### +# Improvement coefficients: +# +# amd64-51(*) gcc-5.x(**) +# +# P4 +22% +40% +# Sandy Bridge -3% +11% +# Haswell -1% +13% +# Broadwell(***) +26% +30% +# Skylake(***) +30% +47% +# Silvermont +20% +26% +# Goldmont +40% +50% +# Bulldozer +20% +9% +# Ryzen(***) +35% +32% +# VIA +170% +120% +# +# (*) amd64-51 is popular assembly implementation with 2^51 radix, +# only multiplication and squaring subroutines were linked +# for comparison, but not complete ladder step; gain on most +# processors is because this module refrains from shld, and +# minor regression on others is because this does result in +# higher instruction count; +# (**) compiler is free to inline functions, in assembly one would +# need to implement ladder step to do that, and it will improve +# performance by several percent; +# (***) ADCX/ADOX result for 2^64 radix, there is no corresponding +# C implementation, so that comparison is always against +# 2^51 radix; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.23); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $addx = ($1>=12); +} + +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $addx = ($ver>=3.03); +} + +$code.=<<___; +.text + +.globl x25519_fe51_mul +.type x25519_fe51_mul,\@function,3 +.align 32 +x25519_fe51_mul: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -8*5(%rsp),%rsp + + mov 8*0(%rsi),%rax # f[0] + mov 8*0(%rdx),%r11 # load g[0-4] + mov 8*1(%rdx),%r12 + mov 8*2(%rdx),%r13 + mov 8*3(%rdx),%rbp + mov 8*4(%rdx),%r14 + + mov %rdi,8*4(%rsp) # offload 1st argument + mov %rax,%rdi + mulq %r11 # f[0]*g[0] + mov %r11,8*0(%rsp) # offload g[0] + mov %rax,%rbx # %rbx:%rcx = h0 + mov %rdi,%rax + mov %rdx,%rcx + mulq %r12 # f[0]*g[1] + mov %r12,8*1(%rsp) # offload g[1] + mov %rax,%r8 # %r8:%r9 = h1 + mov %rdi,%rax + lea (%r14,%r14,8),%r15 + mov %rdx,%r9 + mulq %r13 # f[0]*g[2] + mov %r13,8*2(%rsp) # offload g[2] + mov %rax,%r10 # %r10:%r11 = h2 + mov %rdi,%rax + lea (%r14,%r15,2),%rdi # g[4]*19 + mov %rdx,%r11 + mulq %rbp # f[0]*g[3] + mov %rax,%r12 # %r12:%r13 = h3 + mov 8*0(%rsi),%rax # f[0] + mov %rdx,%r13 + mulq %r14 # f[0]*g[4] + mov %rax,%r14 # %r14:%r15 = h4 + mov 8*1(%rsi),%rax # f[1] + mov %rdx,%r15 + + mulq %rdi # f[1]*g[4]*19 + add %rax,%rbx + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%rcx + mulq %rdi # f[2]*g[4]*19 + add %rax,%r8 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r9 + mulq %rdi # f[3]*g[4]*19 + add %rax,%r10 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r11 + mulq %rdi # f[4]*g[4]*19 + imulq \$19,%rbp,%rdi # g[3]*19 + add %rax,%r12 + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%r13 + mulq %rbp # f[1]*g[3] + mov 8*2(%rsp),%rbp # g[2] + add %rax,%r14 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r15 + + mulq %rdi # f[2]*g[3]*19 + add %rax,%rbx + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%rcx + mulq %rdi # f[3]*g[3]*19 + add %rax,%r8 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r9 + mulq %rdi # f[4]*g[3]*19 + imulq \$19,%rbp,%rdi # g[2]*19 + add %rax,%r10 + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%r11 + mulq %rbp # f[1]*g[2] + add %rax,%r12 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r13 + mulq %rbp # f[2]*g[2] + mov 8*1(%rsp),%rbp # g[1] + add %rax,%r14 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r15 + + mulq %rdi # f[3]*g[2]*19 + add %rax,%rbx + mov 8*4(%rsi),%rax # f[3] + adc %rdx,%rcx + mulq %rdi # f[4]*g[2]*19 + add %rax,%r8 + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%r9 + mulq %rbp # f[1]*g[1] + imulq \$19,%rbp,%rdi + add %rax,%r10 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r11 + mulq %rbp # f[2]*g[1] + add %rax,%r12 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r13 + mulq %rbp # f[3]*g[1] + mov 8*0(%rsp),%rbp # g[0] + add %rax,%r14 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r15 + + mulq %rdi # f[4]*g[1]*19 + add %rax,%rbx + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%rcx + mul %rbp # f[1]*g[0] + add %rax,%r8 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r9 + mul %rbp # f[2]*g[0] + add %rax,%r10 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r11 + mul %rbp # f[3]*g[0] + add %rax,%r12 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r13 + mulq %rbp # f[4]*g[0] + add %rax,%r14 + adc %rdx,%r15 + + mov 8*4(%rsp),%rdi # restore 1st argument + jmp .Lreduce51 +.size x25519_fe51_mul,.-x25519_fe51_mul + +.globl x25519_fe51_sqr +.type x25519_fe51_sqr,\@function,2 +.align 32 +x25519_fe51_sqr: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -8*5(%rsp),%rsp + + mov 8*0(%rsi),%rax # g[0] + mov 8*2(%rsi),%r15 # g[2] + mov 8*4(%rsi),%rbp # g[4] + + mov %rdi,8*4(%rsp) # offload 1st argument + lea (%rax,%rax),%r14 + mulq %rax # g[0]*g[0] + mov %rax,%rbx + mov 8*1(%rsi),%rax # g[1] + mov %rdx,%rcx + mulq %r14 # 2*g[0]*g[1] + mov %rax,%r8 + mov %r15,%rax + mov %r15,8*0(%rsp) # offload g[2] + mov %rdx,%r9 + mulq %r14 # 2*g[0]*g[2] + mov %rax,%r10 + mov 8*3(%rsi),%rax + mov %rdx,%r11 + imulq \$19,%rbp,%rdi # g[4]*19 + mulq %r14 # 2*g[0]*g[3] + mov %rax,%r12 + mov %rbp,%rax + mov %rdx,%r13 + mulq %r14 # 2*g[0]*g[4] + mov %rax,%r14 + mov %rbp,%rax + mov %rdx,%r15 + + mulq %rdi # g[4]*g[4]*19 + add %rax,%r12 + mov 8*1(%rsi),%rax # g[1] + adc %rdx,%r13 + + mov 8*3(%rsi),%rsi # g[3] + lea (%rax,%rax),%rbp + mulq %rax # g[1]*g[1] + add %rax,%r10 + mov 8*0(%rsp),%rax # g[2] + adc %rdx,%r11 + mulq %rbp # 2*g[1]*g[2] + add %rax,%r12 + mov %rbp,%rax + adc %rdx,%r13 + mulq %rsi # 2*g[1]*g[3] + add %rax,%r14 + mov %rbp,%rax + adc %rdx,%r15 + imulq \$19,%rsi,%rbp # g[3]*19 + mulq %rdi # 2*g[1]*g[4]*19 + add %rax,%rbx + lea (%rsi,%rsi),%rax + adc %rdx,%rcx + + mulq %rdi # 2*g[3]*g[4]*19 + add %rax,%r10 + mov %rsi,%rax + adc %rdx,%r11 + mulq %rbp # g[3]*g[3]*19 + add %rax,%r8 + mov 8*0(%rsp),%rax # g[2] + adc %rdx,%r9 + + lea (%rax,%rax),%rsi + mulq %rax # g[2]*g[2] + add %rax,%r14 + mov %rbp,%rax + adc %rdx,%r15 + mulq %rsi # 2*g[2]*g[3]*19 + add %rax,%rbx + mov %rsi,%rax + adc %rdx,%rcx + mulq %rdi # 2*g[2]*g[4]*19 + add %rax,%r8 + adc %rdx,%r9 + + mov 8*4(%rsp),%rdi # restore 1st argument + jmp .Lreduce51 + +.align 32 +.Lreduce51: + mov \$0x7ffffffffffff,%rbp + + mov %r10,%rdx + shr \$51,%r10 + shl \$13,%r11 + and %rbp,%rdx # %rdx = g2 = h2 & mask + or %r10,%r11 # h2>>51 + add %r11,%r12 + adc \$0,%r13 # h3 += h2>>51 + + mov %rbx,%rax + shr \$51,%rbx + shl \$13,%rcx + and %rbp,%rax # %rax = g0 = h0 & mask + or %rbx,%rcx # h0>>51 + add %rcx,%r8 # h1 += h0>>51 + adc \$0,%r9 + + mov %r12,%rbx + shr \$51,%r12 + shl \$13,%r13 + and %rbp,%rbx # %rbx = g3 = h3 & mask + or %r12,%r13 # h3>>51 + add %r13,%r14 # h4 += h3>>51 + adc \$0,%r15 + + mov %r8,%rcx + shr \$51,%r8 + shl \$13,%r9 + and %rbp,%rcx # %rcx = g1 = h1 & mask + or %r8,%r9 + add %r9,%rdx # g2 += h1>>51 + + mov %r14,%r10 + shr \$51,%r14 + shl \$13,%r15 + and %rbp,%r10 # %r10 = g4 = h0 & mask + or %r14,%r15 # h0>>51 + + lea (%r15,%r15,8),%r14 + lea (%r15,%r14,2),%r15 + add %r15,%rax # g0 += (h0>>51)*19 + + mov %rdx,%r8 + and %rbp,%rdx # g2 &= mask + shr \$51,%r8 + add %r8,%rbx # g3 += g2>>51 + + mov %rax,%r9 + and %rbp,%rax # g0 &= mask + shr \$51,%r9 + add %r9,%rcx # g1 += g0>>51 + + mov %rax,8*0(%rdi) # save the result + mov %rcx,8*1(%rdi) + mov %rdx,8*2(%rdi) + mov %rbx,8*3(%rdi) + mov %r10,8*4(%rdi) + + mov 8*5(%rsp),%r15 + mov 8*6(%rsp),%r14 + mov 8*7(%rsp),%r13 + mov 8*8(%rsp),%r12 + mov 8*9(%rsp),%rbx + mov 8*10(%rsp),%rbp + lea 8*11(%rsp),%rsp + ret +.size x25519_fe51_sqr,.-x25519_fe51_sqr + +.globl x25519_fe51_mul121666 +.type x25519_fe51_mul121666,\@function,2 +.align 32 +x25519_fe51_mul121666: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + mov \$121666,%eax + lea -8*5(%rsp),%rsp + + mulq 8*0(%rsi) + mov %rax,%rbx # %rbx:%rcx = h0 + mov \$121666,%eax + mov %rdx,%rcx + mulq 8*1(%rsi) + mov %rax,%r8 # %r8:%r9 = h1 + mov \$121666,%eax + mov %rdx,%r9 + mulq 8*2(%rsi) + mov %rax,%r10 # %r10:%r11 = h2 + mov \$121666,%eax + mov %rdx,%r11 + mulq 8*3(%rsi) + mov %rax,%r12 # %r12:%r13 = h3 + mov \$121666,%eax # f[0] + mov %rdx,%r13 + mulq 8*4(%rsi) + mov %rax,%r14 # %r14:%r15 = h4 + mov %rdx,%r15 + + jmp .Lreduce51 +.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 +___ +######################################################################## +# Base 2^64 subroutines modulo 2*(2^255-19) +# +if ($addx) { +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15)); + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl x25519_fe64_eligible +.type x25519_fe64_eligible,\@abi-omnipotent +.align 32 +x25519_fe64_eligible: + mov OPENSSL_ia32cap_P+8(%rip),%ecx + xor %eax,%eax + and \$0x80100,%ecx + cmp \$0x80100,%ecx + cmove %ecx,%eax + ret +.size x25519_fe64_eligible,.-x25519_fe64_eligible + +.globl x25519_fe64_mul +.type x25519_fe64_mul,\@function,3 +.align 32 +x25519_fe64_mul: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + push %rdi # offload dst + lea -8*2(%rsp),%rsp + + mov %rdx,%rax + mov 8*0(%rdx),%rbp # b[0] + mov 8*0(%rsi),%rdx # a[0] + mov 8*1(%rax),%rcx # b[1] + mov 8*2(%rax),$acc6 # b[2] + mov 8*3(%rax),$acc7 # b[3] + + mulx %rbp,$acc0,%rax # a[0]*b[0] + xor %edi,%edi # cf=0,of=0 + mulx %rcx,$acc1,%rbx # a[0]*b[1] + adcx %rax,$acc1 + mulx $acc6,$acc2,%rax # a[0]*b[2] + adcx %rbx,$acc2 + mulx $acc7,$acc3,$acc4 # a[0]*b[3] + mov 8*1(%rsi),%rdx # a[1] + adcx %rax,$acc3 + mov $acc6,(%rsp) # offload b[2] + adcx %rdi,$acc4 # cf=0 + + mulx %rbp,%rax,%rbx # a[1]*b[0] + adox %rax,$acc1 + adcx %rbx,$acc2 + mulx %rcx,%rax,%rbx # a[1]*b[1] + adox %rax,$acc2 + adcx %rbx,$acc3 + mulx $acc6,%rax,%rbx # a[1]*b[2] + adox %rax,$acc3 + adcx %rbx,$acc4 + mulx $acc7,%rax,$acc5 # a[1]*b[3] + mov 8*2(%rsi),%rdx # a[2] + adox %rax,$acc4 + adcx %rdi,$acc5 # cf=0 + adox %rdi,$acc5 # of=0 + + mulx %rbp,%rax,%rbx # a[2]*b[0] + adcx %rax,$acc2 + adox %rbx,$acc3 + mulx %rcx,%rax,%rbx # a[2]*b[1] + adcx %rax,$acc3 + adox %rbx,$acc4 + mulx $acc6,%rax,%rbx # a[2]*b[2] + adcx %rax,$acc4 + adox %rbx,$acc5 + mulx $acc7,%rax,$acc6 # a[2]*b[3] + mov 8*3(%rsi),%rdx # a[3] + adcx %rax,$acc5 + adox %rdi,$acc6 # of=0 + adcx %rdi,$acc6 # cf=0 + + mulx %rbp,%rax,%rbx # a[3]*b[0] + adox %rax,$acc3 + adcx %rbx,$acc4 + mulx %rcx,%rax,%rbx # a[3]*b[1] + adox %rax,$acc4 + adcx %rbx,$acc5 + mulx (%rsp),%rax,%rbx # a[3]*b[2] + adox %rax,$acc5 + adcx %rbx,$acc6 + mulx $acc7,%rax,$acc7 # a[3]*b[3] + mov \$38,%edx + adox %rax,$acc6 + adcx %rdi,$acc7 # cf=0 + adox %rdi,$acc7 # of=0 + + jmp .Lreduce64 +.size x25519_fe64_mul,.-x25519_fe64_mul + +.globl x25519_fe64_sqr +.type x25519_fe64_sqr,\@function,2 +.align 32 +x25519_fe64_sqr: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + push %rdi # offload dst + lea -8*2(%rsp),%rsp + + mov 8*0(%rsi),%rdx # a[0] + mov 8*1(%rsi),%rcx # a[1] + mov 8*2(%rsi),%rbp # a[2] + mov 8*3(%rsi),%rsi # a[3] + + ################################################################ + mulx %rdx,$acc0,$acc7 # a[0]*a[0] + mulx %rcx,$acc1,%rax # a[0]*a[1] + xor %edi,%edi # cf=0,of=0 + mulx %rbp,$acc2,%rbx # a[0]*a[2] + adcx %rax,$acc2 + mulx %rsi,$acc3,$acc4 # a[0]*a[3] + mov %rcx,%rdx # a[1] + adcx %rbx,$acc3 + adcx %rdi,$acc4 # cf=0 + + ################################################################ + mulx %rbp,%rax,%rbx # a[1]*a[2] + adox %rax,$acc3 + adcx %rbx,$acc4 + mulx %rsi,%rax,$acc5 # a[1]*a[3] + mov %rbp,%rdx # a[2] + adox %rax,$acc4 + adcx %rdi,$acc5 + + ################################################################ + mulx %rsi,%rax,$acc6 # a[2]*a[3] + mov %rcx,%rdx # a[1] + adox %rax,$acc5 + adcx %rdi,$acc6 # cf=0 + adox %rdi,$acc6 # of=0 + + adcx $acc1,$acc1 # acc1:6<<1 + adox $acc7,$acc1 + adcx $acc2,$acc2 + mulx %rdx,%rax,%rbx # a[1]*a[1] + mov %rbp,%rdx # a[2] + adcx $acc3,$acc3 + adox %rax,$acc2 + adcx $acc4,$acc4 + adox %rbx,$acc3 + mulx %rdx,%rax,%rbx # a[2]*a[2] + mov %rsi,%rdx # a[3] + adcx $acc5,$acc5 + adox %rax,$acc4 + adcx $acc6,$acc6 + adox %rbx,$acc5 + mulx %rdx,%rax,$acc7 # a[3]*a[3] + mov \$38,%edx + adox %rax,$acc6 + adcx %rdi,$acc7 # cf=0 + adox %rdi,$acc7 # of=0 + jmp .Lreduce64 + +.align 32 +.Lreduce64: + mulx $acc4,%rax,%rbx + adcx %rax,$acc0 + adox %rbx,$acc1 + mulx $acc5,%rax,%rbx + adcx %rax,$acc1 + adox %rbx,$acc2 + mulx $acc6,%rax,%rbx + adcx %rax,$acc2 + adox %rbx,$acc3 + mulx $acc7,%rax,$acc4 + adcx %rax,$acc3 + adox %rdi,$acc4 + adcx %rdi,$acc4 + + mov 8*2(%rsp),%rdi # restore dst + imulq %rdx,$acc4 + + add $acc4,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + add %rax,$acc0 + adc \$0,$acc1 + mov $acc0,8*0(%rdi) + adc \$0,$acc2 + mov $acc1,8*1(%rdi) + adc \$0,$acc3 + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + + mov 8*3(%rsp),%r15 + mov 8*4(%rsp),%r14 + mov 8*5(%rsp),%r13 + mov 8*6(%rsp),%r12 + mov 8*7(%rsp),%rbx + mov 8*8(%rsp),%rbp + lea 8*9(%rsp),%rsp + ret +.size x25519_fe64_sqr,.-x25519_fe64_sqr + +.globl x25519_fe64_mul121666 +.type x25519_fe64_mul121666,\@function,2 +.align 32 +x25519_fe64_mul121666: + mov \$121666,%edx + mulx 8*0(%rsi),$acc0,%rcx + mulx 8*1(%rsi),$acc1,%rax + add %rcx,$acc1 + mulx 8*2(%rsi),$acc2,%rcx + adc %rax,$acc2 + mulx 8*3(%rsi),$acc3,%rax + adc %rcx,$acc3 + adc \$0,%rax + + imulq \$38,%rax,%rax + + add %rax,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + add %rax,$acc0 + adc \$0,$acc1 + mov $acc0,8*0(%rdi) + adc \$0,$acc2 + mov $acc1,8*1(%rdi) + adc \$0,$acc3 + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + + ret +.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 + +.globl x25519_fe64_add +.type x25519_fe64_add,\@function,3 +.align 32 +x25519_fe64_add: + mov 8*0(%rsi),$acc0 + mov 8*1(%rsi),$acc1 + mov 8*2(%rsi),$acc2 + mov 8*3(%rsi),$acc3 + + add 8*0(%rdx),$acc0 + adc 8*1(%rdx),$acc1 + adc 8*2(%rdx),$acc2 + adc 8*3(%rdx),$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + add %rax,$acc0 + adc \$0,$acc1 + mov $acc0,8*0(%rdi) + adc \$0,$acc2 + mov $acc1,8*1(%rdi) + adc \$0,$acc3 + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + + ret +.size x25519_fe64_add,.-x25519_fe64_add + +.globl x25519_fe64_sub +.type x25519_fe64_sub,\@function,3 +.align 32 +x25519_fe64_sub: + mov 8*0(%rsi),$acc0 + mov 8*1(%rsi),$acc1 + mov 8*2(%rsi),$acc2 + mov 8*3(%rsi),$acc3 + + sub 8*0(%rdx),$acc0 + sbb 8*1(%rdx),$acc1 + sbb 8*2(%rdx),$acc2 + sbb 8*3(%rdx),$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + sub %rax,$acc0 + sbb \$0,$acc1 + mov $acc0,8*0(%rdi) + sbb \$0,$acc2 + mov $acc1,8*1(%rdi) + sbb \$0,$acc3 + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + + ret +.size x25519_fe64_sub,.-x25519_fe64_sub + +.globl x25519_fe64_tobytes +.type x25519_fe64_tobytes,\@function,2 +.align 32 +x25519_fe64_tobytes: + mov 8*0(%rsi),$acc0 + mov 8*1(%rsi),$acc1 + mov 8*2(%rsi),$acc2 + mov 8*3(%rsi),$acc3 + + ################################# reduction modulo 2^255-19 + lea ($acc3,$acc3),%rax + sar \$63,$acc3 # most significant bit -> mask + shr \$1,%rax # most significant bit cleared + and \$19,$acc3 + + add $acc3,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,%rax + + lea (%rax,%rax),$acc3 + sar \$63,%rax # most significant bit -> mask + shr \$1,$acc3 # most significant bit cleared + and \$19,%rax + + add %rax,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,$acc3 + + mov $acc0,8*0(%rdi) + mov $acc1,8*1(%rdi) + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + + ret +.size x25519_fe64_tobytes,.-x25519_fe64_tobytes +___ +} else { +$code.=<<___; +.globl x25519_fe64_eligible +.type x25519_fe64_eligible,\@function +.align 32 +x25519_fe64_eligible: + xor %eax,%eax + ret +.size x25519_fe64_eligible,.-x25519_fe64_eligible + +.globl x25519_fe64_mul +.globl x25519_fe64_sqr +.globl x25519_fe64_mul121666 +.globl x25519_fe64_add +.globl x25519_fe64_sub +.globl x25519_fe64_tobytes +x25519_fe64_mul: +x25519_fe64_sqr: +x25519_fe64_mul121666: +x25519_fe64_add: +x25519_fe64_sub: +x25519_fe64_sub: +x25519_fe64_tobytes: + .byte 0x0f,0x0b # ud2 +___ +} +$code.=<<___; +.asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close $STDOUT; diff --git a/crypto/ec/build.info b/crypto/ec/build.info index b6549a1..1e7814f 100644 --- a/crypto/ec/build.info +++ b/crypto/ec/build.info @@ -26,6 +26,8 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_nistz256-armv8.pl $(PERLASM_SCHEME) INCLUDE[ecp_nistz256-armv8.o]=.. GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl $(PERLASM_SCHEME) +GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl $(PERLASM_SCHEME) + BEGINRAW[Makefile] {- $builddir -}/ecp_nistz256-%.S: {- $sourcedir -}/asm/ecp_nistz256-%.pl CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ diff --git a/crypto/ec/curve25519.c b/crypto/ec/curve25519.c index 2c0bd5f..f354107 100644 --- a/crypto/ec/curve25519.c +++ b/crypto/ec/curve25519.c @@ -11,16 +11,19 @@ #include "ec_lcl.h" #include <openssl/sha.h> -#if !defined(PEDANTIC) && \ - !defined(__sparc__) && \ - (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) +#if defined(X25519_ASM) \ + || ( !defined(PEDANTIC) && \ + !defined(__sparc__) && \ + (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) ) /* * Base 2^51 implementation. */ # define BASE_2_51_IMPLEMENTED typedef uint64_t fe51[5]; +# if !defined(X25519_ASM) typedef unsigned __int128 u128; +# endif static const uint64_t MASK51 = 0x7ffffffffffff; @@ -132,6 +135,250 @@ static void fe51_tobytes(uint8_t *s, const fe51 h) s[31] = h4 >> 44; } +# ifdef X25519_ASM +void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); +void x25519_fe51_sqr(fe51 h, const fe51 f); +void x25519_fe51_mul121666(fe51 h, fe51 f); +# define fe51_mul x25519_fe51_mul +# define fe51_sq x25519_fe51_sqr +# define fe51_mul121666 x25519_fe51_mul121666 + +# if defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64) + +# define BASE_2_64_IMPLEMENTED + +typedef uint64_t fe64[4]; + +int x25519_fe64_eligible(); + +/* + * There are no reference C implementations for this radix. + */ +void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g); +void x25519_fe64_sqr(fe64 h, const fe64 f); +void x25519_fe64_mul121666(fe64 h, fe64 f); +void x25519_fe64_add(fe64 h, const fe64 f, const fe64 g); +void x25519_fe64_sub(fe64 h, const fe64 f, const fe64 g); +void x25519_fe64_tobytes(uint8_t *s, const fe64 f); +# define fe64_mul x25519_fe64_mul +# define fe64_sqr x25519_fe64_sqr +# define fe64_mul121666 x25519_fe64_mul121666 +# define fe64_add x25519_fe64_add +# define fe64_sub x25519_fe64_sub +# define fe64_tobytes x25519_fe64_tobytes + +static uint64_t load_8(const uint8_t *in) +{ + uint64_t result; + + result = in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + result |= ((uint64_t)in[4]) << 32; + result |= ((uint64_t)in[5]) << 40; + result |= ((uint64_t)in[6]) << 48; + result |= ((uint64_t)in[7]) << 56; + + return result; +} + +static void fe64_frombytes(fe64 h, const uint8_t *s) +{ + h[0] = load_8(s); + h[1] = load_8(s + 8); + h[2] = load_8(s + 16); + h[3] = load_8(s + 24) & 0x7fffffffffffffff; +} + +static void fe64_0(fe64 h) +{ + h[0] = 0; + h[1] = 0; + h[2] = 0; + h[3] = 0; +} + +static void fe64_1(fe64 h) +{ + h[0] = 1; + h[1] = 0; + h[2] = 0; + h[3] = 0; +} + +static void fe64_copy(fe64 h, const fe64 f) +{ + h[0] = f[0]; + h[1] = f[1]; + h[2] = f[2]; + h[3] = f[3]; +} + +static void fe64_cswap(fe64 f, fe64 g, unsigned int b) +{ + int i; + uint64_t mask = 0 - (uint64_t)b; + + for (i = 0; i < 4; i++) { + uint64_t x = f[i] ^ g[i]; + x &= mask; + f[i] ^= x; + g[i] ^= x; + } +} + +static void fe64_invert(fe64 out, const fe64 z) +{ + fe64 t0; + fe64 t1; + fe64 t2; + fe64 t3; + int i; + + /* + * Compute z ** -1 = z ** (2 ** 255 - 19 - 2) with the exponent as + * 2 ** 255 - 21 = (2 ** 5) * (2 ** 250 - 1) + 11. + */ + + /* t0 = z ** 2 */ + fe64_sqr(t0, z); + + /* t1 = t0 ** (2 ** 2) = z ** 8 */ + fe64_sqr(t1, t0); + fe64_sqr(t1, t1); + + /* t1 = z * t1 = z ** 9 */ + fe64_mul(t1, z, t1); + /* t0 = t0 * t1 = z ** 11 -- stash t0 away for the end. */ + fe64_mul(t0, t0, t1); + + /* t2 = t0 ** 2 = z ** 22 */ + fe64_sqr(t2, t0); + + /* t1 = t1 * t2 = z ** (2 ** 5 - 1) */ + fe64_mul(t1, t1, t2); + + /* t2 = t1 ** (2 ** 5) = z ** ((2 ** 5) * (2 ** 5 - 1)) */ + fe64_sqr(t2, t1); + for (i = 1; i < 5; ++i) + fe64_sqr(t2, t2); + + /* t1 = t1 * t2 = z ** ((2 ** 5 + 1) * (2 ** 5 - 1)) = z ** (2 ** 10 - 1) */ + fe64_mul(t1, t2, t1); + + /* Continuing similarly... */ + + /* t2 = z ** (2 ** 20 - 1) */ + fe64_sqr(t2, t1); + for (i = 1; i < 10; ++i) + fe64_sqr(t2, t2); + + fe64_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 40 - 1) */ + fe64_sqr(t3, t2); + for (i = 1; i < 20; ++i) + fe64_sqr(t3, t3); + + fe64_mul(t2, t3, t2); + + /* t2 = z ** (2 ** 10) * (2 ** 40 - 1) */ + for (i = 0; i < 10; ++i) + fe64_sqr(t2, t2); + + /* t1 = z ** (2 ** 50 - 1) */ + fe64_mul(t1, t2, t1); + + /* t2 = z ** (2 ** 100 - 1) */ + fe64_sqr(t2, t1); + for (i = 1; i < 50; ++i) + fe64_sqr(t2, t2); + + fe64_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 200 - 1) */ + fe64_sqr(t3, t2); + for (i = 1; i < 100; ++i) + fe64_sqr(t3, t3); + + fe64_mul(t2, t3, t2); + + /* t2 = z ** ((2 ** 50) * (2 ** 200 - 1) */ + for (i = 0; i < 50; ++i) + fe64_sqr(t2, t2); + + /* t1 = z ** (2 ** 250 - 1) */ + fe64_mul(t1, t2, t1); + + /* t1 = z ** ((2 ** 5) * (2 ** 250 - 1)) */ + for (i = 0; i < 5; ++i) + fe64_sqr(t1, t1); + + /* Recall t0 = z ** 11; out = z ** (2 ** 255 - 21) */ + fe64_mul(out, t1, t0); +} + +/* + * Duplicate of original x25519_scalar_mult_generic, but using + * fe64_* subroutines. + */ +static void x25519_scalar_mulx(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) +{ + fe64 x1, x2, z2, x3, z3, tmp0, tmp1; + uint8_t e[32]; + unsigned swap = 0; + int pos; + + memcpy(e, scalar, 32); + e[0] &= 0xf8; + e[31] &= 0x7f; + e[31] |= 0x40; + fe64_frombytes(x1, point); + fe64_1(x2); + fe64_0(z2); + fe64_copy(x3, x1); + fe64_1(z3); + + for (pos = 254; pos >= 0; --pos) { + unsigned int b = 1 & (e[pos / 8] >> (pos & 7)); + + swap ^= b; + fe64_cswap(x2, x3, swap); + fe64_cswap(z2, z3, swap); + swap = b; + fe64_sub(tmp0, x3, z3); + fe64_sub(tmp1, x2, z2); + fe64_add(x2, x2, z2); + fe64_add(z2, x3, z3); + fe64_mul(z3, x2, tmp0); + fe64_mul(z2, z2, tmp1); + fe64_sqr(tmp0, tmp1); + fe64_sqr(tmp1, x2); + fe64_add(x3, z3, z2); + fe64_sub(z2, z3, z2); + fe64_mul(x2, tmp1, tmp0); + fe64_sub(tmp1, tmp1, tmp0); + fe64_sqr(z2, z2); + fe64_mul121666(z3, tmp1); + fe64_sqr(x3, x3); + fe64_add(tmp0, tmp0, z3); + fe64_mul(z3, x1, z2); + fe64_mul(z2, tmp1, tmp0); + } + + fe64_invert(z2, z2); + fe64_mul(x2, x2, z2); + fe64_tobytes(out, x2); + + OPENSSL_cleanse(e, sizeof(e)); +} +# endif + +# else + static void fe51_mul(fe51 h, const fe51 f, const fe51 g) { u128 h0, h1, h2, h3, h4; @@ -192,9 +439,9 @@ static void fe51_mul(fe51 h, const fe51 f, const fe51 g) static void fe51_sq(fe51 h, const fe51 f) { -# if defined(OPENSSL_SMALL_FOOTPRINT) +# if defined(OPENSSL_SMALL_FOOTPRINT) fe51_mul(h, f, f); -# else +# else /* dedicated squaring gives 16-25% overall improvement */ uint64_t g0 = f[0]; uint64_t g1 = f[1]; @@ -241,9 +488,36 @@ static void fe51_sq(fe51 h, const fe51 f) h[2] = g2; h[3] = g3; h[4] = g4; -# endif +# endif } +static void fe51_mul121666(fe51 h, fe51 f) +{ + u128 h0 = f[0] * (u128)121666; + u128 h1 = f[1] * (u128)121666; + u128 h2 = f[2] * (u128)121666; + u128 h3 = f[3] * (u128)121666; + u128 h4 = f[4] * (u128)121666; + uint64_t g0, g1, g2, g3, g4; + + h3 += (uint64_t)(h2 >> 51); g2 = (uint64_t)h2 & MASK51; + h1 += (uint64_t)(h0 >> 51); g0 = (uint64_t)h0 & MASK51; + + h4 += (uint64_t)(h3 >> 51); g3 = (uint64_t)h3 & MASK51; + g2 += (uint64_t)(h1 >> 51); g1 = (uint64_t)h1 & MASK51; + + g0 += (uint64_t)(h4 >> 51) * 19; g4 = (uint64_t)h4 & MASK51; + g3 += g2 >> 51; g2 &= MASK51; + g1 += g0 >> 51; g0 &= MASK51; + + h[0] = g0; + h[1] = g1; + h[2] = g2; + h[3] = g3; + h[4] = g4; +} +# endif + static void fe51_add(fe51 h, const fe51 f, const fe51 g) { h[0] = f[0] + g[0]; @@ -397,32 +671,6 @@ static void fe51_invert(fe51 out, const fe51 z) fe51_mul(out, t1, t0); } -static void fe51_mul121666(fe51 h, fe51 f) -{ - u128 h0 = f[0] * (u128)121666; - u128 h1 = f[1] * (u128)121666; - u128 h2 = f[2] * (u128)121666; - u128 h3 = f[3] * (u128)121666; - u128 h4 = f[4] * (u128)121666; - uint64_t g0, g1, g2, g3, g4; - - h3 += (uint64_t)(h2 >> 51); g2 = (uint64_t)h2 & MASK51; - h1 += (uint64_t)(h0 >> 51); g0 = (uint64_t)h0 & MASK51; - - h4 += (uint64_t)(h3 >> 51); g3 = (uint64_t)h3 & MASK51; - g2 += (uint64_t)(h1 >> 51); g1 = (uint64_t)h1 & MASK51; - - g0 += (uint64_t)(h4 >> 51) * 19; g4 = (uint64_t)h4 & MASK51; - g3 += g2 >> 51; g2 &= MASK51; - g1 += g0 >> 51; g0 &= MASK51; - - h[0] = g0; - h[1] = g1; - h[2] = g2; - h[3] = g3; - h[4] = g4; -} - /* * Duplicate of original x25519_scalar_mult_generic, but using * fe51_* subroutines. @@ -435,6 +683,13 @@ static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], unsigned swap = 0; int pos; +# ifdef BASE_2_64_IMPLEMENTED + if (x25519_fe64_eligible()) { + x25519_scalar_mulx(out, scalar, point); + return; + } +# endif + memcpy(e, scalar, 32); e[0] &= 0xf8; e[31] &= 0x7f; @@ -471,8 +726,6 @@ static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], fe51_mul(z3, x1, z2); fe51_mul(z2, tmp1, tmp0); } - fe51_cswap(x2, x3, swap); - fe51_cswap(z2, z3, swap); fe51_invert(z2, z2); fe51_mul(x2, x2, z2); @@ -4050,8 +4303,6 @@ static void x25519_scalar_mult_generic(uint8_t out[32], fe_mul(z3, x1, z2); fe_mul(z2, tmp1, tmp0); } - fe_cswap(x2, x3, swap); - fe_cswap(z2, z3, swap); fe_invert(z2, z2); fe_mul(x2, x2, z2); _____ openssl-commits mailing list To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits