Module Name: src
Committed By: joerg
Date: Sat May 16 22:23:31 UTC 2015
Modified Files:
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64: Makefile
Added Files:
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64:
x86_64-gf2m.S x86_64-mont.S x86_64-mont5.S
Log Message:
Find CPU-specific variants of the long number routines. Regenerate.
To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile
cvs rdiff -u -r0 -r1.1 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-gf2m.S \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont.S \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.7 src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.8
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.7 Sat May 16 19:08:37 2015
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile Sat May 16 22:23:31 2015
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.7 2015/05/16 19:08:37 joerg Exp $
+# $NetBSD: Makefile,v 1.8 2015/05/16 22:23:31 joerg Exp $
.include "bsd.own.mk"
@@ -11,6 +11,7 @@ CC+= -fno-integrated-as
regen:
for i in $$(find ${OPENSSLSRC} -name \*${MACHINE_ARCH}.pl) \
+ $$(find ${OPENSSLSRC}/crypto/bn/asm -name ${MACHINE_ARCH}-\*.pl) \
${OPENSSLSRC}/crypto/${MACHINE_ARCH}cpuid.pl ; do \
(echo "#include <machine/asm.h>"; CC=${CC:Q} perl $$i elf | sed \
-e 's/\(OPENSSL[A-Za-z0-9_+]*\)(%rip)/\1@GOTPCREL(%rip)/' \
Added files:
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-gf2m.S
diff -u /dev/null src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-gf2m.S:1.1
--- /dev/null Sat May 16 22:23:31 2015
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-gf2m.S Sat May 16 22:23:31 2015
@@ -0,0 +1,292 @@
+#include <machine/asm.h>
+.text
+
+.type _mul_1x1,@function
+.align 16
+_mul_1x1:
+ subq $128+8,%rsp
+ movq $-1,%r9
+ leaq (%rax,%rax,1),%rsi
+ shrq $3,%r9
+ leaq (,%rax,4),%rdi
+ andq %rax,%r9
+ leaq (,%rax,8),%r12
+ sarq $63,%rax
+ leaq (%r9,%r9,1),%r10
+ sarq $63,%rsi
+ leaq (,%r9,4),%r11
+ andq %rbp,%rax
+ sarq $63,%rdi
+ movq %rax,%rdx
+ shlq $63,%rax
+ andq %rbp,%rsi
+ shrq $1,%rdx
+ movq %rsi,%rcx
+ shlq $62,%rsi
+ andq %rbp,%rdi
+ shrq $2,%rcx
+ xorq %rsi,%rax
+ movq %rdi,%rbx
+ shlq $61,%rdi
+ xorq %rcx,%rdx
+ shrq $3,%rbx
+ xorq %rdi,%rax
+ xorq %rbx,%rdx
+
+ movq %r9,%r13
+ movq $0,0(%rsp)
+ xorq %r10,%r13
+ movq %r9,8(%rsp)
+ movq %r11,%r14
+ movq %r10,16(%rsp)
+ xorq %r12,%r14
+ movq %r13,24(%rsp)
+
+ xorq %r11,%r9
+ movq %r11,32(%rsp)
+ xorq %r11,%r10
+ movq %r9,40(%rsp)
+ xorq %r11,%r13
+ movq %r10,48(%rsp)
+ xorq %r14,%r9
+ movq %r13,56(%rsp)
+ xorq %r14,%r10
+
+ movq %r12,64(%rsp)
+ xorq %r14,%r13
+ movq %r9,72(%rsp)
+ xorq %r11,%r9
+ movq %r10,80(%rsp)
+ xorq %r11,%r10
+ movq %r13,88(%rsp)
+
+ xorq %r11,%r13
+ movq %r14,96(%rsp)
+ movq %r8,%rsi
+ movq %r9,104(%rsp)
+ andq %rbp,%rsi
+ movq %r10,112(%rsp)
+ shrq $4,%rbp
+ movq %r13,120(%rsp)
+ movq %r8,%rdi
+ andq %rbp,%rdi
+ shrq $4,%rbp
+
+ movq (%rsp,%rsi,8),%xmm0
+ movq %r8,%rsi
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $4,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $60,%rbx
+ xorq %rcx,%rax
+ pslldq $1,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $12,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $52,%rbx
+ xorq %rcx,%rax
+ pslldq $2,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $20,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $44,%rbx
+ xorq %rcx,%rax
+ pslldq $3,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $28,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $36,%rbx
+ xorq %rcx,%rax
+ pslldq $4,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $36,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $28,%rbx
+ xorq %rcx,%rax
+ pslldq $5,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $44,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $20,%rbx
+ xorq %rcx,%rax
+ pslldq $6,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $52,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $12,%rbx
+ xorq %rcx,%rax
+ pslldq $7,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %rcx,%rbx
+ shlq $60,%rcx
+.byte 102,72,15,126,198
+ shrq $4,%rbx
+ xorq %rcx,%rax
+ psrldq $8,%xmm0
+ xorq %rbx,%rdx
+.byte 102,72,15,126,199
+ xorq %rsi,%rax
+ xorq %rdi,%rdx
+
+ addq $128+8,%rsp
+ .byte 0xf3,0xc3
+.Lend_mul_1x1:
+.size _mul_1x1,.-_mul_1x1
+
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,@function
+.align 16
+bn_GF2m_mul_2x2:
+ movq OPENSSL_ia32cap_P@GOTPCREL(%rip),%rax
+ btq $33,%rax
+ jnc .Lvanilla_mul_2x2
+
+.byte 102,72,15,110,198
+.byte 102,72,15,110,201
+.byte 102,72,15,110,210
+.byte 102,73,15,110,216
+ movdqa %xmm0,%xmm4
+ movdqa %xmm1,%xmm5
+.byte 102,15,58,68,193,0
+ pxor %xmm2,%xmm4
+ pxor %xmm3,%xmm5
+.byte 102,15,58,68,211,0
+.byte 102,15,58,68,229,0
+ xorps %xmm0,%xmm4
+ xorps %xmm2,%xmm4
+ movdqa %xmm4,%xmm5
+ pslldq $8,%xmm4
+ psrldq $8,%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm0
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ .byte 0xf3,0xc3
+
+.align 16
+.Lvanilla_mul_2x2:
+ leaq -136(%rsp),%rsp
+ movq %r14,80(%rsp)
+ movq %r13,88(%rsp)
+ movq %r12,96(%rsp)
+ movq %rbp,104(%rsp)
+ movq %rbx,112(%rsp)
+.Lbody_mul_2x2:
+ movq %rdi,32(%rsp)
+ movq %rsi,40(%rsp)
+ movq %rdx,48(%rsp)
+ movq %rcx,56(%rsp)
+ movq %r8,64(%rsp)
+
+ movq $15,%r8
+ movq %rsi,%rax
+ movq %rcx,%rbp
+ call _mul_1x1
+ movq %rax,16(%rsp)
+ movq %rdx,24(%rsp)
+
+ movq 48(%rsp),%rax
+ movq 64(%rsp),%rbp
+ call _mul_1x1
+ movq %rax,0(%rsp)
+ movq %rdx,8(%rsp)
+
+ movq 40(%rsp),%rax
+ movq 56(%rsp),%rbp
+ xorq 48(%rsp),%rax
+ xorq 64(%rsp),%rbp
+ call _mul_1x1
+ movq 0(%rsp),%rbx
+ movq 8(%rsp),%rcx
+ movq 16(%rsp),%rdi
+ movq 24(%rsp),%rsi
+ movq 32(%rsp),%rbp
+
+ xorq %rdx,%rax
+ xorq %rcx,%rdx
+ xorq %rbx,%rax
+ movq %rbx,0(%rbp)
+ xorq %rdi,%rdx
+ movq %rsi,24(%rbp)
+ xorq %rsi,%rax
+ xorq %rsi,%rdx
+ xorq %rdx,%rax
+ movq %rdx,16(%rbp)
+ movq %rax,8(%rbp)
+
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbp
+ movq 112(%rsp),%rbx
+ leaq 136(%rsp),%rsp
+ .byte 0xf3,0xc3
+.Lend_mul_2x2:
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont.S
diff -u /dev/null src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont.S:1.1
--- /dev/null Sat May 16 22:23:31 2015
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont.S Sat May 16 22:23:31 2015
@@ -0,0 +1,1375 @@
+#include <machine/asm.h>
+.text
+
+.globl bn_mul_mont
+.type bn_mul_mont,@function
+.align 16
+bn_mul_mont:
+ testl $3,%r9d
+ jnz .Lmul_enter
+ cmpl $8,%r9d
+ jb .Lmul_enter
+ cmpq %rsi,%rdx
+ jne .Lmul4x_enter
+ jmp .Lsqr4x_enter
+
+.align 16
+.Lmul_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 2(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+.Lmul_body:
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jl .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ movq %r9,%r15
+ orq %rcx,%rsi
+.align 16
+.Lcopy:
+ movq (%rsi,%r14,8),%rax
+ movq %r14,(%rsp,%r14,8)
+ movq %rax,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul_mont,.-bn_mul_mont
+.type bn_mul4x_mont,@function
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 4(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jl .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ movq 0(%rsp),%rax
+ pxor %xmm0,%xmm0
+ movq 8(%rsp),%rdx
+ shrq $2,%r9
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+ leaq -1(%r9),%r15
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ leaq -1(%r9),%r15
+ orq %rcx,%rsi
+
+ movdqu (%rsi),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,(%rdi)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqu 32(%rsi,%r14,1),%xmm1
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movdqa %xmm0,32(%rsp,%r14,1)
+ movdqu %xmm1,32(%rdi,%r14,1)
+ leaq 32(%r14),%r14
+ decq %r15
+ jnz .Lcopy4x
+
+ shlq $2,%r9
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul4x_mont,.-bn_mul4x_mont
+.type bn_sqr4x_mont,@function
+.align 16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ shll $3,%r9d
+ xorq %r10,%r10
+ movq %rsp,%r11
+ subq %r9,%r10
+ movq (%r8),%r8
+ leaq -72(%rsp,%r10,2),%rsp
+ andq $-1024,%rsp
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdi,32(%rsp)
+ movq %rcx,40(%rsp)
+ movq %r8,48(%rsp)
+ movq %r11,56(%rsp)
+.Lsqr4x_body:
+
+
+
+
+
+
+
+ leaq 32(%r10),%rbp
+ leaq (%rsi,%r9,1),%rsi
+
+ movq %r9,%rcx
+
+
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ xorq %r10,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ leaq -16(%rbp),%rcx
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ mulq %r15
+ movq %rax,%r12
+ movq %rbx,%rax
+ movq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 16(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+ jmp .Lsqr4x_1st
+
+.align 16
+.Lsqr4x_1st:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,(%rdi,%rcx,1)
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,8(%rdi,%rcx,1)
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,16(%rdi,%rcx,1)
+
+
+ movq 24(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 32(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_1st
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ leaq 16(%rbp),%rbp
+ movq %r12,8(%rdi)
+ jmp .Lsqr4x_outer
+
+.align 16
+.Lsqr4x_outer:
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ movq -24(%rdi,%rbp,1),%r10
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ xorq %r10,%r10
+ addq -16(%rdi,%rbp,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ leaq -16(%rbp),%rcx
+ xorq %r12,%r12
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,8(%rdi,%rcx,1)
+
+ leaq 16(%rcx),%rcx
+ jmp .Lsqr4x_inner
+
+.align 16
+.Lsqr4x_inner:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ addq (%rdi,%rcx,1),%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,(%rdi,%rcx,1)
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 16(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_inner
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ movq %r12,8(%rdi)
+
+ addq $16,%rbp
+ jnz .Lsqr4x_outer
+
+
+ movq -32(%rsi),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi),%rbx
+ movq %rax,%r15
+
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-24(%rdi)
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi)
+
+ movq -8(%rsi),%rbx
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq $0,%rdx
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ movq %rdx,%r13
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi)
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq -16(%rsi),%rax
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ movq %r12,8(%rdi)
+
+ mulq %rbx
+ addq $16,%rbp
+ xorq %r14,%r14
+ subq %r9,%rbp
+ xorq %r15,%r15
+
+ addq %r12,%rax
+ adcq $0,%rdx
+ movq %rax,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %r15,24(%rdi)
+
+ movq -16(%rsi,%rbp,1),%rax
+ leaq 64(%rsp,%r9,2),%rdi
+ xorq %r10,%r10
+ movq -24(%rdi,%rbp,2),%r11
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+ leaq 16(%rbp),%rbp
+ movq %r8,-40(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ jmp .Lsqr4x_shift_n_add
+
+.align 16
+.Lsqr4x_shift_n_add:
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+
+ leaq (%r14,%r10,2),%r12
+ movq %r8,-8(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq 8(%rsi,%rbp,1),%rax
+ movq %r12,0(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 16(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+ movq %r8,24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ addq $32,%rbp
+ jnz .Lsqr4x_shift_n_add
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ mulq %rax
+ negq %r15
+ adcq %rax,%rbx
+ adcq %rdx,%r8
+ movq %rbx,-16(%rdi)
+ movq %r8,-8(%rdi)
+ movq 40(%rsp),%rsi
+ movq 48(%rsp),%r8
+ xorq %rcx,%rcx
+ movq %r9,0(%rsp)
+ subq %r9,%rcx
+ movq 64(%rsp),%r10
+ movq %r8,%r14
+ leaq 64(%rsp,%r9,2),%rax
+ leaq 64(%rsp,%r9,1),%rdi
+ movq %rax,8(%rsp)
+ leaq (%rsi,%r9,1),%rsi
+ xorq %rbp,%rbp
+
+ movq 0(%rsi,%rcx,1),%rax
+ movq 8(%rsi,%rcx,1),%r9
+ imulq %r10,%r14
+ movq %rax,%rbx
+ jmp .Lsqr4x_mont_outer
+
+.align 16
+.Lsqr4x_mont_outer:
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+ movq %r8,%r15
+
+ xorq %r10,%r10
+ addq 8(%rdi,%rcx,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+
+ imulq %r11,%r15
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 24(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,16(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 24(%rdi,%rcx,1),%r11
+ leaq 32(%rcx),%rcx
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ jmp .Lsqr4x_mont_inner
+
+.align 16
+.Lsqr4x_mont_inner:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,-8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq (%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 8(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 8(%rdi,%rcx,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 24(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,16(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 24(%rdi,%rcx,1),%r11
+ leaq 32(%rcx),%rcx
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ cmpq $0,%rcx
+ jne .Lsqr4x_mont_inner
+
+ subq 0(%rsp),%rcx
+ movq %r8,%r14
+
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ movq %r12,-8(%rdi)
+
+ xorq %r11,%r11
+ addq (%rdi),%r10
+ adcq $0,%r11
+ movq 0(%rsi,%rcx,1),%rbx
+ addq %rbp,%r10
+ adcq $0,%r11
+
+ imulq 16(%rdi,%rcx,1),%r14
+ xorq %r12,%r12
+ movq 8(%rsi,%rcx,1),%r9
+ addq %r10,%r13
+ movq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+ movq %r13,(%rdi)
+
+ xorq %rbp,%rbp
+ addq 8(%rdi),%r12
+ adcq %rbp,%rbp
+ addq %r11,%r12
+ leaq 16(%rdi),%rdi
+ adcq $0,%rbp
+ movq %r12,-8(%rdi)
+ cmpq 8(%rsp),%rdi
+ jb .Lsqr4x_mont_outer
+
+ movq 0(%rsp),%r9
+ movq %rbp,(%rdi)
+ movq 64(%rsp,%r9,1),%rax
+ leaq 64(%rsp,%r9,1),%rbx
+ movq 40(%rsp),%rsi
+ shrq $5,%r9
+ movq 8(%rbx),%rdx
+ xorq %rbp,%rbp
+
+ movq 32(%rsp),%rdi
+ subq 0(%rsi),%rax
+ movq 16(%rbx),%r10
+ movq 24(%rbx),%r11
+ sbbq 8(%rsi),%rdx
+ leaq -1(%r9),%rcx
+ jmp .Lsqr4x_sub
+.align 16
+.Lsqr4x_sub:
+ movq %rax,0(%rdi,%rbp,8)
+ movq %rdx,8(%rdi,%rbp,8)
+ sbbq 16(%rsi,%rbp,8),%r10
+ movq 32(%rbx,%rbp,8),%rax
+ movq 40(%rbx,%rbp,8),%rdx
+ sbbq 24(%rsi,%rbp,8),%r11
+ movq %r10,16(%rdi,%rbp,8)
+ movq %r11,24(%rdi,%rbp,8)
+ sbbq 32(%rsi,%rbp,8),%rax
+ movq 48(%rbx,%rbp,8),%r10
+ movq 56(%rbx,%rbp,8),%r11
+ sbbq 40(%rsi,%rbp,8),%rdx
+ leaq 4(%rbp),%rbp
+ decq %rcx
+ jnz .Lsqr4x_sub
+
+ movq %rax,0(%rdi,%rbp,8)
+ movq 32(%rbx,%rbp,8),%rax
+ sbbq 16(%rsi,%rbp,8),%r10
+ movq %rdx,8(%rdi,%rbp,8)
+ sbbq 24(%rsi,%rbp,8),%r11
+ movq %r10,16(%rdi,%rbp,8)
+
+ sbbq $0,%rax
+ movq %r11,24(%rdi,%rbp,8)
+ xorq %rbp,%rbp
+ andq %rax,%rbx
+ notq %rax
+ movq %rdi,%rsi
+ andq %rax,%rsi
+ leaq -1(%r9),%rcx
+ orq %rsi,%rbx
+
+ pxor %xmm0,%xmm0
+ leaq 64(%rsp,%r9,8),%rsi
+ movdqu (%rbx),%xmm1
+ leaq (%rsi,%r9,8),%rsi
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm0,(%rsi)
+ movdqu %xmm1,(%rdi)
+ jmp .Lsqr4x_copy
+.align 16
+.Lsqr4x_copy:
+ movdqu 16(%rbx,%rbp,1),%xmm2
+ movdqu 32(%rbx,%rbp,1),%xmm1
+ movdqa %xmm0,80(%rsp,%rbp,1)
+ movdqa %xmm0,96(%rsp,%rbp,1)
+ movdqa %xmm0,16(%rsi,%rbp,1)
+ movdqa %xmm0,32(%rsi,%rbp,1)
+ movdqu %xmm2,16(%rdi,%rbp,1)
+ movdqu %xmm1,32(%rdi,%rbp,1)
+ leaq 32(%rbp),%rbp
+ decq %rcx
+ jnz .Lsqr4x_copy
+
+ movdqu 16(%rbx,%rbp,1),%xmm2
+ movdqa %xmm0,80(%rsp,%rbp,1)
+ movdqa %xmm0,16(%rsi,%rbp,1)
+ movdqu %xmm2,16(%rdi,%rbp,1)
+ movq 56(%rsp),%rsi
+ movq $1,%rax
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lsqr4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_sqr4x_mont,.-bn_sqr4x_mont
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S
diff -u /dev/null src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S:1.1
--- /dev/null Sat May 16 22:23:31 2015
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S Sat May 16 22:23:31 2015
@@ -0,0 +1,785 @@
+#include <machine/asm.h>
+.text
+
+.globl bn_mul_mont_gather5
+.type bn_mul_mont_gather5,@function
+.align 64
+bn_mul_mont_gather5:
+ testl $3,%r9d
+ jnz .Lmul_enter
+ cmpl $8,%r9d
+ jb .Lmul_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ movl %r9d,%r9d
+ movl 8(%rsp),%r10d
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%rax
+ leaq 2(%r9),%r11
+ negq %r11
+ leaq (%rsp,%r11,8),%rsp
+ andq $-1024,%rsp
+
+ movq %rax,8(%rsp,%r9,8)
+.Lmul_body:
+ movq %rdx,%r12
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%r12,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+.byte 102,72,15,126,195
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jl .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ movq %r9,%r15
+ orq %rcx,%rsi
+.align 16
+.Lcopy:
+ movq (%rsi,%r14,8),%rax
+ movq %r14,(%rsp,%r14,8)
+ movq %rax,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type bn_mul4x_mont_gather5,@function
+.align 16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+ movl %r9d,%r9d
+ movl 8(%rsp),%r10d
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%rax
+ leaq 4(%r9),%r11
+ negq %r11
+ leaq (%rsp,%r11,8),%rsp
+ andq $-1024,%rsp
+
+ movq %rax,8(%rsp,%r9,8)
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%r12,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+.byte 102,72,15,126,195
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ xorq %r15,%r15
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-40(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+ movq %rdi,-16(%rsp,%r15,8)
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jl .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ movq 0(%rsp),%rax
+ pxor %xmm0,%xmm0
+ movq 8(%rsp),%rdx
+ shrq $2,%r9
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+ leaq -1(%r9),%r15
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ leaq -1(%r9),%r15
+ orq %rcx,%rsi
+
+ movdqu (%rsi),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,(%rdi)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqu 32(%rsi,%r14,1),%xmm1
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movdqa %xmm0,32(%rsp,%r14,1)
+ movdqu %xmm1,32(%rdi,%r14,1)
+ leaq 32(%r14),%r14
+ decq %r15
+ jnz .Lcopy4x
+
+ shlq $2,%r9
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+.globl bn_scatter5
+.type bn_scatter5,@function
+.align 16
+bn_scatter5:
+ cmpq $0,%rsi
+ jz .Lscatter_epilogue
+ leaq (%rdx,%rcx,8),%rdx
+.Lscatter:
+ movq (%rdi),%rax
+ leaq 8(%rdi),%rdi
+ movq %rax,(%rdx)
+ leaq 256(%rdx),%rdx
+ subq $1,%rsi
+ jnz .Lscatter
+.Lscatter_epilogue:
+ .byte 0xf3,0xc3
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.type bn_gather5,@function
+.align 16
+bn_gather5:
+ movq %rcx,%r11
+ shrq $3,%rcx
+ andq $7,%r11
+ notq %rcx
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%rcx
+ leaq 96(%rdx,%r11,8),%rdx
+ movq 0(%rax,%rcx,8),%xmm4
+ movq 8(%rax,%rcx,8),%xmm5
+ movq 16(%rax,%rcx,8),%xmm6
+ movq 24(%rax,%rcx,8),%xmm7
+ jmp .Lgather
+.align 16
+.Lgather:
+ movq -96(%rdx),%xmm0
+ movq -32(%rdx),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%rdx),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%rdx),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%rdx),%rdx
+ por %xmm3,%xmm0
+
+ movq %xmm0,(%rdi)
+ leaq 8(%rdi),%rdi
+ subq $1,%rsi
+ jnz .Lgather
+ .byte 0xf3,0xc3
+.LSEH_end_bn_gather5:
+.size bn_gather5,.-bn_gather5
+.align 64
+.Lmagic_masks:
+.long 0,0, 0,0, 0,0, -1,-1
+.long 0,0, 0,0, 0,0, 0,0
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0