Module Name:    src
Committed By:   joerg
Date:           Sat May 16 22:23:31 UTC 2015

Modified Files:
        src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64: Makefile
Added Files:
        src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64:
            x86_64-gf2m.S x86_64-mont.S x86_64-mont5.S

Log Message:
Find CPU-specific variants of the long number routines. Regenerate.


To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile
cvs rdiff -u -r0 -r1.1 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-gf2m.S \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont.S \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.7 src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.8
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.7	Sat May 16 19:08:37 2015
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile	Sat May 16 22:23:31 2015
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.7 2015/05/16 19:08:37 joerg Exp $
+#	$NetBSD: Makefile,v 1.8 2015/05/16 22:23:31 joerg Exp $
 
 .include "bsd.own.mk"
 
@@ -11,6 +11,7 @@ CC+= -fno-integrated-as
 
 regen:
 	for i in $$(find ${OPENSSLSRC} -name \*${MACHINE_ARCH}.pl) \
+		$$(find ${OPENSSLSRC}/crypto/bn/asm -name ${MACHINE_ARCH}-\*.pl) \
 		${OPENSSLSRC}/crypto/${MACHINE_ARCH}cpuid.pl ; do \
                 (echo "#include <machine/asm.h>"; CC=${CC:Q} perl $$i elf | sed \
 		    -e 's/\(OPENSSL[A-Za-z0-9_+]*\)(%rip)/\1@GOTPCREL(%rip)/' \

Added files:

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-gf2m.S
diff -u /dev/null src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-gf2m.S:1.1
--- /dev/null	Sat May 16 22:23:31 2015
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-gf2m.S	Sat May 16 22:23:31 2015
@@ -0,0 +1,292 @@
+#include <machine/asm.h>
+.text	
+
+.type	_mul_1x1,@function
+.align	16
+_mul_1x1:
+	subq	$128+8,%rsp
+	movq	$-1,%r9
+	leaq	(%rax,%rax,1),%rsi
+	shrq	$3,%r9
+	leaq	(,%rax,4),%rdi
+	andq	%rax,%r9
+	leaq	(,%rax,8),%r12
+	sarq	$63,%rax
+	leaq	(%r9,%r9,1),%r10
+	sarq	$63,%rsi
+	leaq	(,%r9,4),%r11
+	andq	%rbp,%rax
+	sarq	$63,%rdi
+	movq	%rax,%rdx
+	shlq	$63,%rax
+	andq	%rbp,%rsi
+	shrq	$1,%rdx
+	movq	%rsi,%rcx
+	shlq	$62,%rsi
+	andq	%rbp,%rdi
+	shrq	$2,%rcx
+	xorq	%rsi,%rax
+	movq	%rdi,%rbx
+	shlq	$61,%rdi
+	xorq	%rcx,%rdx
+	shrq	$3,%rbx
+	xorq	%rdi,%rax
+	xorq	%rbx,%rdx
+
+	movq	%r9,%r13
+	movq	$0,0(%rsp)
+	xorq	%r10,%r13
+	movq	%r9,8(%rsp)
+	movq	%r11,%r14
+	movq	%r10,16(%rsp)
+	xorq	%r12,%r14
+	movq	%r13,24(%rsp)
+
+	xorq	%r11,%r9
+	movq	%r11,32(%rsp)
+	xorq	%r11,%r10
+	movq	%r9,40(%rsp)
+	xorq	%r11,%r13
+	movq	%r10,48(%rsp)
+	xorq	%r14,%r9
+	movq	%r13,56(%rsp)
+	xorq	%r14,%r10
+
+	movq	%r12,64(%rsp)
+	xorq	%r14,%r13
+	movq	%r9,72(%rsp)
+	xorq	%r11,%r9
+	movq	%r10,80(%rsp)
+	xorq	%r11,%r10
+	movq	%r13,88(%rsp)
+
+	xorq	%r11,%r13
+	movq	%r14,96(%rsp)
+	movq	%r8,%rsi
+	movq	%r9,104(%rsp)
+	andq	%rbp,%rsi
+	movq	%r10,112(%rsp)
+	shrq	$4,%rbp
+	movq	%r13,120(%rsp)
+	movq	%r8,%rdi
+	andq	%rbp,%rdi
+	shrq	$4,%rbp
+
+	movq	(%rsp,%rsi,8),%xmm0
+	movq	%r8,%rsi
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$4,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$60,%rbx
+	xorq	%rcx,%rax
+	pslldq	$1,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$12,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$52,%rbx
+	xorq	%rcx,%rax
+	pslldq	$2,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$20,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$44,%rbx
+	xorq	%rcx,%rax
+	pslldq	$3,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$28,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$36,%rbx
+	xorq	%rcx,%rax
+	pslldq	$4,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$36,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$28,%rbx
+	xorq	%rcx,%rax
+	pslldq	$5,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$44,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$20,%rbx
+	xorq	%rcx,%rax
+	pslldq	$6,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$52,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$12,%rbx
+	xorq	%rcx,%rax
+	pslldq	$7,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%rcx,%rbx
+	shlq	$60,%rcx
+.byte	102,72,15,126,198
+	shrq	$4,%rbx
+	xorq	%rcx,%rax
+	psrldq	$8,%xmm0
+	xorq	%rbx,%rdx
+.byte	102,72,15,126,199
+	xorq	%rsi,%rax
+	xorq	%rdi,%rdx
+
+	addq	$128+8,%rsp
+	.byte	0xf3,0xc3
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,@function
+.align	16
+bn_GF2m_mul_2x2:
+	movq	OPENSSL_ia32cap_P@GOTPCREL(%rip),%rax
+	btq	$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+.byte	102,72,15,110,198
+.byte	102,72,15,110,201
+.byte	102,72,15,110,210
+.byte	102,73,15,110,216
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm1,%xmm5
+.byte	102,15,58,68,193,0
+	pxor	%xmm2,%xmm4
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,68,211,0
+.byte	102,15,58,68,229,0
+	xorps	%xmm0,%xmm4
+	xorps	%xmm2,%xmm4
+	movdqa	%xmm4,%xmm5
+	pslldq	$8,%xmm4
+	psrldq	$8,%xmm5
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm0
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	.byte	0xf3,0xc3
+
+.align	16
+.Lvanilla_mul_2x2:
+	leaq	-136(%rsp),%rsp
+	movq	%r14,80(%rsp)
+	movq	%r13,88(%rsp)
+	movq	%r12,96(%rsp)
+	movq	%rbp,104(%rsp)
+	movq	%rbx,112(%rsp)
+.Lbody_mul_2x2:
+	movq	%rdi,32(%rsp)
+	movq	%rsi,40(%rsp)
+	movq	%rdx,48(%rsp)
+	movq	%rcx,56(%rsp)
+	movq	%r8,64(%rsp)
+
+	movq	$15,%r8
+	movq	%rsi,%rax
+	movq	%rcx,%rbp
+	call	_mul_1x1		
+	movq	%rax,16(%rsp)
+	movq	%rdx,24(%rsp)
+
+	movq	48(%rsp),%rax
+	movq	64(%rsp),%rbp
+	call	_mul_1x1		
+	movq	%rax,0(%rsp)
+	movq	%rdx,8(%rsp)
+
+	movq	40(%rsp),%rax
+	movq	56(%rsp),%rbp
+	xorq	48(%rsp),%rax
+	xorq	64(%rsp),%rbp
+	call	_mul_1x1		
+	movq	0(%rsp),%rbx
+	movq	8(%rsp),%rcx
+	movq	16(%rsp),%rdi
+	movq	24(%rsp),%rsi
+	movq	32(%rsp),%rbp
+
+	xorq	%rdx,%rax
+	xorq	%rcx,%rdx
+	xorq	%rbx,%rax
+	movq	%rbx,0(%rbp)
+	xorq	%rdi,%rdx
+	movq	%rsi,24(%rbp)
+	xorq	%rsi,%rax
+	xorq	%rsi,%rdx
+	xorq	%rdx,%rax
+	movq	%rdx,16(%rbp)
+	movq	%rax,8(%rbp)
+
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbp
+	movq	112(%rsp),%rbx
+	leaq	136(%rsp),%rsp
+	.byte	0xf3,0xc3
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.byte	71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont.S
diff -u /dev/null src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont.S:1.1
--- /dev/null	Sat May 16 22:23:31 2015
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont.S	Sat May 16 22:23:31 2015
@@ -0,0 +1,1375 @@
+#include <machine/asm.h>
+.text	
+
+.globl	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+	testl	$3,%r9d
+	jnz	.Lmul_enter
+	cmpl	$8,%r9d
+	jb	.Lmul_enter
+	cmpq	%rsi,%rdx
+	jne	.Lmul4x_enter
+	jmp	.Lsqr4x_enter
+
+.align	16
+.Lmul_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movl	%r9d,%r9d
+	leaq	2(%r9),%r10
+	movq	%rsp,%r11
+	negq	%r10
+	leaq	(%rsp,%r10,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%r11,8(%rsp,%r9,8)
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jl	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	movq	%r9,%r15
+	orq	%rcx,%rsi
+.align	16
+.Lcopy:
+	movq	(%rsi,%r14,8),%rax
+	movq	%r14,(%rsp,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul_mont,.-bn_mul_mont
+.type	bn_mul4x_mont,@function
+.align	16
+bn_mul4x_mont:
+.Lmul4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movl	%r9d,%r9d
+	leaq	4(%r9),%r10
+	movq	%rsp,%r11
+	negq	%r10
+	leaq	(%rsp,%r10,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%r11,8(%rsp,%r9,8)
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jl	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	movq	8(%rsp),%rdx
+	shrq	$2,%r9
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+	leaq	-1(%r9),%r15
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-1(%r9),%r15
+	orq	%rcx,%rsi
+
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
+	leaq	32(%r14),%r14
+	decq	%r15
+	jnz	.Lcopy4x
+
+	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+.type	bn_sqr4x_mont,@function
+.align	16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	movq	%rsp,%r11
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,2),%rsp
+	andq	$-1024,%rsp
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+	movq	%r8,48(%rsp)
+	movq	%r11,56(%rsp)
+.Lsqr4x_body:
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	xorq	%r10,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	leaq	-16(%rbp),%rcx
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	.Lsqr4x_1st
+
+.align	16
+.Lsqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,(%rdi,%rcx,1)
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,8(%rdi,%rcx,1)
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,16(%rdi,%rcx,1)
+
+
+	movq	24(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_1st
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	leaq	16(%rbp),%rbp
+	movq	%r12,8(%rdi)
+	jmp	.Lsqr4x_outer
+
+.align	16
+.Lsqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	movq	-24(%rdi,%rbp,1),%r10
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	xorq	%r10,%r10
+	addq	-16(%rdi,%rbp,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	leaq	-16(%rbp),%rcx
+	xorq	%r12,%r12
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,8(%rdi,%rcx,1)
+
+	leaq	16(%rcx),%rcx
+	jmp	.Lsqr4x_inner
+
+.align	16
+.Lsqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,(%rdi,%rcx,1)
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_inner
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	movq	%r12,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	.Lsqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-24(%rdi)
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi)
+
+	movq	-8(%rsi),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	movq	%rdx,%r13
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi)
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	movq	%r12,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
+	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
+
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	64(%rsp,%r9,2),%rdi
+	xorq	%r10,%r10
+	movq	-24(%rdi,%rbp,2),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,-40(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	jmp	.Lsqr4x_shift_n_add
+
+.align	16
+.Lsqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	addq	$32,%rbp
+	jnz	.Lsqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+	movq	40(%rsp),%rsi
+	movq	48(%rsp),%r8
+	xorq	%rcx,%rcx
+	movq	%r9,0(%rsp)
+	subq	%r9,%rcx
+	movq	64(%rsp),%r10
+	movq	%r8,%r14
+	leaq	64(%rsp,%r9,2),%rax
+	leaq	64(%rsp,%r9,1),%rdi
+	movq	%rax,8(%rsp)
+	leaq	(%rsi,%r9,1),%rsi
+	xorq	%rbp,%rbp
+
+	movq	0(%rsi,%rcx,1),%rax
+	movq	8(%rsi,%rcx,1),%r9
+	imulq	%r10,%r14
+	movq	%rax,%rbx
+	jmp	.Lsqr4x_mont_outer
+
+.align	16
+.Lsqr4x_mont_outer:
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+	movq	%r8,%r15
+
+	xorq	%r10,%r10
+	addq	8(%rdi,%rcx,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+
+	imulq	%r11,%r15
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	24(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,16(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	24(%rdi,%rcx,1),%r11
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	jmp	.Lsqr4x_mont_inner
+
+.align	16
+.Lsqr4x_mont_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,-8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	8(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	8(%rdi,%rcx,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	24(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,16(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	24(%rdi,%rcx,1),%r11
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_mont_inner
+
+	subq	0(%rsp),%rcx
+	movq	%r8,%r14
+
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	movq	%r12,-8(%rdi)
+
+	xorq	%r11,%r11
+	addq	(%rdi),%r10
+	adcq	$0,%r11
+	movq	0(%rsi,%rcx,1),%rbx
+	addq	%rbp,%r10
+	adcq	$0,%r11
+
+	imulq	16(%rdi,%rcx,1),%r14
+	xorq	%r12,%r12
+	movq	8(%rsi,%rcx,1),%r9
+	addq	%r10,%r13
+	movq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+	movq	%r13,(%rdi)
+
+	xorq	%rbp,%rbp
+	addq	8(%rdi),%r12
+	adcq	%rbp,%rbp
+	addq	%r11,%r12
+	leaq	16(%rdi),%rdi
+	adcq	$0,%rbp
+	movq	%r12,-8(%rdi)
+	cmpq	8(%rsp),%rdi
+	jb	.Lsqr4x_mont_outer
+
+	movq	0(%rsp),%r9
+	movq	%rbp,(%rdi)
+	movq	64(%rsp,%r9,1),%rax
+	leaq	64(%rsp,%r9,1),%rbx
+	movq	40(%rsp),%rsi
+	shrq	$5,%r9
+	movq	8(%rbx),%rdx
+	xorq	%rbp,%rbp
+
+	movq	32(%rsp),%rdi
+	subq	0(%rsi),%rax
+	movq	16(%rbx),%r10
+	movq	24(%rbx),%r11
+	sbbq	8(%rsi),%rdx
+	leaq	-1(%r9),%rcx
+	jmp	.Lsqr4x_sub
+.align	16
+.Lsqr4x_sub:
+	movq	%rax,0(%rdi,%rbp,8)
+	movq	%rdx,8(%rdi,%rbp,8)
+	sbbq	16(%rsi,%rbp,8),%r10
+	movq	32(%rbx,%rbp,8),%rax
+	movq	40(%rbx,%rbp,8),%rdx
+	sbbq	24(%rsi,%rbp,8),%r11
+	movq	%r10,16(%rdi,%rbp,8)
+	movq	%r11,24(%rdi,%rbp,8)
+	sbbq	32(%rsi,%rbp,8),%rax
+	movq	48(%rbx,%rbp,8),%r10
+	movq	56(%rbx,%rbp,8),%r11
+	sbbq	40(%rsi,%rbp,8),%rdx
+	leaq	4(%rbp),%rbp
+	decq	%rcx
+	jnz	.Lsqr4x_sub
+
+	movq	%rax,0(%rdi,%rbp,8)
+	movq	32(%rbx,%rbp,8),%rax
+	sbbq	16(%rsi,%rbp,8),%r10
+	movq	%rdx,8(%rdi,%rbp,8)
+	sbbq	24(%rsi,%rbp,8),%r11
+	movq	%r10,16(%rdi,%rbp,8)
+
+	sbbq	$0,%rax
+	movq	%r11,24(%rdi,%rbp,8)
+	xorq	%rbp,%rbp
+	andq	%rax,%rbx
+	notq	%rax
+	movq	%rdi,%rsi
+	andq	%rax,%rsi
+	leaq	-1(%r9),%rcx
+	orq	%rsi,%rbx
+
+	pxor	%xmm0,%xmm0
+	leaq	64(%rsp,%r9,8),%rsi
+	movdqu	(%rbx),%xmm1
+	leaq	(%rsi,%r9,8),%rsi
+	movdqa	%xmm0,64(%rsp)
+	movdqa	%xmm0,(%rsi)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lsqr4x_copy
+.align	16
+.Lsqr4x_copy:
+	movdqu	16(%rbx,%rbp,1),%xmm2
+	movdqu	32(%rbx,%rbp,1),%xmm1
+	movdqa	%xmm0,80(%rsp,%rbp,1)
+	movdqa	%xmm0,96(%rsp,%rbp,1)
+	movdqa	%xmm0,16(%rsi,%rbp,1)
+	movdqa	%xmm0,32(%rsi,%rbp,1)
+	movdqu	%xmm2,16(%rdi,%rbp,1)
+	movdqu	%xmm1,32(%rdi,%rbp,1)
+	leaq	32(%rbp),%rbp
+	decq	%rcx
+	jnz	.Lsqr4x_copy
+
+	movdqu	16(%rbx,%rbp,1),%xmm2
+	movdqa	%xmm0,80(%rsp,%rbp,1)
+	movdqa	%xmm0,16(%rsi,%rbp,1)
+	movdqu	%xmm2,16(%rdi,%rbp,1)
+	movq	56(%rsp),%rsi
+	movq	$1,%rax
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lsqr4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_sqr4x_mont,.-bn_sqr4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S
diff -u /dev/null src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S:1.1
--- /dev/null	Sat May 16 22:23:31 2015
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/x86_64-mont5.S	Sat May 16 22:23:31 2015
@@ -0,0 +1,785 @@
+#include <machine/asm.h>
+.text	
+
+.globl	bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,@function
+.align	64
+bn_mul_mont_gather5:
+	testl	$3,%r9d
+	jnz	.Lmul_enter
+	cmpl	$8,%r9d
+	jb	.Lmul_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	movl	%r9d,%r9d
+	movl	8(%rsp),%r10d
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%rax
+	leaq	2(%r9),%r11
+	negq	%r11
+	leaq	(%rsp,%r11,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	%r10,%r11
+	shrq	$3,%r10
+	andq	$7,%r11
+	notq	%r10
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%r10
+	leaq	96(%r12,%r11,8),%r12
+	movq	0(%rax,%r10,8),%xmm4
+	movq	8(%rax,%r10,8),%xmm5
+	movq	16(%rax,%r10,8),%xmm6
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+.byte	102,72,15,126,195
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+.byte	102,72,15,126,195
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+.byte	102,72,15,126,195
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jl	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	movq	%r9,%r15
+	orq	%rcx,%rsi
+.align	16
+.Lcopy:
+	movq	(%rsi,%r14,8),%rax
+	movq	%r14,(%rsp,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type	bn_mul4x_mont_gather5,@function
+.align	16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+	movl	%r9d,%r9d
+	movl	8(%rsp),%r10d
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%rax
+	leaq	4(%r9),%r11
+	negq	%r11
+	leaq	(%rsp,%r11,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	%r10,%r11
+	shrq	$3,%r10
+	andq	$7,%r11
+	notq	%r10
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%r10
+	leaq	96(%r12,%r11,8),%r12
+	movq	0(%rax,%r10,8),%xmm4
+	movq	8(%rax,%r10,8),%xmm5
+	movq	16(%rax,%r10,8),%xmm6
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+.byte	102,72,15,126,195
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.byte	102,72,15,126,195
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	xorq	%r15,%r15
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-40(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.byte	102,72,15,126,195
+	movq	%rdi,-16(%rsp,%r15,8)
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jl	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	movq	8(%rsp),%rdx
+	shrq	$2,%r9
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+	leaq	-1(%r9),%r15
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-1(%r9),%r15
+	orq	%rcx,%rsi
+
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
+	leaq	32(%r14),%r14
+	decq	%r15
+	jnz	.Lcopy4x
+
+	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+.globl	bn_scatter5
+.type	bn_scatter5,@function
+.align	16
+bn_scatter5:
+	cmpq	$0,%rsi
+	jz	.Lscatter_epilogue
+	leaq	(%rdx,%rcx,8),%rdx
+.Lscatter:
+	movq	(%rdi),%rax
+	leaq	8(%rdi),%rdi
+	movq	%rax,(%rdx)
+	leaq	256(%rdx),%rdx
+	subq	$1,%rsi
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,@function
+.align	16
+bn_gather5:
+	movq	%rcx,%r11
+	shrq	$3,%rcx
+	andq	$7,%r11
+	notq	%rcx
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%rcx
+	leaq	96(%rdx,%r11,8),%rdx
+	movq	0(%rax,%rcx,8),%xmm4
+	movq	8(%rax,%rcx,8),%xmm5
+	movq	16(%rax,%rcx,8),%xmm6
+	movq	24(%rax,%rcx,8),%xmm7
+	jmp	.Lgather
+.align	16
+.Lgather:
+	movq	-96(%rdx),%xmm0
+	movq	-32(%rdx),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%rdx),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%rdx),%rdx
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,(%rdi)
+	leaq	8(%rdi),%rdi
+	subq	$1,%rsi
+	jnz	.Lgather
+	.byte	0xf3,0xc3
+.LSEH_end_bn_gather5:
+.size	bn_gather5,.-bn_gather5
+.align	64
+.Lmagic_masks:
+.long	0,0, 0,0, 0,0, -1,-1
+.long	0,0, 0,0, 0,0,  0,0
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0

Reply via email to