Module Name:    src
Committed By:   joerg
Date:           Sat May 16 17:32:54 UTC 2015

Modified Files:
        src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386: Makefile
            bn-586.S ghash-x86.S sha1-586.S sha512-586.S x86cpuid.S
Added Files:
        src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386: modes.inc

Log Message:
Optimize i386 support in libcrypto:
- Enable optional SSE2 assembler versions. Regenerate.
- Hook up assembler version of GCM.


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile
cvs rdiff -u -r1.5 -r1.6 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S
cvs rdiff -u -r1.2 -r1.3 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S
cvs rdiff -u -r0 -r1.1 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc
cvs rdiff -u -r1.3 -r1.4 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S
cvs rdiff -u -r1.8 -r1.9 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile:1.6 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile:1.7
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile:1.6	Mon Jul 30 10:25:24 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile	Sat May 16 17:32:54 2015
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.6 2012/07/30 10:25:24 christos Exp $
+#	$NetBSD: Makefile,v 1.7 2015/05/16 17:32:54 joerg Exp $
 
 .include "bsd.own.mk"
 
@@ -9,7 +9,7 @@ regen:
 	for i in $$(find ${OPENSSLSRC} -name \*86.pl) \
 		  ${OPENSSLSRC}/crypto/x86cpuid.pl; do \
 		perl -I${OPENSSLSRC}/crypto/perlasm \
-		-I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC \
+		-I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC -DOPENSSL_IA32_SSE2 \
 		| sed -e 's,^\.file.*$$,#include <machine/asm.h>,' \
 			-e 's/	call	OPENSSL_cpuid_setup/	PIC_PROLOGUE!	call	PIC_PLT(OPENSSL_cpuid_setup)!	PIC_EPILOGUE/' | tr '!' '\n' \
 		> $$(basename $$i .pl).S; \

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S:1.5 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S:1.6
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S:1.5	Fri Jul 27 19:34:14 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S	Sat May 16 17:32:54 2015
@@ -5,6 +5,103 @@
 .align	16
 bn_mul_add_words:
 .L_bn_mul_add_words_begin:
+	call	.L000PIC_me_up
+.L000PIC_me_up:
+	popl	%eax
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%eax),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L001maw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+	jmp	.L002maw_sse2_entry
+.align	16
+.L003maw_sse2_unrolled:
+	movd	(%eax),%mm3
+	paddq	%mm3,%mm1
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	movd	4(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	movd	8(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	12(%edx),%mm7
+	pmuludq	%mm0,%mm7
+	paddq	%mm2,%mm1
+	movd	4(%eax),%mm3
+	paddq	%mm4,%mm3
+	movd	8(%eax),%mm5
+	paddq	%mm6,%mm5
+	movd	12(%eax),%mm4
+	paddq	%mm4,%mm7
+	movd	%mm1,(%eax)
+	movd	16(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	psrlq	$32,%mm1
+	movd	20(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	paddq	%mm3,%mm1
+	movd	24(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	%mm1,4(%eax)
+	psrlq	$32,%mm1
+	movd	28(%edx),%mm3
+	addl	$32,%edx
+	pmuludq	%mm0,%mm3
+	paddq	%mm5,%mm1
+	movd	16(%eax),%mm5
+	paddq	%mm5,%mm2
+	movd	%mm1,8(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm7,%mm1
+	movd	20(%eax),%mm5
+	paddq	%mm5,%mm4
+	movd	%mm1,12(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm2,%mm1
+	movd	24(%eax),%mm5
+	paddq	%mm5,%mm6
+	movd	%mm1,16(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm4,%mm1
+	movd	28(%eax),%mm5
+	paddq	%mm5,%mm3
+	movd	%mm1,20(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm6,%mm1
+	movd	%mm1,24(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm3,%mm1
+	movd	%mm1,28(%eax)
+	leal	32(%eax),%eax
+	psrlq	$32,%mm1
+	subl	$8,%ecx
+	jz	.L004maw_sse2_exit
+.L002maw_sse2_entry:
+	testl	$4294967288,%ecx
+	jnz	.L003maw_sse2_unrolled
+.align	4
+.L005maw_sse2_loop:
+	movd	(%edx),%mm2
+	movd	(%eax),%mm3
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm3,%mm1
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	.L005maw_sse2_loop
+.L004maw_sse2_exit:
+	movd	%mm1,%eax
+	emms
+	ret
+.align	16
+.L001maw_non_sse2:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
@@ -17,9 +114,9 @@ bn_mul_add_words:
 	andl	$4294967288,%ecx
 	movl	32(%esp),%ebp
 	pushl	%ecx
-	jz	.L000maw_finish
+	jz	.L006maw_finish
 .align	16
-.L001maw_loop:
+.L007maw_loop:
 
 	movl	(%ebx),%eax
 	mull	%ebp
@@ -96,13 +193,13 @@ bn_mul_add_words:
 	subl	$8,%ecx
 	leal	32(%ebx),%ebx
 	leal	32(%edi),%edi
-	jnz	.L001maw_loop
-.L000maw_finish:
+	jnz	.L007maw_loop
+.L006maw_finish:
 	movl	32(%esp),%ecx
 	andl	$7,%ecx
-	jnz	.L002maw_finish2
-	jmp	.L003maw_end
-.L002maw_finish2:
+	jnz	.L008maw_finish2
+	jmp	.L009maw_end
+.L008maw_finish2:
 
 	movl	(%ebx),%eax
 	mull	%ebp
@@ -113,7 +210,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	4(%ebx),%eax
 	mull	%ebp
@@ -124,7 +221,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,4(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	8(%ebx),%eax
 	mull	%ebp
@@ -135,7 +232,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,8(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	12(%ebx),%eax
 	mull	%ebp
@@ -146,7 +243,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,12(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	16(%ebx),%eax
 	mull	%ebp
@@ -157,7 +254,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,16(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	20(%ebx),%eax
 	mull	%ebp
@@ -168,7 +265,7 @@ bn_mul_add_words:
 	decl	%ecx
 	movl	%eax,20(%edi)
 	movl	%edx,%esi
-	jz	.L003maw_end
+	jz	.L009maw_end
 
 	movl	24(%ebx),%eax
 	mull	%ebp
@@ -178,7 +275,7 @@ bn_mul_add_words:
 	adcl	$0,%edx
 	movl	%eax,24(%edi)
 	movl	%edx,%esi
-.L003maw_end:
+.L009maw_end:
 	movl	%esi,%eax
 	popl	%ecx
 	popl	%edi
@@ -192,6 +289,34 @@ bn_mul_add_words:
 .align	16
 bn_mul_words:
 .L_bn_mul_words_begin:
+	call	.L010PIC_me_up
+.L010PIC_me_up:
+	popl	%eax
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%eax),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L011mw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+.align	16
+.L012mw_sse2_loop:
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	.L012mw_sse2_loop
+	movd	%mm1,%eax
+	emms
+	ret
+.align	16
+.L011mw_non_sse2:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
@@ -203,8 +328,8 @@ bn_mul_words:
 	movl	28(%esp),%ebp
 	movl	32(%esp),%ecx
 	andl	$4294967288,%ebp
-	jz	.L004mw_finish
-.L005mw_loop:
+	jz	.L013mw_finish
+.L014mw_loop:
 
 	movl	(%ebx),%eax
 	mull	%ecx
@@ -265,14 +390,14 @@ bn_mul_words:
 	addl	$32,%ebx
 	addl	$32,%edi
 	subl	$8,%ebp
-	jz	.L004mw_finish
-	jmp	.L005mw_loop
-.L004mw_finish:
+	jz	.L013mw_finish
+	jmp	.L014mw_loop
+.L013mw_finish:
 	movl	28(%esp),%ebp
 	andl	$7,%ebp
-	jnz	.L006mw_finish2
-	jmp	.L007mw_end
-.L006mw_finish2:
+	jnz	.L015mw_finish2
+	jmp	.L016mw_end
+.L015mw_finish2:
 
 	movl	(%ebx),%eax
 	mull	%ecx
@@ -281,7 +406,7 @@ bn_mul_words:
 	movl	%eax,(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	4(%ebx),%eax
 	mull	%ecx
@@ -290,7 +415,7 @@ bn_mul_words:
 	movl	%eax,4(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	8(%ebx),%eax
 	mull	%ecx
@@ -299,7 +424,7 @@ bn_mul_words:
 	movl	%eax,8(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	12(%ebx),%eax
 	mull	%ecx
@@ -308,7 +433,7 @@ bn_mul_words:
 	movl	%eax,12(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	16(%ebx),%eax
 	mull	%ecx
@@ -317,7 +442,7 @@ bn_mul_words:
 	movl	%eax,16(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	20(%ebx),%eax
 	mull	%ecx
@@ -326,7 +451,7 @@ bn_mul_words:
 	movl	%eax,20(%edi)
 	movl	%edx,%esi
 	decl	%ebp
-	jz	.L007mw_end
+	jz	.L016mw_end
 
 	movl	24(%ebx),%eax
 	mull	%ecx
@@ -334,7 +459,7 @@ bn_mul_words:
 	adcl	$0,%edx
 	movl	%eax,24(%edi)
 	movl	%edx,%esi
-.L007mw_end:
+.L016mw_end:
 	movl	%esi,%eax
 	popl	%edi
 	popl	%esi
@@ -347,6 +472,29 @@ bn_mul_words:
 .align	16
 bn_sqr_words:
 .L_bn_sqr_words_begin:
+	call	.L017PIC_me_up
+.L017PIC_me_up:
+	popl	%eax
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L017PIC_me_up](%eax),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L018sqr_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+.align	16
+.L019sqr_sse2_loop:
+	movd	(%edx),%mm0
+	pmuludq	%mm0,%mm0
+	leal	4(%edx),%edx
+	movq	%mm0,(%eax)
+	subl	$1,%ecx
+	leal	8(%eax),%eax
+	jnz	.L019sqr_sse2_loop
+	emms
+	ret
+.align	16
+.L018sqr_non_sse2:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
@@ -356,8 +504,8 @@ bn_sqr_words:
 	movl	24(%esp),%edi
 	movl	28(%esp),%ebx
 	andl	$4294967288,%ebx
-	jz	.L008sw_finish
-.L009sw_loop:
+	jz	.L020sw_finish
+.L021sw_loop:
 
 	movl	(%edi),%eax
 	mull	%eax
@@ -402,59 +550,59 @@ bn_sqr_words:
 	addl	$32,%edi
 	addl	$64,%esi
 	subl	$8,%ebx
-	jnz	.L009sw_loop
-.L008sw_finish:
+	jnz	.L021sw_loop
+.L020sw_finish:
 	movl	28(%esp),%ebx
 	andl	$7,%ebx
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	(%edi),%eax
 	mull	%eax
 	movl	%eax,(%esi)
 	decl	%ebx
 	movl	%edx,4(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	4(%edi),%eax
 	mull	%eax
 	movl	%eax,8(%esi)
 	decl	%ebx
 	movl	%edx,12(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	8(%edi),%eax
 	mull	%eax
 	movl	%eax,16(%esi)
 	decl	%ebx
 	movl	%edx,20(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	12(%edi),%eax
 	mull	%eax
 	movl	%eax,24(%esi)
 	decl	%ebx
 	movl	%edx,28(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	16(%edi),%eax
 	mull	%eax
 	movl	%eax,32(%esi)
 	decl	%ebx
 	movl	%edx,36(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	20(%edi),%eax
 	mull	%eax
 	movl	%eax,40(%esi)
 	decl	%ebx
 	movl	%edx,44(%esi)
-	jz	.L010sw_end
+	jz	.L022sw_end
 
 	movl	24(%edi),%eax
 	mull	%eax
 	movl	%eax,48(%esi)
 	movl	%edx,52(%esi)
-.L010sw_end:
+.L022sw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -488,8 +636,8 @@ bn_add_words:
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L011aw_finish
-.L012aw_loop:
+	jz	.L023aw_finish
+.L024aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -567,11 +715,11 @@ bn_add_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L012aw_loop
-.L011aw_finish:
+	jnz	.L024aw_loop
+.L023aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -582,7 +730,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -593,7 +741,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -604,7 +752,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -615,7 +763,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -626,7 +774,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -637,7 +785,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L013aw_end
+	jz	.L025aw_end
 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -647,7 +795,7 @@ bn_add_words:
 	addl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-.L013aw_end:
+.L025aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -670,8 +818,8 @@ bn_sub_words:
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L014aw_finish
-.L015aw_loop:
+	jz	.L026aw_finish
+.L027aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -749,11 +897,11 @@ bn_sub_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L015aw_loop
-.L014aw_finish:
+	jnz	.L027aw_loop
+.L026aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -764,7 +912,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -775,7 +923,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -786,7 +934,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -797,7 +945,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -808,7 +956,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -819,7 +967,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L016aw_end
+	jz	.L028aw_end
 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -829,7 +977,7 @@ bn_sub_words:
 	subl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-.L016aw_end:
+.L028aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -852,8 +1000,8 @@ bn_sub_part_words:
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L017aw_finish
-.L018aw_loop:
+	jz	.L029aw_finish
+.L030aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -931,11 +1079,11 @@ bn_sub_part_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L018aw_loop
-.L017aw_finish:
+	jnz	.L030aw_loop
+.L029aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -949,7 +1097,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -963,7 +1111,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -977,7 +1125,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -991,7 +1139,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -1005,7 +1153,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -1019,7 +1167,7 @@ bn_sub_part_words:
 	addl	$4,%edi
 	addl	$4,%ebx
 	decl	%ebp
-	jz	.L019aw_end
+	jz	.L031aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -1032,20 +1180,20 @@ bn_sub_part_words:
 	addl	$4,%esi
 	addl	$4,%edi
 	addl	$4,%ebx
-.L019aw_end:
+.L031aw_end:
 	cmpl	$0,36(%esp)
-	je	.L020pw_end
+	je	.L032pw_end
 	movl	36(%esp),%ebp
 	cmpl	$0,%ebp
-	je	.L020pw_end
-	jge	.L021pw_pos
+	je	.L032pw_end
+	jge	.L033pw_pos
 
 	movl	$0,%edx
 	subl	%ebp,%edx
 	movl	%edx,%ebp
 	andl	$4294967288,%ebp
-	jz	.L022pw_neg_finish
-.L023pw_neg_loop:
+	jz	.L034pw_neg_finish
+.L035pw_neg_loop:
 
 	movl	$0,%ecx
 	movl	(%edi),%edx
@@ -1122,13 +1270,13 @@ bn_sub_part_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L023pw_neg_loop
-.L022pw_neg_finish:
+	jnz	.L035pw_neg_loop
+.L034pw_neg_finish:
 	movl	36(%esp),%edx
 	movl	$0,%ebp
 	subl	%edx,%ebp
 	andl	$7,%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	(%edi),%edx
@@ -1139,7 +1287,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	4(%edi),%edx
@@ -1150,7 +1298,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	8(%edi),%edx
@@ -1161,7 +1309,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	12(%edi),%edx
@@ -1172,7 +1320,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	16(%edi),%edx
@@ -1183,7 +1331,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	20(%edi),%edx
@@ -1194,7 +1342,7 @@ bn_sub_part_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	$0,%ecx
 	movl	24(%edi),%edx
@@ -1204,181 +1352,182 @@ bn_sub_part_words:
 	subl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-	jmp	.L020pw_end
-.L021pw_pos:
+	jmp	.L032pw_end
+.L033pw_pos:
 	andl	$4294967288,%ebp
-	jz	.L024pw_pos_finish
-.L025pw_pos_loop:
+	jz	.L036pw_pos_finish
+.L037pw_pos_loop:
 
 	movl	(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,(%ebx)
-	jnc	.L026pw_nc0
+	jnc	.L038pw_nc0
 
 	movl	4(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,4(%ebx)
-	jnc	.L027pw_nc1
+	jnc	.L039pw_nc1
 
 	movl	8(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,8(%ebx)
-	jnc	.L028pw_nc2
+	jnc	.L040pw_nc2
 
 	movl	12(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,12(%ebx)
-	jnc	.L029pw_nc3
+	jnc	.L041pw_nc3
 
 	movl	16(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,16(%ebx)
-	jnc	.L030pw_nc4
+	jnc	.L042pw_nc4
 
 	movl	20(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,20(%ebx)
-	jnc	.L031pw_nc5
+	jnc	.L043pw_nc5
 
 	movl	24(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,24(%ebx)
-	jnc	.L032pw_nc6
+	jnc	.L044pw_nc6
 
 	movl	28(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,28(%ebx)
-	jnc	.L033pw_nc7
+	jnc	.L045pw_nc7
 
 	addl	$32,%esi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L025pw_pos_loop
-.L024pw_pos_finish:
+	jnz	.L037pw_pos_loop
+.L036pw_pos_finish:
 	movl	36(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,(%ebx)
-	jnc	.L034pw_tail_nc0
+	jnc	.L046pw_tail_nc0
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	4(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,4(%ebx)
-	jnc	.L035pw_tail_nc1
+	jnc	.L047pw_tail_nc1
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	8(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,8(%ebx)
-	jnc	.L036pw_tail_nc2
+	jnc	.L048pw_tail_nc2
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	12(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,12(%ebx)
-	jnc	.L037pw_tail_nc3
+	jnc	.L049pw_tail_nc3
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	16(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,16(%ebx)
-	jnc	.L038pw_tail_nc4
+	jnc	.L050pw_tail_nc4
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	20(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,20(%ebx)
-	jnc	.L039pw_tail_nc5
+	jnc	.L051pw_tail_nc5
 	decl	%ebp
-	jz	.L020pw_end
+	jz	.L032pw_end
 
 	movl	24(%esi),%ecx
 	subl	%eax,%ecx
 	movl	%ecx,24(%ebx)
-	jnc	.L040pw_tail_nc6
+	jnc	.L052pw_tail_nc6
 	movl	$1,%eax
-	jmp	.L020pw_end
-.L041pw_nc_loop:
+	jmp	.L032pw_end
+.L053pw_nc_loop:
 	movl	(%esi),%ecx
 	movl	%ecx,(%ebx)
-.L026pw_nc0:
+.L038pw_nc0:
 	movl	4(%esi),%ecx
 	movl	%ecx,4(%ebx)
-.L027pw_nc1:
+.L039pw_nc1:
 	movl	8(%esi),%ecx
 	movl	%ecx,8(%ebx)
-.L028pw_nc2:
+.L040pw_nc2:
 	movl	12(%esi),%ecx
 	movl	%ecx,12(%ebx)
-.L029pw_nc3:
+.L041pw_nc3:
 	movl	16(%esi),%ecx
 	movl	%ecx,16(%ebx)
-.L030pw_nc4:
+.L042pw_nc4:
 	movl	20(%esi),%ecx
 	movl	%ecx,20(%ebx)
-.L031pw_nc5:
+.L043pw_nc5:
 	movl	24(%esi),%ecx
 	movl	%ecx,24(%ebx)
-.L032pw_nc6:
+.L044pw_nc6:
 	movl	28(%esi),%ecx
 	movl	%ecx,28(%ebx)
-.L033pw_nc7:
+.L045pw_nc7:
 
 	addl	$32,%esi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L041pw_nc_loop
+	jnz	.L053pw_nc_loop
 	movl	36(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	(%esi),%ecx
 	movl	%ecx,(%ebx)
-.L034pw_tail_nc0:
+.L046pw_tail_nc0:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	4(%esi),%ecx
 	movl	%ecx,4(%ebx)
-.L035pw_tail_nc1:
+.L047pw_tail_nc1:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	8(%esi),%ecx
 	movl	%ecx,8(%ebx)
-.L036pw_tail_nc2:
+.L048pw_tail_nc2:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	12(%esi),%ecx
 	movl	%ecx,12(%ebx)
-.L037pw_tail_nc3:
+.L049pw_tail_nc3:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	16(%esi),%ecx
 	movl	%ecx,16(%ebx)
-.L038pw_tail_nc4:
+.L050pw_tail_nc4:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	20(%esi),%ecx
 	movl	%ecx,20(%ebx)
-.L039pw_tail_nc5:
+.L051pw_tail_nc5:
 	decl	%ebp
-	jz	.L042pw_nc_end
+	jz	.L054pw_nc_end
 	movl	24(%esi),%ecx
 	movl	%ecx,24(%ebx)
-.L040pw_tail_nc6:
-.L042pw_nc_end:
+.L052pw_tail_nc6:
+.L054pw_nc_end:
 	movl	$0,%eax
-.L020pw_end:
+.L032pw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
 	popl	%ebp
 	ret
 .size	bn_sub_part_words,.-.L_bn_sub_part_words_begin
+.comm	OPENSSL_ia32cap_P,8,4

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S:1.2 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S:1.3
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S:1.2	Fri Jul 27 19:34:14 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S	Sat May 16 17:32:54 2015
@@ -203,418 +203,94 @@ gcm_ghash_4bit_x86:
 	popl	%ebp
 	ret
 .size	gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
-.type	_mmx_gmult_4bit_inner,@function
+.globl	gcm_gmult_4bit_mmx
+.type	gcm_gmult_4bit_mmx,@function
 .align	16
-_mmx_gmult_4bit_inner:
+gcm_gmult_4bit_mmx:
+.L_gcm_gmult_4bit_mmx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	call	.L005pic_point
+.L005pic_point:
+	popl	%eax
+	leal	.Lrem_4bit-.L005pic_point(%eax),%eax
+	movzbl	15(%edi),%ebx
 	xorl	%ecx,%ecx
 	movl	%ebx,%edx
 	movb	%dl,%cl
+	movl	$14,%ebp
 	shlb	$4,%cl
 	andl	$240,%edx
 	movq	8(%esi,%ecx,1),%mm0
 	movq	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	14(%edi),%cl
-	psllq	$60,%mm2
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
+	jmp	.L006mmx_loop
+.align	16
+.L006mmx_loop:
 	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
 	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
 	movq	%mm1,%mm2
 	psrlq	$4,%mm1
 	pxor	8(%esi,%edx,1),%mm0
-	movb	13(%edi),%cl
+	movb	(%edi,%ebp,1),%cl
 	psllq	$60,%mm2
 	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
+	decl	%ebp
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	12(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
 	pxor	(%esi,%edx,1),%mm1
 	movl	%ecx,%edx
-	movd	%mm0,%ebx
 	pxor	%mm2,%mm0
+	js	.L007mmx_break
 	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
 	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
 	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	11(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
 	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	10(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	9(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	8(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
 	movq	%mm1,%mm2
 	psrlq	$4,%mm1
 	pxor	8(%esi,%ecx,1),%mm0
 	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	7(%edi),%cl
-	psllq	$60,%mm2
 	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
 	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	6(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	5(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
 	pxor	%mm2,%mm0
+	jmp	.L006mmx_loop
+.align	16
+.L007mmx_break:
 	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
 	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	4(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
 	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
 	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	3(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
 	movq	%mm1,%mm2
 	psrlq	$4,%mm1
 	pxor	8(%esi,%ecx,1),%mm0
 	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	2(%edi),%cl
-	psllq	$60,%mm2
 	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
 	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	movb	1(%edi),%cl
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
-	movd	%mm0,%ebx
 	pxor	%mm2,%mm0
-	shlb	$4,%cl
 	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
 	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
 	movq	%mm1,%mm2
 	psrlq	$4,%mm1
 	pxor	8(%esi,%edx,1),%mm0
-	movb	(%edi),%cl
 	psllq	$60,%mm2
 	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
-	pxor	(%esi,%edx,1),%mm1
-	movl	%ecx,%edx
 	movd	%mm0,%ebx
-	pxor	%mm2,%mm0
-	shlb	$4,%cl
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%ecx,1),%mm0
-	psllq	$60,%mm2
-	andl	$240,%edx
-	pxor	(%eax,%ebp,8),%mm1
-	andl	$15,%ebx
-	pxor	(%esi,%ecx,1),%mm1
-	movd	%mm0,%ebp
-	pxor	%mm2,%mm0
-	psrlq	$4,%mm0
-	movq	%mm1,%mm2
-	psrlq	$4,%mm1
-	pxor	8(%esi,%edx,1),%mm0
-	psllq	$60,%mm2
-	pxor	(%eax,%ebx,8),%mm1
-	andl	$15,%ebp
 	pxor	(%esi,%edx,1),%mm1
-	movd	%mm0,%ebx
 	pxor	%mm2,%mm0
-	movl	4(%eax,%ebp,8),%edi
 	psrlq	$32,%mm0
 	movd	%mm1,%edx
 	psrlq	$32,%mm1
 	movd	%mm0,%ecx
 	movd	%mm1,%ebp
-	shll	$4,%edi
 	bswap	%ebx
 	bswap	%edx
 	bswap	%ecx
-	xorl	%edi,%ebp
 	bswap	%ebp
-	ret
-.size	_mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner
-.globl	gcm_gmult_4bit_mmx
-.type	gcm_gmult_4bit_mmx,@function
-.align	16
-gcm_gmult_4bit_mmx:
-.L_gcm_gmult_4bit_mmx_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%edi
-	movl	24(%esp),%esi
-	call	.L005pic_point
-.L005pic_point:
-	popl	%eax
-	leal	.Lrem_4bit-.L005pic_point(%eax),%eax
-	movzbl	15(%edi),%ebx
-	call	_mmx_gmult_4bit_inner
-	movl	20(%esp),%edi
 	emms
 	movl	%ebx,12(%edi)
 	movl	%edx,4(%edi)
@@ -635,61 +311,926 @@ gcm_ghash_4bit_mmx:
 	pushl	%ebx
 	pushl	%esi
 	pushl	%edi
-	movl	20(%esp),%ebp
-	movl	24(%esp),%esi
-	movl	28(%esp),%edi
-	movl	32(%esp),%ecx
-	call	.L006pic_point
-.L006pic_point:
-	popl	%eax
-	leal	.Lrem_4bit-.L006pic_point(%eax),%eax
-	addl	%edi,%ecx
-	movl	%ecx,32(%esp)
-	subl	$20,%esp
-	movl	12(%ebp),%ebx
-	movl	4(%ebp),%edx
-	movl	8(%ebp),%ecx
-	movl	(%ebp),%ebp
-	jmp	.L007mmx_outer_loop
+	movl	20(%esp),%eax
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ecx
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	call	.L008pic_point
+.L008pic_point:
+	popl	%esi
+	leal	.Lrem_8bit-.L008pic_point(%esi),%esi
+	subl	$544,%esp
+	andl	$-64,%esp
+	subl	$16,%esp
+	addl	%ecx,%edx
+	movl	%eax,544(%esp)
+	movl	%edx,552(%esp)
+	movl	%ebp,556(%esp)
+	addl	$128,%ebx
+	leal	144(%esp),%edi
+	leal	400(%esp),%ebp
+	movl	-120(%ebx),%edx
+	movq	-120(%ebx),%mm0
+	movq	-128(%ebx),%mm3
+	shll	$4,%edx
+	movb	%dl,(%esp)
+	movl	-104(%ebx),%edx
+	movq	-104(%ebx),%mm2
+	movq	-112(%ebx),%mm5
+	movq	%mm0,-128(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,(%edi)
+	movq	%mm3,%mm7
+	psrlq	$4,%mm3
+	shll	$4,%edx
+	movb	%dl,1(%esp)
+	movl	-88(%ebx),%edx
+	movq	-88(%ebx),%mm1
+	psllq	$60,%mm7
+	movq	-96(%ebx),%mm4
+	por	%mm7,%mm0
+	movq	%mm2,-120(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,8(%edi)
+	movq	%mm5,%mm6
+	movq	%mm0,-128(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,(%ebp)
+	shll	$4,%edx
+	movb	%dl,2(%esp)
+	movl	-72(%ebx),%edx
+	movq	-72(%ebx),%mm0
+	psllq	$60,%mm6
+	movq	-80(%ebx),%mm3
+	por	%mm6,%mm2
+	movq	%mm1,-112(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,16(%edi)
+	movq	%mm4,%mm7
+	movq	%mm2,-120(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,8(%ebp)
+	shll	$4,%edx
+	movb	%dl,3(%esp)
+	movl	-56(%ebx),%edx
+	movq	-56(%ebx),%mm2
+	psllq	$60,%mm7
+	movq	-64(%ebx),%mm5
+	por	%mm7,%mm1
+	movq	%mm0,-104(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,24(%edi)
+	movq	%mm3,%mm6
+	movq	%mm1,-112(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,16(%ebp)
+	shll	$4,%edx
+	movb	%dl,4(%esp)
+	movl	-40(%ebx),%edx
+	movq	-40(%ebx),%mm1
+	psllq	$60,%mm6
+	movq	-48(%ebx),%mm4
+	por	%mm6,%mm0
+	movq	%mm2,-96(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,32(%edi)
+	movq	%mm5,%mm7
+	movq	%mm0,-104(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,24(%ebp)
+	shll	$4,%edx
+	movb	%dl,5(%esp)
+	movl	-24(%ebx),%edx
+	movq	-24(%ebx),%mm0
+	psllq	$60,%mm7
+	movq	-32(%ebx),%mm3
+	por	%mm7,%mm2
+	movq	%mm1,-88(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,40(%edi)
+	movq	%mm4,%mm6
+	movq	%mm2,-96(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,32(%ebp)
+	shll	$4,%edx
+	movb	%dl,6(%esp)
+	movl	-8(%ebx),%edx
+	movq	-8(%ebx),%mm2
+	psllq	$60,%mm6
+	movq	-16(%ebx),%mm5
+	por	%mm6,%mm1
+	movq	%mm0,-80(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,48(%edi)
+	movq	%mm3,%mm7
+	movq	%mm1,-88(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,40(%ebp)
+	shll	$4,%edx
+	movb	%dl,7(%esp)
+	movl	8(%ebx),%edx
+	movq	8(%ebx),%mm1
+	psllq	$60,%mm7
+	movq	(%ebx),%mm4
+	por	%mm7,%mm0
+	movq	%mm2,-72(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,56(%edi)
+	movq	%mm5,%mm6
+	movq	%mm0,-80(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,48(%ebp)
+	shll	$4,%edx
+	movb	%dl,8(%esp)
+	movl	24(%ebx),%edx
+	movq	24(%ebx),%mm0
+	psllq	$60,%mm6
+	movq	16(%ebx),%mm3
+	por	%mm6,%mm2
+	movq	%mm1,-64(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,64(%edi)
+	movq	%mm4,%mm7
+	movq	%mm2,-72(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,56(%ebp)
+	shll	$4,%edx
+	movb	%dl,9(%esp)
+	movl	40(%ebx),%edx
+	movq	40(%ebx),%mm2
+	psllq	$60,%mm7
+	movq	32(%ebx),%mm5
+	por	%mm7,%mm1
+	movq	%mm0,-56(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,72(%edi)
+	movq	%mm3,%mm6
+	movq	%mm1,-64(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,64(%ebp)
+	shll	$4,%edx
+	movb	%dl,10(%esp)
+	movl	56(%ebx),%edx
+	movq	56(%ebx),%mm1
+	psllq	$60,%mm6
+	movq	48(%ebx),%mm4
+	por	%mm6,%mm0
+	movq	%mm2,-48(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,80(%edi)
+	movq	%mm5,%mm7
+	movq	%mm0,-56(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,72(%ebp)
+	shll	$4,%edx
+	movb	%dl,11(%esp)
+	movl	72(%ebx),%edx
+	movq	72(%ebx),%mm0
+	psllq	$60,%mm7
+	movq	64(%ebx),%mm3
+	por	%mm7,%mm2
+	movq	%mm1,-40(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,88(%edi)
+	movq	%mm4,%mm6
+	movq	%mm2,-48(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,80(%ebp)
+	shll	$4,%edx
+	movb	%dl,12(%esp)
+	movl	88(%ebx),%edx
+	movq	88(%ebx),%mm2
+	psllq	$60,%mm6
+	movq	80(%ebx),%mm5
+	por	%mm6,%mm1
+	movq	%mm0,-32(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,96(%edi)
+	movq	%mm3,%mm7
+	movq	%mm1,-40(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,88(%ebp)
+	shll	$4,%edx
+	movb	%dl,13(%esp)
+	movl	104(%ebx),%edx
+	movq	104(%ebx),%mm1
+	psllq	$60,%mm7
+	movq	96(%ebx),%mm4
+	por	%mm7,%mm0
+	movq	%mm2,-24(%edi)
+	psrlq	$4,%mm2
+	movq	%mm5,104(%edi)
+	movq	%mm5,%mm6
+	movq	%mm0,-32(%ebp)
+	psrlq	$4,%mm5
+	movq	%mm3,96(%ebp)
+	shll	$4,%edx
+	movb	%dl,14(%esp)
+	movl	120(%ebx),%edx
+	movq	120(%ebx),%mm0
+	psllq	$60,%mm6
+	movq	112(%ebx),%mm3
+	por	%mm6,%mm2
+	movq	%mm1,-16(%edi)
+	psrlq	$4,%mm1
+	movq	%mm4,112(%edi)
+	movq	%mm4,%mm7
+	movq	%mm2,-24(%ebp)
+	psrlq	$4,%mm4
+	movq	%mm5,104(%ebp)
+	shll	$4,%edx
+	movb	%dl,15(%esp)
+	psllq	$60,%mm7
+	por	%mm7,%mm1
+	movq	%mm0,-8(%edi)
+	psrlq	$4,%mm0
+	movq	%mm3,120(%edi)
+	movq	%mm3,%mm6
+	movq	%mm1,-16(%ebp)
+	psrlq	$4,%mm3
+	movq	%mm4,112(%ebp)
+	psllq	$60,%mm6
+	por	%mm6,%mm0
+	movq	%mm0,-8(%ebp)
+	movq	%mm3,120(%ebp)
+	movq	(%eax),%mm6
+	movl	8(%eax),%ebx
+	movl	12(%eax),%edx
 .align	16
-.L007mmx_outer_loop:
-	xorl	12(%edi),%ebx
-	xorl	4(%edi),%edx
-	xorl	8(%edi),%ecx
-	xorl	(%edi),%ebp
-	movl	%edi,48(%esp)
-	movl	%ebx,12(%esp)
-	movl	%edx,4(%esp)
-	movl	%ecx,8(%esp)
-	movl	%ebp,(%esp)
-	movl	%esp,%edi
-	shrl	$24,%ebx
-	call	_mmx_gmult_4bit_inner
-	movl	48(%esp),%edi
-	leal	16(%edi),%edi
-	cmpl	52(%esp),%edi
-	jb	.L007mmx_outer_loop
-	movl	40(%esp),%edi
+.L009outer:
+	xorl	12(%ecx),%edx
+	xorl	8(%ecx),%ebx
+	pxor	(%ecx),%mm6
+	leal	16(%ecx),%ecx
+	movl	%ebx,536(%esp)
+	movq	%mm6,528(%esp)
+	movl	%ecx,548(%esp)
+	xorl	%eax,%eax
+	roll	$8,%edx
+	movb	%dl,%al
+	movl	%eax,%ebp
+	andb	$15,%al
+	shrl	$4,%ebp
+	pxor	%mm0,%mm0
+	roll	$8,%edx
+	pxor	%mm1,%mm1
+	pxor	%mm2,%mm2
+	movq	16(%esp,%eax,8),%mm7
+	movq	144(%esp,%eax,8),%mm6
+	movb	%dl,%al
+	movd	%mm7,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	shrl	$4,%edi
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movl	536(%esp),%edx
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm1,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm0
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm0,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movl	532(%esp),%edx
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm1,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm0
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm0,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm1,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm0
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movl	528(%esp),%edx
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm0,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm1,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm0
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	movb	%dl,%al
+	movd	%mm7,%ecx
+	movzbl	%bl,%ebx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%ebp
+	psrlq	$8,%mm6
+	pxor	272(%esp,%edi,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm0,%mm6
+	shrl	$4,%ebp
+	pinsrw	$2,(%esi,%ebx,2),%mm2
+	pxor	16(%esp,%eax,8),%mm7
+	roll	$8,%edx
+	pxor	144(%esp,%eax,8),%mm6
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%edi,8),%mm6
+	xorb	(%esp,%edi,1),%cl
+	movb	%dl,%al
+	movl	524(%esp),%edx
+	movd	%mm7,%ebx
+	movzbl	%cl,%ecx
+	psrlq	$8,%mm7
+	movq	%mm6,%mm3
+	movl	%eax,%edi
+	psrlq	$8,%mm6
+	pxor	272(%esp,%ebp,8),%mm7
+	andb	$15,%al
+	psllq	$56,%mm3
+	pxor	%mm2,%mm6
+	shrl	$4,%edi
+	pinsrw	$2,(%esi,%ecx,2),%mm1
+	pxor	16(%esp,%eax,8),%mm7
+	pxor	144(%esp,%eax,8),%mm6
+	xorb	(%esp,%ebp,1),%bl
+	pxor	%mm3,%mm7
+	pxor	400(%esp,%ebp,8),%mm6
+	movzbl	%bl,%ebx
+	pxor	%mm2,%mm2
+	psllq	$4,%mm1
+	movd	%mm7,%ecx
+	psrlq	$4,%mm7
+	movq	%mm6,%mm3
+	psrlq	$4,%mm6
+	shll	$4,%ecx
+	pxor	16(%esp,%edi,8),%mm7
+	psllq	$60,%mm3
+	movzbl	%cl,%ecx
+	pxor	%mm3,%mm7
+	pxor	144(%esp,%edi,8),%mm6
+	pinsrw	$2,(%esi,%ebx,2),%mm0
+	pxor	%mm1,%mm6
+	movd	%mm7,%edx
+	pinsrw	$3,(%esi,%ecx,2),%mm2
+	psllq	$12,%mm0
+	pxor	%mm0,%mm6
+	psrlq	$32,%mm7
+	pxor	%mm2,%mm6
+	movl	548(%esp),%ecx
+	movd	%mm7,%ebx
+	movq	%mm6,%mm3
+	psllw	$8,%mm6
+	psrlw	$8,%mm3
+	por	%mm3,%mm6
+	bswap	%edx
+	pshufw	$27,%mm6,%mm6
+	bswap	%ebx
+	cmpl	552(%esp),%ecx
+	jne	.L009outer
+	movl	544(%esp),%eax
+	movl	%edx,12(%eax)
+	movl	%ebx,8(%eax)
+	movq	%mm6,(%eax)
+	movl	556(%esp),%esp
 	emms
-	movl	%ebx,12(%edi)
-	movl	%edx,4(%edi)
-	movl	%ecx,8(%edi)
-	movl	%ebp,(%edi)
-	addl	$20,%esp
 	popl	%edi
 	popl	%esi
 	popl	%ebx
 	popl	%ebp
 	ret
 .size	gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	call	.L010pic
+.L010pic:
+	popl	%ecx
+	leal	.Lbswap-.L010pic(%ecx),%ecx
+	movdqu	(%eax),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+	pand	16(%ecx),%xmm5
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	movdqu	%xmm2,(%edx)
+	movdqu	%xmm0,16(%edx)
+	ret
+.size	gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+.L_gcm_gmult_clmul_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	call	.L011pic
+.L011pic:
+	popl	%ecx
+	leal	.Lbswap-.L011pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movups	(%edx),%xmm2
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	ret
+.size	gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%eax
+	movl	24(%esp),%edx
+	movl	28(%esp),%esi
+	movl	32(%esp),%ebx
+	call	.L012pic
+.L012pic:
+	popl	%ecx
+	leal	.Lbswap-.L012pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movdqu	(%edx),%xmm2
+.byte	102,15,56,0,197
+	subl	$16,%ebx
+	jz	.L013odd_tail
+	movdqu	(%esi),%xmm3
+	movdqu	16(%esi),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm6,%xmm7
+	pshufd	$78,%xmm6,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm6,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm7
+	pxor	%xmm4,%xmm6
+	movups	16(%edx),%xmm2
+	leal	32(%esi),%esi
+	subl	$32,%ebx
+	jbe	.L014even_tail
+.L015mod_loop:
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqu	(%esi),%xmm3
+	movups	(%edx),%xmm2
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+	movdqu	16(%esi),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pshufd	$78,%xmm5,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm5,%xmm3
+	pshufd	$78,%xmm2,%xmm5
+	pxor	%xmm2,%xmm5
+.byte	102,15,58,68,250,17
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.byte	102,15,58,68,221,0
+	movups	16(%edx),%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm3
+	movdqa	%xmm3,%xmm5
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm5
+	pxor	%xmm3,%xmm7
+	pxor	%xmm5,%xmm6
+	movdqa	(%ecx),%xmm5
+	leal	32(%esi),%esi
+	subl	$32,%ebx
+	ja	.L015mod_loop
+.L014even_tail:
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	testl	%ebx,%ebx
+	jnz	.L016done
+	movups	(%edx),%xmm2
+.L013odd_tail:
+	movdqu	(%esi),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.L016done:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align	64
+.Lbswap:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
 .align	64
 .Lrem_4bit:
-.long	0,0,0,29491200,0,58982400,0,38141952
-.long	0,117964800,0,113901568,0,76283904,0,88997888
-.long	0,235929600,0,265420800,0,227803136,0,206962688
-.long	0,152567808,0,148504576,0,177995776,0,190709760
+.long	0,0,0,471859200,0,943718400,0,610271232
+.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
 .align	64
-.L008rem_8bit:
+.Lrem_8bit:
 .value	0,450,900,582,1800,1738,1164,1358
 .value	3600,4050,3476,3158,2328,2266,2716,2910
 .value	7200,7650,8100,7782,6952,6890,6316,6510
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S:1.2 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S:1.3
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S:1.2	Fri Jul 27 19:34:15 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S	Sat May 16 17:32:54 2015
@@ -25,6 +25,278 @@ sha512_block_data_order:
 	movl	%edi,4(%esp)
 	movl	%eax,8(%esp)
 	movl	%ebx,12(%esp)
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L001K512](%ebp),%edx
+	movl	OPENSSL_ia32cap_P@GOT(%edx),%edx
+	btl	$26,(%edx)
+	jnc	.L002loop_x86
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm1
+	movq	16(%esi),%mm2
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	subl	$80,%esp
+.align	16
+.L003loop_sse2:
+	movq	%mm1,8(%esp)
+	movq	%mm2,16(%esp)
+	movq	%mm3,24(%esp)
+	movq	%mm5,40(%esp)
+	movq	%mm6,48(%esp)
+	movq	%mm7,56(%esp)
+	movl	(%edi),%ecx
+	movl	4(%edi),%edx
+	addl	$8,%edi
+	bswap	%ecx
+	bswap	%edx
+	movl	%ecx,76(%esp)
+	movl	%edx,72(%esp)
+.align	16
+.L00400_14_sse2:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,68(%esp)
+	movl	%ebx,64(%esp)
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	movq	%mm4,%mm1
+	movq	%mm4,%mm2
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	psllq	$23,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm2,%mm3
+	psllq	$23,%mm2
+	pxor	%mm1,%mm3
+	psrlq	$23,%mm1
+	pxor	%mm2,%mm3
+	psllq	$4,%mm2
+	pxor	%mm1,%mm3
+	paddq	(%ebp),%mm7
+	pxor	%mm2,%mm3
+	pxor	%mm6,%mm5
+	movq	8(%esp),%mm1
+	pand	%mm4,%mm5
+	movq	16(%esp),%mm2
+	pxor	%mm6,%mm5
+	movq	24(%esp),%mm4
+	paddq	%mm5,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	movq	%mm0,%mm6
+	paddq	72(%esp),%mm3
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	psllq	$25,%mm6
+	movq	%mm5,%mm7
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	subl	$8,%esp
+	pxor	%mm6,%mm7
+	movq	%mm0,%mm5
+	por	%mm2,%mm0
+	pand	%mm2,%mm5
+	pand	%mm1,%mm0
+	por	%mm0,%mm5
+	paddq	%mm5,%mm7
+	movq	%mm3,%mm0
+	movb	(%ebp),%dl
+	paddq	%mm7,%mm0
+	addl	$8,%ebp
+	cmpb	$53,%dl
+	jne	.L00400_14_sse2
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	movq	%mm4,%mm1
+	movq	%mm4,%mm2
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	psllq	$23,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm2,%mm3
+	psllq	$23,%mm2
+	pxor	%mm1,%mm3
+	psrlq	$23,%mm1
+	pxor	%mm2,%mm3
+	psllq	$4,%mm2
+	pxor	%mm1,%mm3
+	paddq	(%ebp),%mm7
+	pxor	%mm2,%mm3
+	pxor	%mm6,%mm5
+	movq	8(%esp),%mm1
+	pand	%mm4,%mm5
+	movq	16(%esp),%mm2
+	pxor	%mm6,%mm5
+	movq	24(%esp),%mm4
+	paddq	%mm5,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	movq	%mm0,%mm6
+	paddq	72(%esp),%mm3
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	psllq	$25,%mm6
+	movq	%mm5,%mm7
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	subl	$8,%esp
+	pxor	%mm6,%mm7
+	movq	%mm0,%mm5
+	por	%mm2,%mm0
+	movq	88(%esp),%mm6
+	pand	%mm2,%mm5
+	pand	%mm1,%mm0
+	movq	192(%esp),%mm2
+	por	%mm0,%mm5
+	paddq	%mm5,%mm7
+	movq	%mm3,%mm0
+	movb	(%ebp),%dl
+	paddq	%mm7,%mm0
+	addl	$8,%ebp
+.align	16
+.L00516_79_sse2:
+	movq	%mm2,%mm1
+	psrlq	$1,%mm2
+	movq	%mm6,%mm7
+	psrlq	$6,%mm6
+	movq	%mm2,%mm3
+	psrlq	$6,%mm2
+	movq	%mm6,%mm5
+	psrlq	$13,%mm6
+	pxor	%mm2,%mm3
+	psrlq	$1,%mm2
+	pxor	%mm6,%mm5
+	psrlq	$42,%mm6
+	pxor	%mm2,%mm3
+	movq	200(%esp),%mm2
+	psllq	$56,%mm1
+	pxor	%mm6,%mm5
+	psllq	$3,%mm7
+	pxor	%mm1,%mm3
+	paddq	128(%esp),%mm2
+	psllq	$7,%mm1
+	pxor	%mm7,%mm5
+	psllq	$42,%mm7
+	pxor	%mm1,%mm3
+	pxor	%mm7,%mm5
+	paddq	%mm5,%mm3
+	paddq	%mm2,%mm3
+	movq	%mm3,72(%esp)
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	movq	%mm4,%mm1
+	movq	%mm4,%mm2
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	psllq	$23,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm2,%mm3
+	psllq	$23,%mm2
+	pxor	%mm1,%mm3
+	psrlq	$23,%mm1
+	pxor	%mm2,%mm3
+	psllq	$4,%mm2
+	pxor	%mm1,%mm3
+	paddq	(%ebp),%mm7
+	pxor	%mm2,%mm3
+	pxor	%mm6,%mm5
+	movq	8(%esp),%mm1
+	pand	%mm4,%mm5
+	movq	16(%esp),%mm2
+	pxor	%mm6,%mm5
+	movq	24(%esp),%mm4
+	paddq	%mm5,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	movq	%mm0,%mm6
+	paddq	72(%esp),%mm3
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	psllq	$25,%mm6
+	movq	%mm5,%mm7
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	subl	$8,%esp
+	pxor	%mm6,%mm7
+	movq	%mm0,%mm5
+	por	%mm2,%mm0
+	movq	88(%esp),%mm6
+	pand	%mm2,%mm5
+	pand	%mm1,%mm0
+	movq	192(%esp),%mm2
+	por	%mm0,%mm5
+	paddq	%mm5,%mm7
+	movq	%mm3,%mm0
+	movb	(%ebp),%dl
+	paddq	%mm7,%mm0
+	addl	$8,%ebp
+	cmpb	$23,%dl
+	jne	.L00516_79_sse2
+	movq	8(%esp),%mm1
+	movq	16(%esp),%mm2
+	movq	24(%esp),%mm3
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	addl	$640,%esp
+	subl	$640,%ebp
+	cmpl	88(%esp),%edi
+	jb	.L003loop_sse2
+	emms
+	movl	92(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
 .align	16
 .L002loop_x86:
 	movl	(%edi),%eax
@@ -130,7 +402,7 @@ sha512_block_data_order:
 	movl	$16,%ecx
 .long	2784229001
 .align	16
-.L00300_15_x86:
+.L00600_15_x86:
 	movl	40(%esp),%ecx
 	movl	44(%esp),%edx
 	movl	%ecx,%esi
@@ -237,9 +509,9 @@ sha512_block_data_order:
 	subl	$8,%esp
 	leal	8(%ebp),%ebp
 	cmpb	$148,%dl
-	jne	.L00300_15_x86
+	jne	.L00600_15_x86
 .align	16
-.L00416_79_x86:
+.L00716_79_x86:
 	movl	312(%esp),%ecx
 	movl	316(%esp),%edx
 	movl	%ecx,%esi
@@ -412,7 +684,7 @@ sha512_block_data_order:
 	subl	$8,%esp
 	leal	8(%ebp),%ebp
 	cmpb	$23,%dl
-	jne	.L00416_79_x86
+	jne	.L00716_79_x86
 	movl	840(%esp),%esi
 	movl	844(%esp),%edi
 	movl	(%esi),%eax
@@ -561,3 +833,4 @@ sha512_block_data_order:
 .byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
 .byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
 .byte	62,0
+.comm	OPENSSL_ia32cap_P,8,4

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S:1.3 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S:1.4
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S:1.3	Fri Jul 27 19:34:15 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S	Sat May 16 17:32:54 2015
@@ -9,6 +9,21 @@ sha1_block_data_order:
 	pushl	%ebx
 	pushl	%esi
 	pushl	%edi
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000pic_point](%ebp),%esi
+	movl	OPENSSL_ia32cap_P@GOT(%esi),%esi
+	leal	.LK_XX_XX-.L000pic_point(%ebp),%ebp
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	testl	$512,%edx
+	jz	.L001x86
+	testl	$16777216,%eax
+	jz	.L001x86
+	jmp	.Lssse3_shortcut
+.align	16
+.L001x86:
 	movl	20(%esp),%ebp
 	movl	24(%esp),%esi
 	movl	28(%esp),%eax
@@ -17,9 +32,9 @@ sha1_block_data_order:
 	addl	%esi,%eax
 	movl	%eax,104(%esp)
 	movl	16(%ebp),%edi
-	jmp	.L000loop
+	jmp	.L002loop
 .align	16
-.L000loop:
+.L002loop:
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
@@ -1366,7 +1381,7 @@ sha1_block_data_order:
 	movl	%ebx,12(%ebp)
 	movl	%edx,%esi
 	movl	%ecx,16(%ebp)
-	jb	.L000loop
+	jb	.L002loop
 	addl	$76,%esp
 	popl	%edi
 	popl	%esi
@@ -1374,7 +1389,1251 @@ sha1_block_data_order:
 	popl	%ebp
 	ret
 .size	sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.type	_sha1_block_data_order_ssse3,@function
+.align	16
+_sha1_block_data_order_ssse3:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.L003pic_point
+.L003pic_point:
+	popl	%ebp
+	leal	.LK_XX_XX-.L003pic_point(%ebp),%ebp
+.Lssse3_shortcut:
+	movdqa	(%ebp),%xmm7
+	movdqa	16(%ebp),%xmm0
+	movdqa	32(%ebp),%xmm1
+	movdqa	48(%ebp),%xmm2
+	movdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	movdqa	%xmm0,112(%esp)
+	movdqa	%xmm1,128(%esp)
+	movdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	movdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	movdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	movdqu	-64(%ebp),%xmm0
+	movdqu	-48(%ebp),%xmm1
+	movdqu	-32(%ebp),%xmm2
+	movdqu	-16(%ebp),%xmm3
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	movdqa	%xmm7,96(%esp)
+.byte	102,15,56,0,222
+	paddd	%xmm7,%xmm0
+	paddd	%xmm7,%xmm1
+	paddd	%xmm7,%xmm2
+	movdqa	%xmm0,(%esp)
+	psubd	%xmm7,%xmm0
+	movdqa	%xmm1,16(%esp)
+	psubd	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	psubd	%xmm7,%xmm2
+	movdqa	%xmm1,%xmm4
+	jmp	.L004loop
+.align	16
+.L004loop:
+	addl	(%esp),%edi
+	xorl	%edx,%ecx
+.byte	102,15,58,15,224,8
+	movdqa	%xmm3,%xmm6
+	movl	%eax,%ebp
+	roll	$5,%eax
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm0,64(%esp)
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrldq	$4,%xmm6
+	xorl	%edx,%esi
+	addl	%eax,%edi
+	pxor	%xmm0,%xmm4
+	rorl	$2,%ebx
+	addl	%esi,%edi
+	pxor	%xmm2,%xmm6
+	addl	4(%esp),%edx
+	xorl	%ecx,%ebx
+	movl	%edi,%esi
+	roll	$5,%edi
+	pxor	%xmm6,%xmm4
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	movdqa	%xmm7,48(%esp)
+	xorl	%ecx,%ebp
+	addl	%edi,%edx
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm6
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	8(%esp),%ecx
+	xorl	%ebx,%eax
+	pslldq	$12,%xmm0
+	paddd	%xmm4,%xmm4
+	movl	%edx,%ebp
+	roll	$5,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrld	$31,%xmm6
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm0,%xmm7
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	psrld	$30,%xmm0
+	por	%xmm6,%xmm4
+	addl	12(%esp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pslld	$2,%xmm7
+	pxor	%xmm0,%xmm4
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	movdqa	96(%esp),%xmm0
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	pxor	%xmm7,%xmm4
+	movdqa	%xmm2,%xmm5
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	16(%esp),%eax
+	xorl	%edi,%edx
+.byte	102,15,58,15,233,8
+	movdqa	%xmm4,%xmm7
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	paddd	%xmm4,%xmm0
+	movdqa	%xmm1,80(%esp)
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	psrldq	$4,%xmm7
+	xorl	%edi,%esi
+	addl	%ebx,%eax
+	pxor	%xmm1,%xmm5
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm3,%xmm7
+	addl	20(%esp),%edi
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pxor	%xmm7,%xmm5
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	movdqa	%xmm0,(%esp)
+	xorl	%edx,%ebp
+	addl	%eax,%edi
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm5,%xmm7
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	addl	24(%esp),%edx
+	xorl	%ecx,%ebx
+	pslldq	$12,%xmm1
+	paddd	%xmm5,%xmm5
+	movl	%edi,%ebp
+	roll	$5,%edi
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	psrld	$31,%xmm7
+	xorl	%ecx,%esi
+	addl	%edi,%edx
+	movdqa	%xmm1,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	psrld	$30,%xmm1
+	por	%xmm7,%xmm5
+	addl	28(%esp),%ecx
+	xorl	%ebx,%eax
+	movl	%edx,%esi
+	roll	$5,%edx
+	pslld	$2,%xmm0
+	pxor	%xmm1,%xmm5
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	movdqa	112(%esp),%xmm1
+	xorl	%ebx,%ebp
+	addl	%edx,%ecx
+	pxor	%xmm0,%xmm5
+	movdqa	%xmm3,%xmm6
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	addl	32(%esp),%ebx
+	xorl	%eax,%edi
+.byte	102,15,58,15,242,8
+	movdqa	%xmm5,%xmm0
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	paddd	%xmm5,%xmm1
+	movdqa	%xmm2,96(%esp)
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	psrldq	$4,%xmm0
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	pxor	%xmm2,%xmm6
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm4,%xmm0
+	addl	36(%esp),%eax
+	xorl	%edi,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	pxor	%xmm0,%xmm6
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	movdqa	%xmm1,16(%esp)
+	xorl	%edi,%ebp
+	addl	%ebx,%eax
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,%xmm0
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	addl	40(%esp),%edi
+	xorl	%edx,%ecx
+	pslldq	$12,%xmm2
+	paddd	%xmm6,%xmm6
+	movl	%eax,%ebp
+	roll	$5,%eax
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrld	$31,%xmm0
+	xorl	%edx,%esi
+	addl	%eax,%edi
+	movdqa	%xmm2,%xmm1
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	psrld	$30,%xmm2
+	por	%xmm0,%xmm6
+	addl	44(%esp),%edx
+	xorl	%ecx,%ebx
+	movdqa	64(%esp),%xmm0
+	movl	%edi,%esi
+	roll	$5,%edi
+	pslld	$2,%xmm1
+	pxor	%xmm2,%xmm6
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	movdqa	112(%esp),%xmm2
+	xorl	%ecx,%ebp
+	addl	%edi,%edx
+	pxor	%xmm1,%xmm6
+	movdqa	%xmm4,%xmm7
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	48(%esp),%ecx
+	xorl	%ebx,%eax
+.byte	102,15,58,15,251,8
+	movdqa	%xmm6,%xmm1
+	movl	%edx,%ebp
+	roll	$5,%edx
+	paddd	%xmm6,%xmm2
+	movdqa	%xmm3,64(%esp)
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrldq	$4,%xmm1
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	pxor	%xmm3,%xmm7
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	pxor	%xmm5,%xmm1
+	addl	52(%esp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pxor	%xmm1,%xmm7
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	movdqa	%xmm2,32(%esp)
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movdqa	%xmm7,%xmm3
+	movdqa	%xmm7,%xmm1
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	56(%esp),%eax
+	xorl	%edi,%edx
+	pslldq	$12,%xmm3
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	psrld	$31,%xmm1
+	xorl	%edi,%esi
+	addl	%ebx,%eax
+	movdqa	%xmm3,%xmm2
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	psrld	$30,%xmm3
+	por	%xmm1,%xmm7
+	addl	60(%esp),%edi
+	xorl	%edx,%ecx
+	movdqa	80(%esp),%xmm1
+	movl	%eax,%esi
+	roll	$5,%eax
+	pslld	$2,%xmm2
+	pxor	%xmm3,%xmm7
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	movdqa	112(%esp),%xmm3
+	xorl	%edx,%ebp
+	addl	%eax,%edi
+	pxor	%xmm2,%xmm7
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	movdqa	%xmm7,%xmm2
+	addl	(%esp),%edx
+	pxor	%xmm4,%xmm0
+.byte	102,15,58,15,214,8
+	xorl	%ecx,%ebx
+	movl	%edi,%ebp
+	roll	$5,%edi
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,80(%esp)
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	movdqa	%xmm3,%xmm4
+	paddd	%xmm7,%xmm3
+	xorl	%ecx,%esi
+	addl	%edi,%edx
+	pxor	%xmm2,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	4(%esp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	roll	$5,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	pslld	$2,%xmm0
+	xorl	%ebx,%ebp
+	addl	%edx,%ecx
+	psrld	$30,%xmm2
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	addl	8(%esp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	por	%xmm2,%xmm0
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	movdqa	96(%esp),%xmm2
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	12(%esp),%eax
+	movdqa	%xmm0,%xmm3
+	xorl	%edi,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	xorl	%edi,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	addl	16(%esp),%edi
+	pxor	%xmm5,%xmm1
+.byte	102,15,58,15,223,8
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,96(%esp)
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	movdqa	%xmm4,%xmm5
+	paddd	%xmm0,%xmm4
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	pxor	%xmm3,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	pslld	$2,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm3
+	movl	%edx,%ebp
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	por	%xmm3,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%eax,%ebp
+	movdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	movdqa	%xmm1,%xmm4
+	addl	%ebp,%ebx
+	addl	32(%esp),%eax
+	pxor	%xmm6,%xmm2
+.byte	102,15,58,15,224,8
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,64(%esp)
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	movdqa	128(%esp),%xmm6
+	paddd	%xmm1,%xmm5
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm4,%xmm2
+	addl	36(%esp),%edi
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	pslld	$2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ecx,%esi
+	psrld	$30,%xmm4
+	movl	%edi,%ebp
+	roll	$5,%edi
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	por	%xmm4,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%ebx,%ebp
+	movdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	movdqa	%xmm2,%xmm5
+	addl	%ebp,%ecx
+	addl	48(%esp),%ebx
+	pxor	%xmm7,%xmm3
+.byte	102,15,58,15,233,8
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,80(%esp)
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	movdqa	%xmm6,%xmm7
+	paddd	%xmm2,%xmm6
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	pslld	$2,%xmm3
+	addl	56(%esp),%edi
+	xorl	%edx,%esi
+	psrld	$30,%xmm5
+	movl	%eax,%ebp
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	por	%xmm5,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ecx,%ebp
+	movdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	roll	$5,%edi
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	movdqa	%xmm3,%xmm6
+	addl	%ebp,%edx
+	addl	(%esp),%ecx
+	pxor	%xmm0,%xmm4
+.byte	102,15,58,15,242,8
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm0,96(%esp)
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm7,%xmm0
+	paddd	%xmm3,%xmm7
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	pxor	%xmm6,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm7,48(%esp)
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	pslld	$2,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edi,%esi
+	psrld	$30,%xmm6
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	por	%xmm6,%xmm4
+	addl	12(%esp),%edi
+	xorl	%edx,%ebp
+	movdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	movdqa	%xmm4,%xmm7
+	addl	%ebp,%edi
+	addl	16(%esp),%edx
+	pxor	%xmm1,%xmm5
+.byte	102,15,58,15,251,8
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm1,64(%esp)
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	movdqa	%xmm0,%xmm1
+	paddd	%xmm4,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	pxor	%xmm7,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm0,(%esp)
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	pslld	$2,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%eax,%esi
+	psrld	$30,%xmm7
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	por	%xmm7,%xmm5
+	addl	28(%esp),%eax
+	xorl	%edi,%ebp
+	movdqa	80(%esp),%xmm7
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	movdqa	%xmm5,%xmm0
+	addl	%ebp,%eax
+	movl	%ecx,%ebp
+	pxor	%xmm2,%xmm6
+.byte	102,15,58,15,196,8
+	xorl	%edx,%ecx
+	addl	32(%esp),%edi
+	andl	%edx,%ebp
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm2,80(%esp)
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm5,%xmm1
+	addl	%ebp,%edi
+	movl	%eax,%ebp
+	pxor	%xmm0,%xmm6
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%edx,%ecx
+	addl	%eax,%edi
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm1,16(%esp)
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	36(%esp),%edx
+	andl	%ecx,%esi
+	pslld	$2,%xmm6
+	andl	%ebx,%ebp
+	rorl	$7,%eax
+	psrld	$30,%xmm0
+	addl	%esi,%edx
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ecx,%ebx
+	addl	%edi,%edx
+	por	%xmm0,%xmm6
+	movl	%eax,%ebp
+	xorl	%ebx,%eax
+	movdqa	96(%esp),%xmm0
+	addl	40(%esp),%ecx
+	andl	%ebx,%ebp
+	andl	%eax,%esi
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	movdqa	%xmm6,%xmm1
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	addl	44(%esp),%ebx
+	andl	%eax,%esi
+	andl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movl	%edx,%ebp
+	pxor	%xmm3,%xmm7
+.byte	102,15,58,15,205,8
+	xorl	%edi,%edx
+	addl	48(%esp),%eax
+	andl	%edi,%ebp
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm3,96(%esp)
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	144(%esp),%xmm3
+	paddd	%xmm6,%xmm2
+	addl	%ebp,%eax
+	movl	%ebx,%ebp
+	pxor	%xmm1,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edi,%edx
+	addl	%ebx,%eax
+	movdqa	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	52(%esp),%edi
+	andl	%edx,%esi
+	pslld	$2,%xmm7
+	andl	%ecx,%ebp
+	rorl	$7,%ebx
+	psrld	$30,%xmm1
+	addl	%esi,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%edx,%ecx
+	addl	%eax,%edi
+	por	%xmm1,%xmm7
+	movl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	movdqa	64(%esp),%xmm1
+	addl	56(%esp),%edx
+	andl	%ecx,%ebp
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	movdqa	%xmm7,%xmm2
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%edi,%edx
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	60(%esp),%ecx
+	andl	%ebx,%esi
+	andl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%edi,%ebp
+	pxor	%xmm4,%xmm0
+.byte	102,15,58,15,214,8
+	xorl	%eax,%edi
+	addl	(%esp),%ebx
+	andl	%eax,%ebp
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,64(%esp)
+	andl	%edi,%esi
+	rorl	$7,%edx
+	movdqa	%xmm3,%xmm4
+	paddd	%xmm7,%xmm3
+	addl	%ebp,%ebx
+	movl	%ecx,%ebp
+	pxor	%xmm2,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	xorl	%edi,%edx
+	addl	4(%esp),%eax
+	andl	%edi,%esi
+	pslld	$2,%xmm0
+	andl	%edx,%ebp
+	rorl	$7,%ecx
+	psrld	$30,%xmm2
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edi,%edx
+	addl	%ebx,%eax
+	por	%xmm2,%xmm0
+	movl	%ecx,%ebp
+	xorl	%edx,%ecx
+	movdqa	80(%esp),%xmm2
+	addl	8(%esp),%edi
+	andl	%edx,%ebp
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	movdqa	%xmm0,%xmm3
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%edx,%ecx
+	addl	%eax,%edi
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	12(%esp),%edx
+	andl	%ecx,%esi
+	andl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ecx,%ebx
+	addl	%edi,%edx
+	movl	%eax,%ebp
+	pxor	%xmm5,%xmm1
+.byte	102,15,58,15,223,8
+	xorl	%ebx,%eax
+	addl	16(%esp),%ecx
+	andl	%ebx,%ebp
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,80(%esp)
+	andl	%eax,%esi
+	rorl	$7,%edi
+	movdqa	%xmm4,%xmm5
+	paddd	%xmm0,%xmm4
+	addl	%ebp,%ecx
+	movl	%edx,%ebp
+	pxor	%xmm3,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	addl	20(%esp),%ebx
+	andl	%eax,%esi
+	pslld	$2,%xmm1
+	andl	%edi,%ebp
+	rorl	$7,%edx
+	psrld	$30,%xmm3
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	por	%xmm3,%xmm1
+	movl	%edx,%ebp
+	xorl	%edi,%edx
+	movdqa	96(%esp),%xmm3
+	addl	24(%esp),%eax
+	andl	%edi,%ebp
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	movdqa	%xmm1,%xmm4
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edi,%edx
+	addl	%ebx,%eax
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	28(%esp),%edi
+	andl	%edx,%esi
+	andl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%edx,%ecx
+	addl	%eax,%edi
+	movl	%ebx,%ebp
+	pxor	%xmm6,%xmm2
+.byte	102,15,58,15,224,8
+	xorl	%ecx,%ebx
+	addl	32(%esp),%edx
+	andl	%ecx,%ebp
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,96(%esp)
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	movdqa	%xmm5,%xmm6
+	paddd	%xmm1,%xmm5
+	addl	%ebp,%edx
+	movl	%edi,%ebp
+	pxor	%xmm4,%xmm2
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%edi,%edx
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	36(%esp),%ecx
+	andl	%ebx,%esi
+	pslld	$2,%xmm2
+	andl	%eax,%ebp
+	rorl	$7,%edi
+	psrld	$30,%xmm4
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	por	%xmm4,%xmm2
+	movl	%edi,%ebp
+	xorl	%eax,%edi
+	movdqa	64(%esp),%xmm4
+	addl	40(%esp),%ebx
+	andl	%eax,%ebp
+	andl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	movdqa	%xmm2,%xmm5
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movl	%edx,%esi
+	xorl	%edi,%edx
+	addl	44(%esp),%eax
+	andl	%edi,%esi
+	andl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edi,%edx
+	addl	%ebx,%eax
+	addl	48(%esp),%edi
+	pxor	%xmm7,%xmm3
+.byte	102,15,58,15,233,8
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,64(%esp)
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	movdqa	%xmm6,%xmm7
+	paddd	%xmm2,%xmm6
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	pslld	$2,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm5
+	movl	%edx,%ebp
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	por	%xmm5,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	(%esp),%eax
+	paddd	%xmm3,%xmm7
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	movdqa	%xmm7,48(%esp)
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	4(%esp),%edi
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	addl	8(%esp),%edx
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	12(%esp),%ecx
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	.L005done
+	movdqa	160(%esp),%xmm7
+	movdqa	176(%esp),%xmm6
+	movdqu	(%ebp),%xmm0
+	movdqu	16(%ebp),%xmm1
+	movdqu	32(%ebp),%xmm2
+	movdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+.byte	102,15,56,0,198
+	movl	%ebp,196(%esp)
+	movdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%eax,%esi
+.byte	102,15,56,0,206
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	paddd	%xmm7,%xmm0
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movdqa	%xmm0,(%esp)
+	addl	20(%esp),%eax
+	xorl	%edi,%ebp
+	psubd	%xmm7,%xmm0
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	addl	24(%esp),%edi
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	addl	28(%esp),%edx
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%esp),%ecx
+	xorl	%ebx,%esi
+.byte	102,15,56,0,214
+	movl	%edx,%ebp
+	roll	$5,%edx
+	paddd	%xmm7,%xmm1
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	movdqa	%xmm1,16(%esp)
+	addl	36(%esp),%ebx
+	xorl	%eax,%ebp
+	psubd	%xmm7,%xmm1
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	40(%esp),%eax
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%esp),%edi
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	addl	48(%esp),%edx
+	xorl	%ecx,%esi
+.byte	102,15,56,0,222
+	movl	%edi,%ebp
+	roll	$5,%edi
+	paddd	%xmm7,%xmm2
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movdqa	%xmm2,32(%esp)
+	addl	52(%esp),%ecx
+	xorl	%ebx,%ebp
+	psubd	%xmm7,%xmm2
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	addl	56(%esp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%esi,%ebx
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	movdqa	%xmm1,%xmm4
+	jmp	.L004loop
+.align	16
+.L005done:
+	addl	16(%esp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	20(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	addl	24(%esp),%edi
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%esi,%edi
+	addl	28(%esp),%edx
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	xorl	%ebx,%ebp
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%esp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%esi,%ecx
+	addl	36(%esp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%edi,%ebp
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%ebp,%ebx
+	addl	40(%esp),%eax
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%esp),%edi
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%ebp
+	addl	%eax,%edi
+	rorl	$7,%ebx
+	addl	%ebp,%edi
+	addl	48(%esp),%edx
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	xorl	%ebx,%esi
+	addl	%edi,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	52(%esp),%ecx
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edi
+	addl	%ebp,%ecx
+	addl	56(%esp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	xorl	%edi,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%ebp
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%ebp,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.align	64
+.LK_XX_XX:
+.long	1518500249,1518500249,1518500249,1518500249
+.long	1859775393,1859775393,1859775393,1859775393
+.long	2400959708,2400959708,2400959708,2400959708
+.long	3395469782,3395469782,3395469782,3395469782
+.long	66051,67438087,134810123,202182159
 .byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
 .byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
 .byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
 .byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.comm	OPENSSL_ia32cap_P,8,4

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S:1.8 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S:1.9
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S:1.8	Sat Jan 11 18:20:06 2014
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S	Sat May 16 17:32:54 2015
@@ -226,6 +226,18 @@ OPENSSL_wipe_cpu:
 	movl	(%ecx),%ecx
 	btl	$1,(%ecx)
 	jnc	.L015no_x87
+	andl	$83886080,%ecx
+	cmpl	$83886080,%ecx
+	jne	.L016no_sse2
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+.L016no_sse2:
 .long	4007259865,4007259865,4007259865,4007259865,2430851995
 .L015no_x87:
 	leal	4(%esp),%eax
@@ -241,11 +253,11 @@ OPENSSL_atomic_add:
 	pushl	%ebx
 	nop
 	movl	(%edx),%eax
-.L016spin:
+.L017spin:
 	leal	(%eax,%ecx,1),%ebx
 	nop
 .long	447811568
-	jne	.L016spin
+	jne	.L017spin
 	movl	%ebx,%eax
 	popl	%ebx
 	ret
@@ -286,32 +298,32 @@ OPENSSL_cleanse:
 	movl	8(%esp),%ecx
 	xorl	%eax,%eax
 	cmpl	$7,%ecx
-	jae	.L017lot
+	jae	.L018lot
 	cmpl	$0,%ecx
-	je	.L018ret
-.L019little:
+	je	.L019ret
+.L020little:
 	movb	%al,(%edx)
 	subl	$1,%ecx
 	leal	1(%edx),%edx
-	jnz	.L019little
-.L018ret:
+	jnz	.L020little
+.L019ret:
 	ret
 .align	16
-.L017lot:
+.L018lot:
 	testl	$3,%edx
-	jz	.L020aligned
+	jz	.L021aligned
 	movb	%al,(%edx)
 	leal	-1(%ecx),%ecx
 	leal	1(%edx),%edx
-	jmp	.L017lot
-.L020aligned:
+	jmp	.L018lot
+.L021aligned:
 	movl	%eax,(%edx)
 	leal	-4(%ecx),%ecx
 	testl	$-4,%ecx
 	leal	4(%edx),%edx
-	jnz	.L020aligned
+	jnz	.L021aligned
 	cmpl	$0,%ecx
-	jne	.L019little
+	jne	.L020little
 	ret
 .size	OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
 .globl	OPENSSL_ia32_rdrand
@@ -320,11 +332,11 @@ OPENSSL_cleanse:
 OPENSSL_ia32_rdrand:
 .L_OPENSSL_ia32_rdrand_begin:
 	movl	$8,%ecx
-.L021loop:
+.L022loop:
 .byte	15,199,240
-	jc	.L022break
-	loop	.L021loop
-.L022break:
+	jc	.L023break
+	loop	.L022loop
+.L023break:
 	cmpl	$0,%eax
 	cmovel	%ecx,%eax
 	ret

Added files:

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc
diff -u /dev/null src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc:1.1
--- /dev/null	Sat May 16 17:32:54 2015
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc	Sat May 16 17:32:54 2015
@@ -0,0 +1,4 @@
+.PATH.S: ${.PARSEDIR}
+MODES_SRCS += ghash-x86.o
+MODESCPPFLAGS = -DGHASH_ASM
+.include "../../modes.inc"

Reply via email to