Module Name:    src
Committed By:   joerg
Date:           Sat May 16 19:08:37 UTC 2015

Modified Files:
        src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64: Makefile
            aesni-sha1-x86_64.S sha1-x86_64.S

Log Message:
Explicitly pass CC down. When building with clang, force external
assembler as some of the Perl scripts use -Wa,-v. Regenerate for AVX
support.


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile
cvs rdiff -u -r1.3 -r1.4 \
    
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S
cvs rdiff -u -r1.5 -r1.6 \
    src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.6 src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.7
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.6	Sat Aug  4 11:03:34 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile	Sat May 16 19:08:37 2015
@@ -1,14 +1,18 @@
-#	$NetBSD: Makefile,v 1.6 2012/08/04 11:03:34 christos Exp $
+#	$NetBSD: Makefile,v 1.7 2015/05/16 19:08:37 joerg Exp $
 
 .include "bsd.own.mk"
 
 CRYPTODIST=${NETBSDSRCDIR}/crypto
 .include "${NETBSDSRCDIR}/crypto/Makefile.openssl"
 
+.if make(regen) && ${HAVE_LLVM:U} == "yes"
+CC+= -fno-integrated-as
+.endif
+
 regen:
 	for i in $$(find ${OPENSSLSRC} -name \*${MACHINE_ARCH}.pl) \
 		${OPENSSLSRC}/crypto/${MACHINE_ARCH}cpuid.pl ; do \
-                (echo "#include <machine/asm.h>"; perl $$i elf | sed \
+                (echo "#include <machine/asm.h>"; CC=${CC:Q} perl $$i elf | sed \
 		    -e 's/\(OPENSSL[A-Za-z0-9_+]*\)(%rip)/\1@GOTPCREL(%rip)/' \
 		    -e 's/.hidden	OPENSSL_cpuid_setup/.globl	OPENSSL_cpuid_setup/' \
 		    -e 's/call	OPENSSL_cpuid_setup/call	PIC_PLT(OPENSSL_cpuid_setup)/') \

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S:1.3 src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S:1.4
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S:1.3	Sat Aug  4 11:03:34 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S	Sat May 16 19:08:37 2015
@@ -9,6 +9,11 @@ aesni_cbc_sha1_enc:
 
 	movl	OPENSSL_ia32cap_P+0@GOTPCREL(%rip),%r10d
 	movl	OPENSSL_ia32cap_P+4@GOTPCREL(%rip),%r11d
+	andl	$268435456,%r11d
+	andl	$1073741824,%r10d
+	orl	%r11d,%r10d
+	cmpl	$1342177280,%r10d
+	je	aesni_cbc_sha1_enc_avx
 	jmp	aesni_cbc_sha1_enc_ssse3
 	.byte	0xf3,0xc3
 .size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
@@ -1385,6 +1390,1343 @@ aesni_cbc_sha1_enc_ssse3:
 .Lepilogue_ssse3:
 	.byte	0xf3,0xc3
 .size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+.type	aesni_cbc_sha1_enc_avx,@function
+.align	16
+aesni_cbc_sha1_enc_avx:
+	movq	8(%rsp),%r10
+
+
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-104(%rsp),%rsp
+
+
+	vzeroall
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	vmovdqu	(%r8),%xmm11
+	movq	%r8,88(%rsp)
+	shlq	$6,%r14
+	subq	%r12,%r13
+	movl	240(%r15),%r8d
+	addq	$112,%r15
+	addq	%r10,%r14
+
+	leaq	K_XX_XX(%rip),%r11
+	movl	0(%r9),%eax
+	movl	4(%r9),%ebx
+	movl	8(%r9),%ecx
+	movl	12(%r9),%edx
+	movl	%ebx,%esi
+	movl	16(%r9),%ebp
+
+	vmovdqa	64(%r11),%xmm6
+	vmovdqa	0(%r11),%xmm9
+	vmovdqu	0(%r10),%xmm0
+	vmovdqu	16(%r10),%xmm1
+	vmovdqu	32(%r10),%xmm2
+	vmovdqu	48(%r10),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r10
+	vpshufb	%xmm6,%xmm1,%xmm1
+	vpshufb	%xmm6,%xmm2,%xmm2
+	vpshufb	%xmm6,%xmm3,%xmm3
+	vpaddd	%xmm9,%xmm0,%xmm4
+	vpaddd	%xmm9,%xmm1,%xmm5
+	vpaddd	%xmm9,%xmm2,%xmm6
+	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm6,32(%rsp)
+	vmovups	-112(%r15),%xmm13
+	vmovups	16-112(%r15),%xmm14
+	jmp	.Loop_avx
+.align	16
+.Loop_avx:
+	addl	0(%rsp),%ebp
+	vmovups	0(%r12),%xmm12
+	vxorps	%xmm13,%xmm12,%xmm12
+	vxorps	%xmm12,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-80(%r15),%xmm15
+	xorl	%edx,%ecx
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpaddd	%xmm3,%xmm9,%xmm9
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	vpsrldq	$4,%xmm3,%xmm8
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	vpxor	%xmm0,%xmm4,%xmm4
+	shrdl	$2,%ebx,%ebx
+	addl	%esi,%ebp
+	vpxor	%xmm2,%xmm8,%xmm8
+	addl	4(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm8,%xmm4,%xmm4
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	vmovdqa	%xmm9,48(%rsp)
+	xorl	%ecx,%edi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	-64(%r15),%xmm14
+	addl	%ebp,%edx
+	vpsrld	$31,%xmm4,%xmm8
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	8(%rsp),%ecx
+	xorl	%ebx,%eax
+	vpslldq	$12,%xmm4,%xmm10
+	vpaddd	%xmm4,%xmm4,%xmm4
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm4,%xmm4
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm4,%xmm4
+	addl	12(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-48(%r15),%xmm15
+	vpxor	%xmm10,%xmm4,%xmm4
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	vmovdqa	0(%r11),%xmm10
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	16(%rsp),%eax
+	xorl	%ebp,%edx
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	vpaddd	%xmm4,%xmm10,%xmm10
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	vpsrldq	$4,%xmm4,%xmm9
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	vpxor	%xmm1,%xmm5,%xmm5
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	vpxor	%xmm3,%xmm9,%xmm9
+	addl	20(%rsp),%ebp
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	-32(%r15),%xmm14
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm9,%xmm5,%xmm5
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	vmovdqa	%xmm10,0(%rsp)
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	vpsrld	$31,%xmm5,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	addl	24(%rsp),%edx
+	xorl	%ecx,%ebx
+	vpslldq	$12,%xmm5,%xmm8
+	vpaddd	%xmm5,%xmm5,%xmm5
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	vpsrld	$30,%xmm8,%xmm10
+	vpor	%xmm9,%xmm5,%xmm5
+	xorl	%ecx,%esi
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-16(%r15),%xmm15
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	vpslld	$2,%xmm8,%xmm8
+	vpxor	%xmm10,%xmm5,%xmm5
+	addl	28(%rsp),%ecx
+	xorl	%ebx,%eax
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm8,%xmm5,%xmm5
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	vmovdqa	16(%r11),%xmm8
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	addl	32(%rsp),%ebx
+	xorl	%eax,%ebp
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	0(%r15),%xmm14
+	vpaddd	%xmm5,%xmm8,%xmm8
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	vpsrldq	$4,%xmm5,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	vpxor	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	vpxor	%xmm4,%xmm10,%xmm10
+	addl	36(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm10,%xmm6,%xmm6
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	vmovdqa	%xmm8,16(%rsp)
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	vpsrld	$31,%xmm6,%xmm10
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	addl	40(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	16(%r15),%xmm15
+	xorl	%edx,%ecx
+	vpslldq	$12,%xmm6,%xmm9
+	vpaddd	%xmm6,%xmm6,%xmm6
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	vpsrld	$30,%xmm9,%xmm8
+	vpor	%xmm10,%xmm6,%xmm6
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	vpslld	$2,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm6,%xmm6
+	addl	44(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm9,%xmm6,%xmm6
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	vmovdqa	16(%r11),%xmm9
+	xorl	%ecx,%edi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	32(%r15),%xmm14
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	48(%rsp),%ecx
+	xorl	%ebx,%eax
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm6,%xmm9,%xmm9
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	vpsrldq	$4,%xmm6,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	vpxor	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpxor	%xmm5,%xmm8,%xmm8
+	addl	52(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	48(%r15),%xmm15
+	vpxor	%xmm8,%xmm7,%xmm7
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	vmovdqa	%xmm9,32(%rsp)
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	vpsrld	$31,%xmm7,%xmm8
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	56(%rsp),%eax
+	xorl	%ebp,%edx
+	vpslldq	$12,%xmm7,%xmm10
+	vpaddd	%xmm7,%xmm7,%xmm7
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm7,%xmm7
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm7,%xmm7
+	addl	60(%rsp),%ebp
+	cmpl	$11,%r8d
+	jb	.Lvaesenclast1
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	64(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	80(%r15),%xmm15
+	je	.Lvaesenclast1
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	96(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	112(%r15),%xmm15
+.Lvaesenclast1:
+	vaesenclast	%xmm15,%xmm11,%xmm11
+	vmovups	16-112(%r15),%xmm14
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm10,%xmm7,%xmm7
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	vmovdqa	16(%r11),%xmm10
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	vpalignr	$8,%xmm6,%xmm7,%xmm9
+	vpxor	%xmm4,%xmm0,%xmm0
+	addl	0(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm1,%xmm0,%xmm0
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	vmovdqa	%xmm10,%xmm8
+	vpaddd	%xmm7,%xmm10,%xmm10
+	xorl	%ecx,%esi
+	vmovups	16(%r12),%xmm12
+	vxorps	%xmm13,%xmm12,%xmm12
+	vmovups	%xmm11,0(%r13,%r12,1)
+	vxorps	%xmm12,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-80(%r15),%xmm15
+	addl	%ebp,%edx
+	vpxor	%xmm9,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	addl	4(%rsp),%ecx
+	xorl	%ebx,%eax
+	vpsrld	$30,%xmm0,%xmm9
+	vmovdqa	%xmm10,48(%rsp)
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	addl	8(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	-64(%r15),%xmm14
+	vpor	%xmm9,%xmm0,%xmm0
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	vmovdqa	%xmm0,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	vpalignr	$8,%xmm7,%xmm0,%xmm10
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-48(%r15),%xmm15
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm2,%xmm1,%xmm1
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	vmovdqa	%xmm8,%xmm9
+	vpaddd	%xmm0,%xmm8,%xmm8
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	vpxor	%xmm10,%xmm1,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm1,%xmm10
+	vmovdqa	%xmm8,0(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%esi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	-32(%r15),%xmm14
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpor	%xmm10,%xmm1,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%eax,%edi
+	vmovdqa	%xmm1,%xmm8
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm3,%xmm2,%xmm2
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	vmovdqa	32(%r11),%xmm10
+	vpaddd	%xmm1,%xmm9,%xmm9
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	vpxor	%xmm8,%xmm2,%xmm2
+	addl	36(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-16(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	vpslld	$2,%xmm2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	vpor	%xmm8,%xmm2,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%ebx,%edi
+	vmovdqa	%xmm2,%xmm9
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%edi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	0(%r15),%xmm14
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	vpalignr	$8,%xmm1,%xmm2,%xmm9
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm4,%xmm3,%xmm3
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	vmovdqa	%xmm10,%xmm8
+	vpaddd	%xmm2,%xmm10,%xmm10
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	vpxor	%xmm9,%xmm3,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm9
+	vmovdqa	%xmm10,32(%rsp)
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	16(%r15),%xmm15
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	vpor	%xmm9,%xmm3,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ecx,%edi
+	vmovdqa	%xmm3,%xmm10
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm2,%xmm3,%xmm10
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	0(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%eax,%esi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	32(%r15),%xmm14
+	addl	%edx,%ecx
+	vmovdqa	%xmm8,%xmm9
+	vpaddd	%xmm3,%xmm8,%xmm8
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpxor	%xmm10,%xmm4,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpsrld	$30,%xmm4,%xmm10
+	vmovdqa	%xmm8,48(%rsp)
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	vpslld	$2,%xmm4,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	vpor	%xmm10,%xmm4,%xmm4
+	addl	12(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	48(%r15),%xmm15
+	xorl	%edx,%edi
+	vmovdqa	%xmm4,%xmm8
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	16(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm6,%xmm5,%xmm5
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	vmovdqa	%xmm9,%xmm10
+	vpaddd	%xmm4,%xmm9,%xmm9
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	vpxor	%xmm8,%xmm5,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm5,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	xorl	%eax,%edi
+	cmpl	$11,%r8d
+	jb	.Lvaesenclast2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	64(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	80(%r15),%xmm15
+	je	.Lvaesenclast2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	96(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	112(%r15),%xmm15
+.Lvaesenclast2:
+	vaesenclast	%xmm15,%xmm11,%xmm11
+	vmovups	16-112(%r15),%xmm14
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	vpslld	$2,%xmm5,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	vpor	%xmm8,%xmm5,%xmm5
+	addl	28(%rsp),%eax
+	xorl	%ebp,%edi
+	vmovdqa	%xmm5,%xmm9
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	vpalignr	$8,%xmm4,%xmm5,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	movl	%ecx,%edi
+	vmovups	32(%r12),%xmm12
+	vxorps	%xmm13,%xmm12,%xmm12
+	vmovups	%xmm11,16(%r13,%r12,1)
+	vxorps	%xmm12,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-80(%r15),%xmm15
+	xorl	%edx,%ecx
+	addl	32(%rsp),%ebp
+	andl	%edx,%edi
+	vpxor	%xmm7,%xmm6,%xmm6
+	andl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	vmovdqa	%xmm10,%xmm8
+	vpaddd	%xmm5,%xmm10,%xmm10
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	vpxor	%xmm9,%xmm6,%xmm6
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	vpsrld	$30,%xmm6,%xmm9
+	vmovdqa	%xmm10,16(%rsp)
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	36(%rsp),%edx
+	andl	%ecx,%esi
+	vpslld	$2,%xmm6,%xmm6
+	andl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	-64(%r15),%xmm14
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	vpor	%xmm9,%xmm6,%xmm6
+	movl	%eax,%edi
+	xorl	%ebx,%eax
+	vmovdqa	%xmm6,%xmm10
+	addl	40(%rsp),%ecx
+	andl	%ebx,%edi
+	andl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	44(%rsp),%ebx
+	andl	%eax,%esi
+	andl	%ebp,%edi
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-48(%r15),%xmm15
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm5,%xmm6,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	movl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	48(%rsp),%eax
+	andl	%ebp,%edi
+	vpxor	%xmm0,%xmm7,%xmm7
+	andl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	vmovdqa	48(%r11),%xmm9
+	vpaddd	%xmm6,%xmm8,%xmm8
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	vpxor	%xmm10,%xmm7,%xmm7
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	vpsrld	$30,%xmm7,%xmm10
+	vmovdqa	%xmm8,32(%rsp)
+	movl	%ecx,%esi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	-32(%r15),%xmm14
+	xorl	%edx,%ecx
+	addl	52(%rsp),%ebp
+	andl	%edx,%esi
+	vpslld	$2,%xmm7,%xmm7
+	andl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	vpor	%xmm10,%xmm7,%xmm7
+	movl	%ebx,%edi
+	xorl	%ecx,%ebx
+	vmovdqa	%xmm7,%xmm8
+	addl	56(%rsp),%edx
+	andl	%ecx,%edi
+	andl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-16(%r15),%xmm15
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	60(%rsp),%ecx
+	andl	%ebx,%esi
+	andl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	vpxor	%xmm4,%xmm0,%xmm0
+	movl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	0(%rsp),%ebx
+	andl	%eax,%edi
+	vpxor	%xmm1,%xmm0,%xmm0
+	andl	%ebp,%esi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	0(%r15),%xmm14
+	shrdl	$7,%edx,%edx
+	vmovdqa	%xmm9,%xmm10
+	vpaddd	%xmm7,%xmm9,%xmm9
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	vpxor	%xmm8,%xmm0,%xmm0
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	4(%rsp),%eax
+	andl	%ebp,%esi
+	vpslld	$2,%xmm0,%xmm0
+	andl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	vpor	%xmm8,%xmm0,%xmm0
+	movl	%ecx,%edi
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	16(%r15),%xmm15
+	xorl	%edx,%ecx
+	vmovdqa	%xmm0,%xmm9
+	addl	8(%rsp),%ebp
+	andl	%edx,%edi
+	andl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	12(%rsp),%edx
+	andl	%ecx,%esi
+	andl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	32(%r15),%xmm14
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	vpalignr	$8,%xmm7,%xmm0,%xmm9
+	vpxor	%xmm5,%xmm1,%xmm1
+	movl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	16(%rsp),%ecx
+	andl	%ebx,%edi
+	vpxor	%xmm2,%xmm1,%xmm1
+	andl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	vmovdqa	%xmm10,%xmm8
+	vpaddd	%xmm0,%xmm10,%xmm10
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	vpxor	%xmm9,%xmm1,%xmm1
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	vpsrld	$30,%xmm1,%xmm9
+	vmovdqa	%xmm10,0(%rsp)
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	20(%rsp),%ebx
+	andl	%eax,%esi
+	vpslld	$2,%xmm1,%xmm1
+	andl	%ebp,%edi
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	48(%r15),%xmm15
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	vpor	%xmm9,%xmm1,%xmm1
+	movl	%edx,%edi
+	xorl	%ebp,%edx
+	vmovdqa	%xmm1,%xmm10
+	addl	24(%rsp),%eax
+	andl	%ebp,%edi
+	andl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movl	%ecx,%esi
+	cmpl	$11,%r8d
+	jb	.Lvaesenclast3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	64(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	80(%r15),%xmm15
+	je	.Lvaesenclast3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	96(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	112(%r15),%xmm15
+.Lvaesenclast3:
+	vaesenclast	%xmm15,%xmm11,%xmm11
+	vmovups	16-112(%r15),%xmm14
+	xorl	%edx,%ecx
+	addl	28(%rsp),%ebp
+	andl	%edx,%esi
+	andl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm0,%xmm1,%xmm10
+	vpxor	%xmm6,%xmm2,%xmm2
+	movl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	32(%rsp),%edx
+	andl	%ecx,%edi
+	vpxor	%xmm3,%xmm2,%xmm2
+	andl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	vmovdqa	%xmm8,%xmm9
+	vpaddd	%xmm1,%xmm8,%xmm8
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	vpxor	%xmm10,%xmm2,%xmm2
+	shldl	$5,%ebp,%ebp
+	vmovups	48(%r12),%xmm12
+	vxorps	%xmm13,%xmm12,%xmm12
+	vmovups	%xmm11,32(%r13,%r12,1)
+	vxorps	%xmm12,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-80(%r15),%xmm15
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	vpsrld	$30,%xmm2,%xmm10
+	vmovdqa	%xmm8,16(%rsp)
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	36(%rsp),%ecx
+	andl	%ebx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	andl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	vpor	%xmm10,%xmm2,%xmm2
+	movl	%ebp,%edi
+	xorl	%eax,%ebp
+	vmovdqa	%xmm2,%xmm8
+	addl	40(%rsp),%ebx
+	andl	%eax,%edi
+	andl	%ebp,%esi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	-64(%r15),%xmm14
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	44(%rsp),%eax
+	andl	%ebp,%esi
+	andl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-48(%r15),%xmm15
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm4,%xmm3,%xmm3
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	vmovdqa	%xmm9,%xmm10
+	vpaddd	%xmm2,%xmm9,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	vpxor	%xmm8,%xmm3,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%esi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	-32(%r15),%xmm14
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpor	%xmm8,%xmm3,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	0(%rsp),%eax
+	vpaddd	%xmm3,%xmm10,%xmm10
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%esi
+	movdqa	%xmm10,48(%rsp)
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	addl	4(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	-16(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	addl	12(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%edi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	0(%r15),%xmm14
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	cmpq	%r14,%r10
+	je	.Ldone_avx
+	vmovdqa	64(%r11),%xmm6
+	vmovdqa	0(%r11),%xmm9
+	vmovdqu	0(%r10),%xmm0
+	vmovdqu	16(%r10),%xmm1
+	vmovdqu	32(%r10),%xmm2
+	vmovdqu	48(%r10),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r10
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+	vpshufb	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm9,%xmm0,%xmm4
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	vmovdqa	%xmm4,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	16(%r15),%xmm15
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+	vpshufb	%xmm6,%xmm2,%xmm2
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm9,%xmm1,%xmm5
+	xorl	%eax,%esi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	32(%r15),%xmm14
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vmovdqa	%xmm5,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	48(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+	vpshufb	%xmm6,%xmm3,%xmm3
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpaddd	%xmm9,%xmm2,%xmm6
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	vmovdqa	%xmm6,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%edi
+	cmpl	$11,%r8d
+	jb	.Lvaesenclast4
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	64(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	80(%r15),%xmm15
+	je	.Lvaesenclast4
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	96(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	112(%r15),%xmm15
+.Lvaesenclast4:
+	vaesenclast	%xmm15,%xmm11,%xmm11
+	vmovups	16-112(%r15),%xmm14
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	vmovups	%xmm11,48(%r13,%r12,1)
+	leaq	64(%r12),%r12
+
+	addl	0(%r9),%eax
+	addl	4(%r9),%esi
+	addl	8(%r9),%ecx
+	addl	12(%r9),%edx
+	movl	%eax,0(%r9)
+	addl	16(%r9),%ebp
+	movl	%esi,4(%r9)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+	movl	%ebp,16(%r9)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	16(%r15),%xmm15
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%esi
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	32(%r15),%xmm14
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	48(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%edi
+	cmpl	$11,%r8d
+	jb	.Lvaesenclast5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	64(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	80(%r15),%xmm15
+	je	.Lvaesenclast5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vmovups	96(%r15),%xmm14
+	vaesenc	%xmm14,%xmm11,%xmm11
+	vmovups	112(%r15),%xmm15
+.Lvaesenclast5:
+	vaesenclast	%xmm15,%xmm11,%xmm11
+	vmovups	16-112(%r15),%xmm14
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	vmovups	%xmm11,48(%r13,%r12,1)
+	movq	88(%rsp),%r8
+
+	addl	0(%r9),%eax
+	addl	4(%r9),%esi
+	addl	8(%r9),%ecx
+	movl	%eax,0(%r9)
+	addl	12(%r9),%edx
+	movl	%esi,4(%r9)
+	addl	16(%r9),%ebp
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+	movl	%ebp,16(%r9)
+	vmovups	%xmm11,(%r8)
+	vzeroall
+	leaq	104(%rsp),%rsi
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
 .align	64
 K_XX_XX:
 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	

Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S:1.5 src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S:1.6
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S:1.5	Sat Aug  4 11:03:35 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S	Sat May 16 19:08:37 2015
@@ -10,6 +10,11 @@ sha1_block_data_order:
 	movl	OPENSSL_ia32cap_P+4@GOTPCREL(%rip),%r8d
 	testl	$512,%r8d
 	jz	.Lialu
+	andl	$268435456,%r8d
+	andl	$1073741824,%r9d
+	orl	%r9d,%r8d
+	cmpl	$1342177280,%r8d
+	je	_avx_shortcut
 	jmp	_ssse3_shortcut
 
 .align	16
@@ -2476,6 +2481,1157 @@ _ssse3_shortcut:
 .Lepilogue_ssse3:
 	.byte	0xf3,0xc3
 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.type	sha1_block_data_order_avx,@function
+.align	16
+sha1_block_data_order_avx:
+_avx_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	leaq	-64(%rsp),%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+	vzeroupper
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX(%rip),%r11
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+
+	vmovdqa	64(%r11),%xmm6
+	vmovdqa	0(%r11),%xmm9
+	vmovdqu	0(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r9
+	vpshufb	%xmm6,%xmm1,%xmm1
+	vpshufb	%xmm6,%xmm2,%xmm2
+	vpshufb	%xmm6,%xmm3,%xmm3
+	vpaddd	%xmm9,%xmm0,%xmm4
+	vpaddd	%xmm9,%xmm1,%xmm5
+	vpaddd	%xmm9,%xmm2,%xmm6
+	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm6,32(%rsp)
+	jmp	.Loop_avx
+.align	16
+.Loop_avx:
+	addl	0(%rsp),%ebp
+	xorl	%edx,%ecx
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpaddd	%xmm3,%xmm9,%xmm9
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	vpsrldq	$4,%xmm3,%xmm8
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	vpxor	%xmm0,%xmm4,%xmm4
+	shrdl	$2,%ebx,%ebx
+	addl	%esi,%ebp
+	vpxor	%xmm2,%xmm8,%xmm8
+	addl	4(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm8,%xmm4,%xmm4
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	vmovdqa	%xmm9,48(%rsp)
+	xorl	%ecx,%edi
+	addl	%ebp,%edx
+	vpsrld	$31,%xmm4,%xmm8
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	8(%rsp),%ecx
+	xorl	%ebx,%eax
+	vpslldq	$12,%xmm4,%xmm10
+	vpaddd	%xmm4,%xmm4,%xmm4
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm4,%xmm4
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm4,%xmm4
+	addl	12(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm10,%xmm4,%xmm4
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	vmovdqa	0(%r11),%xmm10
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	16(%rsp),%eax
+	xorl	%ebp,%edx
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	vpaddd	%xmm4,%xmm10,%xmm10
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	vpsrldq	$4,%xmm4,%xmm9
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	vpxor	%xmm1,%xmm5,%xmm5
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	vpxor	%xmm3,%xmm9,%xmm9
+	addl	20(%rsp),%ebp
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm9,%xmm5,%xmm5
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	vmovdqa	%xmm10,0(%rsp)
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	vpsrld	$31,%xmm5,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	addl	24(%rsp),%edx
+	xorl	%ecx,%ebx
+	vpslldq	$12,%xmm5,%xmm8
+	vpaddd	%xmm5,%xmm5,%xmm5
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	vpsrld	$30,%xmm8,%xmm10
+	vpor	%xmm9,%xmm5,%xmm5
+	xorl	%ecx,%esi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	vpslld	$2,%xmm8,%xmm8
+	vpxor	%xmm10,%xmm5,%xmm5
+	addl	28(%rsp),%ecx
+	xorl	%ebx,%eax
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm8,%xmm5,%xmm5
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	vmovdqa	16(%r11),%xmm8
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	addl	32(%rsp),%ebx
+	xorl	%eax,%ebp
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm5,%xmm8,%xmm8
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	vpsrldq	$4,%xmm5,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	vpxor	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	vpxor	%xmm4,%xmm10,%xmm10
+	addl	36(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm10,%xmm6,%xmm6
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	vmovdqa	%xmm8,16(%rsp)
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	vpsrld	$31,%xmm6,%xmm10
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	addl	40(%rsp),%ebp
+	xorl	%edx,%ecx
+	vpslldq	$12,%xmm6,%xmm9
+	vpaddd	%xmm6,%xmm6,%xmm6
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	vpsrld	$30,%xmm9,%xmm8
+	vpor	%xmm10,%xmm6,%xmm6
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	vpslld	$2,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm6,%xmm6
+	addl	44(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm9,%xmm6,%xmm6
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	vmovdqa	16(%r11),%xmm9
+	xorl	%ecx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	48(%rsp),%ecx
+	xorl	%ebx,%eax
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm6,%xmm9,%xmm9
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	vpsrldq	$4,%xmm6,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	vpxor	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpxor	%xmm5,%xmm8,%xmm8
+	addl	52(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm8,%xmm7,%xmm7
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	vmovdqa	%xmm9,32(%rsp)
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	vpsrld	$31,%xmm7,%xmm8
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	56(%rsp),%eax
+	xorl	%ebp,%edx
+	vpslldq	$12,%xmm7,%xmm10
+	vpaddd	%xmm7,%xmm7,%xmm7
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm7,%xmm7
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm7,%xmm7
+	addl	60(%rsp),%ebp
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm10,%xmm7,%xmm7
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	vmovdqa	16(%r11),%xmm10
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	vpalignr	$8,%xmm6,%xmm7,%xmm9
+	vpxor	%xmm4,%xmm0,%xmm0
+	addl	0(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm1,%xmm0,%xmm0
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	vmovdqa	%xmm10,%xmm8
+	vpaddd	%xmm7,%xmm10,%xmm10
+	xorl	%ecx,%esi
+	addl	%ebp,%edx
+	vpxor	%xmm9,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	addl	4(%rsp),%ecx
+	xorl	%ebx,%eax
+	vpsrld	$30,%xmm0,%xmm9
+	vmovdqa	%xmm10,48(%rsp)
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	addl	8(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpor	%xmm9,%xmm0,%xmm0
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	vmovdqa	%xmm0,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	vpalignr	$8,%xmm7,%xmm0,%xmm10
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm2,%xmm1,%xmm1
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	vmovdqa	%xmm8,%xmm9
+	vpaddd	%xmm0,%xmm8,%xmm8
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	vpxor	%xmm10,%xmm1,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm1,%xmm10
+	vmovdqa	%xmm8,0(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpor	%xmm10,%xmm1,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%eax,%edi
+	vmovdqa	%xmm1,%xmm8
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm3,%xmm2,%xmm2
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	vmovdqa	32(%r11),%xmm10
+	vpaddd	%xmm1,%xmm9,%xmm9
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	vpxor	%xmm8,%xmm2,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	vpslld	$2,%xmm2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	vpor	%xmm8,%xmm2,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%ebx,%edi
+	vmovdqa	%xmm2,%xmm9
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	vpalignr	$8,%xmm1,%xmm2,%xmm9
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm4,%xmm3,%xmm3
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	vmovdqa	%xmm10,%xmm8
+	vpaddd	%xmm2,%xmm10,%xmm10
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	vpxor	%xmm9,%xmm3,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm9
+	vmovdqa	%xmm10,32(%rsp)
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	vpor	%xmm9,%xmm3,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ecx,%edi
+	vmovdqa	%xmm3,%xmm10
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm2,%xmm3,%xmm10
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	0(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	vmovdqa	%xmm8,%xmm9
+	vpaddd	%xmm3,%xmm8,%xmm8
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpxor	%xmm10,%xmm4,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpsrld	$30,%xmm4,%xmm10
+	vmovdqa	%xmm8,48(%rsp)
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	vpslld	$2,%xmm4,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	vpor	%xmm10,%xmm4,%xmm4
+	addl	12(%rsp),%ebp
+	xorl	%edx,%edi
+	vmovdqa	%xmm4,%xmm8
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	16(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm6,%xmm5,%xmm5
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	vmovdqa	%xmm9,%xmm10
+	vpaddd	%xmm4,%xmm9,%xmm9
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	vpxor	%xmm8,%xmm5,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm5,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	vpslld	$2,%xmm5,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	vpor	%xmm8,%xmm5,%xmm5
+	addl	28(%rsp),%eax
+	xorl	%ebp,%edi
+	vmovdqa	%xmm5,%xmm9
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	vpalignr	$8,%xmm4,%xmm5,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	movl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	32(%rsp),%ebp
+	andl	%edx,%edi
+	vpxor	%xmm7,%xmm6,%xmm6
+	andl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	vmovdqa	%xmm10,%xmm8
+	vpaddd	%xmm5,%xmm10,%xmm10
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	vpxor	%xmm9,%xmm6,%xmm6
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	vpsrld	$30,%xmm6,%xmm9
+	vmovdqa	%xmm10,16(%rsp)
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	36(%rsp),%edx
+	andl	%ecx,%esi
+	vpslld	$2,%xmm6,%xmm6
+	andl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	vpor	%xmm9,%xmm6,%xmm6
+	movl	%eax,%edi
+	xorl	%ebx,%eax
+	vmovdqa	%xmm6,%xmm10
+	addl	40(%rsp),%ecx
+	andl	%ebx,%edi
+	andl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	44(%rsp),%ebx
+	andl	%eax,%esi
+	andl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm5,%xmm6,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	movl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	48(%rsp),%eax
+	andl	%ebp,%edi
+	vpxor	%xmm0,%xmm7,%xmm7
+	andl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	vmovdqa	48(%r11),%xmm9
+	vpaddd	%xmm6,%xmm8,%xmm8
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	vpxor	%xmm10,%xmm7,%xmm7
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	vpsrld	$30,%xmm7,%xmm10
+	vmovdqa	%xmm8,32(%rsp)
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	52(%rsp),%ebp
+	andl	%edx,%esi
+	vpslld	$2,%xmm7,%xmm7
+	andl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	vpor	%xmm10,%xmm7,%xmm7
+	movl	%ebx,%edi
+	xorl	%ecx,%ebx
+	vmovdqa	%xmm7,%xmm8
+	addl	56(%rsp),%edx
+	andl	%ecx,%edi
+	andl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	60(%rsp),%ecx
+	andl	%ebx,%esi
+	andl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	vpxor	%xmm4,%xmm0,%xmm0
+	movl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	0(%rsp),%ebx
+	andl	%eax,%edi
+	vpxor	%xmm1,%xmm0,%xmm0
+	andl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	vmovdqa	%xmm9,%xmm10
+	vpaddd	%xmm7,%xmm9,%xmm9
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	vpxor	%xmm8,%xmm0,%xmm0
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	4(%rsp),%eax
+	andl	%ebp,%esi
+	vpslld	$2,%xmm0,%xmm0
+	andl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	vpor	%xmm8,%xmm0,%xmm0
+	movl	%ecx,%edi
+	xorl	%edx,%ecx
+	vmovdqa	%xmm0,%xmm9
+	addl	8(%rsp),%ebp
+	andl	%edx,%edi
+	andl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	12(%rsp),%edx
+	andl	%ecx,%esi
+	andl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	vpalignr	$8,%xmm7,%xmm0,%xmm9
+	vpxor	%xmm5,%xmm1,%xmm1
+	movl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	16(%rsp),%ecx
+	andl	%ebx,%edi
+	vpxor	%xmm2,%xmm1,%xmm1
+	andl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	vmovdqa	%xmm10,%xmm8
+	vpaddd	%xmm0,%xmm10,%xmm10
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	vpxor	%xmm9,%xmm1,%xmm1
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	vpsrld	$30,%xmm1,%xmm9
+	vmovdqa	%xmm10,0(%rsp)
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	20(%rsp),%ebx
+	andl	%eax,%esi
+	vpslld	$2,%xmm1,%xmm1
+	andl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	vpor	%xmm9,%xmm1,%xmm1
+	movl	%edx,%edi
+	xorl	%ebp,%edx
+	vmovdqa	%xmm1,%xmm10
+	addl	24(%rsp),%eax
+	andl	%ebp,%edi
+	andl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	28(%rsp),%ebp
+	andl	%edx,%esi
+	andl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm0,%xmm1,%xmm10
+	vpxor	%xmm6,%xmm2,%xmm2
+	movl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	32(%rsp),%edx
+	andl	%ecx,%edi
+	vpxor	%xmm3,%xmm2,%xmm2
+	andl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	vmovdqa	%xmm8,%xmm9
+	vpaddd	%xmm1,%xmm8,%xmm8
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	vpxor	%xmm10,%xmm2,%xmm2
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	vpsrld	$30,%xmm2,%xmm10
+	vmovdqa	%xmm8,16(%rsp)
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	36(%rsp),%ecx
+	andl	%ebx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	andl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	vpor	%xmm10,%xmm2,%xmm2
+	movl	%ebp,%edi
+	xorl	%eax,%ebp
+	vmovdqa	%xmm2,%xmm8
+	addl	40(%rsp),%ebx
+	andl	%eax,%edi
+	andl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	44(%rsp),%eax
+	andl	%ebp,%esi
+	andl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm4,%xmm3,%xmm3
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	vmovdqa	%xmm9,%xmm10
+	vpaddd	%xmm2,%xmm9,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	vpxor	%xmm8,%xmm3,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vpor	%xmm8,%xmm3,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	0(%rsp),%eax
+	vpaddd	%xmm3,%xmm10,%xmm10
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%esi
+	movdqa	%xmm10,48(%rsp)
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	addl	4(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	addl	12(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	cmpq	%r10,%r9
+	je	.Ldone_avx
+	vmovdqa	64(%r11),%xmm6
+	vmovdqa	0(%r11),%xmm9
+	vmovdqu	0(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+	vpshufb	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm9,%xmm0,%xmm4
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	vmovdqa	%xmm4,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+	vpshufb	%xmm6,%xmm2,%xmm2
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm9,%xmm1,%xmm5
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	vmovdqa	%xmm5,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+	vpshufb	%xmm6,%xmm3,%xmm3
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpaddd	%xmm9,%xmm2,%xmm6
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	vmovdqa	%xmm6,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%esi,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	shrdl	$7,%eax,%eax
+	addl	%esi,%edx
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%ebp,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	shrdl	$7,%edx,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%edi,%eax
+	vzeroupper
+
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	leaq	64(%rsp),%rsi
+	movq	0(%rsi),%r12
+	movq	8(%rsi),%rbp
+	movq	16(%rsi),%rbx
+	leaq	24(%rsi),%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
 .align	64
 K_XX_XX:
 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	

Reply via email to