Module Name: src
Committed By: joerg
Date: Sat May 16 19:08:37 UTC 2015
Modified Files:
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64: Makefile
aesni-sha1-x86_64.S sha1-x86_64.S
Log Message:
Explicitly pass CC down. When building with clang, force external
assembler as some of the Perl scripts use -Wa,-v. Regenerate for AVX
support.
To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile
cvs rdiff -u -r1.3 -r1.4 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S
cvs rdiff -u -r1.5 -r1.6 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.6 src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.7
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile:1.6 Sat Aug 4 11:03:34 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/Makefile Sat May 16 19:08:37 2015
@@ -1,14 +1,18 @@
-# $NetBSD: Makefile,v 1.6 2012/08/04 11:03:34 christos Exp $
+# $NetBSD: Makefile,v 1.7 2015/05/16 19:08:37 joerg Exp $
.include "bsd.own.mk"
CRYPTODIST=${NETBSDSRCDIR}/crypto
.include "${NETBSDSRCDIR}/crypto/Makefile.openssl"
+.if make(regen) && ${HAVE_LLVM:U} == "yes"
+CC+= -fno-integrated-as
+.endif
+
regen:
for i in $$(find ${OPENSSLSRC} -name \*${MACHINE_ARCH}.pl) \
${OPENSSLSRC}/crypto/${MACHINE_ARCH}cpuid.pl ; do \
- (echo "#include <machine/asm.h>"; perl $$i elf | sed \
+ (echo "#include <machine/asm.h>"; CC=${CC:Q} perl $$i elf | sed \
-e 's/\(OPENSSL[A-Za-z0-9_+]*\)(%rip)/\1@GOTPCREL(%rip)/' \
-e 's/.hidden OPENSSL_cpuid_setup/.globl OPENSSL_cpuid_setup/' \
-e 's/call OPENSSL_cpuid_setup/call PIC_PLT(OPENSSL_cpuid_setup)/') \
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S:1.3 src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S:1.4
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S:1.3 Sat Aug 4 11:03:34 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/aesni-sha1-x86_64.S Sat May 16 19:08:37 2015
@@ -9,6 +9,11 @@ aesni_cbc_sha1_enc:
movl OPENSSL_ia32cap_P+0@GOTPCREL(%rip),%r10d
movl OPENSSL_ia32cap_P+4@GOTPCREL(%rip),%r11d
+ andl $268435456,%r11d
+ andl $1073741824,%r10d
+ orl %r11d,%r10d
+ cmpl $1342177280,%r10d
+ je aesni_cbc_sha1_enc_avx
jmp aesni_cbc_sha1_enc_ssse3
.byte 0xf3,0xc3
.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
@@ -1385,6 +1390,1343 @@ aesni_cbc_sha1_enc_ssse3:
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+.type aesni_cbc_sha1_enc_avx,@function
+.align 16
+aesni_cbc_sha1_enc_avx:
+ movq 8(%rsp),%r10
+
+
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -104(%rsp),%rsp
+
+
+ vzeroall
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+ vmovdqu (%r8),%xmm11
+ movq %r8,88(%rsp)
+ shlq $6,%r14
+ subq %r12,%r13
+ movl 240(%r15),%r8d
+ addq $112,%r15
+ addq %r10,%r14
+
+ leaq K_XX_XX(%rip),%r11
+ movl 0(%r9),%eax
+ movl 4(%r9),%ebx
+ movl 8(%r9),%ecx
+ movl 12(%r9),%edx
+ movl %ebx,%esi
+ movl 16(%r9),%ebp
+
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa 0(%r11),%xmm9
+ vmovdqu 0(%r10),%xmm0
+ vmovdqu 16(%r10),%xmm1
+ vmovdqu 32(%r10),%xmm2
+ vmovdqu 48(%r10),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r10
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm9,%xmm0,%xmm4
+ vpaddd %xmm9,%xmm1,%xmm5
+ vpaddd %xmm9,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovups -112(%r15),%xmm13
+ vmovups 16-112(%r15),%xmm14
+ jmp .Loop_avx
+.align 16
+.Loop_avx:
+ addl 0(%rsp),%ebp
+ vmovups 0(%r12),%xmm12
+ vxorps %xmm13,%xmm12,%xmm12
+ vxorps %xmm12,%xmm11,%xmm11
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -80(%r15),%xmm15
+ xorl %edx,%ecx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpaddd %xmm3,%xmm9,%xmm9
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ vpsrldq $4,%xmm3,%xmm8
+ xorl %edx,%esi
+ addl %eax,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ shrdl $2,%ebx,%ebx
+ addl %esi,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ addl 4(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm8,%xmm4,%xmm4
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ vmovdqa %xmm9,48(%rsp)
+ xorl %ecx,%edi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups -64(%r15),%xmm14
+ addl %ebp,%edx
+ vpsrld $31,%xmm4,%xmm8
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 8(%rsp),%ecx
+ xorl %ebx,%eax
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ addl 12(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -48(%r15),%xmm15
+ vpxor %xmm10,%xmm4,%xmm4
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ vmovdqa 0(%r11),%xmm10
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 16(%rsp),%eax
+ xorl %ebp,%edx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpaddd %xmm4,%xmm10,%xmm10
+ andl %edx,%esi
+ xorl %ebp,%edx
+ vpsrldq $4,%xmm4,%xmm9
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ vpxor %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ vpxor %xmm3,%xmm9,%xmm9
+ addl 20(%rsp),%ebp
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups -32(%r15),%xmm14
+ xorl %edx,%ecx
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpxor %xmm9,%xmm5,%xmm5
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ vmovdqa %xmm10,0(%rsp)
+ xorl %edx,%edi
+ addl %eax,%ebp
+ vpsrld $31,%xmm5,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ addl 24(%rsp),%edx
+ xorl %ecx,%ebx
+ vpslldq $12,%xmm5,%xmm8
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ vpsrld $30,%xmm8,%xmm10
+ vpor %xmm9,%xmm5,%xmm5
+ xorl %ecx,%esi
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -16(%r15),%xmm15
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ vpslld $2,%xmm8,%xmm8
+ vpxor %xmm10,%xmm5,%xmm5
+ addl 28(%rsp),%ecx
+ xorl %ebx,%eax
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ andl %eax,%edi
+ xorl %ebx,%eax
+ vmovdqa 16(%r11),%xmm8
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ addl 32(%rsp),%ebx
+ xorl %eax,%ebp
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 0(%r15),%xmm14
+ vpaddd %xmm5,%xmm8,%xmm8
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ vpsrldq $4,%xmm5,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ vpxor %xmm4,%xmm10,%xmm10
+ addl 36(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm10,%xmm6,%xmm6
+ andl %edx,%edi
+ xorl %ebp,%edx
+ vmovdqa %xmm8,16(%rsp)
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ vpsrld $31,%xmm6,%xmm10
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ addl 40(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 16(%r15),%xmm15
+ xorl %edx,%ecx
+ vpslldq $12,%xmm6,%xmm9
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ vpsrld $30,%xmm9,%xmm8
+ vpor %xmm10,%xmm6,%xmm6
+ xorl %edx,%esi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm6,%xmm6
+ addl 44(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm9,%xmm6,%xmm6
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ vmovdqa 16(%r11),%xmm9
+ xorl %ecx,%edi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 32(%r15),%xmm14
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 48(%rsp),%ecx
+ xorl %ebx,%eax
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm6,%xmm9,%xmm9
+ andl %eax,%esi
+ xorl %ebx,%eax
+ vpsrldq $4,%xmm6,%xmm8
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ addl 52(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 48(%r15),%xmm15
+ vpxor %xmm8,%xmm7,%xmm7
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ vmovdqa %xmm9,32(%rsp)
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ vpsrld $31,%xmm7,%xmm8
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 56(%rsp),%eax
+ xorl %ebp,%edx
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ addl 60(%rsp),%ebp
+ cmpl $11,%r8d
+ jb .Lvaesenclast1
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast1
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast1:
+ vaesenclast %xmm15,%xmm11,%xmm11
+ vmovups 16-112(%r15),%xmm14
+ xorl %edx,%ecx
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpxor %xmm10,%xmm7,%xmm7
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ vmovdqa 16(%r11),%xmm10
+ xorl %edx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm9
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm1,%xmm0,%xmm0
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ vmovdqa %xmm10,%xmm8
+ vpaddd %xmm7,%xmm10,%xmm10
+ xorl %ecx,%esi
+ vmovups 16(%r12),%xmm12
+ vxorps %xmm13,%xmm12,%xmm12
+ vmovups %xmm11,0(%r13,%r12,1)
+ vxorps %xmm12,%xmm11,%xmm11
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -80(%r15),%xmm15
+ addl %ebp,%edx
+ vpxor %xmm9,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ addl 4(%rsp),%ecx
+ xorl %ebx,%eax
+ vpsrld $30,%xmm0,%xmm9
+ vmovdqa %xmm10,48(%rsp)
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ vpslld $2,%xmm0,%xmm0
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ addl 8(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups -64(%r15),%xmm14
+ vpor %xmm9,%xmm0,%xmm0
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ vmovdqa %xmm0,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm10
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -48(%r15),%xmm15
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ vmovdqa %xmm8,%xmm9
+ vpaddd %xmm0,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ vpxor %xmm10,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm10
+ vmovdqa %xmm8,0(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ xorl %eax,%esi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups -32(%r15),%xmm14
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpor %xmm10,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %eax,%edi
+ vmovdqa %xmm1,%xmm8
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vmovdqa 32(%r11),%xmm10
+ vpaddd %xmm1,%xmm9,%xmm9
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -16(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %ebx,%edi
+ vmovdqa %xmm2,%xmm9
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ xorl %eax,%edi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 0(%r15),%xmm14
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm9
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ vmovdqa %xmm10,%xmm8
+ vpaddd %xmm2,%xmm10,%xmm10
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ vpxor %xmm9,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 16(%r15),%xmm15
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ vpor %xmm9,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ecx,%edi
+ vmovdqa %xmm3,%xmm10
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm10
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %eax,%esi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 32(%r15),%xmm14
+ addl %edx,%ecx
+ vmovdqa %xmm8,%xmm9
+ vpaddd %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpxor %xmm10,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm10
+ vmovdqa %xmm8,48(%rsp)
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ vpor %xmm10,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 48(%r15),%xmm15
+ xorl %edx,%edi
+ vmovdqa %xmm4,%xmm8
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ vmovdqa %xmm9,%xmm10
+ vpaddd %xmm4,%xmm9,%xmm9
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ xorl %eax,%edi
+ cmpl $11,%r8d
+ jb .Lvaesenclast2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast2:
+ vaesenclast %xmm15,%xmm11,%xmm11
+ vmovups 16-112(%r15),%xmm14
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ xorl %ebp,%edi
+ vmovdqa %xmm5,%xmm9
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ movl %ecx,%edi
+ vmovups 32(%r12),%xmm12
+ vxorps %xmm13,%xmm12,%xmm12
+ vmovups %xmm11,16(%r13,%r12,1)
+ vxorps %xmm12,%xmm11,%xmm11
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -80(%r15),%xmm15
+ xorl %edx,%ecx
+ addl 32(%rsp),%ebp
+ andl %edx,%edi
+ vpxor %xmm7,%xmm6,%xmm6
+ andl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ vmovdqa %xmm10,%xmm8
+ vpaddd %xmm5,%xmm10,%xmm10
+ addl %edi,%ebp
+ movl %eax,%edi
+ vpxor %xmm9,%xmm6,%xmm6
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ vpsrld $30,%xmm6,%xmm9
+ vmovdqa %xmm10,16(%rsp)
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 36(%rsp),%edx
+ andl %ecx,%esi
+ vpslld $2,%xmm6,%xmm6
+ andl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups -64(%r15),%xmm14
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ vpor %xmm9,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ebx,%eax
+ vmovdqa %xmm6,%xmm10
+ addl 40(%rsp),%ecx
+ andl %ebx,%edi
+ andl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 44(%rsp),%ebx
+ andl %eax,%esi
+ andl %ebp,%edi
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -48(%r15),%xmm15
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ movl %edx,%edi
+ xorl %ebp,%edx
+ addl 48(%rsp),%eax
+ andl %ebp,%edi
+ vpxor %xmm0,%xmm7,%xmm7
+ andl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ vmovdqa 48(%r11),%xmm9
+ vpaddd %xmm6,%xmm8,%xmm8
+ addl %edi,%eax
+ movl %ebx,%edi
+ vpxor %xmm10,%xmm7,%xmm7
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ vpsrld $30,%xmm7,%xmm10
+ vmovdqa %xmm8,32(%rsp)
+ movl %ecx,%esi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups -32(%r15),%xmm14
+ xorl %edx,%ecx
+ addl 52(%rsp),%ebp
+ andl %edx,%esi
+ vpslld $2,%xmm7,%xmm7
+ andl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ vpor %xmm10,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %ecx,%ebx
+ vmovdqa %xmm7,%xmm8
+ addl 56(%rsp),%edx
+ andl %ecx,%edi
+ andl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -16(%r15),%xmm15
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 60(%rsp),%ecx
+ andl %ebx,%esi
+ andl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ movl %ebp,%edi
+ xorl %eax,%ebp
+ addl 0(%rsp),%ebx
+ andl %eax,%edi
+ vpxor %xmm1,%xmm0,%xmm0
+ andl %ebp,%esi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 0(%r15),%xmm14
+ shrdl $7,%edx,%edx
+ vmovdqa %xmm9,%xmm10
+ vpaddd %xmm7,%xmm9,%xmm9
+ addl %edi,%ebx
+ movl %ecx,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 4(%rsp),%eax
+ andl %ebp,%esi
+ vpslld $2,%xmm0,%xmm0
+ andl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm0,%xmm0
+ movl %ecx,%edi
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 16(%r15),%xmm15
+ xorl %edx,%ecx
+ vmovdqa %xmm0,%xmm9
+ addl 8(%rsp),%ebp
+ andl %edx,%edi
+ andl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 12(%rsp),%edx
+ andl %ecx,%esi
+ andl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 32(%r15),%xmm14
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm9
+ vpxor %xmm5,%xmm1,%xmm1
+ movl %eax,%edi
+ xorl %ebx,%eax
+ addl 16(%rsp),%ecx
+ andl %ebx,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ andl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ vmovdqa %xmm10,%xmm8
+ vpaddd %xmm0,%xmm10,%xmm10
+ addl %edi,%ecx
+ movl %edx,%edi
+ vpxor %xmm9,%xmm1,%xmm1
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ vpsrld $30,%xmm1,%xmm9
+ vmovdqa %xmm10,0(%rsp)
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 20(%rsp),%ebx
+ andl %eax,%esi
+ vpslld $2,%xmm1,%xmm1
+ andl %ebp,%edi
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 48(%r15),%xmm15
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ vpor %xmm9,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %ebp,%edx
+ vmovdqa %xmm1,%xmm10
+ addl 24(%rsp),%eax
+ andl %ebp,%edi
+ andl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ movl %ecx,%esi
+ cmpl $11,%r8d
+ jb .Lvaesenclast3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast3:
+ vaesenclast %xmm15,%xmm11,%xmm11
+ vmovups 16-112(%r15),%xmm14
+ xorl %edx,%ecx
+ addl 28(%rsp),%ebp
+ andl %edx,%esi
+ andl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm10
+ vpxor %xmm6,%xmm2,%xmm2
+ movl %ebx,%edi
+ xorl %ecx,%ebx
+ addl 32(%rsp),%edx
+ andl %ecx,%edi
+ vpxor %xmm3,%xmm2,%xmm2
+ andl %ebx,%esi
+ shrdl $7,%eax,%eax
+ vmovdqa %xmm8,%xmm9
+ vpaddd %xmm1,%xmm8,%xmm8
+ addl %edi,%edx
+ movl %ebp,%edi
+ vpxor %xmm10,%xmm2,%xmm2
+ shldl $5,%ebp,%ebp
+ vmovups 48(%r12),%xmm12
+ vxorps %xmm13,%xmm12,%xmm12
+ vmovups %xmm11,32(%r13,%r12,1)
+ vxorps %xmm12,%xmm11,%xmm11
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -80(%r15),%xmm15
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ vpsrld $30,%xmm2,%xmm10
+ vmovdqa %xmm8,16(%rsp)
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 36(%rsp),%ecx
+ andl %ebx,%esi
+ vpslld $2,%xmm2,%xmm2
+ andl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ vpor %xmm10,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %eax,%ebp
+ vmovdqa %xmm2,%xmm8
+ addl 40(%rsp),%ebx
+ andl %eax,%edi
+ andl %ebp,%esi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups -64(%r15),%xmm14
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 44(%rsp),%eax
+ andl %ebp,%esi
+ andl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -48(%r15),%xmm15
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ vmovdqa %xmm9,%xmm10
+ vpaddd %xmm2,%xmm9,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ xorl %eax,%esi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups -32(%r15),%xmm14
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm10,%xmm10
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%esi
+ movdqa %xmm10,48(%rsp)
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ addl 4(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups -16(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ addl 8(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ addl 12(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ xorl %eax,%edi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 0(%r15),%xmm14
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ cmpq %r14,%r10
+ je .Ldone_avx
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa 0(%r11),%xmm9
+ vmovdqu 0(%r10),%xmm0
+ vmovdqu 16(%r10),%xmm1
+ vmovdqu 32(%r10),%xmm2
+ vmovdqu 48(%r10),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r10
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm9,%xmm0,%xmm4
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 16(%r15),%xmm15
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm9,%xmm1,%xmm5
+ xorl %eax,%esi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 32(%r15),%xmm14
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 48(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm9,%xmm2,%xmm6
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ xorl %eax,%edi
+ cmpl $11,%r8d
+ jb .Lvaesenclast4
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast4
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast4:
+ vaesenclast %xmm15,%xmm11,%xmm11
+ vmovups 16-112(%r15),%xmm14
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ vmovups %xmm11,48(%r13,%r12,1)
+ leaq 64(%r12),%r12
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ addl 12(%r9),%edx
+ movl %eax,0(%r9)
+ addl 16(%r9),%ebp
+ movl %esi,4(%r9)
+ movl %esi,%ebx
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+ movl %ebp,16(%r9)
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 16(%r15),%xmm15
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ xorl %eax,%esi
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 32(%r15),%xmm14
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 48(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ xorl %eax,%edi
+ cmpl $11,%r8d
+ jb .Lvaesenclast5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm11,%xmm11
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast5:
+ vaesenclast %xmm15,%xmm11,%xmm11
+ vmovups 16-112(%r15),%xmm14
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ vmovups %xmm11,48(%r13,%r12,1)
+ movq 88(%rsp),%r8
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ movl %eax,0(%r9)
+ addl 12(%r9),%edx
+ movl %esi,4(%r9)
+ addl 16(%r9),%ebp
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+ movl %ebp,16(%r9)
+ vmovups %xmm11,(%r8)
+ vzeroall
+ leaq 104(%rsp),%rsi
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S:1.5 src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S:1.6
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S:1.5 Sat Aug 4 11:03:35 2012
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/x86_64/sha1-x86_64.S Sat May 16 19:08:37 2015
@@ -10,6 +10,11 @@ sha1_block_data_order:
movl OPENSSL_ia32cap_P+4@GOTPCREL(%rip),%r8d
testl $512,%r8d
jz .Lialu
+ andl $268435456,%r8d
+ andl $1073741824,%r9d
+ orl %r9d,%r8d
+ cmpl $1342177280,%r8d
+ je _avx_shortcut
jmp _ssse3_shortcut
.align 16
@@ -2476,6 +2481,1157 @@ _ssse3_shortcut:
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.type sha1_block_data_order_avx,@function
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ leaq -64(%rsp),%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+ vzeroupper
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX(%rip),%r11
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa 0(%r11),%xmm9
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm9,%xmm0,%xmm4
+ vpaddd %xmm9,%xmm1,%xmm5
+ vpaddd %xmm9,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ jmp .Loop_avx
+.align 16
+.Loop_avx:
+ addl 0(%rsp),%ebp
+ xorl %edx,%ecx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpaddd %xmm3,%xmm9,%xmm9
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ vpsrldq $4,%xmm3,%xmm8
+ xorl %edx,%esi
+ addl %eax,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ shrdl $2,%ebx,%ebx
+ addl %esi,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ addl 4(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm8,%xmm4,%xmm4
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ vmovdqa %xmm9,48(%rsp)
+ xorl %ecx,%edi
+ addl %ebp,%edx
+ vpsrld $31,%xmm4,%xmm8
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 8(%rsp),%ecx
+ xorl %ebx,%eax
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ addl 12(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm10,%xmm4,%xmm4
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ vmovdqa 0(%r11),%xmm10
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 16(%rsp),%eax
+ xorl %ebp,%edx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpaddd %xmm4,%xmm10,%xmm10
+ andl %edx,%esi
+ xorl %ebp,%edx
+ vpsrldq $4,%xmm4,%xmm9
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ vpxor %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ vpxor %xmm3,%xmm9,%xmm9
+ addl 20(%rsp),%ebp
+ xorl %edx,%ecx
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpxor %xmm9,%xmm5,%xmm5
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ vmovdqa %xmm10,0(%rsp)
+ xorl %edx,%edi
+ addl %eax,%ebp
+ vpsrld $31,%xmm5,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ addl 24(%rsp),%edx
+ xorl %ecx,%ebx
+ vpslldq $12,%xmm5,%xmm8
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ vpsrld $30,%xmm8,%xmm10
+ vpor %xmm9,%xmm5,%xmm5
+ xorl %ecx,%esi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ vpslld $2,%xmm8,%xmm8
+ vpxor %xmm10,%xmm5,%xmm5
+ addl 28(%rsp),%ecx
+ xorl %ebx,%eax
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ andl %eax,%edi
+ xorl %ebx,%eax
+ vmovdqa 16(%r11),%xmm8
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ addl 32(%rsp),%ebx
+ xorl %eax,%ebp
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm5,%xmm8,%xmm8
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ vpsrldq $4,%xmm5,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ vpxor %xmm4,%xmm10,%xmm10
+ addl 36(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm10,%xmm6,%xmm6
+ andl %edx,%edi
+ xorl %ebp,%edx
+ vmovdqa %xmm8,16(%rsp)
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ vpsrld $31,%xmm6,%xmm10
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ addl 40(%rsp),%ebp
+ xorl %edx,%ecx
+ vpslldq $12,%xmm6,%xmm9
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ vpsrld $30,%xmm9,%xmm8
+ vpor %xmm10,%xmm6,%xmm6
+ xorl %edx,%esi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm6,%xmm6
+ addl 44(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm9,%xmm6,%xmm6
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ vmovdqa 16(%r11),%xmm9
+ xorl %ecx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 48(%rsp),%ecx
+ xorl %ebx,%eax
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm6,%xmm9,%xmm9
+ andl %eax,%esi
+ xorl %ebx,%eax
+ vpsrldq $4,%xmm6,%xmm8
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ addl 52(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm8,%xmm7,%xmm7
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ vmovdqa %xmm9,32(%rsp)
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ vpsrld $31,%xmm7,%xmm8
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 56(%rsp),%eax
+ xorl %ebp,%edx
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ addl 60(%rsp),%ebp
+ xorl %edx,%ecx
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpxor %xmm10,%xmm7,%xmm7
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ vmovdqa 16(%r11),%xmm10
+ xorl %edx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm9
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm1,%xmm0,%xmm0
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ vmovdqa %xmm10,%xmm8
+ vpaddd %xmm7,%xmm10,%xmm10
+ xorl %ecx,%esi
+ addl %ebp,%edx
+ vpxor %xmm9,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ addl 4(%rsp),%ecx
+ xorl %ebx,%eax
+ vpsrld $30,%xmm0,%xmm9
+ vmovdqa %xmm10,48(%rsp)
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ vpslld $2,%xmm0,%xmm0
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ addl 8(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpor %xmm9,%xmm0,%xmm0
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ vmovdqa %xmm0,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm10
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ vmovdqa %xmm8,%xmm9
+ vpaddd %xmm0,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ vpxor %xmm10,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm10
+ vmovdqa %xmm8,0(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ xorl %eax,%esi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpor %xmm10,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %eax,%edi
+ vmovdqa %xmm1,%xmm8
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vmovdqa 32(%r11),%xmm10
+ vpaddd %xmm1,%xmm9,%xmm9
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %edx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %ebx,%edi
+ vmovdqa %xmm2,%xmm9
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm9
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ vmovdqa %xmm10,%xmm8
+ vpaddd %xmm2,%xmm10,%xmm10
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ vpxor %xmm9,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ vpor %xmm9,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ecx,%edi
+ vmovdqa %xmm3,%xmm10
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm10
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %eax,%esi
+ addl %edx,%ecx
+ vmovdqa %xmm8,%xmm9
+ vpaddd %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpxor %xmm10,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm10
+ vmovdqa %xmm8,48(%rsp)
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ vpor %xmm10,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %edx,%edi
+ vmovdqa %xmm4,%xmm8
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ vmovdqa %xmm9,%xmm10
+ vpaddd %xmm4,%xmm9,%xmm9
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ xorl %ebp,%edi
+ vmovdqa %xmm5,%xmm9
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ movl %ecx,%edi
+ xorl %edx,%ecx
+ addl 32(%rsp),%ebp
+ andl %edx,%edi
+ vpxor %xmm7,%xmm6,%xmm6
+ andl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ vmovdqa %xmm10,%xmm8
+ vpaddd %xmm5,%xmm10,%xmm10
+ addl %edi,%ebp
+ movl %eax,%edi
+ vpxor %xmm9,%xmm6,%xmm6
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ vpsrld $30,%xmm6,%xmm9
+ vmovdqa %xmm10,16(%rsp)
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 36(%rsp),%edx
+ andl %ecx,%esi
+ vpslld $2,%xmm6,%xmm6
+ andl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ vpor %xmm9,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ebx,%eax
+ vmovdqa %xmm6,%xmm10
+ addl 40(%rsp),%ecx
+ andl %ebx,%edi
+ andl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 44(%rsp),%ebx
+ andl %eax,%esi
+ andl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ movl %edx,%edi
+ xorl %ebp,%edx
+ addl 48(%rsp),%eax
+ andl %ebp,%edi
+ vpxor %xmm0,%xmm7,%xmm7
+ andl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ vmovdqa 48(%r11),%xmm9
+ vpaddd %xmm6,%xmm8,%xmm8
+ addl %edi,%eax
+ movl %ebx,%edi
+ vpxor %xmm10,%xmm7,%xmm7
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ vpsrld $30,%xmm7,%xmm10
+ vmovdqa %xmm8,32(%rsp)
+ movl %ecx,%esi
+ xorl %edx,%ecx
+ addl 52(%rsp),%ebp
+ andl %edx,%esi
+ vpslld $2,%xmm7,%xmm7
+ andl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ vpor %xmm10,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %ecx,%ebx
+ vmovdqa %xmm7,%xmm8
+ addl 56(%rsp),%edx
+ andl %ecx,%edi
+ andl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 60(%rsp),%ecx
+ andl %ebx,%esi
+ andl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ movl %ebp,%edi
+ xorl %eax,%ebp
+ addl 0(%rsp),%ebx
+ andl %eax,%edi
+ vpxor %xmm1,%xmm0,%xmm0
+ andl %ebp,%esi
+ shrdl $7,%edx,%edx
+ vmovdqa %xmm9,%xmm10
+ vpaddd %xmm7,%xmm9,%xmm9
+ addl %edi,%ebx
+ movl %ecx,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 4(%rsp),%eax
+ andl %ebp,%esi
+ vpslld $2,%xmm0,%xmm0
+ andl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %edx,%ecx
+ vmovdqa %xmm0,%xmm9
+ addl 8(%rsp),%ebp
+ andl %edx,%edi
+ andl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 12(%rsp),%edx
+ andl %ecx,%esi
+ andl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm9
+ vpxor %xmm5,%xmm1,%xmm1
+ movl %eax,%edi
+ xorl %ebx,%eax
+ addl 16(%rsp),%ecx
+ andl %ebx,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ andl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ vmovdqa %xmm10,%xmm8
+ vpaddd %xmm0,%xmm10,%xmm10
+ addl %edi,%ecx
+ movl %edx,%edi
+ vpxor %xmm9,%xmm1,%xmm1
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ vpsrld $30,%xmm1,%xmm9
+ vmovdqa %xmm10,0(%rsp)
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 20(%rsp),%ebx
+ andl %eax,%esi
+ vpslld $2,%xmm1,%xmm1
+ andl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ vpor %xmm9,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %ebp,%edx
+ vmovdqa %xmm1,%xmm10
+ addl 24(%rsp),%eax
+ andl %ebp,%edi
+ andl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ movl %ecx,%esi
+ xorl %edx,%ecx
+ addl 28(%rsp),%ebp
+ andl %edx,%esi
+ andl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm10
+ vpxor %xmm6,%xmm2,%xmm2
+ movl %ebx,%edi
+ xorl %ecx,%ebx
+ addl 32(%rsp),%edx
+ andl %ecx,%edi
+ vpxor %xmm3,%xmm2,%xmm2
+ andl %ebx,%esi
+ shrdl $7,%eax,%eax
+ vmovdqa %xmm8,%xmm9
+ vpaddd %xmm1,%xmm8,%xmm8
+ addl %edi,%edx
+ movl %ebp,%edi
+ vpxor %xmm10,%xmm2,%xmm2
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ vpsrld $30,%xmm2,%xmm10
+ vmovdqa %xmm8,16(%rsp)
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 36(%rsp),%ecx
+ andl %ebx,%esi
+ vpslld $2,%xmm2,%xmm2
+ andl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ vpor %xmm10,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %eax,%ebp
+ vmovdqa %xmm2,%xmm8
+ addl 40(%rsp),%ebx
+ andl %eax,%edi
+ andl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 44(%rsp),%eax
+ andl %ebp,%esi
+ andl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ vmovdqa %xmm9,%xmm10
+ vpaddd %xmm2,%xmm9,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ xorl %eax,%esi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm10,%xmm10
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%esi
+ movdqa %xmm10,48(%rsp)
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ addl 4(%rsp),%ebp
+ xorl %edx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ addl 8(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ addl 12(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ cmpq %r10,%r9
+ je .Ldone_avx
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa 0(%r11),%xmm9
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm9,%xmm0,%xmm4
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm9,%xmm1,%xmm5
+ xorl %eax,%esi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+ xorl %edx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm9,%xmm2,%xmm6
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+ xorl %edx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ xorl %eax,%esi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %esi,%ecx
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+ xorl %edx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ shrdl $7,%eax,%eax
+ addl %esi,%edx
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%ebp,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ shrdl $7,%edx,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ shrdl $7,%ecx,%ecx
+ addl %edi,%eax
+ vzeroupper
+
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ leaq 64(%rsp),%rsi
+ movq 0(%rsi),%r12
+ movq 8(%rsi),%rbp
+ movq 16(%rsi),%rbx
+ leaq 24(%rsi),%rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999