Module Name: src Committed By: joerg Date: Sat May 16 17:32:54 UTC 2015
Modified Files: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386: Makefile bn-586.S ghash-x86.S sha1-586.S sha512-586.S x86cpuid.S Added Files: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386: modes.inc Log Message: Optimize i386 support in libcrypto: - Enable optional SSE2 assembler versions. Regenerate. - Hook up assembler version of GCM. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile cvs rdiff -u -r1.5 -r1.6 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S cvs rdiff -u -r1.2 -r1.3 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S cvs rdiff -u -r0 -r1.1 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc cvs rdiff -u -r1.3 -r1.4 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S cvs rdiff -u -r1.8 -r1.9 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile:1.6 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile:1.7 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile:1.6 Mon Jul 30 10:25:24 2012 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile Sat May 16 17:32:54 2015 @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.6 2012/07/30 10:25:24 christos Exp $ +# $NetBSD: Makefile,v 1.7 2015/05/16 17:32:54 joerg Exp $ .include "bsd.own.mk" @@ -9,7 +9,7 @@ regen: for i in $$(find ${OPENSSLSRC} -name \*86.pl) \ ${OPENSSLSRC}/crypto/x86cpuid.pl; do \ perl -I${OPENSSLSRC}/crypto/perlasm \ - -I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC \ + -I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC -DOPENSSL_IA32_SSE2 \ | sed -e 's,^\.file.*$$,#include <machine/asm.h>,' \ -e 's/ call OPENSSL_cpuid_setup/ PIC_PROLOGUE! call PIC_PLT(OPENSSL_cpuid_setup)! PIC_EPILOGUE/' | tr '!' '\n' \ > $$(basename $$i .pl).S; \ Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S:1.5 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S:1.6 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S:1.5 Fri Jul 27 19:34:14 2012 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S Sat May 16 17:32:54 2015 @@ -5,6 +5,103 @@ .align 16 bn_mul_add_words: .L_bn_mul_add_words_begin: + call .L000PIC_me_up +.L000PIC_me_up: + popl %eax + leal _GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%eax),%eax + movl OPENSSL_ia32cap_P@GOT(%eax),%eax + btl $26,(%eax) + jnc .L001maw_non_sse2 + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 + jmp .L002maw_sse2_entry +.align 16 +.L003maw_sse2_unrolled: + movd (%eax),%mm3 + paddq %mm3,%mm1 + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + movd 4(%edx),%mm4 + pmuludq %mm0,%mm4 + movd 8(%edx),%mm6 + pmuludq %mm0,%mm6 + movd 12(%edx),%mm7 + pmuludq %mm0,%mm7 + paddq %mm2,%mm1 + movd 4(%eax),%mm3 + paddq %mm4,%mm3 + movd 8(%eax),%mm5 + paddq %mm6,%mm5 + movd 12(%eax),%mm4 + paddq %mm4,%mm7 + movd %mm1,(%eax) + movd 16(%edx),%mm2 + pmuludq %mm0,%mm2 + psrlq $32,%mm1 + movd 20(%edx),%mm4 + pmuludq %mm0,%mm4 + paddq %mm3,%mm1 + movd 24(%edx),%mm6 + pmuludq %mm0,%mm6 + movd %mm1,4(%eax) + psrlq $32,%mm1 + movd 28(%edx),%mm3 + addl $32,%edx + pmuludq %mm0,%mm3 + paddq %mm5,%mm1 + movd 16(%eax),%mm5 + paddq %mm5,%mm2 + movd %mm1,8(%eax) + psrlq $32,%mm1 + paddq %mm7,%mm1 + movd 20(%eax),%mm5 + paddq %mm5,%mm4 + movd %mm1,12(%eax) + psrlq $32,%mm1 + paddq %mm2,%mm1 + movd 24(%eax),%mm5 + paddq %mm5,%mm6 + movd %mm1,16(%eax) + psrlq $32,%mm1 + paddq %mm4,%mm1 + movd 28(%eax),%mm5 + paddq %mm5,%mm3 + movd %mm1,20(%eax) + psrlq $32,%mm1 + paddq %mm6,%mm1 + movd %mm1,24(%eax) + psrlq $32,%mm1 + paddq %mm3,%mm1 + movd %mm1,28(%eax) + leal 32(%eax),%eax + psrlq $32,%mm1 + subl $8,%ecx + jz .L004maw_sse2_exit +.L002maw_sse2_entry: + testl $4294967288,%ecx + jnz .L003maw_sse2_unrolled +.align 4 +.L005maw_sse2_loop: + movd (%edx),%mm2 + movd (%eax),%mm3 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm3,%mm1 + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz .L005maw_sse2_loop +.L004maw_sse2_exit: + movd %mm1,%eax + emms + ret +.align 16 +.L001maw_non_sse2: pushl %ebp pushl %ebx pushl %esi @@ -17,9 +114,9 @@ bn_mul_add_words: andl $4294967288,%ecx movl 32(%esp),%ebp pushl %ecx - jz .L000maw_finish + jz .L006maw_finish .align 16 -.L001maw_loop: +.L007maw_loop: movl (%ebx),%eax mull %ebp @@ -96,13 +193,13 @@ bn_mul_add_words: subl $8,%ecx leal 32(%ebx),%ebx leal 32(%edi),%edi - jnz .L001maw_loop -.L000maw_finish: + jnz .L007maw_loop +.L006maw_finish: movl 32(%esp),%ecx andl $7,%ecx - jnz .L002maw_finish2 - jmp .L003maw_end -.L002maw_finish2: + jnz .L008maw_finish2 + jmp .L009maw_end +.L008maw_finish2: movl (%ebx),%eax mull %ebp @@ -113,7 +210,7 @@ bn_mul_add_words: decl %ecx movl %eax,(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 4(%ebx),%eax mull %ebp @@ -124,7 +221,7 @@ bn_mul_add_words: decl %ecx movl %eax,4(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 8(%ebx),%eax mull %ebp @@ -135,7 +232,7 @@ bn_mul_add_words: decl %ecx movl %eax,8(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 12(%ebx),%eax mull %ebp @@ -146,7 +243,7 @@ bn_mul_add_words: decl %ecx movl %eax,12(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 16(%ebx),%eax mull %ebp @@ -157,7 +254,7 @@ bn_mul_add_words: decl %ecx movl %eax,16(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 20(%ebx),%eax mull %ebp @@ -168,7 +265,7 @@ bn_mul_add_words: decl %ecx movl %eax,20(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 24(%ebx),%eax mull %ebp @@ -178,7 +275,7 @@ bn_mul_add_words: adcl $0,%edx movl %eax,24(%edi) movl %edx,%esi -.L003maw_end: +.L009maw_end: movl %esi,%eax popl %ecx popl %edi @@ -192,6 +289,34 @@ bn_mul_add_words: .align 16 bn_mul_words: .L_bn_mul_words_begin: + call .L010PIC_me_up +.L010PIC_me_up: + popl %eax + leal _GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%eax),%eax + movl OPENSSL_ia32cap_P@GOT(%eax),%eax + btl $26,(%eax) + jnc .L011mw_non_sse2 + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 +.align 16 +.L012mw_sse2_loop: + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz .L012mw_sse2_loop + movd %mm1,%eax + emms + ret +.align 16 +.L011mw_non_sse2: pushl %ebp pushl %ebx pushl %esi @@ -203,8 +328,8 @@ bn_mul_words: movl 28(%esp),%ebp movl 32(%esp),%ecx andl $4294967288,%ebp - jz .L004mw_finish -.L005mw_loop: + jz .L013mw_finish +.L014mw_loop: movl (%ebx),%eax mull %ecx @@ -265,14 +390,14 @@ bn_mul_words: addl $32,%ebx addl $32,%edi subl $8,%ebp - jz .L004mw_finish - jmp .L005mw_loop -.L004mw_finish: + jz .L013mw_finish + jmp .L014mw_loop +.L013mw_finish: movl 28(%esp),%ebp andl $7,%ebp - jnz .L006mw_finish2 - jmp .L007mw_end -.L006mw_finish2: + jnz .L015mw_finish2 + jmp .L016mw_end +.L015mw_finish2: movl (%ebx),%eax mull %ecx @@ -281,7 +406,7 @@ bn_mul_words: movl %eax,(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 4(%ebx),%eax mull %ecx @@ -290,7 +415,7 @@ bn_mul_words: movl %eax,4(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 8(%ebx),%eax mull %ecx @@ -299,7 +424,7 @@ bn_mul_words: movl %eax,8(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 12(%ebx),%eax mull %ecx @@ -308,7 +433,7 @@ bn_mul_words: movl %eax,12(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 16(%ebx),%eax mull %ecx @@ -317,7 +442,7 @@ bn_mul_words: movl %eax,16(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 20(%ebx),%eax mull %ecx @@ -326,7 +451,7 @@ bn_mul_words: movl %eax,20(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 24(%ebx),%eax mull %ecx @@ -334,7 +459,7 @@ bn_mul_words: adcl $0,%edx movl %eax,24(%edi) movl %edx,%esi -.L007mw_end: +.L016mw_end: movl %esi,%eax popl %edi popl %esi @@ -347,6 +472,29 @@ bn_mul_words: .align 16 bn_sqr_words: .L_bn_sqr_words_begin: + call .L017PIC_me_up +.L017PIC_me_up: + popl %eax + leal _GLOBAL_OFFSET_TABLE_+[.-.L017PIC_me_up](%eax),%eax + movl OPENSSL_ia32cap_P@GOT(%eax),%eax + btl $26,(%eax) + jnc .L018sqr_non_sse2 + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx +.align 16 +.L019sqr_sse2_loop: + movd (%edx),%mm0 + pmuludq %mm0,%mm0 + leal 4(%edx),%edx + movq %mm0,(%eax) + subl $1,%ecx + leal 8(%eax),%eax + jnz .L019sqr_sse2_loop + emms + ret +.align 16 +.L018sqr_non_sse2: pushl %ebp pushl %ebx pushl %esi @@ -356,8 +504,8 @@ bn_sqr_words: movl 24(%esp),%edi movl 28(%esp),%ebx andl $4294967288,%ebx - jz .L008sw_finish -.L009sw_loop: + jz .L020sw_finish +.L021sw_loop: movl (%edi),%eax mull %eax @@ -402,59 +550,59 @@ bn_sqr_words: addl $32,%edi addl $64,%esi subl $8,%ebx - jnz .L009sw_loop -.L008sw_finish: + jnz .L021sw_loop +.L020sw_finish: movl 28(%esp),%ebx andl $7,%ebx - jz .L010sw_end + jz .L022sw_end movl (%edi),%eax mull %eax movl %eax,(%esi) decl %ebx movl %edx,4(%esi) - jz .L010sw_end + jz .L022sw_end movl 4(%edi),%eax mull %eax movl %eax,8(%esi) decl %ebx movl %edx,12(%esi) - jz .L010sw_end + jz .L022sw_end movl 8(%edi),%eax mull %eax movl %eax,16(%esi) decl %ebx movl %edx,20(%esi) - jz .L010sw_end + jz .L022sw_end movl 12(%edi),%eax mull %eax movl %eax,24(%esi) decl %ebx movl %edx,28(%esi) - jz .L010sw_end + jz .L022sw_end movl 16(%edi),%eax mull %eax movl %eax,32(%esi) decl %ebx movl %edx,36(%esi) - jz .L010sw_end + jz .L022sw_end movl 20(%edi),%eax mull %eax movl %eax,40(%esi) decl %ebx movl %edx,44(%esi) - jz .L010sw_end + jz .L022sw_end movl 24(%edi),%eax mull %eax movl %eax,48(%esi) movl %edx,52(%esi) -.L010sw_end: +.L022sw_end: popl %edi popl %esi popl %ebx @@ -488,8 +636,8 @@ bn_add_words: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz .L011aw_finish -.L012aw_loop: + jz .L023aw_finish +.L024aw_loop: movl (%esi),%ecx movl (%edi),%edx @@ -567,11 +715,11 @@ bn_add_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L012aw_loop -.L011aw_finish: + jnz .L024aw_loop +.L023aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz .L013aw_end + jz .L025aw_end movl (%esi),%ecx movl (%edi),%edx @@ -582,7 +730,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz .L013aw_end + jz .L025aw_end movl 4(%esi),%ecx movl 4(%edi),%edx @@ -593,7 +741,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz .L013aw_end + jz .L025aw_end movl 8(%esi),%ecx movl 8(%edi),%edx @@ -604,7 +752,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz .L013aw_end + jz .L025aw_end movl 12(%esi),%ecx movl 12(%edi),%edx @@ -615,7 +763,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz .L013aw_end + jz .L025aw_end movl 16(%esi),%ecx movl 16(%edi),%edx @@ -626,7 +774,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz .L013aw_end + jz .L025aw_end movl 20(%esi),%ecx movl 20(%edi),%edx @@ -637,7 +785,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz .L013aw_end + jz .L025aw_end movl 24(%esi),%ecx movl 24(%edi),%edx @@ -647,7 +795,7 @@ bn_add_words: addl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) -.L013aw_end: +.L025aw_end: popl %edi popl %esi popl %ebx @@ -670,8 +818,8 @@ bn_sub_words: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz .L014aw_finish -.L015aw_loop: + jz .L026aw_finish +.L027aw_loop: movl (%esi),%ecx movl (%edi),%edx @@ -749,11 +897,11 @@ bn_sub_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L015aw_loop -.L014aw_finish: + jnz .L027aw_loop +.L026aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz .L016aw_end + jz .L028aw_end movl (%esi),%ecx movl (%edi),%edx @@ -764,7 +912,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz .L016aw_end + jz .L028aw_end movl 4(%esi),%ecx movl 4(%edi),%edx @@ -775,7 +923,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz .L016aw_end + jz .L028aw_end movl 8(%esi),%ecx movl 8(%edi),%edx @@ -786,7 +934,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz .L016aw_end + jz .L028aw_end movl 12(%esi),%ecx movl 12(%edi),%edx @@ -797,7 +945,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz .L016aw_end + jz .L028aw_end movl 16(%esi),%ecx movl 16(%edi),%edx @@ -808,7 +956,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz .L016aw_end + jz .L028aw_end movl 20(%esi),%ecx movl 20(%edi),%edx @@ -819,7 +967,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz .L016aw_end + jz .L028aw_end movl 24(%esi),%ecx movl 24(%edi),%edx @@ -829,7 +977,7 @@ bn_sub_words: subl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) -.L016aw_end: +.L028aw_end: popl %edi popl %esi popl %ebx @@ -852,8 +1000,8 @@ bn_sub_part_words: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz .L017aw_finish -.L018aw_loop: + jz .L029aw_finish +.L030aw_loop: movl (%esi),%ecx movl (%edi),%edx @@ -931,11 +1079,11 @@ bn_sub_part_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L018aw_loop -.L017aw_finish: + jnz .L030aw_loop +.L029aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -949,7 +1097,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -963,7 +1111,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -977,7 +1125,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -991,7 +1139,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -1005,7 +1153,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -1019,7 +1167,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -1032,20 +1180,20 @@ bn_sub_part_words: addl $4,%esi addl $4,%edi addl $4,%ebx -.L019aw_end: +.L031aw_end: cmpl $0,36(%esp) - je .L020pw_end + je .L032pw_end movl 36(%esp),%ebp cmpl $0,%ebp - je .L020pw_end - jge .L021pw_pos + je .L032pw_end + jge .L033pw_pos movl $0,%edx subl %ebp,%edx movl %edx,%ebp andl $4294967288,%ebp - jz .L022pw_neg_finish -.L023pw_neg_loop: + jz .L034pw_neg_finish +.L035pw_neg_loop: movl $0,%ecx movl (%edi),%edx @@ -1122,13 +1270,13 @@ bn_sub_part_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L023pw_neg_loop -.L022pw_neg_finish: + jnz .L035pw_neg_loop +.L034pw_neg_finish: movl 36(%esp),%edx movl $0,%ebp subl %edx,%ebp andl $7,%ebp - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl (%edi),%edx @@ -1139,7 +1287,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 4(%edi),%edx @@ -1150,7 +1298,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 8(%edi),%edx @@ -1161,7 +1309,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 12(%edi),%edx @@ -1172,7 +1320,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 16(%edi),%edx @@ -1183,7 +1331,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 20(%edi),%edx @@ -1194,7 +1342,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 24(%edi),%edx @@ -1204,181 +1352,182 @@ bn_sub_part_words: subl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) - jmp .L020pw_end -.L021pw_pos: + jmp .L032pw_end +.L033pw_pos: andl $4294967288,%ebp - jz .L024pw_pos_finish -.L025pw_pos_loop: + jz .L036pw_pos_finish +.L037pw_pos_loop: movl (%esi),%ecx subl %eax,%ecx movl %ecx,(%ebx) - jnc .L026pw_nc0 + jnc .L038pw_nc0 movl 4(%esi),%ecx subl %eax,%ecx movl %ecx,4(%ebx) - jnc .L027pw_nc1 + jnc .L039pw_nc1 movl 8(%esi),%ecx subl %eax,%ecx movl %ecx,8(%ebx) - jnc .L028pw_nc2 + jnc .L040pw_nc2 movl 12(%esi),%ecx subl %eax,%ecx movl %ecx,12(%ebx) - jnc .L029pw_nc3 + jnc .L041pw_nc3 movl 16(%esi),%ecx subl %eax,%ecx movl %ecx,16(%ebx) - jnc .L030pw_nc4 + jnc .L042pw_nc4 movl 20(%esi),%ecx subl %eax,%ecx movl %ecx,20(%ebx) - jnc .L031pw_nc5 + jnc .L043pw_nc5 movl 24(%esi),%ecx subl %eax,%ecx movl %ecx,24(%ebx) - jnc .L032pw_nc6 + jnc .L044pw_nc6 movl 28(%esi),%ecx subl %eax,%ecx movl %ecx,28(%ebx) - jnc .L033pw_nc7 + jnc .L045pw_nc7 addl $32,%esi addl $32,%ebx subl $8,%ebp - jnz .L025pw_pos_loop -.L024pw_pos_finish: + jnz .L037pw_pos_loop +.L036pw_pos_finish: movl 36(%esp),%ebp andl $7,%ebp - jz .L020pw_end + jz .L032pw_end movl (%esi),%ecx subl %eax,%ecx movl %ecx,(%ebx) - jnc .L034pw_tail_nc0 + jnc .L046pw_tail_nc0 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 4(%esi),%ecx subl %eax,%ecx movl %ecx,4(%ebx) - jnc .L035pw_tail_nc1 + jnc .L047pw_tail_nc1 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 8(%esi),%ecx subl %eax,%ecx movl %ecx,8(%ebx) - jnc .L036pw_tail_nc2 + jnc .L048pw_tail_nc2 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 12(%esi),%ecx subl %eax,%ecx movl %ecx,12(%ebx) - jnc .L037pw_tail_nc3 + jnc .L049pw_tail_nc3 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 16(%esi),%ecx subl %eax,%ecx movl %ecx,16(%ebx) - jnc .L038pw_tail_nc4 + jnc .L050pw_tail_nc4 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 20(%esi),%ecx subl %eax,%ecx movl %ecx,20(%ebx) - jnc .L039pw_tail_nc5 + jnc .L051pw_tail_nc5 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 24(%esi),%ecx subl %eax,%ecx movl %ecx,24(%ebx) - jnc .L040pw_tail_nc6 + jnc .L052pw_tail_nc6 movl $1,%eax - jmp .L020pw_end -.L041pw_nc_loop: + jmp .L032pw_end +.L053pw_nc_loop: movl (%esi),%ecx movl %ecx,(%ebx) -.L026pw_nc0: +.L038pw_nc0: movl 4(%esi),%ecx movl %ecx,4(%ebx) -.L027pw_nc1: +.L039pw_nc1: movl 8(%esi),%ecx movl %ecx,8(%ebx) -.L028pw_nc2: +.L040pw_nc2: movl 12(%esi),%ecx movl %ecx,12(%ebx) -.L029pw_nc3: +.L041pw_nc3: movl 16(%esi),%ecx movl %ecx,16(%ebx) -.L030pw_nc4: +.L042pw_nc4: movl 20(%esi),%ecx movl %ecx,20(%ebx) -.L031pw_nc5: +.L043pw_nc5: movl 24(%esi),%ecx movl %ecx,24(%ebx) -.L032pw_nc6: +.L044pw_nc6: movl 28(%esi),%ecx movl %ecx,28(%ebx) -.L033pw_nc7: +.L045pw_nc7: addl $32,%esi addl $32,%ebx subl $8,%ebp - jnz .L041pw_nc_loop + jnz .L053pw_nc_loop movl 36(%esp),%ebp andl $7,%ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl (%esi),%ecx movl %ecx,(%ebx) -.L034pw_tail_nc0: +.L046pw_tail_nc0: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 4(%esi),%ecx movl %ecx,4(%ebx) -.L035pw_tail_nc1: +.L047pw_tail_nc1: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 8(%esi),%ecx movl %ecx,8(%ebx) -.L036pw_tail_nc2: +.L048pw_tail_nc2: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 12(%esi),%ecx movl %ecx,12(%ebx) -.L037pw_tail_nc3: +.L049pw_tail_nc3: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 16(%esi),%ecx movl %ecx,16(%ebx) -.L038pw_tail_nc4: +.L050pw_tail_nc4: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 20(%esi),%ecx movl %ecx,20(%ebx) -.L039pw_tail_nc5: +.L051pw_tail_nc5: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 24(%esi),%ecx movl %ecx,24(%ebx) -.L040pw_tail_nc6: -.L042pw_nc_end: +.L052pw_tail_nc6: +.L054pw_nc_end: movl $0,%eax -.L020pw_end: +.L032pw_end: popl %edi popl %esi popl %ebx popl %ebp ret .size bn_sub_part_words,.-.L_bn_sub_part_words_begin +.comm OPENSSL_ia32cap_P,8,4 Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S:1.2 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S:1.3 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S:1.2 Fri Jul 27 19:34:14 2012 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S Sat May 16 17:32:54 2015 @@ -203,418 +203,94 @@ gcm_ghash_4bit_x86: popl %ebp ret .size gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin -.type _mmx_gmult_4bit_inner,@function +.globl gcm_gmult_4bit_mmx +.type gcm_gmult_4bit_mmx,@function .align 16 -_mmx_gmult_4bit_inner: +gcm_gmult_4bit_mmx: +.L_gcm_gmult_4bit_mmx_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + call .L005pic_point +.L005pic_point: + popl %eax + leal .Lrem_4bit-.L005pic_point(%eax),%eax + movzbl 15(%edi),%ebx xorl %ecx,%ecx movl %ebx,%edx movb %dl,%cl + movl $14,%ebp shlb $4,%cl andl $240,%edx movq 8(%esi,%ecx,1),%mm0 movq (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 14(%edi),%cl - psllq $60,%mm2 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl + jmp .L006mmx_loop +.align 16 +.L006mmx_loop: psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 movq %mm1,%mm2 psrlq $4,%mm1 pxor 8(%esi,%edx,1),%mm0 - movb 13(%edi),%cl + movb (%edi,%ebp,1),%cl psllq $60,%mm2 pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx + decl %ebp movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 12(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp pxor (%esi,%edx,1),%mm1 movl %ecx,%edx - movd %mm0,%ebx pxor %mm2,%mm0 + js .L007mmx_break shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 11(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx - movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 10(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx - movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 9(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx - movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 8(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx - movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 movq %mm1,%mm2 psrlq $4,%mm1 pxor 8(%esi,%ecx,1),%mm0 psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 7(%edi),%cl - psllq $60,%mm2 pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 6(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx - movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 5(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx - movd %mm0,%ebx pxor %mm2,%mm0 + jmp .L006mmx_loop +.align 16 +.L007mmx_break: shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 4(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx - movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 3(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx - movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 movq %mm1,%mm2 psrlq $4,%mm1 pxor 8(%esi,%ecx,1),%mm0 psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 2(%edi),%cl - psllq $60,%mm2 pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - movb 1(%edi),%cl - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx - movd %mm0,%ebx pxor %mm2,%mm0 - shlb $4,%cl psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 movq %mm1,%mm2 psrlq $4,%mm1 pxor 8(%esi,%edx,1),%mm0 - movb (%edi),%cl psllq $60,%mm2 pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp - pxor (%esi,%edx,1),%mm1 - movl %ecx,%edx movd %mm0,%ebx - pxor %mm2,%mm0 - shlb $4,%cl - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%ecx,1),%mm0 - psllq $60,%mm2 - andl $240,%edx - pxor (%eax,%ebp,8),%mm1 - andl $15,%ebx - pxor (%esi,%ecx,1),%mm1 - movd %mm0,%ebp - pxor %mm2,%mm0 - psrlq $4,%mm0 - movq %mm1,%mm2 - psrlq $4,%mm1 - pxor 8(%esi,%edx,1),%mm0 - psllq $60,%mm2 - pxor (%eax,%ebx,8),%mm1 - andl $15,%ebp pxor (%esi,%edx,1),%mm1 - movd %mm0,%ebx pxor %mm2,%mm0 - movl 4(%eax,%ebp,8),%edi psrlq $32,%mm0 movd %mm1,%edx psrlq $32,%mm1 movd %mm0,%ecx movd %mm1,%ebp - shll $4,%edi bswap %ebx bswap %edx bswap %ecx - xorl %edi,%ebp bswap %ebp - ret -.size _mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner -.globl gcm_gmult_4bit_mmx -.type gcm_gmult_4bit_mmx,@function -.align 16 -gcm_gmult_4bit_mmx: -.L_gcm_gmult_4bit_mmx_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%edi - movl 24(%esp),%esi - call .L005pic_point -.L005pic_point: - popl %eax - leal .Lrem_4bit-.L005pic_point(%eax),%eax - movzbl 15(%edi),%ebx - call _mmx_gmult_4bit_inner - movl 20(%esp),%edi emms movl %ebx,12(%edi) movl %edx,4(%edi) @@ -635,61 +311,926 @@ gcm_ghash_4bit_mmx: pushl %ebx pushl %esi pushl %edi - movl 20(%esp),%ebp - movl 24(%esp),%esi - movl 28(%esp),%edi - movl 32(%esp),%ecx - call .L006pic_point -.L006pic_point: - popl %eax - leal .Lrem_4bit-.L006pic_point(%eax),%eax - addl %edi,%ecx - movl %ecx,32(%esp) - subl $20,%esp - movl 12(%ebp),%ebx - movl 4(%ebp),%edx - movl 8(%ebp),%ecx - movl (%ebp),%ebp - jmp .L007mmx_outer_loop + movl 20(%esp),%eax + movl 24(%esp),%ebx + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl %esp,%ebp + call .L008pic_point +.L008pic_point: + popl %esi + leal .Lrem_8bit-.L008pic_point(%esi),%esi + subl $544,%esp + andl $-64,%esp + subl $16,%esp + addl %ecx,%edx + movl %eax,544(%esp) + movl %edx,552(%esp) + movl %ebp,556(%esp) + addl $128,%ebx + leal 144(%esp),%edi + leal 400(%esp),%ebp + movl -120(%ebx),%edx + movq -120(%ebx),%mm0 + movq -128(%ebx),%mm3 + shll $4,%edx + movb %dl,(%esp) + movl -104(%ebx),%edx + movq -104(%ebx),%mm2 + movq -112(%ebx),%mm5 + movq %mm0,-128(%edi) + psrlq $4,%mm0 + movq %mm3,(%edi) + movq %mm3,%mm7 + psrlq $4,%mm3 + shll $4,%edx + movb %dl,1(%esp) + movl -88(%ebx),%edx + movq -88(%ebx),%mm1 + psllq $60,%mm7 + movq -96(%ebx),%mm4 + por %mm7,%mm0 + movq %mm2,-120(%edi) + psrlq $4,%mm2 + movq %mm5,8(%edi) + movq %mm5,%mm6 + movq %mm0,-128(%ebp) + psrlq $4,%mm5 + movq %mm3,(%ebp) + shll $4,%edx + movb %dl,2(%esp) + movl -72(%ebx),%edx + movq -72(%ebx),%mm0 + psllq $60,%mm6 + movq -80(%ebx),%mm3 + por %mm6,%mm2 + movq %mm1,-112(%edi) + psrlq $4,%mm1 + movq %mm4,16(%edi) + movq %mm4,%mm7 + movq %mm2,-120(%ebp) + psrlq $4,%mm4 + movq %mm5,8(%ebp) + shll $4,%edx + movb %dl,3(%esp) + movl -56(%ebx),%edx + movq -56(%ebx),%mm2 + psllq $60,%mm7 + movq -64(%ebx),%mm5 + por %mm7,%mm1 + movq %mm0,-104(%edi) + psrlq $4,%mm0 + movq %mm3,24(%edi) + movq %mm3,%mm6 + movq %mm1,-112(%ebp) + psrlq $4,%mm3 + movq %mm4,16(%ebp) + shll $4,%edx + movb %dl,4(%esp) + movl -40(%ebx),%edx + movq -40(%ebx),%mm1 + psllq $60,%mm6 + movq -48(%ebx),%mm4 + por %mm6,%mm0 + movq %mm2,-96(%edi) + psrlq $4,%mm2 + movq %mm5,32(%edi) + movq %mm5,%mm7 + movq %mm0,-104(%ebp) + psrlq $4,%mm5 + movq %mm3,24(%ebp) + shll $4,%edx + movb %dl,5(%esp) + movl -24(%ebx),%edx + movq -24(%ebx),%mm0 + psllq $60,%mm7 + movq -32(%ebx),%mm3 + por %mm7,%mm2 + movq %mm1,-88(%edi) + psrlq $4,%mm1 + movq %mm4,40(%edi) + movq %mm4,%mm6 + movq %mm2,-96(%ebp) + psrlq $4,%mm4 + movq %mm5,32(%ebp) + shll $4,%edx + movb %dl,6(%esp) + movl -8(%ebx),%edx + movq -8(%ebx),%mm2 + psllq $60,%mm6 + movq -16(%ebx),%mm5 + por %mm6,%mm1 + movq %mm0,-80(%edi) + psrlq $4,%mm0 + movq %mm3,48(%edi) + movq %mm3,%mm7 + movq %mm1,-88(%ebp) + psrlq $4,%mm3 + movq %mm4,40(%ebp) + shll $4,%edx + movb %dl,7(%esp) + movl 8(%ebx),%edx + movq 8(%ebx),%mm1 + psllq $60,%mm7 + movq (%ebx),%mm4 + por %mm7,%mm0 + movq %mm2,-72(%edi) + psrlq $4,%mm2 + movq %mm5,56(%edi) + movq %mm5,%mm6 + movq %mm0,-80(%ebp) + psrlq $4,%mm5 + movq %mm3,48(%ebp) + shll $4,%edx + movb %dl,8(%esp) + movl 24(%ebx),%edx + movq 24(%ebx),%mm0 + psllq $60,%mm6 + movq 16(%ebx),%mm3 + por %mm6,%mm2 + movq %mm1,-64(%edi) + psrlq $4,%mm1 + movq %mm4,64(%edi) + movq %mm4,%mm7 + movq %mm2,-72(%ebp) + psrlq $4,%mm4 + movq %mm5,56(%ebp) + shll $4,%edx + movb %dl,9(%esp) + movl 40(%ebx),%edx + movq 40(%ebx),%mm2 + psllq $60,%mm7 + movq 32(%ebx),%mm5 + por %mm7,%mm1 + movq %mm0,-56(%edi) + psrlq $4,%mm0 + movq %mm3,72(%edi) + movq %mm3,%mm6 + movq %mm1,-64(%ebp) + psrlq $4,%mm3 + movq %mm4,64(%ebp) + shll $4,%edx + movb %dl,10(%esp) + movl 56(%ebx),%edx + movq 56(%ebx),%mm1 + psllq $60,%mm6 + movq 48(%ebx),%mm4 + por %mm6,%mm0 + movq %mm2,-48(%edi) + psrlq $4,%mm2 + movq %mm5,80(%edi) + movq %mm5,%mm7 + movq %mm0,-56(%ebp) + psrlq $4,%mm5 + movq %mm3,72(%ebp) + shll $4,%edx + movb %dl,11(%esp) + movl 72(%ebx),%edx + movq 72(%ebx),%mm0 + psllq $60,%mm7 + movq 64(%ebx),%mm3 + por %mm7,%mm2 + movq %mm1,-40(%edi) + psrlq $4,%mm1 + movq %mm4,88(%edi) + movq %mm4,%mm6 + movq %mm2,-48(%ebp) + psrlq $4,%mm4 + movq %mm5,80(%ebp) + shll $4,%edx + movb %dl,12(%esp) + movl 88(%ebx),%edx + movq 88(%ebx),%mm2 + psllq $60,%mm6 + movq 80(%ebx),%mm5 + por %mm6,%mm1 + movq %mm0,-32(%edi) + psrlq $4,%mm0 + movq %mm3,96(%edi) + movq %mm3,%mm7 + movq %mm1,-40(%ebp) + psrlq $4,%mm3 + movq %mm4,88(%ebp) + shll $4,%edx + movb %dl,13(%esp) + movl 104(%ebx),%edx + movq 104(%ebx),%mm1 + psllq $60,%mm7 + movq 96(%ebx),%mm4 + por %mm7,%mm0 + movq %mm2,-24(%edi) + psrlq $4,%mm2 + movq %mm5,104(%edi) + movq %mm5,%mm6 + movq %mm0,-32(%ebp) + psrlq $4,%mm5 + movq %mm3,96(%ebp) + shll $4,%edx + movb %dl,14(%esp) + movl 120(%ebx),%edx + movq 120(%ebx),%mm0 + psllq $60,%mm6 + movq 112(%ebx),%mm3 + por %mm6,%mm2 + movq %mm1,-16(%edi) + psrlq $4,%mm1 + movq %mm4,112(%edi) + movq %mm4,%mm7 + movq %mm2,-24(%ebp) + psrlq $4,%mm4 + movq %mm5,104(%ebp) + shll $4,%edx + movb %dl,15(%esp) + psllq $60,%mm7 + por %mm7,%mm1 + movq %mm0,-8(%edi) + psrlq $4,%mm0 + movq %mm3,120(%edi) + movq %mm3,%mm6 + movq %mm1,-16(%ebp) + psrlq $4,%mm3 + movq %mm4,112(%ebp) + psllq $60,%mm6 + por %mm6,%mm0 + movq %mm0,-8(%ebp) + movq %mm3,120(%ebp) + movq (%eax),%mm6 + movl 8(%eax),%ebx + movl 12(%eax),%edx .align 16 -.L007mmx_outer_loop: - xorl 12(%edi),%ebx - xorl 4(%edi),%edx - xorl 8(%edi),%ecx - xorl (%edi),%ebp - movl %edi,48(%esp) - movl %ebx,12(%esp) - movl %edx,4(%esp) - movl %ecx,8(%esp) - movl %ebp,(%esp) - movl %esp,%edi - shrl $24,%ebx - call _mmx_gmult_4bit_inner - movl 48(%esp),%edi - leal 16(%edi),%edi - cmpl 52(%esp),%edi - jb .L007mmx_outer_loop - movl 40(%esp),%edi +.L009outer: + xorl 12(%ecx),%edx + xorl 8(%ecx),%ebx + pxor (%ecx),%mm6 + leal 16(%ecx),%ecx + movl %ebx,536(%esp) + movq %mm6,528(%esp) + movl %ecx,548(%esp) + xorl %eax,%eax + roll $8,%edx + movb %dl,%al + movl %eax,%ebp + andb $15,%al + shrl $4,%ebp + pxor %mm0,%mm0 + roll $8,%edx + pxor %mm1,%mm1 + pxor %mm2,%mm2 + movq 16(%esp,%eax,8),%mm7 + movq 144(%esp,%eax,8),%mm6 + movb %dl,%al + movd %mm7,%ebx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%edi + psrlq $8,%mm6 + pxor 272(%esp,%ebp,8),%mm7 + andb $15,%al + psllq $56,%mm3 + shrl $4,%edi + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%ebp,8),%mm6 + xorb (%esp,%ebp,1),%bl + movb %dl,%al + movd %mm7,%ecx + movzbl %bl,%ebx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%ebp + psrlq $8,%mm6 + pxor 272(%esp,%edi,8),%mm7 + andb $15,%al + psllq $56,%mm3 + shrl $4,%ebp + pinsrw $2,(%esi,%ebx,2),%mm2 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%edi,8),%mm6 + xorb (%esp,%edi,1),%cl + movb %dl,%al + movl 536(%esp),%edx + movd %mm7,%ebx + movzbl %cl,%ecx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%edi + psrlq $8,%mm6 + pxor 272(%esp,%ebp,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm2,%mm6 + shrl $4,%edi + pinsrw $2,(%esi,%ecx,2),%mm1 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%ebp,8),%mm6 + xorb (%esp,%ebp,1),%bl + movb %dl,%al + movd %mm7,%ecx + movzbl %bl,%ebx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%ebp + psrlq $8,%mm6 + pxor 272(%esp,%edi,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm1,%mm6 + shrl $4,%ebp + pinsrw $2,(%esi,%ebx,2),%mm0 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%edi,8),%mm6 + xorb (%esp,%edi,1),%cl + movb %dl,%al + movd %mm7,%ebx + movzbl %cl,%ecx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%edi + psrlq $8,%mm6 + pxor 272(%esp,%ebp,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm0,%mm6 + shrl $4,%edi + pinsrw $2,(%esi,%ecx,2),%mm2 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%ebp,8),%mm6 + xorb (%esp,%ebp,1),%bl + movb %dl,%al + movd %mm7,%ecx + movzbl %bl,%ebx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%ebp + psrlq $8,%mm6 + pxor 272(%esp,%edi,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm2,%mm6 + shrl $4,%ebp + pinsrw $2,(%esi,%ebx,2),%mm1 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%edi,8),%mm6 + xorb (%esp,%edi,1),%cl + movb %dl,%al + movl 532(%esp),%edx + movd %mm7,%ebx + movzbl %cl,%ecx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%edi + psrlq $8,%mm6 + pxor 272(%esp,%ebp,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm1,%mm6 + shrl $4,%edi + pinsrw $2,(%esi,%ecx,2),%mm0 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%ebp,8),%mm6 + xorb (%esp,%ebp,1),%bl + movb %dl,%al + movd %mm7,%ecx + movzbl %bl,%ebx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%ebp + psrlq $8,%mm6 + pxor 272(%esp,%edi,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm0,%mm6 + shrl $4,%ebp + pinsrw $2,(%esi,%ebx,2),%mm2 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%edi,8),%mm6 + xorb (%esp,%edi,1),%cl + movb %dl,%al + movd %mm7,%ebx + movzbl %cl,%ecx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%edi + psrlq $8,%mm6 + pxor 272(%esp,%ebp,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm2,%mm6 + shrl $4,%edi + pinsrw $2,(%esi,%ecx,2),%mm1 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%ebp,8),%mm6 + xorb (%esp,%ebp,1),%bl + movb %dl,%al + movd %mm7,%ecx + movzbl %bl,%ebx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%ebp + psrlq $8,%mm6 + pxor 272(%esp,%edi,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm1,%mm6 + shrl $4,%ebp + pinsrw $2,(%esi,%ebx,2),%mm0 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%edi,8),%mm6 + xorb (%esp,%edi,1),%cl + movb %dl,%al + movl 528(%esp),%edx + movd %mm7,%ebx + movzbl %cl,%ecx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%edi + psrlq $8,%mm6 + pxor 272(%esp,%ebp,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm0,%mm6 + shrl $4,%edi + pinsrw $2,(%esi,%ecx,2),%mm2 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%ebp,8),%mm6 + xorb (%esp,%ebp,1),%bl + movb %dl,%al + movd %mm7,%ecx + movzbl %bl,%ebx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%ebp + psrlq $8,%mm6 + pxor 272(%esp,%edi,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm2,%mm6 + shrl $4,%ebp + pinsrw $2,(%esi,%ebx,2),%mm1 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%edi,8),%mm6 + xorb (%esp,%edi,1),%cl + movb %dl,%al + movd %mm7,%ebx + movzbl %cl,%ecx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%edi + psrlq $8,%mm6 + pxor 272(%esp,%ebp,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm1,%mm6 + shrl $4,%edi + pinsrw $2,(%esi,%ecx,2),%mm0 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%ebp,8),%mm6 + xorb (%esp,%ebp,1),%bl + movb %dl,%al + movd %mm7,%ecx + movzbl %bl,%ebx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%ebp + psrlq $8,%mm6 + pxor 272(%esp,%edi,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm0,%mm6 + shrl $4,%ebp + pinsrw $2,(%esi,%ebx,2),%mm2 + pxor 16(%esp,%eax,8),%mm7 + roll $8,%edx + pxor 144(%esp,%eax,8),%mm6 + pxor %mm3,%mm7 + pxor 400(%esp,%edi,8),%mm6 + xorb (%esp,%edi,1),%cl + movb %dl,%al + movl 524(%esp),%edx + movd %mm7,%ebx + movzbl %cl,%ecx + psrlq $8,%mm7 + movq %mm6,%mm3 + movl %eax,%edi + psrlq $8,%mm6 + pxor 272(%esp,%ebp,8),%mm7 + andb $15,%al + psllq $56,%mm3 + pxor %mm2,%mm6 + shrl $4,%edi + pinsrw $2,(%esi,%ecx,2),%mm1 + pxor 16(%esp,%eax,8),%mm7 + pxor 144(%esp,%eax,8),%mm6 + xorb (%esp,%ebp,1),%bl + pxor %mm3,%mm7 + pxor 400(%esp,%ebp,8),%mm6 + movzbl %bl,%ebx + pxor %mm2,%mm2 + psllq $4,%mm1 + movd %mm7,%ecx + psrlq $4,%mm7 + movq %mm6,%mm3 + psrlq $4,%mm6 + shll $4,%ecx + pxor 16(%esp,%edi,8),%mm7 + psllq $60,%mm3 + movzbl %cl,%ecx + pxor %mm3,%mm7 + pxor 144(%esp,%edi,8),%mm6 + pinsrw $2,(%esi,%ebx,2),%mm0 + pxor %mm1,%mm6 + movd %mm7,%edx + pinsrw $3,(%esi,%ecx,2),%mm2 + psllq $12,%mm0 + pxor %mm0,%mm6 + psrlq $32,%mm7 + pxor %mm2,%mm6 + movl 548(%esp),%ecx + movd %mm7,%ebx + movq %mm6,%mm3 + psllw $8,%mm6 + psrlw $8,%mm3 + por %mm3,%mm6 + bswap %edx + pshufw $27,%mm6,%mm6 + bswap %ebx + cmpl 552(%esp),%ecx + jne .L009outer + movl 544(%esp),%eax + movl %edx,12(%eax) + movl %ebx,8(%eax) + movq %mm6,(%eax) + movl 556(%esp),%esp emms - movl %ebx,12(%edi) - movl %edx,4(%edi) - movl %ecx,8(%edi) - movl %ebp,(%edi) - addl $20,%esp popl %edi popl %esi popl %ebx popl %ebp ret .size gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin +.globl gcm_init_clmul +.type gcm_init_clmul,@function +.align 16 +gcm_init_clmul: +.L_gcm_init_clmul_begin: + movl 4(%esp),%edx + movl 8(%esp),%eax + call .L010pic +.L010pic: + popl %ecx + leal .Lbswap-.L010pic(%ecx),%ecx + movdqu (%eax),%xmm2 + pshufd $78,%xmm2,%xmm2 + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + pand 16(%ecx),%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm2,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $5,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm4 + pslldq $8,%xmm0 + psrldq $8,%xmm4 + pxor %xmm3,%xmm0 + pxor %xmm4,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 + movdqu %xmm2,(%edx) + movdqu %xmm0,16(%edx) + ret +.size gcm_init_clmul,.-.L_gcm_init_clmul_begin +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,@function +.align 16 +gcm_gmult_clmul: +.L_gcm_gmult_clmul_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + call .L011pic +.L011pic: + popl %ecx + leal .Lbswap-.L011pic(%ecx),%ecx + movdqu (%eax),%xmm0 + movdqa (%ecx),%xmm5 + movups (%edx),%xmm2 +.byte 102,15,56,0,197 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $5,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm4 + pslldq $8,%xmm0 + psrldq $8,%xmm4 + pxor %xmm3,%xmm0 + pxor %xmm4,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,197 + movdqu %xmm0,(%eax) + ret +.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin +.globl gcm_ghash_clmul +.type gcm_ghash_clmul,@function +.align 16 +gcm_ghash_clmul: +.L_gcm_ghash_clmul_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%eax + movl 24(%esp),%edx + movl 28(%esp),%esi + movl 32(%esp),%ebx + call .L012pic +.L012pic: + popl %ecx + leal .Lbswap-.L012pic(%ecx),%ecx + movdqu (%eax),%xmm0 + movdqa (%ecx),%xmm5 + movdqu (%edx),%xmm2 +.byte 102,15,56,0,197 + subl $16,%ebx + jz .L013odd_tail + movdqu (%esi),%xmm3 + movdqu 16(%esi),%xmm6 +.byte 102,15,56,0,221 +.byte 102,15,56,0,245 + pxor %xmm3,%xmm0 + movdqa %xmm6,%xmm7 + pshufd $78,%xmm6,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm6,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,242,0 +.byte 102,15,58,68,250,17 +.byte 102,15,58,68,220,0 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm7 + pxor %xmm4,%xmm6 + movups 16(%edx),%xmm2 + leal 32(%esi),%esi + subl $32,%ebx + jbe .L014even_tail +.L015mod_loop: + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqu (%esi),%xmm3 + movups (%edx),%xmm2 + pxor %xmm6,%xmm0 + pxor %xmm7,%xmm1 + movdqu 16(%esi),%xmm6 +.byte 102,15,56,0,221 +.byte 102,15,56,0,245 + movdqa %xmm6,%xmm5 + movdqa %xmm6,%xmm7 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $5,%xmm0 + pxor %xmm3,%xmm0 +.byte 102,15,58,68,242,0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm4 + pslldq $8,%xmm0 + psrldq $8,%xmm4 + pxor %xmm3,%xmm0 + pshufd $78,%xmm5,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm5,%xmm3 + pshufd $78,%xmm2,%xmm5 + pxor %xmm2,%xmm5 +.byte 102,15,58,68,250,17 + movdqa %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 +.byte 102,15,58,68,221,0 + movups 16(%edx),%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm3 + movdqa %xmm3,%xmm5 + psrldq $8,%xmm3 + pslldq $8,%xmm5 + pxor %xmm3,%xmm7 + pxor %xmm5,%xmm6 + movdqa (%ecx),%xmm5 + leal 32(%esi),%esi + subl $32,%ebx + ja .L015mod_loop +.L014even_tail: + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm6,%xmm0 + pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $5,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm4 + pslldq $8,%xmm0 + psrldq $8,%xmm4 + pxor %xmm3,%xmm0 + pxor %xmm4,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 + testl %ebx,%ebx + jnz .L016done + movups (%edx),%xmm2 +.L013odd_tail: + movdqu (%esi),%xmm3 +.byte 102,15,56,0,221 + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $5,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm4 + pslldq $8,%xmm0 + psrldq $8,%xmm4 + pxor %xmm3,%xmm0 + pxor %xmm4,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm0 +.L016done: +.byte 102,15,56,0,197 + movdqu %xmm0,(%eax) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin +.align 64 +.Lbswap: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 .align 64 .Lrem_4bit: -.long 0,0,0,29491200,0,58982400,0,38141952 -.long 0,117964800,0,113901568,0,76283904,0,88997888 -.long 0,235929600,0,265420800,0,227803136,0,206962688 -.long 0,152567808,0,148504576,0,177995776,0,190709760 +.long 0,0,0,471859200,0,943718400,0,610271232 +.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 +.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 +.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 .align 64 -.L008rem_8bit: +.Lrem_8bit: .value 0,450,900,582,1800,1738,1164,1358 .value 3600,4050,3476,3158,2328,2266,2716,2910 .value 7200,7650,8100,7782,6952,6890,6316,6510 Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S:1.2 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S:1.3 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S:1.2 Fri Jul 27 19:34:15 2012 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S Sat May 16 17:32:54 2015 @@ -25,6 +25,278 @@ sha512_block_data_order: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) + leal _GLOBAL_OFFSET_TABLE_+[.-.L001K512](%ebp),%edx + movl OPENSSL_ia32cap_P@GOT(%edx),%edx + btl $26,(%edx) + jnc .L002loop_x86 + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + subl $80,%esp +.align 16 +.L003loop_sse2: + movq %mm1,8(%esp) + movq %mm2,16(%esp) + movq %mm3,24(%esp) + movq %mm5,40(%esp) + movq %mm6,48(%esp) + movq %mm7,56(%esp) + movl (%edi),%ecx + movl 4(%edi),%edx + addl $8,%edi + bswap %ecx + bswap %edx + movl %ecx,76(%esp) + movl %edx,72(%esp) +.align 16 +.L00400_14_sse2: + movl (%edi),%eax + movl 4(%edi),%ebx + addl $8,%edi + bswap %eax + bswap %ebx + movl %eax,68(%esp) + movl %ebx,64(%esp) + movq 40(%esp),%mm5 + movq 48(%esp),%mm6 + movq 56(%esp),%mm7 + movq %mm4,%mm1 + movq %mm4,%mm2 + psrlq $14,%mm1 + movq %mm4,32(%esp) + psllq $23,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm2,%mm3 + psllq $23,%mm2 + pxor %mm1,%mm3 + psrlq $23,%mm1 + pxor %mm2,%mm3 + psllq $4,%mm2 + pxor %mm1,%mm3 + paddq (%ebp),%mm7 + pxor %mm2,%mm3 + pxor %mm6,%mm5 + movq 8(%esp),%mm1 + pand %mm4,%mm5 + movq 16(%esp),%mm2 + pxor %mm6,%mm5 + movq 24(%esp),%mm4 + paddq %mm5,%mm3 + movq %mm0,(%esp) + paddq %mm7,%mm3 + movq %mm0,%mm5 + movq %mm0,%mm6 + paddq 72(%esp),%mm3 + psrlq $28,%mm5 + paddq %mm3,%mm4 + psllq $25,%mm6 + movq %mm5,%mm7 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + psrlq $5,%mm5 + pxor %mm6,%mm7 + psllq $6,%mm6 + pxor %mm5,%mm7 + subl $8,%esp + pxor %mm6,%mm7 + movq %mm0,%mm5 + por %mm2,%mm0 + pand %mm2,%mm5 + pand %mm1,%mm0 + por %mm0,%mm5 + paddq %mm5,%mm7 + movq %mm3,%mm0 + movb (%ebp),%dl + paddq %mm7,%mm0 + addl $8,%ebp + cmpb $53,%dl + jne .L00400_14_sse2 + movq 40(%esp),%mm5 + movq 48(%esp),%mm6 + movq 56(%esp),%mm7 + movq %mm4,%mm1 + movq %mm4,%mm2 + psrlq $14,%mm1 + movq %mm4,32(%esp) + psllq $23,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm2,%mm3 + psllq $23,%mm2 + pxor %mm1,%mm3 + psrlq $23,%mm1 + pxor %mm2,%mm3 + psllq $4,%mm2 + pxor %mm1,%mm3 + paddq (%ebp),%mm7 + pxor %mm2,%mm3 + pxor %mm6,%mm5 + movq 8(%esp),%mm1 + pand %mm4,%mm5 + movq 16(%esp),%mm2 + pxor %mm6,%mm5 + movq 24(%esp),%mm4 + paddq %mm5,%mm3 + movq %mm0,(%esp) + paddq %mm7,%mm3 + movq %mm0,%mm5 + movq %mm0,%mm6 + paddq 72(%esp),%mm3 + psrlq $28,%mm5 + paddq %mm3,%mm4 + psllq $25,%mm6 + movq %mm5,%mm7 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + psrlq $5,%mm5 + pxor %mm6,%mm7 + psllq $6,%mm6 + pxor %mm5,%mm7 + subl $8,%esp + pxor %mm6,%mm7 + movq %mm0,%mm5 + por %mm2,%mm0 + movq 88(%esp),%mm6 + pand %mm2,%mm5 + pand %mm1,%mm0 + movq 192(%esp),%mm2 + por %mm0,%mm5 + paddq %mm5,%mm7 + movq %mm3,%mm0 + movb (%ebp),%dl + paddq %mm7,%mm0 + addl $8,%ebp +.align 16 +.L00516_79_sse2: + movq %mm2,%mm1 + psrlq $1,%mm2 + movq %mm6,%mm7 + psrlq $6,%mm6 + movq %mm2,%mm3 + psrlq $6,%mm2 + movq %mm6,%mm5 + psrlq $13,%mm6 + pxor %mm2,%mm3 + psrlq $1,%mm2 + pxor %mm6,%mm5 + psrlq $42,%mm6 + pxor %mm2,%mm3 + movq 200(%esp),%mm2 + psllq $56,%mm1 + pxor %mm6,%mm5 + psllq $3,%mm7 + pxor %mm1,%mm3 + paddq 128(%esp),%mm2 + psllq $7,%mm1 + pxor %mm7,%mm5 + psllq $42,%mm7 + pxor %mm1,%mm3 + pxor %mm7,%mm5 + paddq %mm5,%mm3 + paddq %mm2,%mm3 + movq %mm3,72(%esp) + movq 40(%esp),%mm5 + movq 48(%esp),%mm6 + movq 56(%esp),%mm7 + movq %mm4,%mm1 + movq %mm4,%mm2 + psrlq $14,%mm1 + movq %mm4,32(%esp) + psllq $23,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm2,%mm3 + psllq $23,%mm2 + pxor %mm1,%mm3 + psrlq $23,%mm1 + pxor %mm2,%mm3 + psllq $4,%mm2 + pxor %mm1,%mm3 + paddq (%ebp),%mm7 + pxor %mm2,%mm3 + pxor %mm6,%mm5 + movq 8(%esp),%mm1 + pand %mm4,%mm5 + movq 16(%esp),%mm2 + pxor %mm6,%mm5 + movq 24(%esp),%mm4 + paddq %mm5,%mm3 + movq %mm0,(%esp) + paddq %mm7,%mm3 + movq %mm0,%mm5 + movq %mm0,%mm6 + paddq 72(%esp),%mm3 + psrlq $28,%mm5 + paddq %mm3,%mm4 + psllq $25,%mm6 + movq %mm5,%mm7 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + psrlq $5,%mm5 + pxor %mm6,%mm7 + psllq $6,%mm6 + pxor %mm5,%mm7 + subl $8,%esp + pxor %mm6,%mm7 + movq %mm0,%mm5 + por %mm2,%mm0 + movq 88(%esp),%mm6 + pand %mm2,%mm5 + pand %mm1,%mm0 + movq 192(%esp),%mm2 + por %mm0,%mm5 + paddq %mm5,%mm7 + movq %mm3,%mm0 + movb (%ebp),%dl + paddq %mm7,%mm0 + addl $8,%ebp + cmpb $23,%dl + jne .L00516_79_sse2 + movq 8(%esp),%mm1 + movq 16(%esp),%mm2 + movq 24(%esp),%mm3 + movq 40(%esp),%mm5 + movq 48(%esp),%mm6 + movq 56(%esp),%mm7 + paddq (%esi),%mm0 + paddq 8(%esi),%mm1 + paddq 16(%esi),%mm2 + paddq 24(%esi),%mm3 + paddq 32(%esi),%mm4 + paddq 40(%esi),%mm5 + paddq 48(%esi),%mm6 + paddq 56(%esi),%mm7 + movq %mm0,(%esi) + movq %mm1,8(%esi) + movq %mm2,16(%esi) + movq %mm3,24(%esi) + movq %mm4,32(%esi) + movq %mm5,40(%esi) + movq %mm6,48(%esi) + movq %mm7,56(%esi) + addl $640,%esp + subl $640,%ebp + cmpl 88(%esp),%edi + jb .L003loop_sse2 + emms + movl 92(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .align 16 .L002loop_x86: movl (%edi),%eax @@ -130,7 +402,7 @@ sha512_block_data_order: movl $16,%ecx .long 2784229001 .align 16 -.L00300_15_x86: +.L00600_15_x86: movl 40(%esp),%ecx movl 44(%esp),%edx movl %ecx,%esi @@ -237,9 +509,9 @@ sha512_block_data_order: subl $8,%esp leal 8(%ebp),%ebp cmpb $148,%dl - jne .L00300_15_x86 + jne .L00600_15_x86 .align 16 -.L00416_79_x86: +.L00716_79_x86: movl 312(%esp),%ecx movl 316(%esp),%edx movl %ecx,%esi @@ -412,7 +684,7 @@ sha512_block_data_order: subl $8,%esp leal 8(%ebp),%ebp cmpb $23,%dl - jne .L00416_79_x86 + jne .L00716_79_x86 movl 840(%esp),%esi movl 844(%esp),%edi movl (%esi),%eax @@ -561,3 +833,4 @@ sha512_block_data_order: .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 +.comm OPENSSL_ia32cap_P,8,4 Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S:1.3 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S:1.4 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S:1.3 Fri Jul 27 19:34:15 2012 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S Sat May 16 17:32:54 2015 @@ -9,6 +9,21 @@ sha1_block_data_order: pushl %ebx pushl %esi pushl %edi + call .L000pic_point +.L000pic_point: + popl %ebp + leal _GLOBAL_OFFSET_TABLE_+[.-.L000pic_point](%ebp),%esi + movl OPENSSL_ia32cap_P@GOT(%esi),%esi + leal .LK_XX_XX-.L000pic_point(%ebp),%ebp + movl (%esi),%eax + movl 4(%esi),%edx + testl $512,%edx + jz .L001x86 + testl $16777216,%eax + jz .L001x86 + jmp .Lssse3_shortcut +.align 16 +.L001x86: movl 20(%esp),%ebp movl 24(%esp),%esi movl 28(%esp),%eax @@ -17,9 +32,9 @@ sha1_block_data_order: addl %esi,%eax movl %eax,104(%esp) movl 16(%ebp),%edi - jmp .L000loop + jmp .L002loop .align 16 -.L000loop: +.L002loop: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -1366,7 +1381,7 @@ sha1_block_data_order: movl %ebx,12(%ebp) movl %edx,%esi movl %ecx,16(%ebp) - jb .L000loop + jb .L002loop addl $76,%esp popl %edi popl %esi @@ -1374,7 +1389,1251 @@ sha1_block_data_order: popl %ebp ret .size sha1_block_data_order,.-.L_sha1_block_data_order_begin +.type _sha1_block_data_order_ssse3,@function +.align 16 +_sha1_block_data_order_ssse3: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L003pic_point +.L003pic_point: + popl %ebp + leal .LK_XX_XX-.L003pic_point(%ebp),%ebp +.Lssse3_shortcut: + movdqa (%ebp),%xmm7 + movdqa 16(%ebp),%xmm0 + movdqa 32(%ebp),%xmm1 + movdqa 48(%ebp),%xmm2 + movdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + movdqa %xmm0,112(%esp) + movdqa %xmm1,128(%esp) + movdqa %xmm2,144(%esp) + shll $6,%edx + movdqa %xmm7,160(%esp) + addl %ebp,%edx + movdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + movdqu -64(%ebp),%xmm0 + movdqu -48(%ebp),%xmm1 + movdqu -32(%ebp),%xmm2 + movdqu -16(%ebp),%xmm3 +.byte 102,15,56,0,198 +.byte 102,15,56,0,206 +.byte 102,15,56,0,214 + movdqa %xmm7,96(%esp) +.byte 102,15,56,0,222 + paddd %xmm7,%xmm0 + paddd %xmm7,%xmm1 + paddd %xmm7,%xmm2 + movdqa %xmm0,(%esp) + psubd %xmm7,%xmm0 + movdqa %xmm1,16(%esp) + psubd %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + psubd %xmm7,%xmm2 + movdqa %xmm1,%xmm4 + jmp .L004loop +.align 16 +.L004loop: + addl (%esp),%edi + xorl %edx,%ecx +.byte 102,15,58,15,224,8 + movdqa %xmm3,%xmm6 + movl %eax,%ebp + roll $5,%eax + paddd %xmm3,%xmm7 + movdqa %xmm0,64(%esp) + andl %ecx,%esi + xorl %edx,%ecx + psrldq $4,%xmm6 + xorl %edx,%esi + addl %eax,%edi + pxor %xmm0,%xmm4 + rorl $2,%ebx + addl %esi,%edi + pxor %xmm2,%xmm6 + addl 4(%esp),%edx + xorl %ecx,%ebx + movl %edi,%esi + roll $5,%edi + pxor %xmm6,%xmm4 + andl %ebx,%ebp + xorl %ecx,%ebx + movdqa %xmm7,48(%esp) + xorl %ecx,%ebp + addl %edi,%edx + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm6 + rorl $7,%eax + addl %ebp,%edx + addl 8(%esp),%ecx + xorl %ebx,%eax + pslldq $12,%xmm0 + paddd %xmm4,%xmm4 + movl %edx,%ebp + roll $5,%edx + andl %eax,%esi + xorl %ebx,%eax + psrld $31,%xmm6 + xorl %ebx,%esi + addl %edx,%ecx + movdqa %xmm0,%xmm7 + rorl $7,%edi + addl %esi,%ecx + psrld $30,%xmm0 + por %xmm6,%xmm4 + addl 12(%esp),%ebx + xorl %eax,%edi + movl %ecx,%esi + roll $5,%ecx + pslld $2,%xmm7 + pxor %xmm0,%xmm4 + andl %edi,%ebp + xorl %eax,%edi + movdqa 96(%esp),%xmm0 + xorl %eax,%ebp + addl %ecx,%ebx + pxor %xmm7,%xmm4 + movdqa %xmm2,%xmm5 + rorl $7,%edx + addl %ebp,%ebx + addl 16(%esp),%eax + xorl %edi,%edx +.byte 102,15,58,15,233,8 + movdqa %xmm4,%xmm7 + movl %ebx,%ebp + roll $5,%ebx + paddd %xmm4,%xmm0 + movdqa %xmm1,80(%esp) + andl %edx,%esi + xorl %edi,%edx + psrldq $4,%xmm7 + xorl %edi,%esi + addl %ebx,%eax + pxor %xmm1,%xmm5 + rorl $7,%ecx + addl %esi,%eax + pxor %xmm3,%xmm7 + addl 20(%esp),%edi + xorl %edx,%ecx + movl %eax,%esi + roll $5,%eax + pxor %xmm7,%xmm5 + andl %ecx,%ebp + xorl %edx,%ecx + movdqa %xmm0,(%esp) + xorl %edx,%ebp + addl %eax,%edi + movdqa %xmm5,%xmm1 + movdqa %xmm5,%xmm7 + rorl $7,%ebx + addl %ebp,%edi + addl 24(%esp),%edx + xorl %ecx,%ebx + pslldq $12,%xmm1 + paddd %xmm5,%xmm5 + movl %edi,%ebp + roll $5,%edi + andl %ebx,%esi + xorl %ecx,%ebx + psrld $31,%xmm7 + xorl %ecx,%esi + addl %edi,%edx + movdqa %xmm1,%xmm0 + rorl $7,%eax + addl %esi,%edx + psrld $30,%xmm1 + por %xmm7,%xmm5 + addl 28(%esp),%ecx + xorl %ebx,%eax + movl %edx,%esi + roll $5,%edx + pslld $2,%xmm0 + pxor %xmm1,%xmm5 + andl %eax,%ebp + xorl %ebx,%eax + movdqa 112(%esp),%xmm1 + xorl %ebx,%ebp + addl %edx,%ecx + pxor %xmm0,%xmm5 + movdqa %xmm3,%xmm6 + rorl $7,%edi + addl %ebp,%ecx + addl 32(%esp),%ebx + xorl %eax,%edi +.byte 102,15,58,15,242,8 + movdqa %xmm5,%xmm0 + movl %ecx,%ebp + roll $5,%ecx + paddd %xmm5,%xmm1 + movdqa %xmm2,96(%esp) + andl %edi,%esi + xorl %eax,%edi + psrldq $4,%xmm0 + xorl %eax,%esi + addl %ecx,%ebx + pxor %xmm2,%xmm6 + rorl $7,%edx + addl %esi,%ebx + pxor %xmm4,%xmm0 + addl 36(%esp),%eax + xorl %edi,%edx + movl %ebx,%esi + roll $5,%ebx + pxor %xmm0,%xmm6 + andl %edx,%ebp + xorl %edi,%edx + movdqa %xmm1,16(%esp) + xorl %edi,%ebp + addl %ebx,%eax + movdqa %xmm6,%xmm2 + movdqa %xmm6,%xmm0 + rorl $7,%ecx + addl %ebp,%eax + addl 40(%esp),%edi + xorl %edx,%ecx + pslldq $12,%xmm2 + paddd %xmm6,%xmm6 + movl %eax,%ebp + roll $5,%eax + andl %ecx,%esi + xorl %edx,%ecx + psrld $31,%xmm0 + xorl %edx,%esi + addl %eax,%edi + movdqa %xmm2,%xmm1 + rorl $7,%ebx + addl %esi,%edi + psrld $30,%xmm2 + por %xmm0,%xmm6 + addl 44(%esp),%edx + xorl %ecx,%ebx + movdqa 64(%esp),%xmm0 + movl %edi,%esi + roll $5,%edi + pslld $2,%xmm1 + pxor %xmm2,%xmm6 + andl %ebx,%ebp + xorl %ecx,%ebx + movdqa 112(%esp),%xmm2 + xorl %ecx,%ebp + addl %edi,%edx + pxor %xmm1,%xmm6 + movdqa %xmm4,%xmm7 + rorl $7,%eax + addl %ebp,%edx + addl 48(%esp),%ecx + xorl %ebx,%eax +.byte 102,15,58,15,251,8 + movdqa %xmm6,%xmm1 + movl %edx,%ebp + roll $5,%edx + paddd %xmm6,%xmm2 + movdqa %xmm3,64(%esp) + andl %eax,%esi + xorl %ebx,%eax + psrldq $4,%xmm1 + xorl %ebx,%esi + addl %edx,%ecx + pxor %xmm3,%xmm7 + rorl $7,%edi + addl %esi,%ecx + pxor %xmm5,%xmm1 + addl 52(%esp),%ebx + xorl %eax,%edi + movl %ecx,%esi + roll $5,%ecx + pxor %xmm1,%xmm7 + andl %edi,%ebp + xorl %eax,%edi + movdqa %xmm2,32(%esp) + xorl %eax,%ebp + addl %ecx,%ebx + movdqa %xmm7,%xmm3 + movdqa %xmm7,%xmm1 + rorl $7,%edx + addl %ebp,%ebx + addl 56(%esp),%eax + xorl %edi,%edx + pslldq $12,%xmm3 + paddd %xmm7,%xmm7 + movl %ebx,%ebp + roll $5,%ebx + andl %edx,%esi + xorl %edi,%edx + psrld $31,%xmm1 + xorl %edi,%esi + addl %ebx,%eax + movdqa %xmm3,%xmm2 + rorl $7,%ecx + addl %esi,%eax + psrld $30,%xmm3 + por %xmm1,%xmm7 + addl 60(%esp),%edi + xorl %edx,%ecx + movdqa 80(%esp),%xmm1 + movl %eax,%esi + roll $5,%eax + pslld $2,%xmm2 + pxor %xmm3,%xmm7 + andl %ecx,%ebp + xorl %edx,%ecx + movdqa 112(%esp),%xmm3 + xorl %edx,%ebp + addl %eax,%edi + pxor %xmm2,%xmm7 + rorl $7,%ebx + addl %ebp,%edi + movdqa %xmm7,%xmm2 + addl (%esp),%edx + pxor %xmm4,%xmm0 +.byte 102,15,58,15,214,8 + xorl %ecx,%ebx + movl %edi,%ebp + roll $5,%edi + pxor %xmm1,%xmm0 + movdqa %xmm4,80(%esp) + andl %ebx,%esi + xorl %ecx,%ebx + movdqa %xmm3,%xmm4 + paddd %xmm7,%xmm3 + xorl %ecx,%esi + addl %edi,%edx + pxor %xmm2,%xmm0 + rorl $7,%eax + addl %esi,%edx + addl 4(%esp),%ecx + xorl %ebx,%eax + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + movl %edx,%esi + roll $5,%edx + andl %eax,%ebp + xorl %ebx,%eax + pslld $2,%xmm0 + xorl %ebx,%ebp + addl %edx,%ecx + psrld $30,%xmm2 + rorl $7,%edi + addl %ebp,%ecx + addl 8(%esp),%ebx + xorl %eax,%edi + movl %ecx,%ebp + roll $5,%ecx + por %xmm2,%xmm0 + andl %edi,%esi + xorl %eax,%edi + movdqa 96(%esp),%xmm2 + xorl %eax,%esi + addl %ecx,%ebx + rorl $7,%edx + addl %esi,%ebx + addl 12(%esp),%eax + movdqa %xmm0,%xmm3 + xorl %edi,%edx + movl %ebx,%esi + roll $5,%ebx + andl %edx,%ebp + xorl %edi,%edx + xorl %edi,%ebp + addl %ebx,%eax + rorl $7,%ecx + addl %ebp,%eax + addl 16(%esp),%edi + pxor %xmm5,%xmm1 +.byte 102,15,58,15,223,8 + xorl %edx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm2,%xmm1 + movdqa %xmm5,96(%esp) + xorl %ecx,%esi + addl %eax,%edi + movdqa %xmm4,%xmm5 + paddd %xmm0,%xmm4 + rorl $7,%ebx + addl %esi,%edi + pxor %xmm3,%xmm1 + addl 20(%esp),%edx + xorl %ecx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + xorl %ebx,%ebp + addl %edi,%edx + rorl $7,%eax + addl %ebp,%edx + pslld $2,%xmm1 + addl 24(%esp),%ecx + xorl %ebx,%esi + psrld $30,%xmm3 + movl %edx,%ebp + roll $5,%edx + xorl %eax,%esi + addl %edx,%ecx + rorl $7,%edi + addl %esi,%ecx + por %xmm3,%xmm1 + addl 28(%esp),%ebx + xorl %eax,%ebp + movdqa 64(%esp),%xmm3 + movl %ecx,%esi + roll $5,%ecx + xorl %edi,%ebp + addl %ecx,%ebx + rorl $7,%edx + movdqa %xmm1,%xmm4 + addl %ebp,%ebx + addl 32(%esp),%eax + pxor %xmm6,%xmm2 +.byte 102,15,58,15,224,8 + xorl %edi,%esi + movl %ebx,%ebp + roll $5,%ebx + pxor %xmm3,%xmm2 + movdqa %xmm6,64(%esp) + xorl %edx,%esi + addl %ebx,%eax + movdqa 128(%esp),%xmm6 + paddd %xmm1,%xmm5 + rorl $7,%ecx + addl %esi,%eax + pxor %xmm4,%xmm2 + addl 36(%esp),%edi + xorl %edx,%ebp + movl %eax,%esi + roll $5,%eax + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + xorl %ecx,%ebp + addl %eax,%edi + rorl $7,%ebx + addl %ebp,%edi + pslld $2,%xmm2 + addl 40(%esp),%edx + xorl %ecx,%esi + psrld $30,%xmm4 + movl %edi,%ebp + roll $5,%edi + xorl %ebx,%esi + addl %edi,%edx + rorl $7,%eax + addl %esi,%edx + por %xmm4,%xmm2 + addl 44(%esp),%ecx + xorl %ebx,%ebp + movdqa 80(%esp),%xmm4 + movl %edx,%esi + roll $5,%edx + xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edi + movdqa %xmm2,%xmm5 + addl %ebp,%ecx + addl 48(%esp),%ebx + pxor %xmm7,%xmm3 +.byte 102,15,58,15,233,8 + xorl %eax,%esi + movl %ecx,%ebp + roll $5,%ecx + pxor %xmm4,%xmm3 + movdqa %xmm7,80(%esp) + xorl %edi,%esi + addl %ecx,%ebx + movdqa %xmm6,%xmm7 + paddd %xmm2,%xmm6 + rorl $7,%edx + addl %esi,%ebx + pxor %xmm5,%xmm3 + addl 52(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + xorl %edx,%ebp + addl %ebx,%eax + rorl $7,%ecx + addl %ebp,%eax + pslld $2,%xmm3 + addl 56(%esp),%edi + xorl %edx,%esi + psrld $30,%xmm5 + movl %eax,%ebp + roll $5,%eax + xorl %ecx,%esi + addl %eax,%edi + rorl $7,%ebx + addl %esi,%edi + por %xmm5,%xmm3 + addl 60(%esp),%edx + xorl %ecx,%ebp + movdqa 96(%esp),%xmm5 + movl %edi,%esi + roll $5,%edi + xorl %ebx,%ebp + addl %edi,%edx + rorl $7,%eax + movdqa %xmm3,%xmm6 + addl %ebp,%edx + addl (%esp),%ecx + pxor %xmm0,%xmm4 +.byte 102,15,58,15,242,8 + xorl %ebx,%esi + movl %edx,%ebp + roll $5,%edx + pxor %xmm5,%xmm4 + movdqa %xmm0,96(%esp) + xorl %eax,%esi + addl %edx,%ecx + movdqa %xmm7,%xmm0 + paddd %xmm3,%xmm7 + rorl $7,%edi + addl %esi,%ecx + pxor %xmm6,%xmm4 + addl 4(%esp),%ebx + xorl %eax,%ebp + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm4,%xmm6 + movdqa %xmm7,48(%esp) + xorl %edi,%ebp + addl %ecx,%ebx + rorl $7,%edx + addl %ebp,%ebx + pslld $2,%xmm4 + addl 8(%esp),%eax + xorl %edi,%esi + psrld $30,%xmm6 + movl %ebx,%ebp + roll $5,%ebx + xorl %edx,%esi + addl %ebx,%eax + rorl $7,%ecx + addl %esi,%eax + por %xmm6,%xmm4 + addl 12(%esp),%edi + xorl %edx,%ebp + movdqa 64(%esp),%xmm6 + movl %eax,%esi + roll $5,%eax + xorl %ecx,%ebp + addl %eax,%edi + rorl $7,%ebx + movdqa %xmm4,%xmm7 + addl %ebp,%edi + addl 16(%esp),%edx + pxor %xmm1,%xmm5 +.byte 102,15,58,15,251,8 + xorl %ecx,%esi + movl %edi,%ebp + roll $5,%edi + pxor %xmm6,%xmm5 + movdqa %xmm1,64(%esp) + xorl %ebx,%esi + addl %edi,%edx + movdqa %xmm0,%xmm1 + paddd %xmm4,%xmm0 + rorl $7,%eax + addl %esi,%edx + pxor %xmm7,%xmm5 + addl 20(%esp),%ecx + xorl %ebx,%ebp + movl %edx,%esi + roll $5,%edx + movdqa %xmm5,%xmm7 + movdqa %xmm0,(%esp) + xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edi + addl %ebp,%ecx + pslld $2,%xmm5 + addl 24(%esp),%ebx + xorl %eax,%esi + psrld $30,%xmm7 + movl %ecx,%ebp + roll $5,%ecx + xorl %edi,%esi + addl %ecx,%ebx + rorl $7,%edx + addl %esi,%ebx + por %xmm7,%xmm5 + addl 28(%esp),%eax + xorl %edi,%ebp + movdqa 80(%esp),%xmm7 + movl %ebx,%esi + roll $5,%ebx + xorl %edx,%ebp + addl %ebx,%eax + rorl $7,%ecx + movdqa %xmm5,%xmm0 + addl %ebp,%eax + movl %ecx,%ebp + pxor %xmm2,%xmm6 +.byte 102,15,58,15,196,8 + xorl %edx,%ecx + addl 32(%esp),%edi + andl %edx,%ebp + pxor %xmm7,%xmm6 + movdqa %xmm2,80(%esp) + andl %ecx,%esi + rorl $7,%ebx + movdqa %xmm1,%xmm2 + paddd %xmm5,%xmm1 + addl %ebp,%edi + movl %eax,%ebp + pxor %xmm0,%xmm6 + roll $5,%eax + addl %esi,%edi + xorl %edx,%ecx + addl %eax,%edi + movdqa %xmm6,%xmm0 + movdqa %xmm1,16(%esp) + movl %ebx,%esi + xorl %ecx,%ebx + addl 36(%esp),%edx + andl %ecx,%esi + pslld $2,%xmm6 + andl %ebx,%ebp + rorl $7,%eax + psrld $30,%xmm0 + addl %esi,%edx + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ecx,%ebx + addl %edi,%edx + por %xmm0,%xmm6 + movl %eax,%ebp + xorl %ebx,%eax + movdqa 96(%esp),%xmm0 + addl 40(%esp),%ecx + andl %ebx,%ebp + andl %eax,%esi + rorl $7,%edi + addl %ebp,%ecx + movdqa %xmm6,%xmm1 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %ebx,%eax + addl %edx,%ecx + movl %edi,%esi + xorl %eax,%edi + addl 44(%esp),%ebx + andl %eax,%esi + andl %edi,%ebp + rorl $7,%edx + addl %esi,%ebx + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %eax,%edi + addl %ecx,%ebx + movl %edx,%ebp + pxor %xmm3,%xmm7 +.byte 102,15,58,15,205,8 + xorl %edi,%edx + addl 48(%esp),%eax + andl %edi,%ebp + pxor %xmm0,%xmm7 + movdqa %xmm3,96(%esp) + andl %edx,%esi + rorl $7,%ecx + movdqa 144(%esp),%xmm3 + paddd %xmm6,%xmm2 + addl %ebp,%eax + movl %ebx,%ebp + pxor %xmm1,%xmm7 + roll $5,%ebx + addl %esi,%eax + xorl %edi,%edx + addl %ebx,%eax + movdqa %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + movl %ecx,%esi + xorl %edx,%ecx + addl 52(%esp),%edi + andl %edx,%esi + pslld $2,%xmm7 + andl %ecx,%ebp + rorl $7,%ebx + psrld $30,%xmm1 + addl %esi,%edi + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %edx,%ecx + addl %eax,%edi + por %xmm1,%xmm7 + movl %ebx,%ebp + xorl %ecx,%ebx + movdqa 64(%esp),%xmm1 + addl 56(%esp),%edx + andl %ecx,%ebp + andl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + movdqa %xmm7,%xmm2 + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ecx,%ebx + addl %edi,%edx + movl %eax,%esi + xorl %ebx,%eax + addl 60(%esp),%ecx + andl %ebx,%esi + andl %eax,%ebp + rorl $7,%edi + addl %esi,%ecx + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %ebx,%eax + addl %edx,%ecx + movl %edi,%ebp + pxor %xmm4,%xmm0 +.byte 102,15,58,15,214,8 + xorl %eax,%edi + addl (%esp),%ebx + andl %eax,%ebp + pxor %xmm1,%xmm0 + movdqa %xmm4,64(%esp) + andl %edi,%esi + rorl $7,%edx + movdqa %xmm3,%xmm4 + paddd %xmm7,%xmm3 + addl %ebp,%ebx + movl %ecx,%ebp + pxor %xmm2,%xmm0 + roll $5,%ecx + addl %esi,%ebx + xorl %eax,%edi + addl %ecx,%ebx + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + movl %edx,%esi + xorl %edi,%edx + addl 4(%esp),%eax + andl %edi,%esi + pslld $2,%xmm0 + andl %edx,%ebp + rorl $7,%ecx + psrld $30,%xmm2 + addl %esi,%eax + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + xorl %edi,%edx + addl %ebx,%eax + por %xmm2,%xmm0 + movl %ecx,%ebp + xorl %edx,%ecx + movdqa 80(%esp),%xmm2 + addl 8(%esp),%edi + andl %edx,%ebp + andl %ecx,%esi + rorl $7,%ebx + addl %ebp,%edi + movdqa %xmm0,%xmm3 + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %edx,%ecx + addl %eax,%edi + movl %ebx,%esi + xorl %ecx,%ebx + addl 12(%esp),%edx + andl %ecx,%esi + andl %ebx,%ebp + rorl $7,%eax + addl %esi,%edx + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ecx,%ebx + addl %edi,%edx + movl %eax,%ebp + pxor %xmm5,%xmm1 +.byte 102,15,58,15,223,8 + xorl %ebx,%eax + addl 16(%esp),%ecx + andl %ebx,%ebp + pxor %xmm2,%xmm1 + movdqa %xmm5,80(%esp) + andl %eax,%esi + rorl $7,%edi + movdqa %xmm4,%xmm5 + paddd %xmm0,%xmm4 + addl %ebp,%ecx + movl %edx,%ebp + pxor %xmm3,%xmm1 + roll $5,%edx + addl %esi,%ecx + xorl %ebx,%eax + addl %edx,%ecx + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + movl %edi,%esi + xorl %eax,%edi + addl 20(%esp),%ebx + andl %eax,%esi + pslld $2,%xmm1 + andl %edi,%ebp + rorl $7,%edx + psrld $30,%xmm3 + addl %esi,%ebx + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %eax,%edi + addl %ecx,%ebx + por %xmm3,%xmm1 + movl %edx,%ebp + xorl %edi,%edx + movdqa 96(%esp),%xmm3 + addl 24(%esp),%eax + andl %edi,%ebp + andl %edx,%esi + rorl $7,%ecx + addl %ebp,%eax + movdqa %xmm1,%xmm4 + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edi,%edx + addl %ebx,%eax + movl %ecx,%esi + xorl %edx,%ecx + addl 28(%esp),%edi + andl %edx,%esi + andl %ecx,%ebp + rorl $7,%ebx + addl %esi,%edi + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %edx,%ecx + addl %eax,%edi + movl %ebx,%ebp + pxor %xmm6,%xmm2 +.byte 102,15,58,15,224,8 + xorl %ecx,%ebx + addl 32(%esp),%edx + andl %ecx,%ebp + pxor %xmm3,%xmm2 + movdqa %xmm6,96(%esp) + andl %ebx,%esi + rorl $7,%eax + movdqa %xmm5,%xmm6 + paddd %xmm1,%xmm5 + addl %ebp,%edx + movl %edi,%ebp + pxor %xmm4,%xmm2 + roll $5,%edi + addl %esi,%edx + xorl %ecx,%ebx + addl %edi,%edx + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + movl %eax,%esi + xorl %ebx,%eax + addl 36(%esp),%ecx + andl %ebx,%esi + pslld $2,%xmm2 + andl %eax,%ebp + rorl $7,%edi + psrld $30,%xmm4 + addl %esi,%ecx + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %ebx,%eax + addl %edx,%ecx + por %xmm4,%xmm2 + movl %edi,%ebp + xorl %eax,%edi + movdqa 64(%esp),%xmm4 + addl 40(%esp),%ebx + andl %eax,%ebp + andl %edi,%esi + rorl $7,%edx + addl %ebp,%ebx + movdqa %xmm2,%xmm5 + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %eax,%edi + addl %ecx,%ebx + movl %edx,%esi + xorl %edi,%edx + addl 44(%esp),%eax + andl %edi,%esi + andl %edx,%ebp + rorl $7,%ecx + addl %esi,%eax + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + xorl %edi,%edx + addl %ebx,%eax + addl 48(%esp),%edi + pxor %xmm7,%xmm3 +.byte 102,15,58,15,233,8 + xorl %edx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm4,%xmm3 + movdqa %xmm7,64(%esp) + xorl %ecx,%esi + addl %eax,%edi + movdqa %xmm6,%xmm7 + paddd %xmm2,%xmm6 + rorl $7,%ebx + addl %esi,%edi + pxor %xmm5,%xmm3 + addl 52(%esp),%edx + xorl %ecx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + xorl %ebx,%ebp + addl %edi,%edx + rorl $7,%eax + addl %ebp,%edx + pslld $2,%xmm3 + addl 56(%esp),%ecx + xorl %ebx,%esi + psrld $30,%xmm5 + movl %edx,%ebp + roll $5,%edx + xorl %eax,%esi + addl %edx,%ecx + rorl $7,%edi + addl %esi,%ecx + por %xmm5,%xmm3 + addl 60(%esp),%ebx + xorl %eax,%ebp + movl %ecx,%esi + roll $5,%ecx + xorl %edi,%ebp + addl %ecx,%ebx + rorl $7,%edx + addl %ebp,%ebx + addl (%esp),%eax + paddd %xmm3,%xmm7 + xorl %edi,%esi + movl %ebx,%ebp + roll $5,%ebx + xorl %edx,%esi + movdqa %xmm7,48(%esp) + addl %ebx,%eax + rorl $7,%ecx + addl %esi,%eax + addl 4(%esp),%edi + xorl %edx,%ebp + movl %eax,%esi + roll $5,%eax + xorl %ecx,%ebp + addl %eax,%edi + rorl $7,%ebx + addl %ebp,%edi + addl 8(%esp),%edx + xorl %ecx,%esi + movl %edi,%ebp + roll $5,%edi + xorl %ebx,%esi + addl %edi,%edx + rorl $7,%eax + addl %esi,%edx + addl 12(%esp),%ecx + xorl %ebx,%ebp + movl %edx,%esi + roll $5,%edx + xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edi + addl %ebp,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L005done + movdqa 160(%esp),%xmm7 + movdqa 176(%esp),%xmm6 + movdqu (%ebp),%xmm0 + movdqu 16(%ebp),%xmm1 + movdqu 32(%ebp),%xmm2 + movdqu 48(%ebp),%xmm3 + addl $64,%ebp +.byte 102,15,56,0,198 + movl %ebp,196(%esp) + movdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %eax,%esi +.byte 102,15,56,0,206 + movl %ecx,%ebp + roll $5,%ecx + paddd %xmm7,%xmm0 + xorl %edi,%esi + addl %ecx,%ebx + rorl $7,%edx + addl %esi,%ebx + movdqa %xmm0,(%esp) + addl 20(%esp),%eax + xorl %edi,%ebp + psubd %xmm7,%xmm0 + movl %ebx,%esi + roll $5,%ebx + xorl %edx,%ebp + addl %ebx,%eax + rorl $7,%ecx + addl %ebp,%eax + addl 24(%esp),%edi + xorl %edx,%esi + movl %eax,%ebp + roll $5,%eax + xorl %ecx,%esi + addl %eax,%edi + rorl $7,%ebx + addl %esi,%edi + addl 28(%esp),%edx + xorl %ecx,%ebp + movl %edi,%esi + roll $5,%edi + xorl %ebx,%ebp + addl %edi,%edx + rorl $7,%eax + addl %ebp,%edx + addl 32(%esp),%ecx + xorl %ebx,%esi +.byte 102,15,56,0,214 + movl %edx,%ebp + roll $5,%edx + paddd %xmm7,%xmm1 + xorl %eax,%esi + addl %edx,%ecx + rorl $7,%edi + addl %esi,%ecx + movdqa %xmm1,16(%esp) + addl 36(%esp),%ebx + xorl %eax,%ebp + psubd %xmm7,%xmm1 + movl %ecx,%esi + roll $5,%ecx + xorl %edi,%ebp + addl %ecx,%ebx + rorl $7,%edx + addl %ebp,%ebx + addl 40(%esp),%eax + xorl %edi,%esi + movl %ebx,%ebp + roll $5,%ebx + xorl %edx,%esi + addl %ebx,%eax + rorl $7,%ecx + addl %esi,%eax + addl 44(%esp),%edi + xorl %edx,%ebp + movl %eax,%esi + roll $5,%eax + xorl %ecx,%ebp + addl %eax,%edi + rorl $7,%ebx + addl %ebp,%edi + addl 48(%esp),%edx + xorl %ecx,%esi +.byte 102,15,56,0,222 + movl %edi,%ebp + roll $5,%edi + paddd %xmm7,%xmm2 + xorl %ebx,%esi + addl %edi,%edx + rorl $7,%eax + addl %esi,%edx + movdqa %xmm2,32(%esp) + addl 52(%esp),%ecx + xorl %ebx,%ebp + psubd %xmm7,%xmm2 + movl %edx,%esi + roll $5,%edx + xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edi + addl %ebp,%ecx + addl 56(%esp),%ebx + xorl %eax,%esi + movl %ecx,%ebp + roll $5,%ecx + xorl %edi,%esi + addl %ecx,%ebx + rorl $7,%edx + addl %esi,%ebx + addl 60(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + roll $5,%ebx + xorl %edx,%ebp + addl %ebx,%eax + rorl $7,%ecx + addl %ebp,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %esi,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movdqa %xmm1,%xmm4 + jmp .L004loop +.align 16 +.L005done: + addl 16(%esp),%ebx + xorl %eax,%esi + movl %ecx,%ebp + roll $5,%ecx + xorl %edi,%esi + addl %ecx,%ebx + rorl $7,%edx + addl %esi,%ebx + addl 20(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + roll $5,%ebx + xorl %edx,%ebp + addl %ebx,%eax + rorl $7,%ecx + addl %ebp,%eax + addl 24(%esp),%edi + xorl %edx,%esi + movl %eax,%ebp + roll $5,%eax + xorl %ecx,%esi + addl %eax,%edi + rorl $7,%ebx + addl %esi,%edi + addl 28(%esp),%edx + xorl %ecx,%ebp + movl %edi,%esi + roll $5,%edi + xorl %ebx,%ebp + addl %edi,%edx + rorl $7,%eax + addl %ebp,%edx + addl 32(%esp),%ecx + xorl %ebx,%esi + movl %edx,%ebp + roll $5,%edx + xorl %eax,%esi + addl %edx,%ecx + rorl $7,%edi + addl %esi,%ecx + addl 36(%esp),%ebx + xorl %eax,%ebp + movl %ecx,%esi + roll $5,%ecx + xorl %edi,%ebp + addl %ecx,%ebx + rorl $7,%edx + addl %ebp,%ebx + addl 40(%esp),%eax + xorl %edi,%esi + movl %ebx,%ebp + roll $5,%ebx + xorl %edx,%esi + addl %ebx,%eax + rorl $7,%ecx + addl %esi,%eax + addl 44(%esp),%edi + xorl %edx,%ebp + movl %eax,%esi + roll $5,%eax + xorl %ecx,%ebp + addl %eax,%edi + rorl $7,%ebx + addl %ebp,%edi + addl 48(%esp),%edx + xorl %ecx,%esi + movl %edi,%ebp + roll $5,%edi + xorl %ebx,%esi + addl %edi,%edx + rorl $7,%eax + addl %esi,%edx + addl 52(%esp),%ecx + xorl %ebx,%ebp + movl %edx,%esi + roll $5,%edx + xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edi + addl %ebp,%ecx + addl 56(%esp),%ebx + xorl %eax,%esi + movl %ecx,%ebp + roll $5,%ecx + xorl %edi,%esi + addl %ecx,%ebx + rorl $7,%edx + addl %esi,%ebx + addl 60(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + roll $5,%ebx + xorl %edx,%ebp + addl %ebx,%eax + rorl $7,%ecx + addl %ebp,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 +.align 64 +.LK_XX_XX: +.long 1518500249,1518500249,1518500249,1518500249 +.long 1859775393,1859775393,1859775393,1859775393 +.long 2400959708,2400959708,2400959708,2400959708 +.long 3395469782,3395469782,3395469782,3395469782 +.long 66051,67438087,134810123,202182159 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 .byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.comm OPENSSL_ia32cap_P,8,4 Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S:1.8 src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S:1.9 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S:1.8 Sat Jan 11 18:20:06 2014 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S Sat May 16 17:32:54 2015 @@ -226,6 +226,18 @@ OPENSSL_wipe_cpu: movl (%ecx),%ecx btl $1,(%ecx) jnc .L015no_x87 + andl $83886080,%ecx + cmpl $83886080,%ecx + jne .L016no_sse2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 +.L016no_sse2: .long 4007259865,4007259865,4007259865,4007259865,2430851995 .L015no_x87: leal 4(%esp),%eax @@ -241,11 +253,11 @@ OPENSSL_atomic_add: pushl %ebx nop movl (%edx),%eax -.L016spin: +.L017spin: leal (%eax,%ecx,1),%ebx nop .long 447811568 - jne .L016spin + jne .L017spin movl %ebx,%eax popl %ebx ret @@ -286,32 +298,32 @@ OPENSSL_cleanse: movl 8(%esp),%ecx xorl %eax,%eax cmpl $7,%ecx - jae .L017lot + jae .L018lot cmpl $0,%ecx - je .L018ret -.L019little: + je .L019ret +.L020little: movb %al,(%edx) subl $1,%ecx leal 1(%edx),%edx - jnz .L019little -.L018ret: + jnz .L020little +.L019ret: ret .align 16 -.L017lot: +.L018lot: testl $3,%edx - jz .L020aligned + jz .L021aligned movb %al,(%edx) leal -1(%ecx),%ecx leal 1(%edx),%edx - jmp .L017lot -.L020aligned: + jmp .L018lot +.L021aligned: movl %eax,(%edx) leal -4(%ecx),%ecx testl $-4,%ecx leal 4(%edx),%edx - jnz .L020aligned + jnz .L021aligned cmpl $0,%ecx - jne .L019little + jne .L020little ret .size OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin .globl OPENSSL_ia32_rdrand @@ -320,11 +332,11 @@ OPENSSL_cleanse: OPENSSL_ia32_rdrand: .L_OPENSSL_ia32_rdrand_begin: movl $8,%ecx -.L021loop: +.L022loop: .byte 15,199,240 - jc .L022break - loop .L021loop -.L022break: + jc .L023break + loop .L022loop +.L023break: cmpl $0,%eax cmovel %ecx,%eax ret Added files: Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc diff -u /dev/null src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc:1.1 --- /dev/null Sat May 16 17:32:54 2015 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc Sat May 16 17:32:54 2015 @@ -0,0 +1,4 @@ +.PATH.S: ${.PARSEDIR} +MODES_SRCS += ghash-x86.o +MODESCPPFLAGS = -DGHASH_ASM +.include "../../modes.inc"