The branch master has been updated via e052083cc7620379b7119cdbe4def5ea5de65c18 (commit) via 5e32cfb2b6aec4d8d80083dabbd25bf89a482f21 (commit) via fa62bc4661960a593a77d2c3f260173c3aa7333d (commit) via 49508b23ce929ad5c8381bdc4b397eb41fd06137 (commit) via 1c47e8836f4213251957254764886e82ac2563bc (commit) via f17652e5f9198941ce761da2ccc6ce584fd90e81 (commit) from 26a556e778f167070037fee243d7e6b9800fdb7f (commit)
- Log ----------------------------------------------------------------- commit e052083cc7620379b7119cdbe4def5ea5de65c18 Author: Andy Polyakov <ap...@openssl.org> Date: Sat Feb 25 18:37:24 2017 +0100 poly1305/asm/poly1305-x86_64.pl: minor AVX512 optimization. Reviewed-by: Rich Salz <rs...@openssl.org> commit 5e32cfb2b6aec4d8d80083dabbd25bf89a482f21 Author: Andy Polyakov <ap...@openssl.org> Date: Sat Feb 25 22:17:21 2017 +0100 crypto/x86_64cpuid.pl: add CFI annotations. Reviewed-by: Rich Salz <rs...@openssl.org> commit fa62bc4661960a593a77d2c3f260173c3aa7333d Author: Andy Polyakov <ap...@openssl.org> Date: Sat Feb 25 22:16:57 2017 +0100 whrlpool/asm/wp-x86_64.pl: add CFI annotations. Reviewed-by: Rich Salz <rs...@openssl.org> commit 49508b23ce929ad5c8381bdc4b397eb41fd06137 Author: Andy Polyakov <ap...@openssl.org> Date: Sat Feb 25 22:16:38 2017 +0100 camellia/asm/cmll-x86_64.pl: add CFI annotations. Reviewed-by: Rich Salz <rs...@openssl.org> commit 1c47e8836f4213251957254764886e82ac2563bc Author: Andy Polyakov <ap...@openssl.org> Date: Sat Feb 25 19:37:02 2017 +0100 poly1305/asm/poly1305-x86_64.pl: add CFI annotations. Reviewed-by: Rich Salz <rs...@openssl.org> commit f17652e5f9198941ce761da2ccc6ce584fd90e81 Author: Andy Polyakov <ap...@openssl.org> Date: Sat Feb 25 19:36:43 2017 +0100 chacha/asm/chacha-x86_64.pl: add CFI annotations. Reviewed-by: Rich Salz <rs...@openssl.org> ----------------------------------------------------------------------- Summary of changes: crypto/camellia/asm/cmll-x86_64.pl | 57 ++++++++++++ crypto/chacha/asm/chacha-x86_64.pl | 41 +++++++++ crypto/poly1305/asm/poly1305-x86_64.pl | 161 ++++++++++++++++++++++++--------- crypto/whrlpool/asm/wp-x86_64.pl | 18 ++++ crypto/x86_64cpuid.pl | 4 + 5 files changed, 239 insertions(+), 42 deletions(-) diff --git a/crypto/camellia/asm/cmll-x86_64.pl b/crypto/camellia/asm/cmll-x86_64.pl index da5ad7b..02c52c3 100644 --- a/crypto/camellia/asm/cmll-x86_64.pl +++ b/crypto/camellia/asm/cmll-x86_64.pl @@ -137,11 +137,17 @@ Camellia_EncryptBlock: .align 16 .Lenc_rounds: Camellia_EncryptBlock_Rounds: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lenc_prologue: #mov %rsi,$inp # put away arguments @@ -173,13 +179,20 @@ Camellia_EncryptBlock_Rounds: mov @S[3],12($out) mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%rbp +.cfi_restore %rbp mov 32(%rsp),%rbx +.cfi_restore %rbx lea 40(%rsp),%rsp +.cfi_adjust_cfa_offset -40 .Lenc_epilogue: ret +.cfi_endproc .size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds .type _x86_64_Camellia_encrypt,\@abi-omnipotent @@ -247,11 +260,17 @@ Camellia_DecryptBlock: .align 16 .Ldec_rounds: Camellia_DecryptBlock_Rounds: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Ldec_prologue: #mov %rsi,$inp # put away arguments @@ -283,13 +302,20 @@ Camellia_DecryptBlock_Rounds: mov @S[3],12($out) mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%rbp +.cfi_restore %rbp mov 32(%rsp),%rbx +.cfi_restore %rbx lea 40(%rsp),%rsp +.cfi_adjust_cfa_offset -40 .Ldec_epilogue: ret +.cfi_endproc .size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds .type _x86_64_Camellia_decrypt,\@abi-omnipotent @@ -409,11 +435,17 @@ $code.=<<___; .type Camellia_Ekeygen,\@function,3 .align 16 Camellia_Ekeygen: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lkey_prologue: mov %edi,${keyend}d # put away arguments, keyBitLength @@ -573,13 +605,20 @@ $code.=<<___; mov \$4,%eax .Ldone: mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%rbp +.cfi_restore %rbp mov 32(%rsp),%rbx +.cfi_restore %rbx lea 40(%rsp),%rsp +.cfi_adjust_cfa_offset -40 .Lkey_epilogue: ret +.cfi_endproc .size Camellia_Ekeygen,.-Camellia_Ekeygen ___ } @@ -637,17 +676,25 @@ $code.=<<___; .type Camellia_cbc_encrypt,\@function,6 .align 16 Camellia_cbc_encrypt: +.cfi_startproc cmp \$0,%rdx je .Lcbc_abort push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lcbc_prologue: mov %rsp,%rbp +.cfi_def_cfa_register %rbp sub \$64,%rsp and \$-64,%rsp @@ -668,6 +715,7 @@ Camellia_cbc_encrypt: mov %r8,$_ivp mov %rbp,$_rsp +.cfi_cfa_expression $_rsp,deref,+56 .Lcbc_body: lea .LCamellia_SBOX(%rip),$Tbl @@ -856,15 +904,24 @@ Camellia_cbc_encrypt: .align 16 .Lcbc_done: mov $_rsp,%rcx +.cfi_def_cfa %rcx,56 mov 0(%rcx),%r15 +.cfi_restore %r15 mov 8(%rcx),%r14 +.cfi_restore %r14 mov 16(%rcx),%r13 +.cfi_restore %r13 mov 24(%rcx),%r12 +.cfi_restore %r12 mov 32(%rcx),%rbp +.cfi_restore %rbp mov 40(%rcx),%rbx +.cfi_restore %rbx lea 48(%rcx),%rsp +.cfi_def_cfa %rsp,8 .Lcbc_abort: ret +.cfi_endproc .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt .asciz "Camellia for x86_64 by <appro\@openssl.org>" diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl index 7fc1749..b59d96f 100755 --- a/crypto/chacha/asm/chacha-x86_64.pl +++ b/crypto/chacha/asm/chacha-x86_64.pl @@ -242,6 +242,7 @@ $code.=<<___; .type ChaCha20_ctr32,\@function,5 .align 64 ChaCha20_ctr32: +.cfi_startproc cmp \$0,$len je .Lno_data mov OPENSSL_ia32cap_P+4(%rip),%r10 @@ -255,12 +256,19 @@ $code.=<<___; jnz .LChaCha20_ssse3 push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 sub \$64+24,%rsp +.cfi_adjust_cfa_offset 64+24 .Lctr32_body: #movdqa .Lsigma(%rip),%xmm0 @@ -401,15 +409,24 @@ $code.=<<___; .Ldone: lea 64+24+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lno_data: ret +.cfi_endproc .size ChaCha20_ctr32,.-ChaCha20_ctr32 ___ @@ -448,8 +465,10 @@ $code.=<<___; .type ChaCha20_ssse3,\@function,5 .align 32 ChaCha20_ssse3: +.cfi_startproc .LChaCha20_ssse3: mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 ___ $code.=<<___ if ($avx); test \$`1<<(43-32)`,%r10d @@ -565,8 +584,10 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea (%r9),%rsp +.cfi_def_cfa_register %rsp .Lssse3_epilogue: ret +.cfi_endproc .size ChaCha20_ssse3,.-ChaCha20_ssse3 ___ } @@ -708,8 +729,10 @@ $code.=<<___; .type ChaCha20_4x,\@function,5 .align 32 ChaCha20_4x: +.cfi_startproc .LChaCha20_4x: mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 mov %r10,%r11 ___ $code.=<<___ if ($avx>1); @@ -1149,8 +1172,10 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea (%r9),%rsp +.cfi_def_cfa_register %rsp .L4x_epilogue: ret +.cfi_endproc .size ChaCha20_4x,.-ChaCha20_4x ___ } @@ -1237,8 +1262,10 @@ $code.=<<___; .type ChaCha20_4xop,\@function,5 .align 32 ChaCha20_4xop: +.cfi_startproc .LChaCha20_4xop: mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 sub \$0x140+$xframe,%rsp ___ ################ stack layout @@ -1601,8 +1628,10 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea (%r9),%rsp +.cfi_def_cfa_register %rsp .L4xop_epilogue: ret +.cfi_endproc .size ChaCha20_4xop,.-ChaCha20_4xop ___ } @@ -1735,8 +1764,10 @@ $code.=<<___; .type ChaCha20_8x,\@function,5 .align 32 ChaCha20_8x: +.cfi_startproc .LChaCha20_8x: mov %rsp,%r9 # frame register +.cfi_def_cfa_register %r9 sub \$0x280+$xframe,%rsp and \$-32,%rsp ___ @@ -2242,8 +2273,10 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea (%r9),%rsp +.cfi_def_cfa_register %rsp .L8x_epilogue: ret +.cfi_endproc .size ChaCha20_8x,.-ChaCha20_8x ___ } @@ -2280,8 +2313,10 @@ $code.=<<___; .type ChaCha20_avx512,\@function,5 .align 32 ChaCha20_avx512: +.cfi_startproc .LChaCha20_avx512: mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 cmp \$512,$len ja .LChaCha20_16x @@ -2461,8 +2496,10 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea (%r9),%rsp +.cfi_def_cfa_register %rsp .Lavx512_epilogue: ret +.cfi_endproc .size ChaCha20_avx512,.-ChaCha20_avx512 ___ } @@ -2544,8 +2581,10 @@ $code.=<<___; .type ChaCha20_16x,\@function,5 .align 32 ChaCha20_16x: +.cfi_startproc .LChaCha20_16x: mov %rsp,%r9 # frame register +.cfi_def_cfa_register %r9 sub \$64+$xframe,%rsp and \$-64,%rsp ___ @@ -2963,8 +3002,10 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea (%r9),%rsp +.cfi_def_cfa_register %rsp .L16x_epilogue: ret +.cfi_endproc .size ChaCha20_16x,.-ChaCha20_16x ___ } diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl index ff4efb3..a397019 100755 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/crypto/poly1305/asm/poly1305-x86_64.pl @@ -210,16 +210,23 @@ $code.=<<___; .type poly1305_blocks,\@function,4 .align 32 poly1305_blocks: +.cfi_startproc .Lblocks: shr \$4,$len jz .Lno_data # too short push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lblocks_body: mov $len,%r15 # reassign $len @@ -255,15 +262,23 @@ $code.=<<___; mov $h2,16($ctx) mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data: .Lblocks_epilogue: ret +.cfi_endproc .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,\@function,3 @@ -484,6 +499,7 @@ __poly1305_init_avx: .type poly1305_blocks_avx,\@function,4 .align 32 poly1305_blocks_avx: +.cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx @@ -503,11 +519,17 @@ poly1305_blocks_avx: jz .Leven_avx push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lblocks_avx_body: mov $len,%r15 # reassign $len @@ -610,24 +632,39 @@ poly1305_blocks_avx: .align 16 .Ldone_avx: mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data_avx: .Lblocks_avx_epilogue: ret +.cfi_endproc .align 32 .Lbase2_64_avx: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lbase2_64_avx_body: mov $len,%r15 # reassign $len @@ -687,18 +724,27 @@ poly1305_blocks_avx: mov %r15,$len mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lbase2_64_avx_epilogue: jmp .Ldo_avx +.cfi_endproc .align 32 .Leven_avx: +.cfi_startproc vmovd 4*0($ctx),$H0 # load hash value vmovd 4*1($ctx),$H1 vmovd 4*2($ctx),$H2 @@ -709,6 +755,7 @@ poly1305_blocks_avx: ___ $code.=<<___ if (!$win64); lea -0x58(%rsp),%r11 +.cfi_def_cfa %r11,0x60 sub \$0x178,%rsp ___ $code.=<<___ if ($win64); @@ -1301,10 +1348,12 @@ $code.=<<___ if ($win64); ___ $code.=<<___ if (!$win64); lea 0x58(%r11),%rsp +.cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret +.cfi_endproc .size poly1305_blocks_avx,.-poly1305_blocks_avx .type poly1305_emit_avx,\@function,3 @@ -1372,6 +1421,7 @@ $code.=<<___; .type poly1305_blocks_avx2,\@function,4 .align 32 poly1305_blocks_avx2: +.cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx2 @@ -1391,11 +1441,17 @@ poly1305_blocks_avx2: jz .Leven_avx2 push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lblocks_avx2_body: mov $len,%r15 # reassign $len @@ -1504,24 +1560,39 @@ poly1305_blocks_avx2: .align 16 .Ldone_avx2: mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data_avx2: .Lblocks_avx2_epilogue: ret +.cfi_endproc .align 32 .Lbase2_64_avx2: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lbase2_64_avx2_body: mov $len,%r15 # reassign $len @@ -1588,18 +1659,27 @@ poly1305_blocks_avx2: mov \$`(1<<31|1<<30|1<<16)`,%r11d mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lbase2_64_avx2_epilogue: jmp .Ldo_avx2 +.cfi_endproc .align 32 .Leven_avx2: +.cfi_startproc mov OPENSSL_ia32cap_P+8(%rip),%r10d mov \$`(1<<31|1<<30|1<<16)`,%r11d vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 @@ -1620,6 +1700,7 @@ $code.=<<___ if ($avx>2); ___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); @@ -2008,10 +2089,12 @@ $code.=<<___ if ($win64); ___ $code.=<<___ if (!$win64); lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret +.cfi_endproc .size poly1305_blocks_avx2,.-poly1305_blocks_avx2 ___ ####################################################################### @@ -2031,11 +2114,13 @@ $code.=<<___; .type poly1305_blocks_avx512,\@function,4 .align 32 poly1305_blocks_avx512: +.cfi_startproc .Lblocks_avx512: vzeroupper ___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); @@ -2044,13 +2129,13 @@ $code.=<<___ if ($win64); vmovdqa %xmm6,0x50(%r11) vmovdqa %xmm7,0x60(%r11) vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) + vmovdqa32 %xmm9,0x80(%r11) + vmovdqa32 %xmm10,0x90(%r11) + vmovdqa32 %xmm11,0xa0(%r11) + vmovdqa32 %xmm12,0xb0(%r11) + vmovdqa32 %xmm13,0xc0(%r11) + vmovdqa32 %xmm14,0xd0(%r11) + vmovdqa32 %xmm15,0xe0(%r11) .Ldo_avx512_body: ___ $code.=<<___; @@ -2213,36 +2298,21 @@ $code.=<<___; # we could just flow along, hence the goal for $R0-$S4 is # 1858286838784888 ... - mov \$0b0110011001100110,%eax - mov \$0b1100110011001100,%r8d - mov \$0b0101010101010101,%r9d + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: + mov \$0x7777,%eax kmovw %eax,%k1 - kmovw %r8d,%k2 - kmovw %r9d,%k3 - - vpbroadcastq %x#$D0,$M0 # 0808080808080808 - vpbroadcastq %x#$D1,$M1 - vpbroadcastq %x#$D2,$M2 - vpbroadcastq %x#$D3,$M3 - vpbroadcastq %x#$D4,$M4 - - vpexpandd $D0,${D0}{%k1} # 05060708 -> -05--06--07--08- - vpexpandd $D1,${D1}{%k1} - vpexpandd $D2,${D2}{%k1} - vpexpandd $D3,${D3}{%k1} - vpexpandd $D4,${D4}{%k1} - - vpexpandd $R0,${D0}{%k2} # -05--06--07--08- -> 145-246-347-448- - vpexpandd $R1,${D1}{%k2} - vpexpandd $R2,${D2}{%k2} - vpexpandd $R3,${D3}{%k2} - vpexpandd $R4,${D4}{%k2} - - vpblendmd $M0,$D0,${R0}{%k3} # 1858286838784888 - vpblendmd $M1,$D1,${R1}{%k3} - vpblendmd $M2,$D2,${R2}{%k3} - vpblendmd $M3,$D3,${R3}{%k3} - vpblendmd $M4,$D4,${R4}{%k3} + + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- + vpermd $R1,$M0,$R1 + vpermd $R2,$M0,$R2 + vpermd $R3,$M0,$R3 + vpermd $R4,$M0,$R4 + + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 + vpermd $D1,$M0,${R1}{%k1} + vpermd $D2,$M0,${R2}{%k1} + vpermd $D3,$M0,${R3}{%k1} + vpermd $D4,$M0,${R4}{%k1} vpslld \$2,$R1,$S1 # *5 vpslld \$2,$R2,$S2 @@ -2264,15 +2334,14 @@ $code.=<<___; vpsrlq \$40,$T4,$T4 # 4 vpandq $MASK,$T2,$T2 # 2 vpandq $MASK,$T0,$T0 # 0 - vpandq $MASK,$T1,$T1 # 1 - vpandq $MASK,$T3,$T3 # 3 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 #vporq $PADBIT,$T4,$T4 # padbit, yes, always vpaddq $H2,$T2,$H2 # accumulate input - mov \$0x0f,%eax sub \$192,$len jbe .Ltail_avx512 - jmp .Loop_avx512 + #jmp .Loop_avx512 .align 32 .Loop_avx512: @@ -2307,7 +2376,9 @@ $code.=<<___; vpmuludq $H2,$R1,$D3 # d3 = h2*r1 vpaddq $H0,$T0,$H0 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpandq $MASK,$T1,$T1 # 1 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T3,$T3 # 3 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vporq $PADBIT,$T4,$T4 # padbit, yes, always vpmuludq $H2,$R0,$D2 # d2 = h2*r0 @@ -2415,8 +2486,8 @@ $code.=<<___; vpaddq $D3,$H4,$H4 # h3 -> h4 vpandq $MASK,$T0,$T0 # 0 - vpandq $MASK,$T1,$T1 # 1 - vpandq $MASK,$T3,$T3 # 3 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 #vporq $PADBIT,$T4,$T4 # padbit, yes, always sub \$128,$len @@ -2448,7 +2519,9 @@ $code.=<<___; vpmuludq $H2,$R1,$D3 # d3 = h2*r1 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T1,$T1 # 1 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vpandq $MASK,$T3,$T3 # 3 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 vporq $PADBIT,$T4,$T4 # padbit, yes, always vpaddq $H1,$T1,$H1 # accumulate input @@ -2622,9 +2695,11 @@ $code.=<<___ if ($win64); ___ $code.=<<___ if (!$win64); lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 ___ $code.=<<___; ret +.cfi_endproc .size poly1305_blocks_avx512,.-poly1305_blocks_avx512 ___ if ($avx>3) { @@ -2832,6 +2907,8 @@ $code.=<<___; .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 .Lpermd_avx2: .long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 .L2_44_inp_permd: .long 0,1,1,2,2,3,7,7 diff --git a/crypto/whrlpool/asm/wp-x86_64.pl b/crypto/whrlpool/asm/wp-x86_64.pl index d0b7ecc..4a1261d 100644 --- a/crypto/whrlpool/asm/wp-x86_64.pl +++ b/crypto/whrlpool/asm/wp-x86_64.pl @@ -66,13 +66,21 @@ $code=<<___; .type $func,\@function,3 .align 16 $func: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 sub \$128+40,%rsp and \$-64,%rsp @@ -82,6 +90,7 @@ $func: mov %rsi,8(%r10) mov %rdx,16(%r10) mov %rax,32(%r10) # saved stack pointer +.cfi_cfa_expression %rsp+`128+32`,deref,+8 .Lprologue: mov %r10,%rbx @@ -205,15 +214,24 @@ $code.=<<___; jmp .Louterloop .Lalldone: mov 32(%rbx),%rsi # restore saved pointer +.cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: ret +.cfi_endproc .size $func,.-$func .align 64 diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl index 3082253..e08e1c4 100644 --- a/crypto/x86_64cpuid.pl +++ b/crypto/x86_64cpuid.pl @@ -63,7 +63,9 @@ OPENSSL_rdtsc: .type OPENSSL_ia32_cpuid,\@function,1 .align 16 OPENSSL_ia32_cpuid: +.cfi_startproc mov %rbx,%r8 # save %rbx +.cfi_register %rbx,%r8 xor %eax,%eax mov %eax,8(%rdi) # clear 3rd word @@ -194,8 +196,10 @@ OPENSSL_ia32_cpuid: shl \$32,%r9 mov %r10d,%eax mov %r8,%rbx # restore %rbx +.cfi_restore %rbx or %r9,%rax ret +.cfi_endproc .size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid .globl OPENSSL_cleanse _____ openssl-commits mailing list To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits