The branch master has been updated
       via  e052083cc7620379b7119cdbe4def5ea5de65c18 (commit)
       via  5e32cfb2b6aec4d8d80083dabbd25bf89a482f21 (commit)
       via  fa62bc4661960a593a77d2c3f260173c3aa7333d (commit)
       via  49508b23ce929ad5c8381bdc4b397eb41fd06137 (commit)
       via  1c47e8836f4213251957254764886e82ac2563bc (commit)
       via  f17652e5f9198941ce761da2ccc6ce584fd90e81 (commit)
      from  26a556e778f167070037fee243d7e6b9800fdb7f (commit)


- Log -----------------------------------------------------------------
commit e052083cc7620379b7119cdbe4def5ea5de65c18
Author: Andy Polyakov <ap...@openssl.org>
Date:   Sat Feb 25 18:37:24 2017 +0100

    poly1305/asm/poly1305-x86_64.pl: minor AVX512 optimization.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

commit 5e32cfb2b6aec4d8d80083dabbd25bf89a482f21
Author: Andy Polyakov <ap...@openssl.org>
Date:   Sat Feb 25 22:17:21 2017 +0100

    crypto/x86_64cpuid.pl: add CFI annotations.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

commit fa62bc4661960a593a77d2c3f260173c3aa7333d
Author: Andy Polyakov <ap...@openssl.org>
Date:   Sat Feb 25 22:16:57 2017 +0100

    whrlpool/asm/wp-x86_64.pl: add CFI annotations.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

commit 49508b23ce929ad5c8381bdc4b397eb41fd06137
Author: Andy Polyakov <ap...@openssl.org>
Date:   Sat Feb 25 22:16:38 2017 +0100

    camellia/asm/cmll-x86_64.pl: add CFI annotations.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

commit 1c47e8836f4213251957254764886e82ac2563bc
Author: Andy Polyakov <ap...@openssl.org>
Date:   Sat Feb 25 19:37:02 2017 +0100

    poly1305/asm/poly1305-x86_64.pl: add CFI annotations.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

commit f17652e5f9198941ce761da2ccc6ce584fd90e81
Author: Andy Polyakov <ap...@openssl.org>
Date:   Sat Feb 25 19:36:43 2017 +0100

    chacha/asm/chacha-x86_64.pl: add CFI annotations.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

-----------------------------------------------------------------------

Summary of changes:
 crypto/camellia/asm/cmll-x86_64.pl     |  57 ++++++++++++
 crypto/chacha/asm/chacha-x86_64.pl     |  41 +++++++++
 crypto/poly1305/asm/poly1305-x86_64.pl | 161 ++++++++++++++++++++++++---------
 crypto/whrlpool/asm/wp-x86_64.pl       |  18 ++++
 crypto/x86_64cpuid.pl                  |   4 +
 5 files changed, 239 insertions(+), 42 deletions(-)

diff --git a/crypto/camellia/asm/cmll-x86_64.pl 
b/crypto/camellia/asm/cmll-x86_64.pl
index da5ad7b..02c52c3 100644
--- a/crypto/camellia/asm/cmll-x86_64.pl
+++ b/crypto/camellia/asm/cmll-x86_64.pl
@@ -137,11 +137,17 @@ Camellia_EncryptBlock:
 .align 16
 .Lenc_rounds:
 Camellia_EncryptBlock_Rounds:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lenc_prologue:
 
        #mov    %rsi,$inp               # put away arguments
@@ -173,13 +179,20 @@ Camellia_EncryptBlock_Rounds:
        mov     @S[3],12($out)
 
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%rbp
+.cfi_restore   %rbp
        mov     32(%rsp),%rbx
+.cfi_restore   %rbx
        lea     40(%rsp),%rsp
+.cfi_adjust_cfa_offset -40
 .Lenc_epilogue:
        ret
+.cfi_endproc
 .size  Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
 
 .type  _x86_64_Camellia_encrypt,\@abi-omnipotent
@@ -247,11 +260,17 @@ Camellia_DecryptBlock:
 .align 16
 .Ldec_rounds:
 Camellia_DecryptBlock_Rounds:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Ldec_prologue:
 
        #mov    %rsi,$inp               # put away arguments
@@ -283,13 +302,20 @@ Camellia_DecryptBlock_Rounds:
        mov     @S[3],12($out)
 
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%rbp
+.cfi_restore   %rbp
        mov     32(%rsp),%rbx
+.cfi_restore   %rbx
        lea     40(%rsp),%rsp
+.cfi_adjust_cfa_offset -40
 .Ldec_epilogue:
        ret
+.cfi_endproc
 .size  Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
 
 .type  _x86_64_Camellia_decrypt,\@abi-omnipotent
@@ -409,11 +435,17 @@ $code.=<<___;
 .type  Camellia_Ekeygen,\@function,3
 .align 16
 Camellia_Ekeygen:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lkey_prologue:
 
        mov     %edi,${keyend}d         # put away arguments, keyBitLength
@@ -573,13 +605,20 @@ $code.=<<___;
        mov     \$4,%eax
 .Ldone:
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%rbp
+.cfi_restore   %rbp
        mov     32(%rsp),%rbx
+.cfi_restore   %rbx
        lea     40(%rsp),%rsp
+.cfi_adjust_cfa_offset -40
 .Lkey_epilogue:
        ret
+.cfi_endproc
 .size  Camellia_Ekeygen,.-Camellia_Ekeygen
 ___
 }
@@ -637,17 +676,25 @@ $code.=<<___;
 .type  Camellia_cbc_encrypt,\@function,6
 .align 16
 Camellia_cbc_encrypt:
+.cfi_startproc
        cmp     \$0,%rdx
        je      .Lcbc_abort
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lcbc_prologue:
 
        mov     %rsp,%rbp
+.cfi_def_cfa_register  %rbp
        sub     \$64,%rsp
        and     \$-64,%rsp
 
@@ -668,6 +715,7 @@ Camellia_cbc_encrypt:
 
        mov     %r8,$_ivp
        mov     %rbp,$_rsp
+.cfi_cfa_expression    $_rsp,deref,+56
 
 .Lcbc_body:
        lea     .LCamellia_SBOX(%rip),$Tbl
@@ -856,15 +904,24 @@ Camellia_cbc_encrypt:
 .align 16
 .Lcbc_done:
        mov     $_rsp,%rcx
+.cfi_def_cfa   %rcx,56
        mov     0(%rcx),%r15
+.cfi_restore   %r15
        mov     8(%rcx),%r14
+.cfi_restore   %r14
        mov     16(%rcx),%r13
+.cfi_restore   %r13
        mov     24(%rcx),%r12
+.cfi_restore   %r12
        mov     32(%rcx),%rbp
+.cfi_restore   %rbp
        mov     40(%rcx),%rbx
+.cfi_restore   %rbx
        lea     48(%rcx),%rsp
+.cfi_def_cfa   %rsp,8
 .Lcbc_abort:
        ret
+.cfi_endproc
 .size  Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
 
 .asciz "Camellia for x86_64 by <appro\@openssl.org>"
diff --git a/crypto/chacha/asm/chacha-x86_64.pl 
b/crypto/chacha/asm/chacha-x86_64.pl
index 7fc1749..b59d96f 100755
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@@ -242,6 +242,7 @@ $code.=<<___;
 .type  ChaCha20_ctr32,\@function,5
 .align 64
 ChaCha20_ctr32:
+.cfi_startproc
        cmp     \$0,$len
        je      .Lno_data
        mov     OPENSSL_ia32cap_P+4(%rip),%r10
@@ -255,12 +256,19 @@ $code.=<<___;
        jnz     .LChaCha20_ssse3
 
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
        sub     \$64+24,%rsp
+.cfi_adjust_cfa_offset 64+24
 .Lctr32_body:
 
        #movdqa .Lsigma(%rip),%xmm0
@@ -401,15 +409,24 @@ $code.=<<___;
 
 .Ldone:
        lea     64+24+48(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lno_data:
        ret
+.cfi_endproc
 .size  ChaCha20_ctr32,.-ChaCha20_ctr32
 ___
 
@@ -448,8 +465,10 @@ $code.=<<___;
 .type  ChaCha20_ssse3,\@function,5
 .align 32
 ChaCha20_ssse3:
+.cfi_startproc
 .LChaCha20_ssse3:
        mov     %rsp,%r9                # frame pointer
+.cfi_def_cfa_register  %r9
 ___
 $code.=<<___   if ($avx);
        test    \$`1<<(43-32)`,%r10d
@@ -565,8 +584,10 @@ $code.=<<___       if ($win64);
 ___
 $code.=<<___;
        lea     (%r9),%rsp
+.cfi_def_cfa_register  %rsp
 .Lssse3_epilogue:
        ret
+.cfi_endproc
 .size  ChaCha20_ssse3,.-ChaCha20_ssse3
 ___
 }
@@ -708,8 +729,10 @@ $code.=<<___;
 .type  ChaCha20_4x,\@function,5
 .align 32
 ChaCha20_4x:
+.cfi_startproc
 .LChaCha20_4x:
        mov             %rsp,%r9                # frame pointer
+.cfi_def_cfa_register  %r9
        mov             %r10,%r11
 ___
 $code.=<<___   if ($avx>1);
@@ -1149,8 +1172,10 @@ $code.=<<___     if ($win64);
 ___
 $code.=<<___;
        lea             (%r9),%rsp
+.cfi_def_cfa_register  %rsp
 .L4x_epilogue:
        ret
+.cfi_endproc
 .size  ChaCha20_4x,.-ChaCha20_4x
 ___
 }
@@ -1237,8 +1262,10 @@ $code.=<<___;
 .type  ChaCha20_4xop,\@function,5
 .align 32
 ChaCha20_4xop:
+.cfi_startproc
 .LChaCha20_4xop:
        mov             %rsp,%r9                # frame pointer
+.cfi_def_cfa_register  %r9
        sub             \$0x140+$xframe,%rsp
 ___
        ################ stack layout
@@ -1601,8 +1628,10 @@ $code.=<<___     if ($win64);
 ___
 $code.=<<___;
        lea             (%r9),%rsp
+.cfi_def_cfa_register  %rsp
 .L4xop_epilogue:
        ret
+.cfi_endproc
 .size  ChaCha20_4xop,.-ChaCha20_4xop
 ___
 }
@@ -1735,8 +1764,10 @@ $code.=<<___;
 .type  ChaCha20_8x,\@function,5
 .align 32
 ChaCha20_8x:
+.cfi_startproc
 .LChaCha20_8x:
        mov             %rsp,%r9                # frame register
+.cfi_def_cfa_register  %r9
        sub             \$0x280+$xframe,%rsp
        and             \$-32,%rsp
 ___
@@ -2242,8 +2273,10 @@ $code.=<<___     if ($win64);
 ___
 $code.=<<___;
        lea             (%r9),%rsp
+.cfi_def_cfa_register  %rsp
 .L8x_epilogue:
        ret
+.cfi_endproc
 .size  ChaCha20_8x,.-ChaCha20_8x
 ___
 }
@@ -2280,8 +2313,10 @@ $code.=<<___;
 .type  ChaCha20_avx512,\@function,5
 .align 32
 ChaCha20_avx512:
+.cfi_startproc
 .LChaCha20_avx512:
        mov     %rsp,%r9                # frame pointer
+.cfi_def_cfa_register  %r9
        cmp     \$512,$len
        ja      .LChaCha20_16x
 
@@ -2461,8 +2496,10 @@ $code.=<<___     if ($win64);
 ___
 $code.=<<___;
        lea     (%r9),%rsp
+.cfi_def_cfa_register  %rsp
 .Lavx512_epilogue:
        ret
+.cfi_endproc
 .size  ChaCha20_avx512,.-ChaCha20_avx512
 ___
 }
@@ -2544,8 +2581,10 @@ $code.=<<___;
 .type  ChaCha20_16x,\@function,5
 .align 32
 ChaCha20_16x:
+.cfi_startproc
 .LChaCha20_16x:
        mov             %rsp,%r9                # frame register
+.cfi_def_cfa_register  %r9
        sub             \$64+$xframe,%rsp
        and             \$-64,%rsp
 ___
@@ -2963,8 +3002,10 @@ $code.=<<___     if ($win64);
 ___
 $code.=<<___;
        lea             (%r9),%rsp
+.cfi_def_cfa_register  %rsp
 .L16x_epilogue:
        ret
+.cfi_endproc
 .size  ChaCha20_16x,.-ChaCha20_16x
 ___
 }
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl 
b/crypto/poly1305/asm/poly1305-x86_64.pl
index ff4efb3..a397019 100755
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -210,16 +210,23 @@ $code.=<<___;
 .type  poly1305_blocks,\@function,4
 .align 32
 poly1305_blocks:
+.cfi_startproc
 .Lblocks:
        shr     \$4,$len
        jz      .Lno_data               # too short
 
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lblocks_body:
 
        mov     $len,%r15               # reassign $len
@@ -255,15 +262,23 @@ $code.=<<___;
        mov     $h2,16($ctx)
 
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lno_data:
 .Lblocks_epilogue:
        ret
+.cfi_endproc
 .size  poly1305_blocks,.-poly1305_blocks
 
 .type  poly1305_emit,\@function,3
@@ -484,6 +499,7 @@ __poly1305_init_avx:
 .type  poly1305_blocks_avx,\@function,4
 .align 32
 poly1305_blocks_avx:
+.cfi_startproc
        mov     20($ctx),%r8d           # is_base2_26
        cmp     \$128,$len
        jae     .Lblocks_avx
@@ -503,11 +519,17 @@ poly1305_blocks_avx:
        jz      .Leven_avx
 
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lblocks_avx_body:
 
        mov     $len,%r15               # reassign $len
@@ -610,24 +632,39 @@ poly1305_blocks_avx:
 .align 16
 .Ldone_avx:
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lno_data_avx:
 .Lblocks_avx_epilogue:
        ret
+.cfi_endproc
 
 .align 32
 .Lbase2_64_avx:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lbase2_64_avx_body:
 
        mov     $len,%r15               # reassign $len
@@ -687,18 +724,27 @@ poly1305_blocks_avx:
        mov     %r15,$len
 
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rax
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lbase2_64_avx_epilogue:
        jmp     .Ldo_avx
+.cfi_endproc
 
 .align 32
 .Leven_avx:
+.cfi_startproc
        vmovd           4*0($ctx),$H0           # load hash value
        vmovd           4*1($ctx),$H1
        vmovd           4*2($ctx),$H2
@@ -709,6 +755,7 @@ poly1305_blocks_avx:
 ___
 $code.=<<___   if (!$win64);
        lea             -0x58(%rsp),%r11
+.cfi_def_cfa           %r11,0x60
        sub             \$0x178,%rsp
 ___
 $code.=<<___   if ($win64);
@@ -1301,10 +1348,12 @@ $code.=<<___    if ($win64);
 ___
 $code.=<<___   if (!$win64);
        lea             0x58(%r11),%rsp
+.cfi_def_cfa           %rsp,8
 ___
 $code.=<<___;
        vzeroupper
        ret
+.cfi_endproc
 .size  poly1305_blocks_avx,.-poly1305_blocks_avx
 
 .type  poly1305_emit_avx,\@function,3
@@ -1372,6 +1421,7 @@ $code.=<<___;
 .type  poly1305_blocks_avx2,\@function,4
 .align 32
 poly1305_blocks_avx2:
+.cfi_startproc
        mov     20($ctx),%r8d           # is_base2_26
        cmp     \$128,$len
        jae     .Lblocks_avx2
@@ -1391,11 +1441,17 @@ poly1305_blocks_avx2:
        jz      .Leven_avx2
 
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lblocks_avx2_body:
 
        mov     $len,%r15               # reassign $len
@@ -1504,24 +1560,39 @@ poly1305_blocks_avx2:
 .align 16
 .Ldone_avx2:
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lno_data_avx2:
 .Lblocks_avx2_epilogue:
        ret
+.cfi_endproc
 
 .align 32
 .Lbase2_64_avx2:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lbase2_64_avx2_body:
 
        mov     $len,%r15               # reassign $len
@@ -1588,18 +1659,27 @@ poly1305_blocks_avx2:
        mov     \$`(1<<31|1<<30|1<<16)`,%r11d
 
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rax
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lbase2_64_avx2_epilogue:
        jmp     .Ldo_avx2
+.cfi_endproc
 
 .align 32
 .Leven_avx2:
+.cfi_startproc
        mov             OPENSSL_ia32cap_P+8(%rip),%r10d
        mov             \$`(1<<31|1<<30|1<<16)`,%r11d
        vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
@@ -1620,6 +1700,7 @@ $code.=<<___              if ($avx>2);
 ___
 $code.=<<___   if (!$win64);
        lea             -8(%rsp),%r11
+.cfi_def_cfa           %r11,16
        sub             \$0x128,%rsp
 ___
 $code.=<<___   if ($win64);
@@ -2008,10 +2089,12 @@ $code.=<<___    if ($win64);
 ___
 $code.=<<___   if (!$win64);
        lea             8(%r11),%rsp
+.cfi_def_cfa           %rsp,8
 ___
 $code.=<<___;
        vzeroupper
        ret
+.cfi_endproc
 .size  poly1305_blocks_avx2,.-poly1305_blocks_avx2
 ___
 #######################################################################
@@ -2031,11 +2114,13 @@ $code.=<<___;
 .type  poly1305_blocks_avx512,\@function,4
 .align 32
 poly1305_blocks_avx512:
+.cfi_startproc
 .Lblocks_avx512:
        vzeroupper
 ___
 $code.=<<___   if (!$win64);
        lea             -8(%rsp),%r11
+.cfi_def_cfa           %r11,16
        sub             \$0x128,%rsp
 ___
 $code.=<<___   if ($win64);
@@ -2044,13 +2129,13 @@ $code.=<<___    if ($win64);
        vmovdqa         %xmm6,0x50(%r11)
        vmovdqa         %xmm7,0x60(%r11)
        vmovdqa         %xmm8,0x70(%r11)
-       vmovdqa         %xmm9,0x80(%r11)
-       vmovdqa         %xmm10,0x90(%r11)
-       vmovdqa         %xmm11,0xa0(%r11)
-       vmovdqa         %xmm12,0xb0(%r11)
-       vmovdqa         %xmm13,0xc0(%r11)
-       vmovdqa         %xmm14,0xd0(%r11)
-       vmovdqa         %xmm15,0xe0(%r11)
+       vmovdqa32       %xmm9,0x80(%r11)
+       vmovdqa32       %xmm10,0x90(%r11)
+       vmovdqa32       %xmm11,0xa0(%r11)
+       vmovdqa32       %xmm12,0xb0(%r11)
+       vmovdqa32       %xmm13,0xc0(%r11)
+       vmovdqa32       %xmm14,0xd0(%r11)
+       vmovdqa32       %xmm15,0xe0(%r11)
 .Ldo_avx512_body:
 ___
 $code.=<<___;
@@ -2213,36 +2298,21 @@ $code.=<<___;
        # we could just flow along, hence the goal for $R0-$S4 is
        # 1858286838784888 ...
 
-       mov             \$0b0110011001100110,%eax
-       mov             \$0b1100110011001100,%r8d
-       mov             \$0b0101010101010101,%r9d
+       vmovdqa32       128(%rcx),$M0           # .Lpermd_avx512:
+       mov             \$0x7777,%eax
        kmovw           %eax,%k1
-       kmovw           %r8d,%k2
-       kmovw           %r9d,%k3
-
-       vpbroadcastq    %x#$D0,$M0      # 0808080808080808
-       vpbroadcastq    %x#$D1,$M1
-       vpbroadcastq    %x#$D2,$M2
-       vpbroadcastq    %x#$D3,$M3
-       vpbroadcastq    %x#$D4,$M4
-
-       vpexpandd       $D0,${D0}{%k1}  # 05060708 -> -05--06--07--08-
-       vpexpandd       $D1,${D1}{%k1}
-       vpexpandd       $D2,${D2}{%k1}
-       vpexpandd       $D3,${D3}{%k1}
-       vpexpandd       $D4,${D4}{%k1}
-
-       vpexpandd       $R0,${D0}{%k2}  # -05--06--07--08- -> 145-246-347-448-
-       vpexpandd       $R1,${D1}{%k2}
-       vpexpandd       $R2,${D2}{%k2}
-       vpexpandd       $R3,${D3}{%k2}
-       vpexpandd       $R4,${D4}{%k2}
-
-       vpblendmd       $M0,$D0,${R0}{%k3}      # 1858286838784888
-       vpblendmd       $M1,$D1,${R1}{%k3}
-       vpblendmd       $M2,$D2,${R2}{%k3}
-       vpblendmd       $M3,$D3,${R3}{%k3}
-       vpblendmd       $M4,$D4,${R4}{%k3}
+
+       vpermd          $R0,$M0,$R0             # 14243444 -> 1---2---3---4---
+       vpermd          $R1,$M0,$R1
+       vpermd          $R2,$M0,$R2
+       vpermd          $R3,$M0,$R3
+       vpermd          $R4,$M0,$R4
+
+       vpermd          $D0,$M0,${R0}{%k1}      # 05060708 -> 1858286838784888
+       vpermd          $D1,$M0,${R1}{%k1}
+       vpermd          $D2,$M0,${R2}{%k1}
+       vpermd          $D3,$M0,${R3}{%k1}
+       vpermd          $D4,$M0,${R4}{%k1}
 
        vpslld          \$2,$R1,$S1             # *5
        vpslld          \$2,$R2,$S2
@@ -2264,15 +2334,14 @@ $code.=<<___;
        vpsrlq          \$40,$T4,$T4            # 4
        vpandq          $MASK,$T2,$T2           # 2
        vpandq          $MASK,$T0,$T0           # 0
-       vpandq          $MASK,$T1,$T1           # 1
-       vpandq          $MASK,$T3,$T3           # 3
+       #vpandq         $MASK,$T1,$T1           # 1
+       #vpandq         $MASK,$T3,$T3           # 3
        #vporq          $PADBIT,$T4,$T4         # padbit, yes, always
 
        vpaddq          $H2,$T2,$H2             # accumulate input
-       mov             \$0x0f,%eax
        sub             \$192,$len
        jbe             .Ltail_avx512
-       jmp             .Loop_avx512
+       #jmp            .Loop_avx512
 
 .align 32
 .Loop_avx512:
@@ -2307,7 +2376,9 @@ $code.=<<___;
        vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
         vpaddq         $H0,$T0,$H0
        vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
+        vpandq         $MASK,$T1,$T1           # 1
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
+        vpandq         $MASK,$T3,$T3           # 3
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
@@ -2415,8 +2486,8 @@ $code.=<<___;
        vpaddq          $D3,$H4,$H4             # h3 -> h4
 
         vpandq         $MASK,$T0,$T0           # 0
-        vpandq         $MASK,$T1,$T1           # 1
-        vpandq         $MASK,$T3,$T3           # 3
+        #vpandq        $MASK,$T1,$T1           # 1
+        #vpandq        $MASK,$T3,$T3           # 3
         #vporq         $PADBIT,$T4,$T4         # padbit, yes, always
 
        sub             \$128,$len
@@ -2448,7 +2519,9 @@ $code.=<<___;
        vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
        vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
+        vpandq         $MASK,$T1,$T1           # 1
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
+        vpandq         $MASK,$T3,$T3           # 3
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
         vpaddq         $H1,$T1,$H1             # accumulate input
@@ -2622,9 +2695,11 @@ $code.=<<___     if ($win64);
 ___
 $code.=<<___   if (!$win64);
        lea             8(%r11),%rsp
+.cfi_def_cfa           %rsp,8
 ___
 $code.=<<___;
        ret
+.cfi_endproc
 .size  poly1305_blocks_avx512,.-poly1305_blocks_avx512
 ___
 if ($avx>3) {
@@ -2832,6 +2907,8 @@ $code.=<<___;
 .long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
 .Lpermd_avx2:
 .long  2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long  0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
 
 .L2_44_inp_permd:
 .long  0,1,1,2,2,3,7,7
diff --git a/crypto/whrlpool/asm/wp-x86_64.pl b/crypto/whrlpool/asm/wp-x86_64.pl
index d0b7ecc..4a1261d 100644
--- a/crypto/whrlpool/asm/wp-x86_64.pl
+++ b/crypto/whrlpool/asm/wp-x86_64.pl
@@ -66,13 +66,21 @@ $code=<<___;
 .type  $func,\@function,3
 .align 16
 $func:
+.cfi_startproc
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
        sub     \$128+40,%rsp
        and     \$-64,%rsp
@@ -82,6 +90,7 @@ $func:
        mov     %rsi,8(%r10)
        mov     %rdx,16(%r10)
        mov     %rax,32(%r10)           # saved stack pointer
+.cfi_cfa_expression    %rsp+`128+32`,deref,+8
 .Lprologue:
 
        mov     %r10,%rbx
@@ -205,15 +214,24 @@ $code.=<<___;
        jmp     .Louterloop
 .Lalldone:
        mov     32(%rbx),%rsi           # restore saved pointer
+.cfi_def_cfa   %rsi,8
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lepilogue:
        ret
+.cfi_endproc
 .size  $func,.-$func
 
 .align 64
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index 3082253..e08e1c4 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -63,7 +63,9 @@ OPENSSL_rdtsc:
 .type  OPENSSL_ia32_cpuid,\@function,1
 .align 16
 OPENSSL_ia32_cpuid:
+.cfi_startproc
        mov     %rbx,%r8                # save %rbx
+.cfi_register  %rbx,%r8
 
        xor     %eax,%eax
        mov     %eax,8(%rdi)            # clear 3rd word
@@ -194,8 +196,10 @@ OPENSSL_ia32_cpuid:
        shl     \$32,%r9
        mov     %r10d,%eax
        mov     %r8,%rbx                # restore %rbx
+.cfi_restore   %rbx
        or      %r9,%rax
        ret
+.cfi_endproc
 .size  OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
 
 .globl  OPENSSL_cleanse
_____
openssl-commits mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits

Reply via email to