The branch master has been updated
       via  76e624a003db22db2d99ece04a15e20fe44c1fbe (commit)
       via  a3b5684fc1d4f3aabdf68dcf6c577f6dd24d2b2d (commit)
       via  9d301cfea7181766b79ba31ed257d30fb84b1b0f (commit)
       via  e09b6216a5423555271509acf5112da5484ec15d (commit)
       via  53b33100769aa8801d6fd2caf155c7cb04d64dfc (commit)
      from  fbc9eeaaa32ba1416d6cb2794201f440bbaeb629 (commit)


- Log -----------------------------------------------------------------
commit 76e624a003db22db2d99ece04a15e20fe44c1fbe
Author: Andy Polyakov <ap...@openssl.org>
Date:   Wed Feb 8 10:12:28 2017 +0100

    bn/asm/x86_64*: add DWARF CFI directives.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

commit a3b5684fc1d4f3aabdf68dcf6c577f6dd24d2b2d
Author: Andy Polyakov <ap...@openssl.org>
Date:   Mon Feb 6 08:58:34 2017 +0100

    perlasm/x86_64-xlate.pl: recognize DWARF CFI directives.
    
    CFI directives annotate instructions that are significant for stack
    unwinding procedure. In addition to directives recognized by GNU
    assembler this module implements three synthetic ones:
    
    - .cfi_push annotates push instructions in prologue and translates to
      .cfi_adjust_cfa_offset (if needed) and .cfi_offset;
    - .cfi_pop annotates pop instructions in epilogue and translates to
      .cfi_adjust_cfs_offset (if needed) and .cfi_restore;
    - .cfi_cfa_expression encodes DW_CFA_def_cfa_expression and passes it
      to .cfi_escape as byte vector;
    
    CFA expression syntax is made up mix of DWARF operator suffixes [subset
    of] and references to registers with optional bias. Following example
    describes offloaded original stack pointer at specific offset from
    current stack pointer:
    
        .cfi_cfa_expression     %rsp+40,deref,+8
    
    Final +8 has everything to do with the fact that CFA, Canonical Frame
    Address, is reference to top of caller's stack, and on x86_64 call to
    subroutine pushes 8-byte return address.
    
    Triggered by request from Adam Langley.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

commit 9d301cfea7181766b79ba31ed257d30fb84b1b0f
Author: Andy Polyakov <ap...@openssl.org>
Date:   Sat Feb 4 13:26:15 2017 +0100

    perlasm/x86_64-xlate.pl: remove obsolete .picmeup synthetic directive.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

commit e09b6216a5423555271509acf5112da5484ec15d
Author: Andy Polyakov <ap...@openssl.org>
Date:   Sat Feb 4 13:24:42 2017 +0100

    perlasm/x86_64-xlate.pl: minor readability updates.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

commit 53b33100769aa8801d6fd2caf155c7cb04d64dfc
Author: Andy Polyakov <ap...@openssl.org>
Date:   Wed Feb 8 10:09:21 2017 +0100

    bn/asm/rsaz-avx2.pl: refine Win64 SE handler.
    
    Reviewed-by: Rich Salz <rs...@openssl.org>

-----------------------------------------------------------------------

Summary of changes:
 crypto/bn/asm/rsaz-avx2.pl     |  57 ++++++-
 crypto/bn/asm/rsaz-x86_64.pl   |  85 +++++++++++
 crypto/bn/asm/x86_64-gf2m.pl   |  18 +++
 crypto/bn/asm/x86_64-mont.pl   |  72 +++++++++
 crypto/bn/asm/x86_64-mont5.pl  | 111 +++++++++++++-
 crypto/perlasm/x86_64-xlate.pl | 327 +++++++++++++++++++++++++++++++++++------
 6 files changed, 614 insertions(+), 56 deletions(-)

diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl
index e620285..f45c214 100755
--- a/crypto/bn/asm/rsaz-avx2.pl
+++ b/crypto/bn/asm/rsaz-avx2.pl
@@ -168,13 +168,21 @@ $code.=<<___;
 .type  rsaz_1024_sqr_avx2,\@function,5
 .align 64
 rsaz_1024_sqr_avx2:            # 702 cycles, 14% faster than rsaz_1024_mul_avx2
+.cfi_startproc
        lea     (%rsp), %rax
+.cfi_def_cfa_register  %rax
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
        vzeroupper
 ___
 $code.=<<___ if ($win64);
@@ -193,6 +201,7 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
        mov     %rax,%rbp
+.cfi_def_cfa_register  %rbp
        mov     %rdx, $np                       # reassigned argument
        sub     \$$FrameSize, %rsp
        mov     $np, $tmp
@@ -825,8 +834,10 @@ $code.=<<___;
 
        vzeroall
        mov     %rbp, %rax
+.cfi_def_cfa_register  %rax
 ___
 $code.=<<___ if ($win64);
+.Lsqr_1024_in_tail:
        movaps  -0xd8(%rax),%xmm6
        movaps  -0xc8(%rax),%xmm7
        movaps  -0xb8(%rax),%xmm8
@@ -840,14 +851,22 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
        mov     -48(%rax),%r15
+.cfi_restore   %r15
        mov     -40(%rax),%r14
+.cfi_restore   %r14
        mov     -32(%rax),%r13
+.cfi_restore   %r13
        mov     -24(%rax),%r12
+.cfi_restore   %r12
        mov     -16(%rax),%rbp
+.cfi_restore   %rbp
        mov     -8(%rax),%rbx
+.cfi_restore   %rbx
        lea     (%rax),%rsp             # restore %rsp
+.cfi_def_cfa_register  %rsp
 .Lsqr_1024_epilogue:
        ret
+.cfi_endproc
 .size  rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
 ___
 }
@@ -900,13 +919,21 @@ $code.=<<___;
 .type  rsaz_1024_mul_avx2,\@function,5
 .align 64
 rsaz_1024_mul_avx2:
+.cfi_startproc
        lea     (%rsp), %rax
+.cfi_def_cfa_register  %rax
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 ___
 $code.=<<___ if ($win64);
        vzeroupper
@@ -925,6 +952,7 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
        mov     %rax,%rbp
+.cfi_def_cfa_register  %rbp
        vzeroall
        mov     %rdx, $bp       # reassigned argument
        sub     \$64,%rsp
@@ -1458,8 +1486,10 @@ $code.=<<___;
        vzeroupper
 
        mov     %rbp, %rax
+.cfi_def_cfa_register  %rax
 ___
 $code.=<<___ if ($win64);
+.Lmul_1024_in_tail:
        movaps  -0xd8(%rax),%xmm6
        movaps  -0xc8(%rax),%xmm7
        movaps  -0xb8(%rax),%xmm8
@@ -1473,14 +1503,22 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
        mov     -48(%rax),%r15
+.cfi_restore   %r15
        mov     -40(%rax),%r14
+.cfi_restore   %r14
        mov     -32(%rax),%r13
+.cfi_restore   %r13
        mov     -24(%rax),%r12
+.cfi_restore   %r12
        mov     -16(%rax),%rbp
+.cfi_restore   %rbp
        mov     -8(%rax),%rbx
+.cfi_restore   %rbx
        lea     (%rax),%rsp             # restore %rsp
+.cfi_def_cfa_register  %rsp
 .Lmul_1024_epilogue:
        ret
+.cfi_endproc
 .size  rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
 ___
 }
@@ -1599,8 +1637,10 @@ rsaz_1024_scatter5_avx2:
 .type  rsaz_1024_gather5_avx2,\@abi-omnipotent
 .align 32
 rsaz_1024_gather5_avx2:
+.cfi_startproc
        vzeroupper
        mov     %rsp,%r11
+.cfi_def_cfa_register  %r11
 ___
 $code.=<<___ if ($win64);
        lea     -0x88(%rsp),%rax
@@ -1741,7 +1781,9 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
        lea     (%r11),%rsp
+.cfi_def_cfa_register  %rsp
        ret
+.cfi_endproc
 .LSEH_end_rsaz_1024_gather5:
 .size  rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
 ___
@@ -1815,14 +1857,17 @@ rsaz_se_handler:
        cmp     %r10,%rbx               # context->Rip<prologue label
        jb      .Lcommon_seh_tail
 
-       mov     152($context),%rax      # pull context->Rsp
-
        mov     4(%r11),%r10d           # HandlerData[1]
        lea     (%rsi,%r10),%r10        # epilogue label
        cmp     %r10,%rbx               # context->Rip>=epilogue label
        jae     .Lcommon_seh_tail
 
-       mov     160($context),%rax      # pull context->Rbp
+       mov     160($context),%rbp      # pull context->Rbp
+
+       mov     8(%r11),%r10d           # HandlerData[2]
+       lea     (%rsi,%r10),%r10        # "in tail" label
+       cmp     %r10,%rbx               # context->Rip>="in tail" label
+       cmovc   %rbp,%rax
 
        mov     -48(%rax),%r15
        mov     -40(%rax),%r14
@@ -1900,11 +1945,13 @@ rsaz_se_handler:
 .LSEH_info_rsaz_1024_sqr_avx2:
        .byte   9,0,0,0
        .rva    rsaz_se_handler
-       .rva    .Lsqr_1024_body,.Lsqr_1024_epilogue
+       .rva    .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail
+       .long   0
 .LSEH_info_rsaz_1024_mul_avx2:
        .byte   9,0,0,0
        .rva    rsaz_se_handler
-       .rva    .Lmul_1024_body,.Lmul_1024_epilogue
+       .rva    .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail
+       .long   0
 .LSEH_info_rsaz_1024_gather5:
        .byte   0x01,0x36,0x17,0x0b
        .byte   0x36,0xf8,0x09,0x00     # vmovaps 0x90(rsp),xmm15
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index 7bcfafe..1b88026 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -138,14 +138,22 @@ $code.=<<___;
 .type  rsaz_512_sqr,\@function,5
 .align 32
 rsaz_512_sqr:                          # 25-29% faster than rsaz_512_mul
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
        subq    \$128+24, %rsp
+.cfi_adjust_cfa_offset 128+24
 .Lsqr_body:
        movq    $mod, %rbp              # common argument
        movq    ($inp), %rdx
@@ -800,15 +808,24 @@ ___
 $code.=<<___;
 
        leaq    128+24+48(%rsp), %rax
+.cfi_def_cfa   %rax,8
        movq    -48(%rax), %r15
+.cfi_restore   %r15
        movq    -40(%rax), %r14
+.cfi_restore   %r14
        movq    -32(%rax), %r13
+.cfi_restore   %r13
        movq    -24(%rax), %r12
+.cfi_restore   %r12
        movq    -16(%rax), %rbp
+.cfi_restore   %rbp
        movq    -8(%rax), %rbx
+.cfi_restore   %rbx
        leaq    (%rax), %rsp
+.cfi_def_cfa_register  %rsp
 .Lsqr_epilogue:
        ret
+.cfi_endproc
 .size  rsaz_512_sqr,.-rsaz_512_sqr
 ___
 }
@@ -819,14 +836,22 @@ $code.=<<___;
 .type  rsaz_512_mul,\@function,5
 .align 32
 rsaz_512_mul:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
        subq    \$128+24, %rsp
+.cfi_adjust_cfa_offset 128+24
 .Lmul_body:
        movq    $out, %xmm0             # off-load arguments
        movq    $mod, %xmm1
@@ -896,15 +921,24 @@ $code.=<<___;
        call    __rsaz_512_subtract
 
        leaq    128+24+48(%rsp), %rax
+.cfi_def_cfa   %rax,8
        movq    -48(%rax), %r15
+.cfi_restore   %r15
        movq    -40(%rax), %r14
+.cfi_restore   %r14
        movq    -32(%rax), %r13
+.cfi_restore   %r13
        movq    -24(%rax), %r12
+.cfi_restore   %r12
        movq    -16(%rax), %rbp
+.cfi_restore   %rbp
        movq    -8(%rax), %rbx
+.cfi_restore   %rbx
        leaq    (%rax), %rsp
+.cfi_def_cfa_register  %rsp
 .Lmul_epilogue:
        ret
+.cfi_endproc
 .size  rsaz_512_mul,.-rsaz_512_mul
 ___
 }
@@ -915,14 +949,22 @@ $code.=<<___;
 .type  rsaz_512_mul_gather4,\@function,6
 .align 32
 rsaz_512_mul_gather4:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
        subq    \$`128+24+($win64?0xb0:0)`, %rsp
+.cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
 ___
 $code.=<<___   if ($win64);
        movaps  %xmm6,0xa0(%rsp)
@@ -1348,15 +1390,24 @@ $code.=<<___    if ($win64);
        lea     0xb0(%rax),%rax
 ___
 $code.=<<___;
+.cfi_def_cfa   %rax,8
        movq    -48(%rax), %r15
+.cfi_restore   %r15
        movq    -40(%rax), %r14
+.cfi_restore   %r14
        movq    -32(%rax), %r13
+.cfi_restore   %r13
        movq    -24(%rax), %r12
+.cfi_restore   %r12
        movq    -16(%rax), %rbp
+.cfi_restore   %rbp
        movq    -8(%rax), %rbx
+.cfi_restore   %rbx
        leaq    (%rax), %rsp
+.cfi_def_cfa_register  %rsp
 .Lmul_gather4_epilogue:
        ret
+.cfi_endproc
 .size  rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
 ___
 }
@@ -1367,15 +1418,23 @@ $code.=<<___;
 .type  rsaz_512_mul_scatter4,\@function,6
 .align 32
 rsaz_512_mul_scatter4:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
        mov     $pwr, $pwr
        subq    \$128+24, %rsp
+.cfi_adjust_cfa_offset 128+24
 .Lmul_scatter4_body:
        leaq    ($tbl,$pwr,8), $tbl
        movq    $out, %xmm0             # off-load arguments
@@ -1458,15 +1517,24 @@ $code.=<<___;
        movq    %r15, 128*7($inp)
 
        leaq    128+24+48(%rsp), %rax
+.cfi_def_cfa   %rax,8
        movq    -48(%rax), %r15
+.cfi_restore   %r15
        movq    -40(%rax), %r14
+.cfi_restore   %r14
        movq    -32(%rax), %r13
+.cfi_restore   %r13
        movq    -24(%rax), %r12
+.cfi_restore   %r12
        movq    -16(%rax), %rbp
+.cfi_restore   %rbp
        movq    -8(%rax), %rbx
+.cfi_restore   %rbx
        leaq    (%rax), %rsp
+.cfi_def_cfa_register  %rsp
 .Lmul_scatter4_epilogue:
        ret
+.cfi_endproc
 .size  rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
 ___
 }
@@ -1477,14 +1545,22 @@ $code.=<<___;
 .type  rsaz_512_mul_by_one,\@function,4
 .align 32
 rsaz_512_mul_by_one:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
        subq    \$128+24, %rsp
+.cfi_adjust_cfa_offset 128+24
 .Lmul_by_one_body:
 ___
 $code.=<<___ if ($addx);
@@ -1539,15 +1615,24 @@ $code.=<<___;
        movq    %r15, 56($out)
 
        leaq    128+24+48(%rsp), %rax
+.cfi_def_cfa   %rax,8
        movq    -48(%rax), %r15
+.cfi_restore   %r15
        movq    -40(%rax), %r14
+.cfi_restore   %r14
        movq    -32(%rax), %r13
+.cfi_restore   %r13
        movq    -24(%rax), %r12
+.cfi_restore   %r12
        movq    -16(%rax), %rbp
+.cfi_restore   %rbp
        movq    -8(%rax), %rbx
+.cfi_restore   %rbx
        leaq    (%rax), %rsp
+.cfi_def_cfa_register  %rsp
 .Lmul_by_one_epilogue:
        ret
+.cfi_endproc
 .size  rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
 ___
 }
diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl
index d237c1e..0181f52 100644
--- a/crypto/bn/asm/x86_64-gf2m.pl
+++ b/crypto/bn/asm/x86_64-gf2m.pl
@@ -54,7 +54,9 @@ $code.=<<___;
 .type  _mul_1x1,\@abi-omnipotent
 .align 16
 _mul_1x1:
+.cfi_startproc
        sub     \$128+8,%rsp
+.cfi_adjust_cfa_offset 128+8
        mov     \$-1,$a1
        lea     ($a,$a),$i0
        shr     \$3,$a1
@@ -160,8 +162,10 @@ $code.=<<___;
        xor     $i1,$hi
 
        add     \$128+8,%rsp
+.cfi_adjust_cfa_offset -128-8
        ret
 .Lend_mul_1x1:
+.cfi_endproc
 .size  _mul_1x1,.-_mul_1x1
 ___
 
@@ -174,6 +178,7 @@ $code.=<<___;
 .type  bn_GF2m_mul_2x2,\@abi-omnipotent
 .align 16
 bn_GF2m_mul_2x2:
+.cfi_startproc
        mov     %rsp,%rax
        mov     OPENSSL_ia32cap_P(%rip),%r10
        bt      \$33,%r10
@@ -211,6 +216,7 @@ $code.=<<___;
 .align 16
 .Lvanilla_mul_2x2:
        lea     -8*17(%rsp),%rsp
+.cfi_adjust_cfa_offset 8*17
 ___
 $code.=<<___ if ($win64);
        mov     `8*17+40`(%rsp),$b0
@@ -219,10 +225,15 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
        mov     %r14,8*10(%rsp)
+.cfi_rel_offset        %r14,8*10
        mov     %r13,8*11(%rsp)
+.cfi_rel_offset        %r13,8*11
        mov     %r12,8*12(%rsp)
+.cfi_rel_offset        %r12,8*12
        mov     %rbp,8*13(%rsp)
+.cfi_rel_offset        %rbp,8*13
        mov     %rbx,8*14(%rsp)
+.cfi_rel_offset        %rbx,8*14
 .Lbody_mul_2x2:
        mov     $rp,32(%rsp)            # save the arguments
        mov     $a1,40(%rsp)
@@ -270,10 +281,15 @@ $code.=<<___;
        mov     $lo,8(%rbp)
 
        mov     8*10(%rsp),%r14
+.cfi_restore   %r14
        mov     8*11(%rsp),%r13
+.cfi_restore   %r13
        mov     8*12(%rsp),%r12
+.cfi_restore   %r12
        mov     8*13(%rsp),%rbp
+.cfi_restore   %rbp
        mov     8*14(%rsp),%rbx
+.cfi_restore   %rbx
 ___
 $code.=<<___ if ($win64);
        mov     8*15(%rsp),%rdi
@@ -281,9 +297,11 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
        lea     8*17(%rsp),%rsp
+.cfi_adjust_cfa_offset -8*17
 .Lepilogue_mul_2x2:
        ret
 .Lend_mul_2x2:
+.cfi_endproc
 .size  bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
 .asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align 16
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 7b5e885..cf3daa2 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -104,8 +104,10 @@ $code=<<___;
 .type  bn_mul_mont,\@function,6
 .align 16
 bn_mul_mont:
+.cfi_startproc
        mov     ${num}d,${num}d
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
        test    \$3,${num}d
        jnz     .Lmul_enter
        cmp     \$8,${num}d
@@ -124,11 +126,17 @@ $code.=<<___;
 .align 16
 .Lmul_enter:
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
        neg     $num
        mov     %rsp,%r11
@@ -161,6 +169,7 @@ $code.=<<___;
 .Lmul_page_walk_done:
 
        mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.cfi_cfa_expression    %rsp+8,$num,8,mul,plus,deref,+8
 .Lmul_body:
        mov     $bp,%r12                # reassign $bp
 ___
@@ -331,16 +340,25 @@ $code.=<<___;
        jnz     .Lcopy
 
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
+.cfi_def_cfa   %rsi,8
        mov     \$1,%rax
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lmul_epilogue:
        ret
+.cfi_endproc
 .size  bn_mul_mont,.-bn_mul_mont
 ___
 {{{
@@ -350,8 +368,10 @@ $code.=<<___;
 .type  bn_mul4x_mont,\@function,6
 .align 16
 bn_mul4x_mont:
+.cfi_startproc
        mov     ${num}d,${num}d
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
@@ -361,11 +381,17 @@ $code.=<<___ if ($addx);
 ___
 $code.=<<___;
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
        neg     $num
        mov     %rsp,%r11
@@ -389,6 +415,7 @@ $code.=<<___;
 .Lmul4x_page_walk_done:
 
        mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.cfi_cfa_expression    %rsp+8,$num,8,mul,plus,deref,+8
 .Lmul4x_body:
        mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
        mov     %rdx,%r12               # reassign $bp
@@ -767,16 +794,25 @@ ___
 }
 $code.=<<___;
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
+.cfi_def_cfa   %rsi, 8
        mov     \$1,%rax
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lmul4x_epilogue:
        ret
+.cfi_endproc
 .size  bn_mul4x_mont,.-bn_mul4x_mont
 ___
 }}}
@@ -804,14 +840,22 @@ $code.=<<___;
 .type  bn_sqr8x_mont,\@function,6
 .align 32
 bn_sqr8x_mont:
+.cfi_startproc
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
 .Lsqr8x_enter:
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lsqr8x_prologue:
 
        mov     ${num}d,%r10d
@@ -867,6 +911,7 @@ bn_sqr8x_mont:
 
        mov     $n0,  32(%rsp)
        mov     %rax, 40(%rsp)          # save original %rsp
+.cfi_cfa_expression    %rsp+40,deref,+8
 .Lsqr8x_body:
 
        movq    $nptr, %xmm2            # save pointer to modulus
@@ -936,6 +981,7 @@ $code.=<<___;
        pxor    %xmm0,%xmm0
        pshufd  \$0,%xmm1,%xmm1
        mov     40(%rsp),%rsi           # restore %rsp
+.cfi_def_cfa   %rsi,8
        jmp     .Lsqr8x_cond_copy
 
 .align 32
@@ -965,14 +1011,22 @@ $code.=<<___;
 
        mov     \$1,%rax
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lsqr8x_epilogue:
        ret
+.cfi_endproc
 .size  bn_sqr8x_mont,.-bn_sqr8x_mont
 ___
 }}}
@@ -984,14 +1038,22 @@ $code.=<<___;
 .type  bn_mulx4x_mont,\@function,6
 .align 32
 bn_mulx4x_mont:
+.cfi_startproc
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
 .Lmulx4x_enter:
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lmulx4x_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
@@ -1037,6 +1099,7 @@ bn_mulx4x_mont:
        mov     $n0, 24(%rsp)           # save *n0
        mov     $rp, 32(%rsp)           # save $rp
        mov     %rax,40(%rsp)           # save original %rsp
+.cfi_cfa_expression    %rsp+40,deref,+8
        mov     $num,48(%rsp)           # inner counter
        jmp     .Lmulx4x_body
 
@@ -1286,6 +1349,7 @@ $code.=<<___;
        pxor    %xmm0,%xmm0
        pshufd  \$0,%xmm1,%xmm1
        mov     40(%rsp),%rsi           # restore %rsp
+.cfi_def_cfa   %rsi,8
        jmp     .Lmulx4x_cond_copy
 
 .align 32
@@ -1315,14 +1379,22 @@ $code.=<<___;
 
        mov     \$1,%rax
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r15
        mov     -32(%rsi),%r13
+.cfi_restore   %r15
        mov     -24(%rsi),%r12
+.cfi_restore   %r15
        mov     -16(%rsi),%rbp
+.cfi_restore   %r15
        mov     -8(%rsi),%rbx
+.cfi_restore   %r15
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lmulx4x_epilogue:
        ret
+.cfi_endproc
 .size  bn_mulx4x_mont,.-bn_mulx4x_mont
 ___
 }}}
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 226f436..1bf9942 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -93,8 +93,10 @@ $code=<<___;
 .type  bn_mul_mont_gather5,\@function,6
 .align 64
 bn_mul_mont_gather5:
+.cfi_startproc
        mov     ${num}d,${num}d
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
        test    \$7,${num}d
        jnz     .Lmul_enter
 ___
@@ -108,11 +110,17 @@ $code.=<<___;
 .Lmul_enter:
        movd    `($win64?56:8)`(%rsp),%xmm5     # load 7th argument
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r14
 
        neg     $num
        mov     %rsp,%r11
@@ -145,6 +153,7 @@ $code.=<<___;
 
        lea     .Linc(%rip),%r10
        mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.cfi_cfa_expression    %rsp+8,$num,8,mul,plus,deref,+8
 .Lmul_body:
 
        lea     128($bp),%r12           # reassign $bp (+size optimization)
@@ -431,17 +440,26 @@ $code.=<<___;
        jnz     .Lcopy
 
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
+.cfi_def_cfa   %rsi,8
        mov     \$1,%rax
 
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r15
        mov     -32(%rsi),%r13
+.cfi_restore   %r15
        mov     -24(%rsi),%r12
+.cfi_restore   %r15
        mov     -16(%rsi),%rbp
+.cfi_restore   %r15
        mov     -8(%rsi),%rbx
+.cfi_restore   %r15
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lmul_epilogue:
        ret
+.cfi_endproc
 .size  bn_mul_mont_gather5,.-bn_mul_mont_gather5
 ___
 {{{
@@ -451,8 +469,10 @@ $code.=<<___;
 .type  bn_mul4x_mont_gather5,\@function,6
 .align 32
 bn_mul4x_mont_gather5:
+.cfi_startproc
        .byte   0x67
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
@@ -462,11 +482,17 @@ $code.=<<___ if ($addx);
 ___
 $code.=<<___;
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lmul4x_prologue:
 
        .byte   0x67
@@ -522,22 +548,32 @@ $code.=<<___;
        neg     $num
 
        mov     %rax,40(%rsp)
+.cfi_cfa_expression    %rsp+40,deref,+8
 .Lmul4x_body:
 
        call    mul4x_internal
 
        mov     40(%rsp),%rsi           # restore %rsp
+.cfi_def_cfa   %rsi,8
        mov     \$1,%rax
 
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lmul4x_epilogue:
        ret
+.cfi_endproc
 .size  bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
 
 .type  mul4x_internal,\@abi-omnipotent
@@ -1061,7 +1097,9 @@ $code.=<<___;
 .type  bn_power5,\@function,6
 .align 32
 bn_power5:
+.cfi_startproc
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
 ___
 $code.=<<___ if ($addx);
        mov     OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -1071,11 +1109,17 @@ $code.=<<___ if ($addx);
 ___
 $code.=<<___;
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lpower5_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
@@ -1140,6 +1184,7 @@ $code.=<<___;
        #
        mov     $n0,  32(%rsp)
        mov     %rax, 40(%rsp)          # save original %rsp
+.cfi_cfa_expression    %rsp+40,deref,+8
 .Lpower5_body:
        movq    $rptr,%xmm1             # save $rptr, used in sqr8x
        movq    $nptr,%xmm2             # save $nptr
@@ -1166,16 +1211,25 @@ $code.=<<___;
        call    mul4x_internal
 
        mov     40(%rsp),%rsi           # restore %rsp
+.cfi_def_cfa   %rsi,8
        mov     \$1,%rax
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lpower5_epilogue:
        ret
+.cfi_endproc
 .size  bn_power5,.-bn_power5
 
 .globl bn_sqr8x_internal
@@ -2055,14 +2109,22 @@ bn_from_montgomery:
 .type  bn_from_mont8x,\@function,6
 .align 32
 bn_from_mont8x:
+.cfi_startproc
        .byte   0x67
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lfrom_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
@@ -2127,6 +2189,7 @@ bn_from_mont8x:
        #
        mov     $n0,  32(%rsp)
        mov     %rax, 40(%rsp)          # save original %rsp
+.cfi_cfa_expression    %rsp+40,deref,+8
 .Lfrom_body:
        mov     $num,%r11
        lea     48(%rsp),%rax
@@ -2170,7 +2233,6 @@ $code.=<<___ if ($addx);
 
        pxor    %xmm0,%xmm0
        lea     48(%rsp),%rax
-       mov     40(%rsp),%rsi           # restore %rsp
        jmp     .Lfrom_mont_zero
 
 .align 32
@@ -2182,11 +2244,12 @@ $code.=<<___;
 
        pxor    %xmm0,%xmm0
        lea     48(%rsp),%rax
-       mov     40(%rsp),%rsi           # restore %rsp
        jmp     .Lfrom_mont_zero
 
 .align 32
 .Lfrom_mont_zero:
+       mov     40(%rsp),%rsi           # restore %rsp
+.cfi_def_cfa   %rsi,8
        movdqa  %xmm0,16*0(%rax)
        movdqa  %xmm0,16*1(%rax)
        movdqa  %xmm0,16*2(%rax)
@@ -2197,14 +2260,22 @@ $code.=<<___;
 
        mov     \$1,%rax
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lfrom_epilogue:
        ret
+.cfi_endproc
 .size  bn_from_mont8x,.-bn_from_mont8x
 ___
 }
@@ -2217,14 +2288,22 @@ $code.=<<___;
 .type  bn_mulx4x_mont_gather5,\@function,6
 .align 32
 bn_mulx4x_mont_gather5:
+.cfi_startproc
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
 .Lmulx4x_enter:
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lmulx4x_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
@@ -2290,21 +2369,31 @@ bn_mulx4x_mont_gather5:
        #
        mov     $n0, 32(%rsp)           # save *n0
        mov     %rax,40(%rsp)           # save original %rsp
+.cfi_cfa_expression    %rsp+40,deref,+8
 .Lmulx4x_body:
        call    mulx4x_internal
 
        mov     40(%rsp),%rsi           # restore %rsp
+.cfi_def_cfa   %rsi,8
        mov     \$1,%rax
 
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lmulx4x_epilogue:
        ret
+.cfi_endproc
 .size  bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
 
 .type  mulx4x_internal,\@abi-omnipotent
@@ -2682,14 +2771,22 @@ $code.=<<___;
 .type  bn_powerx5,\@function,6
 .align 32
 bn_powerx5:
+.cfi_startproc
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
 .Lpowerx5_enter:
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lpowerx5_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
@@ -2761,6 +2858,7 @@ bn_powerx5:
        movq    $bptr,%xmm4
        mov     $n0,  32(%rsp)
        mov     %rax, 40(%rsp)          # save original %rsp
+.cfi_cfa_expression    %rsp+40,deref,+8
 .Lpowerx5_body:
 
        call    __bn_sqrx8x_internal
@@ -2783,17 +2881,26 @@ bn_powerx5:
        call    mulx4x_internal
 
        mov     40(%rsp),%rsi           # restore %rsp
+.cfi_def_cfa   %rsi,8
        mov     \$1,%rax
 
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lpowerx5_epilogue:
        ret
+.cfi_endproc
 .size  bn_powerx5,.-bn_powerx5
 
 .globl bn_sqrx8x_internal
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index dd8afe7..09d293a 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -51,12 +51,7 @@
 # 7. Stick to explicit ip-relative addressing. If you have to use
 #    GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??.
 #    Both are recognized and translated to proper Win64 addressing
-#    modes. To support legacy code a synthetic directive, .picmeup,
-#    is implemented. It puts address of the *next* instruction into
-#    target register, e.g.:
-#
-#              .picmeup        %rax
-#              lea             .Label-.(%rax),%rax
+#    modes.
 #
 # 8. In order to provide for structured exception handling unified
 #    Win64 prologue copies %rsp value to %rax. For further details
@@ -100,7 +95,7 @@ elsif (!$gas)
     {  $nasm = $1 + $2*0.01; $PTR="";  }
     elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/)
     {  $masm = $1 + $2*2**-16 + $4*2**-32;   }
-    die "no assembler found on %PATH" if (!($nasm || $masm));
+    die "no assembler found on %PATH%" if (!($nasm || $masm));
     $win64=1;
     $elf=0;
     $decor="\$L\$";
@@ -223,6 +218,13 @@ my %globals;
     }
 }
 { package ea;          # pick up effective addresses: expr(%reg,%reg,scale)
+
+    my %szmap = (      b=>"BYTE$PTR",    w=>"WORD$PTR",
+                       l=>"DWORD$PTR",   d=>"DWORD$PTR",
+                       q=>"QWORD$PTR",   o=>"OWORD$PTR",
+                       x=>"XMMWORD$PTR", y=>"YMMWORD$PTR",
+                       z=>"ZMMWORD$PTR" ) if (!$gas);
+
     sub re {
        my      ($class, $line, $opcode) = @_;
        my      $self = {};
@@ -235,7 +237,7 @@ my %globals;
            $self->{label} = $2;
            ($self->{base},$self->{index},$self->{scale})=split(/,/,$3);
            $self->{scale} = 1 if (!defined($self->{scale}));
-           $self->{pred} = $4;
+           $self->{opmask} = $4;
            $ret = $self;
            $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
 
@@ -276,6 +278,8 @@ my %globals;
            $self->{label} =~ s/\b([0-9]+)\b/$1>>0/eg;
        }
 
+       # if base register is %rbp or %r13, see if it's possible to
+       # flip base and ingex registers [for better performance]
        if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
            $self->{base} =~ /(rbp|r13)/) {
                $self->{base} = $self->{index}; $self->{index} = $1;
@@ -289,17 +293,12 @@ my %globals;
                                        $self->{asterisk},$self->{label},
                                        $self->{base}?"%$self->{base}":"",
                                        $self->{index},$self->{scale},
-                                       $self->{pred};
+                                       $self->{opmask};
            } else {
                sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label},
-                                       $self->{base},$self->{pred};
+                                       $self->{base},$self->{opmask};
            }
        } else {
-           my %szmap = (       b=>"BYTE$PTR",  w=>"WORD$PTR",
-                       l=>"DWORD$PTR", d=>"DWORD$PTR",
-                       q=>"QWORD$PTR", o=>"OWORD$PTR",
-                       x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", z=>"ZMMWORD$PTR" );
-
            $self->{label} =~ s/\./\$/g;
            $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
            $self->{label} = "($self->{label})" if ($self->{label} =~ 
/[\*\+\-\/]/);
@@ -311,20 +310,20 @@ my %globals;
            ($mnemonic =~ /^vpbroadcast([qdwb])$/)      && ($sz=$1)  ||
            ($mnemonic =~ /^v(?!perm)[a-z]+[fi]128$/)   && ($sz="x");
 
-           $self->{pred}  =~ s/%(k[0-7])/$1/;
+           $self->{opmask}  =~ s/%(k[0-7])/$1/;
 
            if (defined($self->{index})) {
                sprintf "%s[%s%s*%d%s]%s",$szmap{$sz},
                                        $self->{label}?"$self->{label}+":"",
                                        $self->{index},$self->{scale},
                                        $self->{base}?"+$self->{base}":"",
-                                       $self->{pred};
+                                       $self->{opmask};
            } elsif ($self->{base} eq "rip") {
                sprintf "%s[%s]",$szmap{$sz},$self->{label};
            } else {
                sprintf "%s[%s%s]%s",   $szmap{$sz},
                                        $self->{label}?"$self->{label}+":"",
-                                       $self->{base},$self->{pred};
+                                       $self->{base},$self->{opmask};
            }
        }
     }
@@ -340,7 +339,7 @@ my %globals;
            bless $self,$class;
            $self->{asterisk} = $1;
            $self->{value} = $2;
-           $self->{pred} = $3;
+           $self->{opmask} = $3;
            $opcode->size($self->size());
            $ret = $self;
            $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
@@ -366,9 +365,9 @@ my %globals;
        my $self = shift;
        if ($gas)       { sprintf "%s%%%s%s",   $self->{asterisk},
                                                $self->{value},
-                                               $self->{pred}; }
-       else            { $self->{pred} =~ s/%(k[0-7])/$1/;
-                         $self->{value}.$self->{pred}; }
+                                               $self->{opmask}; }
+       else            { $self->{opmask} =~ s/%(k[0-7])/$1/;
+                         $self->{value}.$self->{opmask}; }
     }
 }
 { package label;       # pick up labels, which end with :
@@ -392,9 +391,8 @@ my %globals;
 
        if ($gas) {
            my $func = ($globals{$self->{value}} or $self->{value}) . ":";
-           if ($win64  &&
-                       $current_function->{name} eq $self->{value} &&
-                       $current_function->{abi} eq "svr4") {
+           if ($win64  && $current_function->{name} eq $self->{value}
+                       && $current_function->{abi} eq "svr4") {
                $func .= "\n";
                $func .= "      movq    %rdi,8(%rsp)\n";
                $func .= "      movq    %rsi,16(%rsp)\n";
@@ -467,21 +465,249 @@ my %globals;
        }
     }
 }
+{ package cfi_directive;
+    # CFI directives annotate instructions that are significant for
+    # stack unwinding procedure compliant with DWARF specification,
+    # see http://dwarfstd.org/. Besides naturally expected for this
+    # script platform-specific filtering function, this module adds
+    # three auxiliary synthetic directives not recognized by [GNU]
+    # assembler:
+    #
+    # - .cfi_push to annotate push instructions in prologue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_offset;
+    # - .cfi_pop to annotate pop instructions in epilogue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_restore;
+    # - [and most notably] .cfi_cfa_expression which encodes
+    #   DW_CFA_def_cfa_expression and passes it to .cfi_escape as
+    #   byte vector;
+    #
+    # CFA expressions were introduced in DWARF specification version
+    # 3 and describe how to deduce CFA, Canonical Frame Address. This
+    # becomes handy if your stack frame is variable and you can't
+    # spare register for [previous] frame pointer. Suggested directive
+    # syntax is made-up mix of DWARF operator suffixes [subset of]
+    # and references to registers with optional bias. Following example
+    # describes offloaded *original* stack pointer at specific offset
+    # from *current* stack pointer:
+    #
+    #   .cfi_cfa_expression     %rsp+40,deref,+8
+    #
+    # Final +8 has everything to do with the fact that CFA is defined
+    # as reference to top of caller's stack, and on x86_64 call to
+    # subroutine pushes 8-byte return address. In other words original
+    # stack pointer upon entry to a subroutine is 8 bytes off from CFA.
+
+    # Below constants are taken from "DWARF Expressions" section of the
+    # DWARF specification, section is numbered 7.7 in versions 3 and 4.
+    my %DW_OP_simple = (       # no-arg operators, mapped directly
+       deref   => 0x06,        dup     => 0x12,
+       drop    => 0x13,        over    => 0x14,
+       pick    => 0x15,        swap    => 0x16,
+       rot     => 0x17,        xderef  => 0x18,
+
+       abs     => 0x19,        and     => 0x1a,
+       div     => 0x1b,        minus   => 0x1c,
+       mod     => 0x1d,        mul     => 0x1e,
+       neg     => 0x1f,        not     => 0x20,
+       or      => 0x21,        plus    => 0x22,
+       shl     => 0x24,        shr     => 0x25,
+       shra    => 0x26,        xor     => 0x27,
+       );
+
+    my %DW_OP_complex = (      # used in specific subroutines
+       constu          => 0x10,        # uleb128
+       consts          => 0x11,        # sleb128
+       plus_uconst     => 0x23,        # uleb128
+       lit0            => 0x30,        # add 0-31 to opcode
+       reg0            => 0x50,        # add 0-31 to opcode
+       breg0           => 0x70,        # add 0-31 to opcole, sleb128
+       regx            => 0x90,        # uleb28
+       fbreg           => 0x91,        # sleb128
+       bregx           => 0x92,        # uleb128, sleb128
+       piece           => 0x93,        # uleb128
+       );
+
+    # Following constants are defined in x86_64 ABI supplement, for
+    # example avaiable at https://www.uclibc.org/docs/psABI-x86_64.pdf,
+    # see section 3.7 "Stack Unwind Algorithm".
+    my %DW_reg_idx = (
+       "%rax"=>0,  "%rdx"=>1,  "%rcx"=>2,  "%rbx"=>3,
+       "%rsi"=>4,  "%rdi"=>5,  "%rbp"=>6,  "%rsp"=>7,
+       "%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
+       "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+       );
+
+    my ($cfa_reg, $cfa_rsp);
+
+    # [us]leb128 format is variable-length integer representation base
+    # 2^128, with most significant bit of each byte being 0 denoting
+    # *last* most significat digit. See "Variable Length Data" in the
+    # DWARF specification, numbered 7.6 at least in versions 3 and 4.
+    sub sleb128 {
+       use integer;    # get right shift extend sign
+
+       my $val = shift;
+       my $sign = ($val < 0) ? -1 : 0;
+       my @ret = ();
+
+       while(1) {
+           push @ret, $val&0x7f;
+
+           # see if remaining bits are same and equal to most
+           # significant bit of the current digit, if so, it's
+           # last digit...
+           last if (($val>>6) == $sign);
+
+           @ret[-1] |= 0x80;
+           $val >>= 7;
+       }
+
+       return @ret;
+    }
+    sub uleb128 {
+       my $val = shift;
+       my @ret = ();
+
+       while(1) {
+           push @ret, $val&0x7f;
+
+           # see if it's last significant digit...
+           last if (($val >>= 7) == 0);
+
+           @ret[-1] |= 0x80;
+       }
+
+       return @ret;
+    }
+    sub const {
+       my $val = shift;
+
+       if ($val >= 0 && $val < 32) {
+            return ($DW_OP_complex{lit0}+$val);
+       }
+       return ($DW_OP_complex{consts}, sleb128($val));
+    }
+    sub reg {
+       my $val = shift;
+
+       return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/);
+
+       my $reg = $DW_reg_idx{$1};
+       my $off = eval ("0 $2 $3");
+
+       return (($DW_OP_complex{breg0} + $reg), sleb128($off));
+       # Yes, we use DW_OP_bregX+0 to push register value and not
+       # DW_OP_regX, because latter would require even DW_OP_piece,
+       # which would be a waste under the circumstances. If you have
+       # to use DWP_OP_reg, use "regx:N"...
+    }
+    sub cfa_expression {
+       my $line = shift;
+       my @ret;
+
+       foreach my $token (split(/,\s*/,$line)) {
+           if ($token =~ /^%r/) {
+               push @ret,reg($token);
+           } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) {
+               my $i = 1*eval($2);
+               push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i));
+           } elsif (my $i = 1*eval($token) or $token eq "0") {
+               if ($token =~ /^\+/) {
+                   push @ret,$DW_OP_complex{plus_uconst},uleb128($i);
+               } else {
+                   push @ret,const($i);
+               }
+           } else {
+               push @ret,$DW_OP_simple{$token};
+           }
+       }
+
+       # Finally we return DW_CFA_def_cfa_expression, 15, followed by
+       # length of the expression and of course the expression itself.
+       return (15,scalar(@ret),@ret);
+    }
+    sub re {
+       my      ($class, $line) = @_;
+       my      $self = {};
+       my      $ret;
+
+       if ($$line =~ s/^\s*\.cfi_(\w+)\s+//) {
+           bless $self,$class;
+           $ret = $self;
+           undef $self->{value};
+           my $dir = $1;
+
+           SWITCH: for ($dir) {
+           # What is $cfa_rsp? Effectively it's difference between %rsp
+           # value and current CFA, Canonical Frame Address, which is
+           # why it starts with -8. Recall that CFA is top of caller's
+           # stack...
+           /startproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; };
+           /endproc/   && do { ($cfa_reg, $cfa_rsp) = ("%rsp",  0); last; };
+           /def_cfa_register/
+                       && do { $cfa_reg = $$line; last; };
+           /def_cfa_offset/
+                       && do { $cfa_rsp = -1*eval($$line) if ($cfa_reg eq 
"%rsp");
+                               last;
+                             };
+           /adjust_cfa_offset/
+                       && do { $cfa_rsp -= 1*eval($$line) if ($cfa_reg eq 
"%rsp");
+                               last;
+                             };
+           /def_cfa/   && do { if ($$line =~ /(%r\w+)\s*,\s*(\.+)/) {
+                                   $cfa_reg = $1;
+                                   $cfa_rsp = -1*eval($2) if ($cfa_reg eq 
"%rsp");
+                               }
+                               last;
+                             };
+           /push/      && do { $dir = undef;
+                               $cfa_rsp -= 8;
+                               if ($cfa_reg eq "%rsp") {
+                                   $self->{value} = 
".cfi_adjust_cfa_offset\t8\n";
+                               }
+                               $self->{value} .= 
".cfi_offset\t$$line,$cfa_rsp";
+                               last;
+                             };
+           /pop/       && do { $dir = undef;
+                               $cfa_rsp += 8;
+                               if ($cfa_reg eq "%rsp") {
+                                   $self->{value} = 
".cfi_adjust_cfa_offset\t-8\n";
+                               }
+                               $self->{value} .= ".cfi_restore\t$$line";
+                               last;
+                             };
+           /cfa_expression/
+                       && do { $dir = undef;
+                               $self->{value} = ".cfi_escape\t" .
+                                       join(",", map(sprintf("0x%02x", $_),
+                                                     cfa_expression($$line)));
+                               last;
+                             };
+           }
+
+           $self->{value} = ".cfi_$dir\t$$line" if ($dir);
+
+           $$line = "";
+       }
+
+       return $ret;
+    }
+    sub out {
+       my $self = shift;
+       return ($elf ? $self->{value} : undef);
+    }
+}
 { package directive;   # pick up directives, which start with .
     sub re {
        my      ($class, $line) = @_;
        my      $self = {};
        my      $ret;
        my      $dir;
-       my      %opcode =       # lea 2f-1f(%rip),%dst; 1: nop; 2:
-               (       "%rax"=>0x01058d48,     "%rcx"=>0x010d8d48,
-                       "%rdx"=>0x01158d48,     "%rbx"=>0x011d8d48,
-                       "%rsp"=>0x01258d48,     "%rbp"=>0x012d8d48,
-                       "%rsi"=>0x01358d48,     "%rdi"=>0x013d8d48,
-                       "%r8" =>0x01058d4c,     "%r9" =>0x010d8d4c,
-                       "%r10"=>0x01158d4c,     "%r11"=>0x011d8d4c,
-                       "%r12"=>0x01258d4c,     "%r13"=>0x012d8d4c,
-                       "%r14"=>0x01358d4c,     "%r15"=>0x013d8d4c      );
+
+       # chain-call to cfi_directive
+       $ret = cfi_directive->re($line) and return $ret;
 
        if ($$line =~ /^\s*(\.\w+)/) {
            bless $self,$class;
@@ -491,12 +717,6 @@ my %globals;
            $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
 
            SWITCH: for ($dir) {
-               /\.picmeup/ && do { if ($$line =~ /(%r[\w]+)/i) {
-                                       $dir="\t.long";
-                                       $$line=sprintf 
"0x%x,0x90000000",$opcode{$1};
-                                   }
-                                   last;
-                                 };
                /\.global|\.globl|\.extern/
                            && do { $globals{$$line} = $prefix . $$line;
                                    $$line = $globals{$$line} if ($prefix);
@@ -701,15 +921,6 @@ my %globals;
     }
 }
 
-sub rex {
- my $opcode=shift;
- my ($dst,$src,$rex)=@_;
-
-   $rex|=0x04 if($dst>=8);
-   $rex|=0x01 if($src>=8);
-   push @$opcode,($rex|0x40) if ($rex);
-}
-
 # Upon initial x86_64 introduction SSE>2 extensions were not introduced
 # yet. In order not to be bothered by tracing exact assembler versions,
 # but at the same time to provide a bare security minimum of AES-NI, we
@@ -720,6 +931,15 @@ sub rex {
 my %regrm = (  "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
                "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7      );
 
+sub rex {
+ my $opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @$opcode,($rex|0x40) if ($rex);
+}
+
 my $movq = sub {       # elderly gas can't handle inter-register movq
   my $arg = shift;
   my @opcode=(0x66);
@@ -843,6 +1063,10 @@ my $rdseed = sub {
     }
 };
 
+# Not all AVX-capable assemblers recognize AMD XOP extension. Since we
+# are using only two instructions hand-code them in order to be excused
+# from chasing assembler versions...
+
 sub rxb {
  my $opcode=shift;
  my ($dst,$src1,$src2,$rxb)=@_;
@@ -882,10 +1106,15 @@ my $vprotq = sub {
     }
 };
 
+# Intel Control-flow Enforcement Technology extension. All functions and
+# indirect branch targets will have to start with this instruction...
+
 my $endbranch = sub {
     (0xf3,0x0f,0x1e,0xfa);
 };
 
+########################################################################
+
 if ($nasm) {
     print <<___;
 default        rel
_____
openssl-commits mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits

Reply via email to