The branch OpenSSL_1_1_1-stable has been updated
       via  580b8db8b4f1290ec879bfd0bb772012695ac370 (commit)
       via  08fb832377cd90c08a2d233b3230b95a9b9f6e24 (commit)
       via  46ac489a1369f6d938adda356accab83acf2987a (commit)
       via  419102400a2811582a7a3d4a4e317d72e5ce0a8f (commit)
      from  7a4d39f0d176f0d17f2de15672e1869b22f3e1d8 (commit)


- Log -----------------------------------------------------------------
commit 580b8db8b4f1290ec879bfd0bb772012695ac370
Author: Bernd Edlinger <bernd.edlin...@hotmail.de>
Date:   Thu Dec 5 01:20:14 2019 +0100

    Add a CHANGES entry for CVE-2019-1551
    
    Reviewed-by: Paul Dale <paul.d...@oracle.com>
    (Merged from https://github.com/openssl/openssl/pull/10575)

commit 08fb832377cd90c08a2d233b3230b95a9b9f6e24
Author: Bernd Edlinger <bernd.edlin...@hotmail.de>
Date:   Wed Dec 4 22:38:19 2019 +0100

    Add a test case for rsaz_512_sqr overflow handling
    
    [extended tests]
    
    Reviewed-by: Paul Dale <paul.d...@oracle.com>
    (Merged from https://github.com/openssl/openssl/pull/10575)

commit 46ac489a1369f6d938adda356accab83acf2987a
Author: Bernd Edlinger <bernd.edlin...@hotmail.de>
Date:   Wed Dec 4 12:57:41 2019 +0100

    Improve the overflow handling in rsaz_512_sqr
    
    We have always a carry in %rcx or %rbx in range 0..2
    from the previous stage, that is added to the result
    of the 64-bit square, but the low nibble of any square
    can only be 0, 1, 4, 9.
    
    Therefore one "adcq $0, %rdx" can be removed.
    Likewise in the ADX code we can remove one
    "adcx %rbp, $out" since %rbp is always 0, and carry is
    also zero, therefore that is a no-op.
    
    Reviewed-by: Paul Dale <paul.d...@oracle.com>
    (Merged from https://github.com/openssl/openssl/pull/10575)

commit 419102400a2811582a7a3d4a4e317d72e5ce0a8f
Author: Andy Polyakov <ap...@openssl.org>
Date:   Wed Dec 4 12:48:21 2019 +0100

    Fix an overflow bug in rsaz_512_sqr
    
    There is an overflow bug in the x64_64 Montgomery squaring procedure used in
    exponentiation with 512-bit moduli. No EC algorithms are affected. Analysis
    suggests that attacks against 2-prime RSA1024, 3-prime RSA1536, and DSA1024 
as a
    result of this defect would be very difficult to perform and are not 
believed
    likely. Attacks against DH512 are considered just feasible. However, for an
    attack the target would have to re-use the DH512 private key, which is not
    recommended anyway. Also applications directly using the low level API
    BN_mod_exp may be affected if they use BN_FLG_CONSTTIME.
    
    CVE-2019-1551
    
    Reviewed-by: Paul Dale <paul.d...@oracle.com>
    Reviewed-by: Bernd Edlinger <bernd.edlin...@hotmail.de>
    (Merged from https://github.com/openssl/openssl/pull/10575)

-----------------------------------------------------------------------

Summary of changes:
 CHANGES                      |  12 ++
 crypto/bn/asm/rsaz-x86_64.pl | 383 ++++++++++++++++++++++---------------------
 test/bntest.c                | 284 ++++++++++++++++++++++++++++++++
 3 files changed, 494 insertions(+), 185 deletions(-)

diff --git a/CHANGES b/CHANGES
index 58e98dd391..42382fd031 100644
--- a/CHANGES
+++ b/CHANGES
@@ -9,6 +9,18 @@
 
  Changes between 1.1.1d and 1.1.1e [xx XXX xxxx]
 
+  *) Fixed an an overflow bug in the x64_64 Montgomery squaring procedure
+     used in exponentiation with 512-bit moduli. No EC algorithms are
+     affected. Analysis suggests that attacks against 2-prime RSA1024,
+     3-prime RSA1536, and DSA1024 as a result of this defect would be very
+     difficult to perform and are not believed likely. Attacks against DH512
+     are considered just feasible. However, for an attack the target would
+     have to re-use the DH512 private key, which is not recommended anyway.
+     Also applications directly using the low level API BN_mod_exp may be
+     affected if they use BN_FLG_CONSTTIME.
+     (CVE-2019-1551)
+     [Andy Polyakov]
+
   *) Added a new method to gather entropy on VMS, based on SYS$GET_ENTROPY.
      The presence of this system service is determined at run-time.
      [Richard Levitte]
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index b1797b649f..f4d9c9b129 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -116,7 +116,7 @@ rsaz_512_sqr:                               # 25-29% faster 
than rsaz_512_mul
        subq    \$128+24, %rsp
 .cfi_adjust_cfa_offset 128+24
 .Lsqr_body:
-       movq    $mod, %rbp              # common argument
+       movq    $mod, %xmm1             # common off-load
        movq    ($inp), %rdx
        movq    8($inp), %rax
        movq    $n0, 128(%rsp)
@@ -134,7 +134,8 @@ $code.=<<___;
 .Loop_sqr:
        movl    $times,128+8(%rsp)
 #first iteration
-       movq    %rdx, %rbx
+       movq    %rdx, %rbx              # 0($inp)
+       mov     %rax, %rbp              # 8($inp)
        mulq    %rdx
        movq    %rax, %r8
        movq    16($inp), %rax
@@ -173,31 +174,29 @@ $code.=<<___;
        mulq    %rbx
        addq    %rax, %r14
        movq    %rbx, %rax
-       movq    %rdx, %r15
-       adcq    \$0, %r15
+       adcq    \$0, %rdx
 
-       addq    %r8, %r8                #shlq   \$1, %r8
-       movq    %r9, %rcx
-       adcq    %r9, %r9                #shld   \$1, %r8, %r9
+       xorq    %rcx,%rcx               # rcx:r8 = r8 << 1
+       addq    %r8, %r8
+        movq   %rdx, %r15
+       adcq    \$0, %rcx
 
        mulq    %rax
-       movq    %rax, (%rsp)
-       addq    %rdx, %r8
-       adcq    \$0, %r9
+       addq    %r8, %rdx
+       adcq    \$0, %rcx
 
-       movq    %r8, 8(%rsp)
-       shrq    \$63, %rcx
+       movq    %rax, (%rsp)
+       movq    %rdx, 8(%rsp)
 
 #second iteration
-       movq    8($inp), %r8
        movq    16($inp), %rax
-       mulq    %r8
+       mulq    %rbp
        addq    %rax, %r10
        movq    24($inp), %rax
        movq    %rdx, %rbx
        adcq    \$0, %rbx
 
-       mulq    %r8
+       mulq    %rbp
        addq    %rax, %r11
        movq    32($inp), %rax
        adcq    \$0, %rdx
@@ -205,7 +204,7 @@ $code.=<<___;
        movq    %rdx, %rbx
        adcq    \$0, %rbx
 
-       mulq    %r8
+       mulq    %rbp
        addq    %rax, %r12
        movq    40($inp), %rax
        adcq    \$0, %rdx
@@ -213,7 +212,7 @@ $code.=<<___;
        movq    %rdx, %rbx
        adcq    \$0, %rbx
 
-       mulq    %r8
+       mulq    %rbp
        addq    %rax, %r13
        movq    48($inp), %rax
        adcq    \$0, %rdx
@@ -221,7 +220,7 @@ $code.=<<___;
        movq    %rdx, %rbx
        adcq    \$0, %rbx
 
-       mulq    %r8
+       mulq    %rbp
        addq    %rax, %r14
        movq    56($inp), %rax
        adcq    \$0, %rdx
@@ -229,39 +228,39 @@ $code.=<<___;
        movq    %rdx, %rbx
        adcq    \$0, %rbx
 
-       mulq    %r8
+       mulq    %rbp
        addq    %rax, %r15
-       movq    %r8, %rax
+       movq    %rbp, %rax
        adcq    \$0, %rdx
        addq    %rbx, %r15
-       movq    %rdx, %r8
-       movq    %r10, %rdx
-       adcq    \$0, %r8
+       adcq    \$0, %rdx
 
-       add     %rdx, %rdx
-       lea     (%rcx,%r10,2), %r10     #shld   \$1, %rcx, %r10
-       movq    %r11, %rbx
-       adcq    %r11, %r11              #shld   \$1, %r10, %r11
+       xorq    %rbx, %rbx              # rbx:r10:r9 = r10:r9 << 1
+       addq    %r9, %r9
+        movq   %rdx, %r8
+       adcq    %r10, %r10
+       adcq    \$0, %rbx
 
        mulq    %rax
+       # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       addq    %rcx, %rax
+        movq   16($inp), %rbp
        addq    %rax, %r9
+        movq   24($inp), %rax
        adcq    %rdx, %r10
-       adcq    \$0, %r11
+       adcq    \$0, %rbx
 
        movq    %r9, 16(%rsp)
        movq    %r10, 24(%rsp)
-       shrq    \$63, %rbx
 
 #third iteration
-       movq    16($inp), %r9
-       movq    24($inp), %rax
-       mulq    %r9
+       mulq    %rbp
        addq    %rax, %r12
        movq    32($inp), %rax
        movq    %rdx, %rcx
        adcq    \$0, %rcx
 
-       mulq    %r9
+       mulq    %rbp
        addq    %rax, %r13
        movq    40($inp), %rax
        adcq    \$0, %rdx
@@ -269,7 +268,7 @@ $code.=<<___;
        movq    %rdx, %rcx
        adcq    \$0, %rcx
 
-       mulq    %r9
+       mulq    %rbp
        addq    %rax, %r14
        movq    48($inp), %rax
        adcq    \$0, %rdx
@@ -277,9 +276,7 @@ $code.=<<___;
        movq    %rdx, %rcx
        adcq    \$0, %rcx
 
-       mulq    %r9
-        movq   %r12, %r10
-        lea    (%rbx,%r12,2), %r12     #shld   \$1, %rbx, %r12
+       mulq    %rbp
        addq    %rax, %r15
        movq    56($inp), %rax
        adcq    \$0, %rdx
@@ -287,36 +284,40 @@ $code.=<<___;
        movq    %rdx, %rcx
        adcq    \$0, %rcx
 
-       mulq    %r9
-        shrq   \$63, %r10
+       mulq    %rbp
        addq    %rax, %r8
-       movq    %r9, %rax
+       movq    %rbp, %rax
        adcq    \$0, %rdx
        addq    %rcx, %r8
-       movq    %rdx, %r9
-       adcq    \$0, %r9
+       adcq    \$0, %rdx
 
-       movq    %r13, %rcx
-       leaq    (%r10,%r13,2), %r13     #shld   \$1, %r12, %r13
+       xorq    %rcx, %rcx              # rcx:r12:r11 = r12:r11 << 1
+       addq    %r11, %r11
+        movq   %rdx, %r9
+       adcq    %r12, %r12
+       adcq    \$0, %rcx
 
        mulq    %rax
+       # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       addq    %rbx, %rax
+        movq   24($inp), %r10
        addq    %rax, %r11
+        movq   32($inp), %rax
        adcq    %rdx, %r12
-       adcq    \$0, %r13
+       adcq    \$0, %rcx
 
        movq    %r11, 32(%rsp)
        movq    %r12, 40(%rsp)
-       shrq    \$63, %rcx
 
 #fourth iteration
-       movq    24($inp), %r10
-       movq    32($inp), %rax
+       mov     %rax, %r11              # 32($inp)
        mulq    %r10
        addq    %rax, %r14
        movq    40($inp), %rax
        movq    %rdx, %rbx
        adcq    \$0, %rbx
 
+       mov     %rax, %r12              # 40($inp)
        mulq    %r10
        addq    %rax, %r15
        movq    48($inp), %rax
@@ -325,9 +326,8 @@ $code.=<<___;
        movq    %rdx, %rbx
        adcq    \$0, %rbx
 
+       mov     %rax, %rbp              # 48($inp)
        mulq    %r10
-        movq   %r14, %r12
-        leaq   (%rcx,%r14,2), %r14     #shld   \$1, %rcx, %r14
        addq    %rax, %r8
        movq    56($inp), %rax
        adcq    \$0, %rdx
@@ -336,32 +336,33 @@ $code.=<<___;
        adcq    \$0, %rbx
 
        mulq    %r10
-        shrq   \$63, %r12
        addq    %rax, %r9
        movq    %r10, %rax
        adcq    \$0, %rdx
        addq    %rbx, %r9
-       movq    %rdx, %r10
-       adcq    \$0, %r10
+       adcq    \$0, %rdx
 
-       movq    %r15, %rbx
-       leaq    (%r12,%r15,2),%r15      #shld   \$1, %r14, %r15
+       xorq    %rbx, %rbx              # rbx:r13:r14 = r13:r14 << 1
+       addq    %r13, %r13
+        movq   %rdx, %r10
+       adcq    %r14, %r14
+       adcq    \$0, %rbx
 
        mulq    %rax
+       # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       addq    %rcx, %rax
        addq    %rax, %r13
+        movq   %r12, %rax              # 40($inp)
        adcq    %rdx, %r14
-       adcq    \$0, %r15
+       adcq    \$0, %rbx
 
        movq    %r13, 48(%rsp)
        movq    %r14, 56(%rsp)
-       shrq    \$63, %rbx
 
 #fifth iteration
-       movq    32($inp), %r11
-       movq    40($inp), %rax
        mulq    %r11
        addq    %rax, %r8
-       movq    48($inp), %rax
+       movq    %rbp, %rax              # 48($inp)
        movq    %rdx, %rcx
        adcq    \$0, %rcx
 
@@ -369,97 +370,99 @@ $code.=<<___;
        addq    %rax, %r9
        movq    56($inp), %rax
        adcq    \$0, %rdx
-        movq   %r8, %r12
-        leaq   (%rbx,%r8,2), %r8       #shld   \$1, %rbx, %r8
        addq    %rcx, %r9
        movq    %rdx, %rcx
        adcq    \$0, %rcx
 
+       mov     %rax, %r14              # 56($inp)
        mulq    %r11
-        shrq   \$63, %r12
        addq    %rax, %r10
        movq    %r11, %rax
        adcq    \$0, %rdx
        addq    %rcx, %r10
-       movq    %rdx, %r11
-       adcq    \$0, %r11
+       adcq    \$0, %rdx
 
-       movq    %r9, %rcx
-       leaq    (%r12,%r9,2), %r9       #shld   \$1, %r8, %r9
+       xorq    %rcx, %rcx              # rcx:r8:r15 = r8:r15 << 1
+       addq    %r15, %r15
+        movq   %rdx, %r11
+       adcq    %r8, %r8
+       adcq    \$0, %rcx
 
        mulq    %rax
+       # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       addq    %rbx, %rax
        addq    %rax, %r15
+        movq   %rbp, %rax              # 48($inp)
        adcq    %rdx, %r8
-       adcq    \$0, %r9
+       adcq    \$0, %rcx
 
        movq    %r15, 64(%rsp)
        movq    %r8, 72(%rsp)
-       shrq    \$63, %rcx
 
 #sixth iteration
-       movq    40($inp), %r12
-       movq    48($inp), %rax
        mulq    %r12
        addq    %rax, %r10
-       movq    56($inp), %rax
+       movq    %r14, %rax              # 56($inp)
        movq    %rdx, %rbx
        adcq    \$0, %rbx
 
        mulq    %r12
        addq    %rax, %r11
        movq    %r12, %rax
-        movq   %r10, %r15
-        leaq   (%rcx,%r10,2), %r10     #shld   \$1, %rcx, %r10
        adcq    \$0, %rdx
-        shrq   \$63, %r15
        addq    %rbx, %r11
-       movq    %rdx, %r12
-       adcq    \$0, %r12
+       adcq    \$0, %rdx
 
-       movq    %r11, %rbx
-       leaq    (%r15,%r11,2), %r11     #shld   \$1, %r10, %r11
+       xorq    %rbx, %rbx              # rbx:r10:r9 = r10:r9 << 1
+       addq    %r9, %r9
+        movq   %rdx, %r12
+       adcq    %r10, %r10
+       adcq    \$0, %rbx
 
        mulq    %rax
+       # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       addq    %rcx, %rax
        addq    %rax, %r9
+        movq   %r14, %rax              # 56($inp)
        adcq    %rdx, %r10
-       adcq    \$0, %r11
+       adcq    \$0, %rbx
 
        movq    %r9, 80(%rsp)
        movq    %r10, 88(%rsp)
 
 #seventh iteration
-       movq    48($inp), %r13
-       movq    56($inp), %rax
-       mulq    %r13
+       mulq    %rbp
        addq    %rax, %r12
-       movq    %r13, %rax
-       movq    %rdx, %r13
-       adcq    \$0, %r13
+       movq    %rbp, %rax
+       adcq    \$0, %rdx
 
-       xorq    %r14, %r14
-       shlq    \$1, %rbx
-       adcq    %r12, %r12              #shld   \$1, %rbx, %r12
-       adcq    %r13, %r13              #shld   \$1, %r12, %r13
-       adcq    %r14, %r14              #shld   \$1, %r13, %r14
+       xorq    %rcx, %rcx              # rcx:r12:r11 = r12:r11 << 1
+       addq    %r11, %r11
+        movq   %rdx, %r13
+       adcq    %r12, %r12
+       adcq    \$0, %rcx
 
        mulq    %rax
+       # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       addq    %rbx, %rax
        addq    %rax, %r11
+        movq   %r14, %rax              # 56($inp)
        adcq    %rdx, %r12
-       adcq    \$0, %r13
+       adcq    \$0, %rcx
 
        movq    %r11, 96(%rsp)
        movq    %r12, 104(%rsp)
 
 #eighth iteration
-       movq    56($inp), %rax
-       mulq    %rax
-       addq    %rax, %r13
-       adcq    \$0, %rdx
-
-       addq    %rdx, %r14
+       xorq    %rbx, %rbx              # rbx:r13 = r13 << 1
+       addq    %r13, %r13
+       adcq    \$0, %rbx
 
-       movq    %r13, 112(%rsp)
-       movq    %r14, 120(%rsp)
+       mulq    %rax
+       # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       addq    %rcx, %rax
+       addq    %r13, %rax
+       adcq    %rbx, %rdx
 
        movq    (%rsp), %r8
        movq    8(%rsp), %r9
@@ -469,6 +472,10 @@ $code.=<<___;
        movq    40(%rsp), %r13
        movq    48(%rsp), %r14
        movq    56(%rsp), %r15
+       movq    %xmm1, %rbp
+
+       movq    %rax, 112(%rsp)
+       movq    %rdx, 120(%rsp)
 
        call    __rsaz_512_reduce
 
@@ -500,9 +507,9 @@ $code.=<<___;
 .Loop_sqrx:
        movl    $times,128+8(%rsp)
        movq    $out, %xmm0             # off-load
-       movq    %rbp, %xmm1             # off-load
 #first iteration
        mulx    %rax, %r8, %r9
+       mov     %rax, %rbx
 
        mulx    16($inp), %rcx, %r10
        xor     %rbp, %rbp              # cf=0, of=0
@@ -510,40 +517,39 @@ $code.=<<___;
        mulx    24($inp), %rax, %r11
        adcx    %rcx, %r9
 
-       mulx    32($inp), %rcx, %r12
+       .byte   0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00    # mulx  
32($inp), %rcx, %r12
        adcx    %rax, %r10
 
-       mulx    40($inp), %rax, %r13
+       .byte   0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00    # mulx  
40($inp), %rax, %r13
        adcx    %rcx, %r11
 
-       .byte   0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00    # mulx  
48($inp), %rcx, %r14
+       mulx    48($inp), %rcx, %r14
        adcx    %rax, %r12
        adcx    %rcx, %r13
 
-       .byte   0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00    # mulx  
56($inp), %rax, %r15
+       mulx    56($inp), %rax, %r15
        adcx    %rax, %r14
        adcx    %rbp, %r15              # %rbp is 0
 
-       mov     %r9, %rcx
-       shld    \$1, %r8, %r9
-       shl     \$1, %r8
-
-       xor     %ebp, %ebp
-       mulx    %rdx, %rax, %rdx
-       adcx    %rdx, %r8
-        mov    8($inp), %rdx
-       adcx    %rbp, %r9
+       mulx    %rdx, %rax, $out
+        mov    %rbx, %rdx              # 8($inp)
+       xor     %rcx, %rcx
+       adox    %r8, %r8
+       adcx    $out, %r8
+       adox    %rbp, %rcx
+       adcx    %rbp, %rcx
 
        mov     %rax, (%rsp)
        mov     %r8, 8(%rsp)
 
 #second iteration
-       mulx    16($inp), %rax, %rbx
+       .byte   0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00    # mulx  
16($inp), %rax, %rbx
        adox    %rax, %r10
        adcx    %rbx, %r11
 
-       .byte   0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00    # mulx  
24($inp), $out, %r8
+       mulx    24($inp), $out, %r8
        adox    $out, %r11
+       .byte   0x66
        adcx    %r8, %r12
 
        mulx    32($inp), %rax, %rbx
@@ -561,24 +567,25 @@ $code.=<<___;
        .byte   0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00    # mulx  
56($inp), $out, %r8
        adox    $out, %r15
        adcx    %rbp, %r8
+        mulx   %rdx, %rax, $out
        adox    %rbp, %r8
+        .byte  0x48,0x8b,0x96,0x10,0x00,0x00,0x00              # mov   
16($inp), %rdx
 
-       mov     %r11, %rbx
-       shld    \$1, %r10, %r11
-       shld    \$1, %rcx, %r10
-
-       xor     %ebp,%ebp
-       mulx    %rdx, %rax, %rcx
-        mov    16($inp), %rdx
+       xor     %rbx, %rbx
+        adox   %r9, %r9
+       # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       adcx    %rcx, %rax
+       adox    %r10, %r10
        adcx    %rax, %r9
-       adcx    %rcx, %r10
-       adcx    %rbp, %r11
+       adox    %rbp, %rbx
+       adcx    $out, %r10
+       adcx    %rbp, %rbx
 
        mov     %r9, 16(%rsp)
        .byte   0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00         # mov   %r10, 
24(%rsp)
 
 #third iteration
-       .byte   0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00    # mulx  
24($inp), $out, %r9
+       mulx    24($inp), $out, %r9
        adox    $out, %r12
        adcx    %r9, %r13
 
@@ -586,7 +593,7 @@ $code.=<<___;
        adox    %rax, %r13
        adcx    %rcx, %r14
 
-       mulx    40($inp), $out, %r9
+       .byte   0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00    # mulx  
40($inp), $out, %r9
        adox    $out, %r14
        adcx    %r9, %r15
 
@@ -594,27 +601,28 @@ $code.=<<___;
        adox    %rax, %r15
        adcx    %rcx, %r8
 
-       .byte   0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00    # mulx  
56($inp), $out, %r9
+       mulx    56($inp), $out, %r9
        adox    $out, %r8
        adcx    %rbp, %r9
+        mulx   %rdx, %rax, $out
        adox    %rbp, %r9
+        mov    24($inp), %rdx
 
-       mov     %r13, %rcx
-       shld    \$1, %r12, %r13
-       shld    \$1, %rbx, %r12
-
-       xor     %ebp, %ebp
-       mulx    %rdx, %rax, %rdx
+       xor     %rcx, %rcx
+        adox   %r11, %r11
+       # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       adcx    %rbx, %rax
+       adox    %r12, %r12
        adcx    %rax, %r11
-       adcx    %rdx, %r12
-        mov    24($inp), %rdx
-       adcx    %rbp, %r13
+       adox    %rbp, %rcx
+       adcx    $out, %r12
+       adcx    %rbp, %rcx
 
        mov     %r11, 32(%rsp)
-       .byte   0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00         # mov   %r12, 
40(%rsp)
+       mov     %r12, 40(%rsp)
 
 #fourth iteration
-       .byte   0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00    # mulx  
32($inp), %rax, %rbx
+       mulx    32($inp), %rax, %rbx
        adox    %rax, %r14
        adcx    %rbx, %r15
 
@@ -629,25 +637,25 @@ $code.=<<___;
        mulx    56($inp), $out, %r10
        adox    $out, %r9
        adcx    %rbp, %r10
+        mulx   %rdx, %rax, $out
        adox    %rbp, %r10
+        mov    32($inp), %rdx
 
-       .byte   0x66
-       mov     %r15, %rbx
-       shld    \$1, %r14, %r15
-       shld    \$1, %rcx, %r14
-
-       xor     %ebp, %ebp
-       mulx    %rdx, %rax, %rdx
+       xor     %rbx, %rbx
+        adox   %r13, %r13
+       # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       adcx    %rcx, %rax
+       adox    %r14, %r14
        adcx    %rax, %r13
-       adcx    %rdx, %r14
-        mov    32($inp), %rdx
-       adcx    %rbp, %r15
+       adox    %rbp, %rbx
+       adcx    $out, %r14
+       adcx    %rbp, %rbx
 
        mov     %r13, 48(%rsp)
        mov     %r14, 56(%rsp)
 
 #fifth iteration
-       .byte   0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00    # mulx  
40($inp), $out, %r11
+       mulx    40($inp), $out, %r11
        adox    $out, %r8
        adcx    %r11, %r9
 
@@ -658,18 +666,19 @@ $code.=<<___;
        mulx    56($inp), $out, %r11
        adox    $out, %r10
        adcx    %rbp, %r11
+        mulx   %rdx, %rax, $out
+        mov    40($inp), %rdx
        adox    %rbp, %r11
 
-       mov     %r9, %rcx
-       shld    \$1, %r8, %r9
-       shld    \$1, %rbx, %r8
-
-       xor     %ebp, %ebp
-       mulx    %rdx, %rax, %rdx
+       xor     %rcx, %rcx
+        adox   %r15, %r15
+       # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       adcx    %rbx, %rax
+       adox    %r8, %r8
        adcx    %rax, %r15
-       adcx    %rdx, %r8
-        mov    40($inp), %rdx
-       adcx    %rbp, %r9
+       adox    %rbp, %rcx
+       adcx    $out, %r8
+       adcx    %rbp, %rcx
 
        mov     %r15, 64(%rsp)
        mov     %r8, 72(%rsp)
@@ -682,18 +691,19 @@ $code.=<<___;
        .byte   0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00    # mulx  
56($inp), $out, %r12
        adox    $out, %r11
        adcx    %rbp, %r12
+        mulx   %rdx, %rax, $out
        adox    %rbp, %r12
+        mov    48($inp), %rdx
 
-       mov     %r11, %rbx
-       shld    \$1, %r10, %r11
-       shld    \$1, %rcx, %r10
-
-       xor     %ebp, %ebp
-       mulx    %rdx, %rax, %rdx
+       xor     %rbx, %rbx
+        adox   %r9, %r9
+       # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       adcx    %rcx, %rax
+       adox    %r10, %r10
        adcx    %rax, %r9
-       adcx    %rdx, %r10
-        mov    48($inp), %rdx
-       adcx    %rbp, %r11
+       adcx    $out, %r10
+       adox    %rbp, %rbx
+       adcx    %rbp, %rbx
 
        mov     %r9, 80(%rsp)
        mov     %r10, 88(%rsp)
@@ -703,31 +713,31 @@ $code.=<<___;
        adox    %rax, %r12
        adox    %rbp, %r13
 
-       xor     %r14, %r14
-       shld    \$1, %r13, %r14
-       shld    \$1, %r12, %r13
-       shld    \$1, %rbx, %r12
-
-       xor     %ebp, %ebp
-       mulx    %rdx, %rax, %rdx
-       adcx    %rax, %r11
-       adcx    %rdx, %r12
+       mulx    %rdx, %rax, $out
+       xor     %rcx, %rcx
         mov    56($inp), %rdx
-       adcx    %rbp, %r13
+        adox   %r11, %r11
+       # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       adcx    %rbx, %rax
+       adox    %r12, %r12
+       adcx    %rax, %r11
+       adox    %rbp, %rcx
+       adcx    $out, %r12
+       adcx    %rbp, %rcx
 
        .byte   0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00         # mov   %r11, 
96(%rsp)
        .byte   0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00         # mov   %r12, 
104(%rsp)
 
 #eighth iteration
        mulx    %rdx, %rax, %rdx
-       adox    %rax, %r13
-       adox    %rbp, %rdx
+       xor     %rbx, %rbx
+        adox   %r13, %r13
+       # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
+       adcx    %rcx, %rax
+       adox    %rbp, %rbx
+       adcx    %r13, %rax
+       adcx    %rdx, %rbx
 
-       .byte   0x66
-       add     %rdx, %r14
-
-       movq    %r13, 112(%rsp)
-       movq    %r14, 120(%rsp)
        movq    %xmm0, $out
        movq    %xmm1, %rbp
 
@@ -741,6 +751,9 @@ $code.=<<___;
        movq    48(%rsp), %r14
        movq    56(%rsp), %r15
 
+       movq    %rax, 112(%rsp)
+       movq    %rbx, 120(%rsp)
+
        call    __rsaz_512_reducex
 
        addq    64(%rsp), %r8
diff --git a/test/bntest.c b/test/bntest.c
index 9f63ec121c..97d08ac0be 100644
--- a/test/bntest.c
+++ b/test/bntest.c
@@ -2404,6 +2404,288 @@ static int test_gcd_prime(void)
     return st;
 }
 
+typedef struct mod_exp_test_st
+{
+  const char *base;
+  const char *exp;
+  const char *mod;
+  const char *res;
+} MOD_EXP_TEST;
+
+static const MOD_EXP_TEST ModExpTests[] = {
+   /* original test vectors for rsaz_512_sqr bug, by OSS-Fuzz */
+   {
+       "1166180238001879113042182292626169621106255558914000595999312084"
+       "4627946820899490684928760491249738643524880720584249698100907201"
+       "002086675047927600340800371",
+       "8000000000000000000000000000000000000000000000000000000000000000"
+       "0000000000000000000000000000000000000000000000000000000000000000"
+       "00000000",
+       "1340780792684523720980737645613191762604395855615117867483316354"
+       "3294276330515137663421134775482798690129946803802212663956180562"
+       "088664022929883876655300863",
+       "8243904058268085430037326628480645845409758077568738532059032482"
+       "8294114415890603594730158120426756266457928475330450251339773498"
+       "26758407619521544102068438"
+   },
+   {
+       "4974270041410803822078866696159586946995877618987010219312844726"
+       "0284386121835740784990869050050504348861513337232530490826340663"
+       "197278031692737429054",
+       "4974270041410803822078866696159586946995877428188754995041148539"
+       "1663243362592271353668158565195557417149981094324650322556843202"
+       "946445882670777892608",
+       "1340780716511420227215592830971452482815377482627251725537099028"
+       "4429769497230131760206012644403029349547320953206103351725462999"
+       "947509743623340557059752191",
+       "5296244594780707015616522701706118082963369547253192207884519362"
+       "1767869984947542695665420219028522815539559194793619684334900442"
+       "49304558011362360473525933"
+   },
+   /* test vectors for rsaz_512_srq bug, with rcx/rbx=1 */
+   {   /* between first and second iteration */
+       "5148719036160389201525610950887605325980251964889646556085286545"
+       "3931548809178823413169359635978762036512397113080988070677858033"
+       "36463909753993540214027190",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between second and third iteration */
+       "8908340854353752577419678771330460827942371434853054158622636544"
+       "8151360109722890949471912566649465436296659601091730745087014189"
+       "2672764191218875181826063",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between third and fourth iteration */
+       "3427446396505596330634350984901719674479522569002785244080234738"
+       "4288743635435746136297299366444548736533053717416735379073185344"
+       "26985272974404612945608761",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between fourth and fifth iteration */
+       "3472743044917564564078857826111874560045331237315597383869652985"
+       "6919870028890895988478351133601517365908445058405433832718206902"
+       "4088133164805266956353542",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between fifth and sixth iteration */
+       "3608632990153469264412378349742339216742409743898601587274768025"
+       "0110772032985643555192767717344946174122842255204082586753499651"
+       "14483434992887431333675068",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between sixth and seventh iteration */
+       "8455374370234070242910508226941981520235709767260723212165264877"
+       "8689064388017521524568434328264431772644802567028663962962025746"
+       "9283458217850119569539086",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between seventh and eighth iteration */
+       "5155371529688532178421209781159131443543419764974688878527112131"
+       "7446518205609427412336183157918981038066636807317733319323257603"
+       "04416292040754017461076359",
+       "1005585594745694782468051874865438459560952436544429503329267108"
+       "2791323022555160232601405723625177570767523893639864538140315412"
+       "108959927459825236754563832",
+       "1005585594745694782468051874865438459560952436544429503329267108"
+       "2791323022555160232601405723625177570767523893639864538140315412"
+       "108959927459825236754563833",
+       "1"
+   },
+   /* test vectors for rsaz_512_srq bug, with rcx/rbx=2 */
+   {   /* between first and second iteration */
+       "3155666506033786929967309937640790361084670559125912405342594979"
+       "4345142818528956285490897841406338022378565972533508820577760065"
+       "58494345853302083699912572",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between second and third iteration */
+       "3789819583801342198190405714582958759005991915505282362397087750"
+       "4213544724644823098843135685133927198668818185338794377239590049"
+       "41019388529192775771488319",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between third and forth iteration */
+       "4695752552040706867080542538786056470322165281761525158189220280"
+       "4025547447667484759200742764246905647644662050122968912279199065"
+       "48065034299166336940507214",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between forth and fifth iteration */
+       "2159140240970485794188159431017382878636879856244045329971239574"
+       "8919691133560661162828034323196457386059819832804593989740268964"
+       "74502911811812651475927076",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between fifth and sixth iteration */
+       "5239312332984325668414624633307915097111691815000872662334695514"
+       "5436533521392362443557163429336808208137221322444780490437871903"
+       "99972784701334569424519255",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between sixth and seventh iteration */
+       "1977953647322612860406858017869125467496941904523063466791308891"
+       "1172796739058531929470539758361774569875505293428856181093904091"
+       "33788264851714311303725089",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042158",
+       "6703903964971298549787012499102923063739682910296196688861780721"
+       "8608820150367734884009371490834517138450159290932430254268769414"
+       "05973284973216824503042159",
+       "1"
+   },
+   {   /* between seventh and eighth iteration */
+       "6456987954117763835533395796948878140715006860263624787492985786"
+       "8514630216966738305923915688821526449499763719943997120302368211"
+       "04813318117996225041943964",
+       "1340780792994259709957402499820584612747936582059239337772356144"
+       "3721764030073546976801874298166903427690031858186486050853753882"
+       "811946551499689575296532556",
+       "1340780792994259709957402499820584612747936582059239337772356144"
+       "3721764030073546976801874298166903427690031858186486050853753882"
+       "811946551499689575296532557",
+       "1"
+   }
+};
+
+static int test_mod_exp(int i)
+{
+    const MOD_EXP_TEST *test = &ModExpTests[i];
+    int res = 0;
+    BIGNUM* result = NULL;
+    BIGNUM *base = NULL, *exponent = NULL, *modulo = NULL;
+    char *s = NULL;
+
+    if (!TEST_ptr(result = BN_new())
+            || !TEST_true(BN_dec2bn(&base, test->base))
+            || !TEST_true(BN_dec2bn(&exponent, test->exp))
+            || !TEST_true(BN_dec2bn(&modulo, test->mod)))
+        goto err;
+
+    if (!TEST_int_eq(BN_mod_exp(result, base, exponent, modulo, ctx), 1))
+        goto err;
+
+    if (!TEST_ptr(s = BN_bn2dec(result)))
+        goto err;
+
+    if (!TEST_mem_eq(s, strlen(s), test->res, strlen(test->res)))
+        goto err;
+
+    res = 1;
+
+ err:
+    OPENSSL_free(s);
+    BN_free(result);
+    BN_free(base);
+    BN_free(exponent);
+    BN_free(modulo);
+    return res;
+}
+
+static int test_mod_exp_consttime(int i)
+{
+    const MOD_EXP_TEST *test = &ModExpTests[i];
+    int res = 0;
+    BIGNUM* result = NULL;
+    BIGNUM *base = NULL, *exponent = NULL, *modulo = NULL;
+    char *s = NULL;
+
+    if (!TEST_ptr(result = BN_new())
+            || !TEST_true(BN_dec2bn(&base, test->base))
+            || !TEST_true(BN_dec2bn(&exponent, test->exp))
+            || !TEST_true(BN_dec2bn(&modulo, test->mod)))
+        goto err;
+
+    BN_set_flags(base, BN_FLG_CONSTTIME);
+    BN_set_flags(exponent, BN_FLG_CONSTTIME);
+    BN_set_flags(modulo, BN_FLG_CONSTTIME);
+
+    if (!TEST_int_eq(BN_mod_exp(result, base, exponent, modulo, ctx), 1))
+        goto err;
+
+    if (!TEST_ptr(s = BN_bn2dec(result)))
+        goto err;
+
+    if (!TEST_mem_eq(s, strlen(s), test->res, strlen(test->res)))
+        goto err;
+
+    res = 1;
+
+ err:
+    OPENSSL_free(s);
+    BN_free(result);
+    BN_free(base);
+    BN_free(exponent);
+    BN_free(modulo);
+    return res;
+}
+
 static int file_test_run(STANZA *s)
 {
     static const FILETEST filetests[] = {
@@ -2508,6 +2790,8 @@ int setup_tests(void)
         ADD_ALL_TESTS(test_is_prime, (int)OSSL_NELEM(primes));
         ADD_ALL_TESTS(test_not_prime, (int)OSSL_NELEM(not_primes));
         ADD_TEST(test_gcd_prime);
+        ADD_ALL_TESTS(test_mod_exp, (int)OSSL_NELEM(ModExpTests));
+        ADD_ALL_TESTS(test_mod_exp_consttime, (int)OSSL_NELEM(ModExpTests));
     } else {
         ADD_ALL_TESTS(run_file_tests, n);
     }

Reply via email to