On Mon, Dec 1, 2014 at 3:23 PM, Andy Polyakov via RT <r...@openssl.org> wrote:
>>> (Affects 1.0.2 only.)
>>>
>>> In crypto/ec/asm/ecp_nistz256-x86_64.pl, __ecp_nistz256_sqr_montq,
>>> under "Now the reduction" there are a number of comments saying
>>> "doesn't overflow". Unfortunately, they aren't correct.
>>
>> Got math wrong:-( Attached is not only fixed version, but even faster
>> one.
>
> Please test attached one instead. Squaring didn't cover one case, and
> AD*X is optimized.

thanks! Was away last week and so didn't have a chance to try fixing this.

I'll patch that it and run the tests against it.


Cheers

AGL

>
>> On related note. It's possible to improve server-side DSA by ~5% by
>> switching [back] to scatter-gather. [Change from scatter-gather was
>> caused by concern about timing dependency, but I argue that concern is
>> not valid in most cases.] There also are x86 and and ARM versions pending:
>>
>> #               with/without -DECP_NISTZ256_ASM
>> # Pentium       +66-168%
>> # PIII          +73-175%
>> # P4            +68-140%
>> # Core2         +90-215%
>> # Sandy Bridge  +105-265% (contemporary i[57]-* are all close to this)
>> # Atom          +66-160%
>> # Opteron       +54-112%
>> # Bulldozer     +99-240%
>> # VIA Nano      +93-300%
>>
>> #                       with/without -DECP_NISTZ256_ASM
>> # Cortex-A8             +53-173%
>> # Cortex-A9             +76-205%
>> # Cortex-A15            +100-316%
>> # Snapdragon S4         +66-187%
>>
>> No, bug in question is not there. Nor is AD*X code path is affected.
>>
>>
>
>
>
> diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl 
> b/crypto/ec/asm/ecp_nistz256-x86_64.pl
> index 4486a5e..56f6c2b 100755
> --- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
> +++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
> @@ -31,15 +31,15 @@
>  # Further optimization by <ap...@openssl.org>:
>  #
>  #              this/original
> -# Opteron      +8-33%
> -# Bulldozer    +10-30%
> -# P4           +14-38%
> -# Westmere     +8-23%
> -# Sandy Bridge +8-24%
> -# Ivy Bridge   +7-25%
> -# Haswell      +5-25%
> -# Atom         +10-32%
> -# VIA Nano     +37-130%
> +# Opteron      +10-43%
> +# Bulldozer    +14-43%
> +# P4           +18-50%
> +# Westmere     +12-36%
> +# Sandy Bridge +9-36%
> +# Ivy Bridge   +9-36%
> +# Haswell      +8-37%
> +# Atom         +15-50%
> +# VIA Nano     +43-160%
>  #
>  # Ranges denote minimum and maximum improvement coefficients depending
>  # on benchmark. Lower coefficients are for ECDSA sign, relatively
> @@ -550,28 +550,20 @@ __ecp_nistz256_mul_montq:
>         # and add the result to the acc.
>         # Due to the special form of p256 we do some optimizations
>         #
> -       # acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
> -       # then we add acc[0] and get acc[0] x 2^64
> -
> -       mulq    $poly1
> -       xor     $t0, $t0
> -       add     $acc0, $acc1            # +=acc[0]*2^64
> -       adc     \$0, %rdx
> -       add     %rax, $acc1
> -       mov     $acc0, %rax
> -
> -       # acc[0] x p256[2] = 0
> -       adc     %rdx, $acc2
> -       adc     \$0, $t0
> +       # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
> +       # then we add acc[0] and get acc[0] x 2^96
>
> +       mov     $acc0, $t1
> +       shl     \$32, $acc0
>         mulq    $poly3
> -       xor     $acc0, $acc0
> -       add     $t0, $acc3
> -       adc     \$0, %rdx
> -       add     %rax, $acc3
> +       shr     \$32, $t1
> +       add     $acc0, $acc1            # +=acc[0]<<96
> +       adc     $t1, $acc2
> +       adc     %rax, $acc3
>          mov    8*1($b_ptr), %rax
>         adc     %rdx, $acc4
>         adc     \$0, $acc5
> +       xor     $acc0, $acc0
>
>         
> ########################################################################
>         # Multiply by b[1]
> @@ -608,23 +600,17 @@ __ecp_nistz256_mul_montq:
>
>         
> ########################################################################
>         # Second reduction step
> -       mulq    $poly1
> -       xor     $t0, $t0
> -       add     $acc1, $acc2
> -       adc     \$0, %rdx
> -       add     %rax, $acc2
> -       mov     $acc1, %rax
> -       adc     %rdx, $acc3
> -       adc     \$0, $t0
> -
> +       mov     $acc1, $t1
> +       shl     \$32, $acc1
>         mulq    $poly3
> -       xor     $acc1, $acc1
> -       add     $t0, $acc4
> -       adc     \$0, %rdx
> -       add     %rax, $acc4
> +       shr     \$32, $t1
> +       add     $acc1, $acc2
> +       adc     $t1, $acc3
> +       adc     %rax, $acc4
>          mov    8*2($b_ptr), %rax
>         adc     %rdx, $acc5
>         adc     \$0, $acc0
> +       xor     $acc1, $acc1
>
>         
> ########################################################################
>         # Multiply by b[2]
> @@ -661,23 +647,17 @@ __ecp_nistz256_mul_montq:
>
>         
> ########################################################################
>         # Third reduction step
> -       mulq    $poly1
> -       xor     $t0, $t0
> -       add     $acc2, $acc3
> -       adc     \$0, %rdx
> -       add     %rax, $acc3
> -       mov     $acc2, %rax
> -       adc     %rdx, $acc4
> -       adc     \$0, $t0
> -
> +       mov     $acc2, $t1
> +       shl     \$32, $acc2
>         mulq    $poly3
> -       xor     $acc2, $acc2
> -       add     $t0, $acc5
> -       adc     \$0, %rdx
> -       add     %rax, $acc5
> +       shr     \$32, $t1
> +       add     $acc2, $acc3
> +       adc     $t1, $acc4
> +       adc     %rax, $acc5
>          mov    8*3($b_ptr), %rax
>         adc     %rdx, $acc0
>         adc     \$0, $acc1
> +       xor     $acc2, $acc2
>
>         
> ########################################################################
>         # Multiply by b[3]
> @@ -714,20 +694,14 @@ __ecp_nistz256_mul_montq:
>
>         
> ########################################################################
>         # Final reduction step
> -       mulq    $poly1
> -       #xor    $t0, $t0
> -       add     $acc3, $acc4
> -       adc     \$0, %rdx
> -       add     %rax, $acc4
> -       mov     $acc3, %rax
> -       adc     %rdx, $acc5
> -       #adc    \$0, $t0                # doesn't overflow
> -
> +       mov     $acc3, $t1
> +       shl     \$32, $acc3
>         mulq    $poly3
> -       #add    $t0, $acc0
> -       #adc    \$0, %rdx
> +       shr     \$32, $t1
> +       add     $acc3, $acc4
> +       adc     $t1, $acc5
>          mov    $acc4, $t0
> -       add     %rax, $acc0
> +       adc     %rax, $acc0
>         adc     %rdx, $acc1
>          mov    $acc5, $t1
>         adc     \$0, $acc2
> @@ -897,89 +871,62 @@ __ecp_nistz256_sqr_montq:
>         ##########################################
>         # Now the reduction
>         # First iteration
> -       mulq    $a_ptr
> -       #xor    $t0, $t0
> -       add     $acc0, $acc1
> -       adc     \$0, %rdx
> -       add     %rax, $acc1
> -       mov     $acc0, %rax
> -       adc     %rdx, $acc2     # doesn't overflow
> -       #adc    \$0, $t0
> -
> +       mov     $acc0, $t0
> +       shl     \$32, $acc0
>         mulq    $t1
> -       xor     $acc0, $acc0
> -       #add    $t0, $acc3
> -       #adc    \$0, %rdx
> -       add     %rax, $acc3
> +       shr     \$32, $t0
> +       add     $acc0, $acc1            # +=acc[0]<<96
> +       adc     $t0, $acc2
> +       adc     %rax, $acc3
>          mov    $acc1, %rax
> -       adc     %rdx, $acc4
> -       adc     \$0, $acc0
> +       adc     \$0, %rdx
>
>         ##########################################
>         # Second iteration
> -       mulq    $a_ptr
> -       #xor    $t0, $t0
> -       add     $acc1, $acc2
> -       adc     \$0, %rdx
> -       add     %rax, $acc2
> -       mov     $acc1, %rax
> -       adc     %rdx, $acc3     # doesn't overflow
> -       #adc    \$0, $t0
> -
> +       mov     $acc1, $t0
> +       shl     \$32, $acc1
> +       mov     %rdx, $acc0
>         mulq    $t1
> -       xor     $acc1, $acc1
> -       #add    $t0, $acc4
> -       #adc    \$0, %rdx
> -       add     %rax, $acc4
> +       shr     \$32, $t0
> +       add     $acc1, $acc2
> +       adc     $t0, $acc3
> +       adc     %rax, $acc0
>          mov    $acc2, %rax
> -       adc     %rdx, $acc0
> -       adc     \$0, $acc1
> +       adc     \$0, %rdx
>
>         ##########################################
>         # Third iteration
> -       mulq    $a_ptr
> -       #xor    $t0, $t0
> -       add     $acc2, $acc3
> -       adc     \$0, %rdx
> -       add     %rax, $acc3
> -       mov     $acc2, %rax
> -       adc     %rdx, $acc4     # doesn't overflow
> -       #adc    \$0, $t0
> -
> +       mov     $acc2, $t0
> +       shl     \$32, $acc2
> +       mov     %rdx, $acc1
>         mulq    $t1
> -       xor     $acc2, $acc2
> -       #add    $t0, $acc0
> -       #adc    \$0, %rdx
> -       add     %rax, $acc0
> +       shr     \$32, $t0
> +       add     $acc2, $acc3
> +       adc     $t0, $acc0
> +       adc     %rax, $acc1
>          mov    $acc3, %rax
> -       adc     %rdx, $acc1
> -       adc     \$0, $acc2
> +       adc     \$0, %rdx
>
>         ###########################################
>         # Last iteration
> -       mulq    $a_ptr
> -       #xor    $t0, $t0
> -       add     $acc3, $acc4
> -       adc     \$0, %rdx
> -       add     %rax, $acc4
> -       mov     $acc3, %rax
> -       adc     %rdx, $acc0     # doesn't overflow
> -       #adc    \$0, $t0
> -
> +       mov     $acc3, $t0
> +       shl     \$32, $acc3
> +       mov     %rdx, $acc2
>         mulq    $t1
> +       shr     \$32, $t0
> +       add     $acc3, $acc0
> +       adc     $t0, $acc1
> +       adc     %rax, $acc2
> +       adc     \$0, %rdx
>         xor     $acc3, $acc3
> -       #add    $t0, $acc1
> -       #adc    \$0, %rdx
> -       add     %rax, $acc1
> -       adc     %rdx, $acc2
> -       adc     \$0, $acc3
>
>         ############################################
>         # Add the rest of the acc
> -       add     $acc0, $acc5
> +       add     $acc0, $acc4
> +       adc     $acc1, $acc5
>          mov    $acc4, $acc0
> -       adc     $acc1, $acc6
> -       adc     $acc2, $acc7
> +       adc     $acc2, $acc6
> +       adc     %rdx, $acc7
>          mov    $acc5, $acc1
>         adc     \$0, $acc3
>
> @@ -1028,18 +975,15 @@ __ecp_nistz256_mul_montx:
>
>         
> ########################################################################
>         # First reduction step
> -       xor     $acc0, $acc0            # $acc0=0,cf=0,of=0
> -       adox    $t1, $acc1
> -       adox    $t0, $acc2
> +       add     $t1, $acc1
> +       adc     $t0, $acc2
>
>         mulx    $poly3, $t0, $t1
>          mov    8*1($b_ptr), %rdx
> -       adox    $t0, $acc3
> -       adcx    $t1, $acc4
> -
> -       adox    $acc0, $acc4
> -       adcx    $acc0, $acc5            # cf=0
> -       adox    $acc0, $acc5            # of=0
> +       adc     $t0, $acc3
> +       adc     $t1, $acc4
> +       adc     \$0, $acc5
> +       xor     $acc0, $acc0            # $acc0=0,cf=0,of=0
>
>         
> ########################################################################
>         # Multiply by b[1]
> @@ -1068,18 +1012,15 @@ __ecp_nistz256_mul_montx:
>
>         
> ########################################################################
>         # Second reduction step
> -       xor     $acc1 ,$acc1            # $acc1=0,cf=0,of=0
> -       adox    $t0, $acc2
> -       adox    $t1, $acc3
> +       add     $t0, $acc2
> +       adc     $t1, $acc3
>
>         mulx    $poly3, $t0, $t1
>          mov    8*2($b_ptr), %rdx
> -       adox    $t0, $acc4
> -       adcx    $t1, $acc5
> -
> -       adox    $acc1, $acc5
> -       adcx    $acc1, $acc0            # cf=0
> -       adox    $acc1, $acc0            # of=0
> +       adc     $t0, $acc4
> +       adc     $t1, $acc5
> +       adc     \$0, $acc0
> +       xor     $acc1 ,$acc1            # $acc1=0,cf=0,of=0
>
>         
> ########################################################################
>         # Multiply by b[2]
> @@ -1108,18 +1049,15 @@ __ecp_nistz256_mul_montx:
>
>         
> ########################################################################
>         # Third reduction step
> -       xor     $acc2, $acc2            # $acc2=0,cf=0,of=0
> -       adox    $t0, $acc3
> -       adox    $t1, $acc4
> +       add     $t0, $acc3
> +       adc     $t1, $acc4
>
>         mulx    $poly3, $t0, $t1
>          mov    8*3($b_ptr), %rdx
> -       adox    $t0, $acc5
> -       adcx    $t1, $acc0
> -
> -       adox    $acc2, $acc0
> -       adcx    $acc2, $acc1            # cf=0
> -       adox    $acc2, $acc1            # of=0
> +       adc     $t0, $acc5
> +       adc     $t1, $acc0
> +       adc     \$0, $acc1
> +       xor     $acc2, $acc2            # $acc2=0,cf=0,of=0
>
>         
> ########################################################################
>         # Multiply by b[3]
> @@ -1148,25 +1086,21 @@ __ecp_nistz256_mul_montx:
>
>         
> ########################################################################
>         # Fourth reduction step
> -       xor     $acc3, $acc3            # $acc3=0,cf=0,of=0
> -       adox    $t0, $acc4
> -       adox    $t1, $acc5
> +       add     $t0, $acc4
> +       adc     $t1, $acc5
>
>         mulx    $poly3, $t0, $t1
>          mov    $acc4, $t2
>         mov     .Lpoly+8*1(%rip), $poly1
> -       adcx    $t0, $acc0
> -       adox    $t1, $acc1
> +       adc     $t0, $acc0
>          mov    $acc5, $t3
> -
> -       adcx    $acc3, $acc1
> -       adox    $acc3, $acc2
> +       adc     $t1, $acc1
>         adc     \$0, $acc2
> -        mov    $acc0, $t0
>
>         
> ########################################################################
>         # Branch-less conditional subtraction of P
>         xor     %eax, %eax
> +        mov    $acc0, $t0
>         sbb     \$-1, $acc4             # .Lpoly[0]
>         sbb     $poly1, $acc5           # .Lpoly[1]
>         sbb     \$0, $acc0              # .Lpoly[2]
> @@ -1247,52 +1181,44 @@ __ecp_nistz256_sqr_montx:
>          mov    .Lpoly+8*3(%rip), $t1
>
>         # reduction step 1
> -       xor     $acc0, $acc0
> -       adcx    $t0, $acc1
> -       adcx    $t4, $acc2
> +       add     $t0, $acc1
> +       adc     $t4, $acc2
>
> -       mulx    $t1, $t0, $t4
> +       mulx    $t1, $t0, $acc0
>          mov    $acc1, %rdx
> -       adcx    $t0, $acc3
> +       adc     $t0, $acc3
>          shlx   $a_ptr, $acc1, $t0
> -       adox    $t4, $acc0
> -        shrx   $a_ptr, $acc1, $t4
>         adc     \$0, $acc0
> +        shrx   $a_ptr, $acc1, $t4
>
>         # reduction step 2
> -       xor     $acc1, $acc1
> -       adcx    $t0, $acc2
> -       adcx    $t4, $acc3
> +       add     $t0, $acc2
> +       adc     $t4, $acc3
>
> -       mulx    $t1, $t0, $t4
> +       mulx    $t1, $t0, $acc1
>          mov    $acc2, %rdx
> -       adcx    $t0, $acc0
> +       adc     $t0, $acc0
>          shlx   $a_ptr, $acc2, $t0
> -       adox    $t4, $acc1
> -        shrx   $a_ptr, $acc2, $t4
>         adc     \$0, $acc1
> +        shrx   $a_ptr, $acc2, $t4
>
>         # reduction step 3
> -       xor     $acc2, $acc2
> -       adcx    $t0, $acc3
> -       adcx    $t4, $acc0
> +       add     $t0, $acc3
> +       adc     $t4, $acc0
>
> -       mulx    $t1, $t0, $t4
> +       mulx    $t1, $t0, $acc2
>          mov    $acc3, %rdx
> -       adcx    $t0, $acc1
> +       adc     $t0, $acc1
>          shlx   $a_ptr, $acc3, $t0
> -       adox    $t4, $acc2
> -        shrx   $a_ptr, $acc3, $t4
>         adc     \$0, $acc2
> +        shrx   $a_ptr, $acc3, $t4
>
>         # reduction step 4
> -       xor     $acc3, $acc3
> -       adcx    $t0, $acc0
> -       adcx    $t4, $acc1
> +       add     $t0, $acc0
> +       adc     $t4, $acc1
>
> -       mulx    $t1, $t0, $t4
> -       adcx    $t0, $acc2
> -       adox    $t4, $acc3
> +       mulx    $t1, $t0, $acc3
> +       adc     $t0, $acc2
>         adc     \$0, $acc3
>
>         xor     $t3, $t3                # cf=0
>


______________________________________________________________________
OpenSSL Project                                 http://www.openssl.org
Development Mailing List                       openssl-dev@openssl.org
Automated List Manager                           majord...@openssl.org

Reply via email to