On Mon, Dec 1, 2014 at 3:23 PM, Andy Polyakov via RT <r...@openssl.org> wrote: >>> (Affects 1.0.2 only.) >>> >>> In crypto/ec/asm/ecp_nistz256-x86_64.pl, __ecp_nistz256_sqr_montq, >>> under "Now the reduction" there are a number of comments saying >>> "doesn't overflow". Unfortunately, they aren't correct. >> >> Got math wrong:-( Attached is not only fixed version, but even faster >> one. > > Please test attached one instead. Squaring didn't cover one case, and > AD*X is optimized.
thanks! Was away last week and so didn't have a chance to try fixing this. I'll patch that it and run the tests against it. Cheers AGL > >> On related note. It's possible to improve server-side DSA by ~5% by >> switching [back] to scatter-gather. [Change from scatter-gather was >> caused by concern about timing dependency, but I argue that concern is >> not valid in most cases.] There also are x86 and and ARM versions pending: >> >> # with/without -DECP_NISTZ256_ASM >> # Pentium +66-168% >> # PIII +73-175% >> # P4 +68-140% >> # Core2 +90-215% >> # Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) >> # Atom +66-160% >> # Opteron +54-112% >> # Bulldozer +99-240% >> # VIA Nano +93-300% >> >> # with/without -DECP_NISTZ256_ASM >> # Cortex-A8 +53-173% >> # Cortex-A9 +76-205% >> # Cortex-A15 +100-316% >> # Snapdragon S4 +66-187% >> >> No, bug in question is not there. Nor is AD*X code path is affected. >> >> > > > > diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl > b/crypto/ec/asm/ecp_nistz256-x86_64.pl > index 4486a5e..56f6c2b 100755 > --- a/crypto/ec/asm/ecp_nistz256-x86_64.pl > +++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl > @@ -31,15 +31,15 @@ > # Further optimization by <ap...@openssl.org>: > # > # this/original > -# Opteron +8-33% > -# Bulldozer +10-30% > -# P4 +14-38% > -# Westmere +8-23% > -# Sandy Bridge +8-24% > -# Ivy Bridge +7-25% > -# Haswell +5-25% > -# Atom +10-32% > -# VIA Nano +37-130% > +# Opteron +10-43% > +# Bulldozer +14-43% > +# P4 +18-50% > +# Westmere +12-36% > +# Sandy Bridge +9-36% > +# Ivy Bridge +9-36% > +# Haswell +8-37% > +# Atom +15-50% > +# VIA Nano +43-160% > # > # Ranges denote minimum and maximum improvement coefficients depending > # on benchmark. Lower coefficients are for ECDSA sign, relatively > @@ -550,28 +550,20 @@ __ecp_nistz256_mul_montq: > # and add the result to the acc. > # Due to the special form of p256 we do some optimizations > # > - # acc[0] x p256[0] = acc[0] x 2^64 - acc[0] > - # then we add acc[0] and get acc[0] x 2^64 > - > - mulq $poly1 > - xor $t0, $t0 > - add $acc0, $acc1 # +=acc[0]*2^64 > - adc \$0, %rdx > - add %rax, $acc1 > - mov $acc0, %rax > - > - # acc[0] x p256[2] = 0 > - adc %rdx, $acc2 > - adc \$0, $t0 > + # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] > + # then we add acc[0] and get acc[0] x 2^96 > > + mov $acc0, $t1 > + shl \$32, $acc0 > mulq $poly3 > - xor $acc0, $acc0 > - add $t0, $acc3 > - adc \$0, %rdx > - add %rax, $acc3 > + shr \$32, $t1 > + add $acc0, $acc1 # +=acc[0]<<96 > + adc $t1, $acc2 > + adc %rax, $acc3 > mov 8*1($b_ptr), %rax > adc %rdx, $acc4 > adc \$0, $acc5 > + xor $acc0, $acc0 > > > ######################################################################## > # Multiply by b[1] > @@ -608,23 +600,17 @@ __ecp_nistz256_mul_montq: > > > ######################################################################## > # Second reduction step > - mulq $poly1 > - xor $t0, $t0 > - add $acc1, $acc2 > - adc \$0, %rdx > - add %rax, $acc2 > - mov $acc1, %rax > - adc %rdx, $acc3 > - adc \$0, $t0 > - > + mov $acc1, $t1 > + shl \$32, $acc1 > mulq $poly3 > - xor $acc1, $acc1 > - add $t0, $acc4 > - adc \$0, %rdx > - add %rax, $acc4 > + shr \$32, $t1 > + add $acc1, $acc2 > + adc $t1, $acc3 > + adc %rax, $acc4 > mov 8*2($b_ptr), %rax > adc %rdx, $acc5 > adc \$0, $acc0 > + xor $acc1, $acc1 > > > ######################################################################## > # Multiply by b[2] > @@ -661,23 +647,17 @@ __ecp_nistz256_mul_montq: > > > ######################################################################## > # Third reduction step > - mulq $poly1 > - xor $t0, $t0 > - add $acc2, $acc3 > - adc \$0, %rdx > - add %rax, $acc3 > - mov $acc2, %rax > - adc %rdx, $acc4 > - adc \$0, $t0 > - > + mov $acc2, $t1 > + shl \$32, $acc2 > mulq $poly3 > - xor $acc2, $acc2 > - add $t0, $acc5 > - adc \$0, %rdx > - add %rax, $acc5 > + shr \$32, $t1 > + add $acc2, $acc3 > + adc $t1, $acc4 > + adc %rax, $acc5 > mov 8*3($b_ptr), %rax > adc %rdx, $acc0 > adc \$0, $acc1 > + xor $acc2, $acc2 > > > ######################################################################## > # Multiply by b[3] > @@ -714,20 +694,14 @@ __ecp_nistz256_mul_montq: > > > ######################################################################## > # Final reduction step > - mulq $poly1 > - #xor $t0, $t0 > - add $acc3, $acc4 > - adc \$0, %rdx > - add %rax, $acc4 > - mov $acc3, %rax > - adc %rdx, $acc5 > - #adc \$0, $t0 # doesn't overflow > - > + mov $acc3, $t1 > + shl \$32, $acc3 > mulq $poly3 > - #add $t0, $acc0 > - #adc \$0, %rdx > + shr \$32, $t1 > + add $acc3, $acc4 > + adc $t1, $acc5 > mov $acc4, $t0 > - add %rax, $acc0 > + adc %rax, $acc0 > adc %rdx, $acc1 > mov $acc5, $t1 > adc \$0, $acc2 > @@ -897,89 +871,62 @@ __ecp_nistz256_sqr_montq: > ########################################## > # Now the reduction > # First iteration > - mulq $a_ptr > - #xor $t0, $t0 > - add $acc0, $acc1 > - adc \$0, %rdx > - add %rax, $acc1 > - mov $acc0, %rax > - adc %rdx, $acc2 # doesn't overflow > - #adc \$0, $t0 > - > + mov $acc0, $t0 > + shl \$32, $acc0 > mulq $t1 > - xor $acc0, $acc0 > - #add $t0, $acc3 > - #adc \$0, %rdx > - add %rax, $acc3 > + shr \$32, $t0 > + add $acc0, $acc1 # +=acc[0]<<96 > + adc $t0, $acc2 > + adc %rax, $acc3 > mov $acc1, %rax > - adc %rdx, $acc4 > - adc \$0, $acc0 > + adc \$0, %rdx > > ########################################## > # Second iteration > - mulq $a_ptr > - #xor $t0, $t0 > - add $acc1, $acc2 > - adc \$0, %rdx > - add %rax, $acc2 > - mov $acc1, %rax > - adc %rdx, $acc3 # doesn't overflow > - #adc \$0, $t0 > - > + mov $acc1, $t0 > + shl \$32, $acc1 > + mov %rdx, $acc0 > mulq $t1 > - xor $acc1, $acc1 > - #add $t0, $acc4 > - #adc \$0, %rdx > - add %rax, $acc4 > + shr \$32, $t0 > + add $acc1, $acc2 > + adc $t0, $acc3 > + adc %rax, $acc0 > mov $acc2, %rax > - adc %rdx, $acc0 > - adc \$0, $acc1 > + adc \$0, %rdx > > ########################################## > # Third iteration > - mulq $a_ptr > - #xor $t0, $t0 > - add $acc2, $acc3 > - adc \$0, %rdx > - add %rax, $acc3 > - mov $acc2, %rax > - adc %rdx, $acc4 # doesn't overflow > - #adc \$0, $t0 > - > + mov $acc2, $t0 > + shl \$32, $acc2 > + mov %rdx, $acc1 > mulq $t1 > - xor $acc2, $acc2 > - #add $t0, $acc0 > - #adc \$0, %rdx > - add %rax, $acc0 > + shr \$32, $t0 > + add $acc2, $acc3 > + adc $t0, $acc0 > + adc %rax, $acc1 > mov $acc3, %rax > - adc %rdx, $acc1 > - adc \$0, $acc2 > + adc \$0, %rdx > > ########################################### > # Last iteration > - mulq $a_ptr > - #xor $t0, $t0 > - add $acc3, $acc4 > - adc \$0, %rdx > - add %rax, $acc4 > - mov $acc3, %rax > - adc %rdx, $acc0 # doesn't overflow > - #adc \$0, $t0 > - > + mov $acc3, $t0 > + shl \$32, $acc3 > + mov %rdx, $acc2 > mulq $t1 > + shr \$32, $t0 > + add $acc3, $acc0 > + adc $t0, $acc1 > + adc %rax, $acc2 > + adc \$0, %rdx > xor $acc3, $acc3 > - #add $t0, $acc1 > - #adc \$0, %rdx > - add %rax, $acc1 > - adc %rdx, $acc2 > - adc \$0, $acc3 > > ############################################ > # Add the rest of the acc > - add $acc0, $acc5 > + add $acc0, $acc4 > + adc $acc1, $acc5 > mov $acc4, $acc0 > - adc $acc1, $acc6 > - adc $acc2, $acc7 > + adc $acc2, $acc6 > + adc %rdx, $acc7 > mov $acc5, $acc1 > adc \$0, $acc3 > > @@ -1028,18 +975,15 @@ __ecp_nistz256_mul_montx: > > > ######################################################################## > # First reduction step > - xor $acc0, $acc0 # $acc0=0,cf=0,of=0 > - adox $t1, $acc1 > - adox $t0, $acc2 > + add $t1, $acc1 > + adc $t0, $acc2 > > mulx $poly3, $t0, $t1 > mov 8*1($b_ptr), %rdx > - adox $t0, $acc3 > - adcx $t1, $acc4 > - > - adox $acc0, $acc4 > - adcx $acc0, $acc5 # cf=0 > - adox $acc0, $acc5 # of=0 > + adc $t0, $acc3 > + adc $t1, $acc4 > + adc \$0, $acc5 > + xor $acc0, $acc0 # $acc0=0,cf=0,of=0 > > > ######################################################################## > # Multiply by b[1] > @@ -1068,18 +1012,15 @@ __ecp_nistz256_mul_montx: > > > ######################################################################## > # Second reduction step > - xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 > - adox $t0, $acc2 > - adox $t1, $acc3 > + add $t0, $acc2 > + adc $t1, $acc3 > > mulx $poly3, $t0, $t1 > mov 8*2($b_ptr), %rdx > - adox $t0, $acc4 > - adcx $t1, $acc5 > - > - adox $acc1, $acc5 > - adcx $acc1, $acc0 # cf=0 > - adox $acc1, $acc0 # of=0 > + adc $t0, $acc4 > + adc $t1, $acc5 > + adc \$0, $acc0 > + xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 > > > ######################################################################## > # Multiply by b[2] > @@ -1108,18 +1049,15 @@ __ecp_nistz256_mul_montx: > > > ######################################################################## > # Third reduction step > - xor $acc2, $acc2 # $acc2=0,cf=0,of=0 > - adox $t0, $acc3 > - adox $t1, $acc4 > + add $t0, $acc3 > + adc $t1, $acc4 > > mulx $poly3, $t0, $t1 > mov 8*3($b_ptr), %rdx > - adox $t0, $acc5 > - adcx $t1, $acc0 > - > - adox $acc2, $acc0 > - adcx $acc2, $acc1 # cf=0 > - adox $acc2, $acc1 # of=0 > + adc $t0, $acc5 > + adc $t1, $acc0 > + adc \$0, $acc1 > + xor $acc2, $acc2 # $acc2=0,cf=0,of=0 > > > ######################################################################## > # Multiply by b[3] > @@ -1148,25 +1086,21 @@ __ecp_nistz256_mul_montx: > > > ######################################################################## > # Fourth reduction step > - xor $acc3, $acc3 # $acc3=0,cf=0,of=0 > - adox $t0, $acc4 > - adox $t1, $acc5 > + add $t0, $acc4 > + adc $t1, $acc5 > > mulx $poly3, $t0, $t1 > mov $acc4, $t2 > mov .Lpoly+8*1(%rip), $poly1 > - adcx $t0, $acc0 > - adox $t1, $acc1 > + adc $t0, $acc0 > mov $acc5, $t3 > - > - adcx $acc3, $acc1 > - adox $acc3, $acc2 > + adc $t1, $acc1 > adc \$0, $acc2 > - mov $acc0, $t0 > > > ######################################################################## > # Branch-less conditional subtraction of P > xor %eax, %eax > + mov $acc0, $t0 > sbb \$-1, $acc4 # .Lpoly[0] > sbb $poly1, $acc5 # .Lpoly[1] > sbb \$0, $acc0 # .Lpoly[2] > @@ -1247,52 +1181,44 @@ __ecp_nistz256_sqr_montx: > mov .Lpoly+8*3(%rip), $t1 > > # reduction step 1 > - xor $acc0, $acc0 > - adcx $t0, $acc1 > - adcx $t4, $acc2 > + add $t0, $acc1 > + adc $t4, $acc2 > > - mulx $t1, $t0, $t4 > + mulx $t1, $t0, $acc0 > mov $acc1, %rdx > - adcx $t0, $acc3 > + adc $t0, $acc3 > shlx $a_ptr, $acc1, $t0 > - adox $t4, $acc0 > - shrx $a_ptr, $acc1, $t4 > adc \$0, $acc0 > + shrx $a_ptr, $acc1, $t4 > > # reduction step 2 > - xor $acc1, $acc1 > - adcx $t0, $acc2 > - adcx $t4, $acc3 > + add $t0, $acc2 > + adc $t4, $acc3 > > - mulx $t1, $t0, $t4 > + mulx $t1, $t0, $acc1 > mov $acc2, %rdx > - adcx $t0, $acc0 > + adc $t0, $acc0 > shlx $a_ptr, $acc2, $t0 > - adox $t4, $acc1 > - shrx $a_ptr, $acc2, $t4 > adc \$0, $acc1 > + shrx $a_ptr, $acc2, $t4 > > # reduction step 3 > - xor $acc2, $acc2 > - adcx $t0, $acc3 > - adcx $t4, $acc0 > + add $t0, $acc3 > + adc $t4, $acc0 > > - mulx $t1, $t0, $t4 > + mulx $t1, $t0, $acc2 > mov $acc3, %rdx > - adcx $t0, $acc1 > + adc $t0, $acc1 > shlx $a_ptr, $acc3, $t0 > - adox $t4, $acc2 > - shrx $a_ptr, $acc3, $t4 > adc \$0, $acc2 > + shrx $a_ptr, $acc3, $t4 > > # reduction step 4 > - xor $acc3, $acc3 > - adcx $t0, $acc0 > - adcx $t4, $acc1 > + add $t0, $acc0 > + adc $t4, $acc1 > > - mulx $t1, $t0, $t4 > - adcx $t0, $acc2 > - adox $t4, $acc3 > + mulx $t1, $t0, $acc3 > + adc $t0, $acc2 > adc \$0, $acc3 > > xor $t3, $t3 # cf=0 > ______________________________________________________________________ OpenSSL Project http://www.openssl.org Development Mailing List openssl-dev@openssl.org Automated List Manager majord...@openssl.org