>> There seems to be a bug in the AVX2 codepath in poly1305-x86.pl. I have not >> attempted to debug this, but I have attached a test file which produces >> different output in normal and AVX2 codepaths. Our existing poly1305 >> implementation agrees with the former. >> >> $ OPENSSL_ia32cap=0 ./poly1305_test >> PASS >> $ ./poly1305_test >> Poly1305 test failed. >> got: 2e65f0054e36505687d937ff5e8ed112 >> expected: 69d28f73dd09d39a92aa179da354b7ea > > While I keep looking further, double-check attached.
Attached is alternative solution, please double-check. -- Ticket here: http://rt.openssl.org/Ticket/Display.html?id=4346 Please log in as guest with password guest if prompted
commit 3902c696c5406d68b17cecf9a9effb0edcb84eb2 Author: Andy Polyakov <ap...@openssl.org> Date: Sun Feb 28 21:48:43 2016 +0100 poly1305/asm/poly1305-*.pl: flip horizontal add and reduction. Formally only 32-bit AVX2 code path needs this, but I choose to harmonize all vector code paths. RT#4346 diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl index 65b79cf..ac202ad 100755 --- a/crypto/poly1305/asm/poly1305-armv4.pl +++ b/crypto/poly1305/asm/poly1305-armv4.pl @@ -1057,6 +1057,15 @@ poly1305_blocks_neon: .Lshort_tail: @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 $D3#lo,$D3#lo,$D3#hi + vadd.i64 $D0#lo,$D0#lo,$D0#hi + vadd.i64 $D4#lo,$D4#lo,$D4#hi + vadd.i64 $D1#lo,$D1#lo,$D1#hi + vadd.i64 $D2#lo,$D2#lo,$D2#hi + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @ lazy reduction, but without narrowing vshr.u64 $T0,$D3,#26 @@ -1086,15 +1095,6 @@ poly1305_blocks_neon: vadd.i64 $D1,$D1,$T0 @ h0 -> h1 vadd.i64 $D4,$D4,$T1 @ h3 -> h4 - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ horizontal addition - - vadd.i64 $D2#lo,$D2#lo,$D2#hi - vadd.i64 $D0#lo,$D0#lo,$D0#hi - vadd.i64 $D3#lo,$D3#lo,$D3#hi - vadd.i64 $D1#lo,$D1#lo,$D1#hi - vadd.i64 $D4#lo,$D4#lo,$D4#hi - cmp $len,#0 bne .Leven diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl index 79185d2..f1359fd 100755 --- a/crypto/poly1305/asm/poly1305-armv8.pl +++ b/crypto/poly1305/asm/poly1305-armv8.pl @@ -791,6 +791,19 @@ poly1305_blocks_neon: .Lshort_tail: //////////////////////////////////////////////////////////////// + // horizontal add + + addp $ACC3,$ACC3,$ACC3 + ldp d8,d9,[sp,#16] // meet ABI requirements + addp $ACC0,$ACC0,$ACC0 + ldp d10,d11,[sp,#32] + addp $ACC4,$ACC4,$ACC4 + ldp d12,d13,[sp,#48] + addp $ACC1,$ACC1,$ACC1 + ldp d14,d15,[sp,#64] + addp $ACC2,$ACC2,$ACC2 + + //////////////////////////////////////////////////////////////// // lazy reduction, but without narrowing ushr $T0.2d,$ACC3,#26 @@ -822,19 +835,6 @@ poly1305_blocks_neon: add $ACC4,$ACC4,$T1.2d // h3 -> h4 //////////////////////////////////////////////////////////////// - // horizontal add - - addp $ACC2,$ACC2,$ACC2 - ldp d8,d9,[sp,#16] // meet ABI requirements - addp $ACC0,$ACC0,$ACC0 - ldp d10,d11,[sp,#32] - addp $ACC1,$ACC1,$ACC1 - ldp d12,d13,[sp,#48] - addp $ACC3,$ACC3,$ACC3 - ldp d14,d15,[sp,#64] - addp $ACC4,$ACC4,$ACC4 - - //////////////////////////////////////////////////////////////// // write the result, can be partially reduced st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl index 7c1aee5..fb9fa2b 100755 --- a/crypto/poly1305/asm/poly1305-x86.pl +++ b/crypto/poly1305/asm/poly1305-x86.pl @@ -536,6 +536,8 @@ my $base = shift; $base = "esp" if (!defined($base)); },"edx"); sub lazy_reduction { +my $extra = shift; + ################################################################ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein # and P. Schwabe @@ -543,6 +545,7 @@ sub lazy_reduction { &movdqa ($T0,$D3); &pand ($D3,$MASK); &psrlq ($T0,26); + &$extra () if (defined($extra)); &paddq ($T0,$D4); # h3 -> h4 &movdqa ($T1,$D0); &pand ($D0,$MASK); @@ -1091,21 +1094,21 @@ my $addr = shift; &set_label("short_tail"); - &lazy_reduction (); - ################################################################ # horizontal addition + &pshufd ($T1,$D4,0b01001110); + &pshufd ($T0,$D3,0b01001110); + &paddq ($D4,$T1); + &paddq ($D3,$T0); &pshufd ($T1,$D0,0b01001110); &pshufd ($T0,$D1,0b01001110); - &paddd ($D0,$T1); + &paddq ($D0,$T1); + &paddq ($D1,$T0); &pshufd ($T1,$D2,0b01001110); - &paddd ($D1,$T0); - &pshufd ($T0,$D3,0b01001110); - &paddd ($D2,$T1); - &pshufd ($T1,$D4,0b01001110); - &paddd ($D3,$T0); - &paddd ($D4,$T1); + #&paddq ($D2,$T1); + + &lazy_reduction (sub { &paddq ($D2,$T1) }); &set_label("done"); &movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value @@ -1113,8 +1116,8 @@ my $addr = shift; &movd (&DWP(-16*3+4*2,"edi"),$D2); &movd (&DWP(-16*3+4*3,"edi"),$D3); &movd (&DWP(-16*3+4*4,"edi"),$D4); -&set_label("nodata"); &mov ("esp","ebp"); +&set_label("nodata"); &function_end("_poly1305_blocks_sse2"); &align (32); @@ -1435,7 +1438,7 @@ sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; } &test ("eax","eax"); # is_base2_26? &jz (&label("enter_blocks")); -&set_label("enter_avx2",16); +&set_label("enter_avx2"); &vzeroupper (); &call (&label("pic_point")); @@ -1731,31 +1734,31 @@ sub vlazy_reduction { &vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); }); - &vlazy_reduction(); - ################################################################ # horizontal addition + &vpsrldq ($T0,$D4,8); + &vpsrldq ($T1,$D3,8); + &vpaddq ($D4,$D4,$T0); &vpsrldq ($T0,$D0,8); + &vpaddq ($D3,$D3,$T1); &vpsrldq ($T1,$D1,8); &vpaddq ($D0,$D0,$T0); &vpsrldq ($T0,$D2,8); &vpaddq ($D1,$D1,$T1); - &vpsrldq ($T1,$D3,8); + &vpermq ($T1,$D4,2); # keep folding &vpaddq ($D2,$D2,$T0); - &vpsrldq ($T0,$D4,8); - &vpaddq ($D3,$D3,$T1); - &vpermq ($T1,$D0,2); # keep folding - &vpaddq ($D4,$D4,$T0); + &vpermq ($T0,$D3,2); + &vpaddq ($D4,$D4,$T1); + &vpermq ($T1,$D0,2); + &vpaddq ($D3,$D3,$T0); &vpermq ($T0,$D1,2); &vpaddq ($D0,$D0,$T1); &vpermq ($T1,$D2,2); &vpaddq ($D1,$D1,$T0); - &vpermq ($T0,$D3,2); &vpaddq ($D2,$D2,$T1); - &vpermq ($T1,$D4,2); - &vpaddq ($D3,$D3,$T0); - &vpaddq ($D4,$D4,$T1); + + &vlazy_reduction(); &cmp ("ecx",0); &je (&label("done")); @@ -1772,14 +1775,14 @@ sub vlazy_reduction { &jmp (&label("even")); &set_label("done",16); - &vmovd (&DWP(-16*3+4*0,"edi"),"xmm0"); # store hash value - &vmovd (&DWP(-16*3+4*1,"edi"),"xmm1"); - &vmovd (&DWP(-16*3+4*2,"edi"),"xmm2"); - &vmovd (&DWP(-16*3+4*3,"edi"),"xmm3"); - &vmovd (&DWP(-16*3+4*4,"edi"),"xmm4"); + &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value + &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1)); + &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2)); + &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3)); + &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4)); &vzeroupper (); -&set_label("nodata"); &mov ("esp","ebp"); +&set_label("nodata"); &function_end("_poly1305_blocks_avx2"); } &set_label("const_sse2",64); diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl index b827d24..2265664 100755 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/crypto/poly1305/asm/poly1305-x86_64.pl @@ -1198,6 +1198,20 @@ $code.=<<___; .Lshort_tail_avx: ################################################################ + # horizontal addition + + vpsrldq \$8,$D4,$T4 + vpsrldq \$8,$D3,$T3 + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$D0,$T0 + vpsrldq \$8,$D2,$T2 + vpaddq $T3,$D3,$D3 + vpaddq $T4,$D4,$D4 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$D2,$D2 + + ################################################################ # lazy reduction vpsrlq \$26,$D3,$H3 @@ -1231,25 +1245,11 @@ $code.=<<___; vpand $MASK,$D3,$D3 vpaddq $H3,$D4,$D4 # h3 -> h4 - ################################################################ - # horizontal addition - - vpsrldq \$8,$D2,$T2 - vpsrldq \$8,$D0,$T0 - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$D3,$T3 - vpsrldq \$8,$D4,$T4 - vpaddq $T2,$D2,$H2 - vpaddq $T0,$D0,$H0 - vpaddq $T1,$D1,$H1 - vpaddq $T3,$D3,$H3 - vpaddq $T4,$D4,$H4 - - vmovd $H0,`4*0-48-64`($ctx) # save partially reduced - vmovd $H1,`4*1-48-64`($ctx) - vmovd $H2,`4*2-48-64`($ctx) - vmovd $H3,`4*3-48-64`($ctx) - vmovd $H4,`4*4-48-64`($ctx) + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced + vmovd $D1,`4*1-48-64`($ctx) + vmovd $D2,`4*2-48-64`($ctx) + vmovd $D3,`4*3-48-64`($ctx) + vmovd $D4,`4*4-48-64`($ctx) ___ $code.=<<___ if ($win64); vmovdqa 0x50(%r11),%xmm6 @@ -1888,6 +1888,31 @@ $code.=<<___; vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 ################################################################ + # horizontal addition + + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$H2,$T2 + vpsrldq \$8,$H3,$T3 + vpsrldq \$8,$H4,$T4 + vpsrldq \$8,$H0,$T0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + + vpermq \$0x2,$H3,$T3 + vpermq \$0x2,$H4,$T4 + vpermq \$0x2,$H0,$T0 + vpermq \$0x2,$D1,$T1 + vpermq \$0x2,$H2,$T2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + + ################################################################ # lazy reduction vpsrlq \$26,$H3,$D3 @@ -1921,31 +1946,6 @@ $code.=<<___; vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 - ################################################################ - # horizontal addition - - vpsrldq \$8,$H2,$T2 - vpsrldq \$8,$H0,$T0 - vpsrldq \$8,$H1,$T1 - vpsrldq \$8,$H3,$T3 - vpsrldq \$8,$H4,$T4 - vpaddq $T2,$H2,$H2 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - - vpermq \$0x2,$H2,$T2 - vpermq \$0x2,$H0,$T0 - vpermq \$0x2,$H1,$T1 - vpermq \$0x2,$H3,$T3 - vpermq \$0x2,$H4,$T4 - vpaddq $T2,$H2,$H2 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced vmovd %x#$H1,`4*1-48-64`($ctx) vmovd %x#$H2,`4*2-48-64`($ctx) diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c index 7c9f302..303822e 100644 --- a/crypto/poly1305/poly1305.c +++ b/crypto/poly1305/poly1305.c @@ -668,6 +668,20 @@ static const struct poly1305_test poly1305_tests[] = { "f248312e578d9d58f8b7bb4d19105431" }, /* + * AVX2 in poly1305-x86.pl failed this with 176+32 split + */ + { + "248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd" + "2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e8" + "74cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c" + "8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936a" + "ff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a37" + "09894e4eb0a4eedc4ae19468e66b81f2" + "71351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb", + "000102030405060708090a0b0c0d0e0f""00000000000000000000000000000000", + "bc939bc5281480fa99c6d68c258ec42f" + }, + /* * test vectors from Google */ { @@ -844,6 +858,23 @@ int main() printf("\n"); return 1; } + + for (half = 16; half < inlen; half += 16) { + Poly1305_Init(&poly1305, key); + Poly1305_Update(&poly1305, in, half); + Poly1305_Update(&poly1305, in+half, inlen-half); + Poly1305_Final(&poly1305, out); + + if (memcmp(out, expected, sizeof(expected)) != 0) { + printf("Poly1305 test #%d/%d failed.\n", i, half); + printf("got: "); + hexdump(out, sizeof(out)); + printf("\nexpected: "); + hexdump(expected, sizeof(expected)); + printf("\n"); + return 1; + } + } } free(in);
-- openssl-dev mailing list To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev