> I'm unclear on what EVP_CIPHER's interface guarantees are, but our EVP_AEAD > APIs are documented to allow in/out buffers to alias as long as out is <= > in. This matches what callers might expect from a naive implementation. > > Our AES-GCM EVP_AEADs, which share code with OpenSSL, have tended to match > this pattern too. For ChaCha, of chacha-{x86,x86_64,armv4,armv8}.pl and the > C implementation, all seem satisfy this (though it's possible I don't have > complete coverage) except for chacha-x86.pl. That one works if in == out, > but not if out is slightly behind. > > We were able to reproduce problems when in = out + 1. The SSE3 code > triggers if the input is at least 256 bytes and the non-SSE3 code if the > input is at least 64 bytes. The non-SSE3 code is because the words in a > block are processed in a slightly funny order (0, 4, 8, 9, 12, 14, 1, 2, 3, > 5, 6, 7, 10, 11, 13, 15). I haven't looked at the SSE3 case carefully, but > I expect it's something similar.
It's in 16-byte chunks numbered 0,4,8,12, 1,5,8,13, 2,6,... > Could the blocks perhaps be processed in a more straight-forward ordering, > so that chacha-x86.pl behaves like the other implementations? (It's nice to > avoid bugs that only trigger in one implementation.) Or is this order > necessary for something? It's the order in which amount of references to memory is minimal. But double-check attached.
diff --git a/crypto/chacha/asm/chacha-x86.pl b/crypto/chacha/asm/chacha-x86.pl index 850c917..986e7f7 100755 --- a/crypto/chacha/asm/chacha-x86.pl +++ b/crypto/chacha/asm/chacha-x86.pl @@ -19,13 +19,13 @@ # P4 18.6/+84% # Core2 9.56/+89% 4.83 # Westmere 9.50/+45% 3.35 -# Sandy Bridge 10.5/+47% 3.20 -# Haswell 8.15/+50% 2.83 -# Silvermont 17.4/+36% 8.35 +# Sandy Bridge 10.7/+47% 3.24 +# Haswell 8.22/+50% 2.89 +# Silvermont 17.8/+36% 8.53 # Sledgehammer 10.2/+54% -# Bulldozer 13.4/+50% 4.38(*) +# Bulldozer 13.5/+50% 4.39(*) # -# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; +# (*) Bulldozer actually executes 4xXOP code path that delivers 3.50; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); @@ -238,18 +238,20 @@ if ($xmm) { &xor ($a, &DWP(4*0,$b)); # xor with input &xor ($b_,&DWP(4*4,$b)); - &mov (&DWP(4*0,"esp"),$a); + &mov (&DWP(4*0,"esp"),$a); # off-load for later write &mov ($a,&wparam(0)); # load output pointer &xor ($c, &DWP(4*8,$b)); &xor ($c_,&DWP(4*9,$b)); &xor ($d, &DWP(4*12,$b)); &xor ($d_,&DWP(4*14,$b)); - &mov (&DWP(4*4,$a),$b_); # write output - &mov (&DWP(4*8,$a),$c); - &mov (&DWP(4*9,$a),$c_); - &mov (&DWP(4*12,$a),$d); - &mov (&DWP(4*14,$a),$d_); + &mov (&DWP(4*4,"esp"),$b_); + &mov ($b_,&DWP(4*0,"esp")); + &mov (&DWP(4*8,"esp"),$c); + &mov (&DWP(4*9,"esp"),$c_); + &mov (&DWP(4*12,"esp"),$d); + &mov (&DWP(4*14,"esp"),$d_); + &mov (&DWP(4*0,$a),$b_); # write output in order &mov ($b_,&DWP(4*1,"esp")); &mov ($c, &DWP(4*2,"esp")); &mov ($c_,&DWP(4*3,"esp")); @@ -266,35 +268,45 @@ if ($xmm) { &xor ($d, &DWP(4*5,$b)); &xor ($d_,&DWP(4*6,$b)); &mov (&DWP(4*1,$a),$b_); + &mov ($b_,&DWP(4*4,"esp")); &mov (&DWP(4*2,$a),$c); &mov (&DWP(4*3,$a),$c_); + &mov (&DWP(4*4,$a),$b_); &mov (&DWP(4*5,$a),$d); &mov (&DWP(4*6,$a),$d_); - &mov ($b_,&DWP(4*7,"esp")); - &mov ($c, &DWP(4*10,"esp")); + &mov ($c,&DWP(4*7,"esp")); + &mov ($d,&DWP(4*8,"esp")); + &mov ($d_,&DWP(4*9,"esp")); + &add ($c,&DWP(64+4*7,"esp")); + &mov ($b_, &DWP(4*10,"esp")); + &xor ($c,&DWP(4*7,$b)); &mov ($c_,&DWP(4*11,"esp")); + &mov (&DWP(4*7,$a),$c); + &mov (&DWP(4*8,$a),$d); + &mov (&DWP(4*9,$a),$d_); + + &add ($b_, &DWP(64+4*10,"esp")); + &add ($c_,&DWP(64+4*11,"esp")); + &xor ($b_, &DWP(4*10,$b)); + &xor ($c_,&DWP(4*11,$b)); + &mov (&DWP(4*10,$a),$b_); + &mov (&DWP(4*11,$a),$c_); + + &mov ($c,&DWP(4*12,"esp")); + &mov ($c_,&DWP(4*14,"esp")); &mov ($d, &DWP(4*13,"esp")); &mov ($d_,&DWP(4*15,"esp")); - &add ($b_,&DWP(64+4*7,"esp")); - &add ($c, &DWP(64+4*10,"esp")); - &add ($c_,&DWP(64+4*11,"esp")); &add ($d, &DWP(64+4*13,"esp")); &add ($d_,&DWP(64+4*15,"esp")); - &xor ($b_,&DWP(4*7,$b)); - &xor ($c, &DWP(4*10,$b)); - &xor ($c_,&DWP(4*11,$b)); &xor ($d, &DWP(4*13,$b)); &xor ($d_,&DWP(4*15,$b)); &lea ($b,&DWP(4*16,$b)); - &mov (&DWP(4*7,$a),$b_); - &mov ($b_,&DWP(4*0,"esp")); - &mov (&DWP(4*10,$a),$c); + &mov (&DWP(4*12,$a),$c); &mov ($c,&wparam(2)); # len - &mov (&DWP(4*11,$a),$c_); &mov (&DWP(4*13,$a),$d); + &mov (&DWP(4*14,$a),$c_); &mov (&DWP(4*15,$a),$d_); - &mov (&DWP(4*0,$a),$b_); &lea ($a,&DWP(4*16,$a)); &sub ($c,64); &jnz (&label("outer_loop")); @@ -572,12 +584,12 @@ my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); - #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there - &movdqa ($xa1,&QWP(16*1-128,"ebx")); - &movdqa ($xa2,&QWP(16*2-128,"ebx")); - &movdqa ($xa3,&QWP(16*3-128,"ebx")); - for($i=0;$i<256;$i+=64) { + #&movdqa ($xa0,&QWP($i+16*0-128,"ebx")); # it's there + &movdqa ($xa1,&QWP($i+16*1-128,"ebx")); + &movdqa ($xa2,&QWP($i+16*2-128,"ebx")); + &movdqa ($xa3,&QWP($i+16*3-128,"ebx")); + &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material &paddd ($xa1,&QWP($i+16*1-128,"ebp")); &paddd ($xa2,&QWP($i+16*2-128,"ebp")); @@ -598,25 +610,29 @@ my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous #($xa2,$xt2)=($xt2,$xa2); - &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input - &movdqu ($xt1,&QWP(64*1-128,$inp)); - &movdqu ($xa2,&QWP(64*2-128,$inp)); - &movdqu ($xt3,&QWP(64*3-128,$inp)); - &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); - &pxor ($xt0,$xa0); + &movdqa (&QWP($i+16*0-128,"ebx"),$xa0); &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); - &pxor ($xt1,$xa1); - &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); - &pxor ($xt2,$xa2); - &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); - &pxor ($xt3,$xa3); - &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); - &movdqu (&QWP(64*0-128,$out),$xt0); # store output - &movdqu (&QWP(64*1-128,$out),$xt1); - &movdqu (&QWP(64*2-128,$out),$xt2); - &movdqu (&QWP(64*3-128,$out),$xt3); - &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); + &movdqa (&QWP($i+16*1-128,"ebx"),$xa1); + &movdqa (&QWP($i+16*2-128,"ebx"),$xt2); + &movdqa (&QWP($i+16*3-128,"ebx"),$xa3); + } + for($i=0;$i<256;$i+=64) { + my $j = 16*($i/64); + &movdqu ($xa0,&QWP($i+16*0-128,$inp)); # load input + &movdqu ($xa1,&QWP($i+16*1-128,$inp)); + &movdqu ($xa2,&QWP($i+16*2-128,$inp)); + &movdqu ($xa3,&QWP($i+16*3-128,$inp)); + &pxor ($xa0,&QWP($j+64*0-128,"ebx")); + &pxor ($xa1,&QWP($j+64*1-128,"ebx")); + &pxor ($xa2,&QWP($j+64*2-128,"ebx")); + &pxor ($xa3,&QWP($j+64*3-128,"ebx")); + &movdqu (&QWP($i+16*0-128,$out),$xa0); # write output + &movdqu (&QWP($i+16*1-128,$out),$xa1); + &movdqu (&QWP($i+16*2-128,$out),$xa2); + &movdqu (&QWP($i+16*3-128,$out),$xa3); } + &lea ($inp,&DWP(256,$inp)); + &lea ($out,&DWP(256,$out)); &sub ($len,64*4); &jnc (&label("outer_loop")); @@ -967,12 +983,12 @@ my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); - #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there - &vmovdqa ($xa1,&QWP(16*1-128,"ebx")); - &vmovdqa ($xa2,&QWP(16*2-128,"ebx")); - &vmovdqa ($xa3,&QWP(16*3-128,"ebx")); - for($i=0;$i<256;$i+=64) { + #&vmovdqa ($xa0,&QWP($i+16*0-128,"ebx")); # it's there + &vmovdqa ($xa1,&QWP($i+16*1-128,"ebx")); + &vmovdqa ($xa2,&QWP($i+16*2-128,"ebx")); + &vmovdqa ($xa3,&QWP($i+16*3-128,"ebx")); + &vpaddd ($xa0,$xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material &vpaddd ($xa1,$xa1,&QWP($i+16*1-128,"ebp")); &vpaddd ($xa2,$xa2,&QWP($i+16*2-128,"ebp")); @@ -987,21 +1003,33 @@ my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous &vpunpcklqdq ($xt3,$xa0,$xa2); # "a2" &vpunpckhqdq ($xa3,$xa0,$xa2); # "a3" - &vpxor ($xt0,$xa1,&QWP(64*0-128,$inp)); - &vpxor ($xt1,$xt2,&QWP(64*1-128,$inp)); - &vpxor ($xt2,$xt3,&QWP(64*2-128,$inp)); - &vpxor ($xt3,$xa3,&QWP(64*3-128,$inp)); - &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); - &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); - &vmovdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); - &vmovdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); - &vmovdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); - &vmovdqu (&QWP(64*0-128,$out),$xt0); # store output - &vmovdqu (&QWP(64*1-128,$out),$xt1); - &vmovdqu (&QWP(64*2-128,$out),$xt2); - &vmovdqu (&QWP(64*3-128,$out),$xt3); - &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); + &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); + &vmovdqa (&QWP($i+16*0-128,"ebx"),$xa1); + &vmovdqa (&QWP($i+16*1-128,"ebx"),$xt2); + &vmovdqa (&QWP($i+16*2-128,"ebx"),$xt3); + &vmovdqa (&QWP($i+16*3-128,"ebx"),$xa3); + } + &vmovdqu ($xa0,&QWP(16*0-128,$inp)); # load input + &vmovdqu ($xa1,&QWP(16*1-128,$inp)); + &vmovdqu ($xa2,&QWP(16*2-128,$inp)); + &vmovdqu ($xa3,&QWP(16*3-128,$inp)); + for($i=0;$i<256;$i+=64) { + my $j = 16*($i/64); + &vpxor ($xt0,$xa0,&QWP($j+64*0-128,"ebx")); + &vmovdqu ($xa0,&QWP($i+16*4-128,$inp)) if ($i<192); + &vpxor ($xt1,$xa1,&QWP($j+64*1-128,"ebx")); + &vmovdqu ($xa1,&QWP($i+16*5-128,$inp)) if ($i<192); + &vpxor ($xt2,$xa2,&QWP($j+64*2-128,"ebx")); + &vmovdqu ($xa2,&QWP($i+16*6-128,$inp)) if ($i<192); + &vpxor ($xt3,$xa3,&QWP($j+64*3-128,"ebx")); + &vmovdqu ($xa3,&QWP($i+16*7-128,$inp)) if ($i<192); + &vmovdqu (&QWP($i+16*0-128,$out),$xt0); # write output + &vmovdqu (&QWP($i+16*1-128,$out),$xt1); + &vmovdqu (&QWP($i+16*2-128,$out),$xt2); + &vmovdqu (&QWP($i+16*3-128,$out),$xt3); } + &lea ($inp,&DWP(256,$inp)); + &lea ($out,&DWP(256,$out)); &sub ($len,64*4); &jnc (&label("outer_loop"));
-- openssl-dev mailing list To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev