On Thu, 23 Dec 2004, Andy Polyakov wrote:
> aes-586.pl module is committed to CVS now [see
> http://cvs.openssl.org/rlog?f=openssl/crypto/aes/asm/aes-586.pl]. Take
> "Special note about instruction choice" in commentary section for
> consideration even for AMD64. Merry Christmas to everybody:-) A.
hmmm... i seem to have done better by switching back to scaling :)
with the patch below i'm getting the following throughput improvements for
aes-128-cbc 8192B buffer:
patch delta
p4-2 + 3.8%
p4-3 +11%
p-m + 8.8%
k8 +12%
efficeon + 4.3%
the code is 229 bytes smaller in $small_footprint=1 ... i didn't look to
see how much smaller it is for the fully unrolled variety (i would assume
1145 bytes or so). unfortunately this space improvement is hidden by the
alignment pain caused by the placement of AES_Te and AES_Td :) i suggest
moving both those tables to the top of the module so that their 64 byte
alignment is taken care of once only.
here's an updated comparison versus the gladman code -- this is in
cycles/byte for 8192 byte buffer (smaller is better):
openssl w/patch
small large gladman
p4-2 31.7 26.1 27.3
p4-3 32.3 32.9 18.7
p-m 23.8 23.3 16.9
k8 21.8 21.5 18.1
efficeon 25.1 22.6 17.8
damn the p4 is a weird beast -- notice how the gladman code is better
everywhere except p4-2 ... and p4-3 gladman is nearly twice as good as the
openssl code. i'm a bit disappointed with efficeon but i know what the
problems are (efficeon lacks native bswap, so your "1%" estimation on the
bswaps is more painful for efficeon, and the loop could be rotated
differently). fixing that is a more significant effort -- so i figured
i'd checkpoint by sending you a patch now.
i made the following changes:
o shifts by 24 don't need to be followed by and $0xff:
shr $22,%esi --> shr $24,%esi
and $0x3fc,%esi mov offs,(%ebp,%esi,4),esi
mov offs(%ebp,%esi,1),esi
o movzbl is 3 bytes shorter than and $imm:
shr $14,%eax --> shr $16,%eax
and $0x3fc,%eax movzbl %al,%eax
mov offs(%ebp,%eax,1),%eax mov offs,(%ebp,%eax,4),%eax
there's no perf degredation by making this change (in fact it
improves on p4-2, p-m and efficeon). for consistency i made
the same change to all the "and $0x3fc,%edi" ... unfortunately
movzbl isn't an option there, but there was no negative perf
impact anywhere with the change (and efficeon internally uses
movzbl for this case so i'm slightly biased).
o movzbl is 3 bytes shorter than and $imm (part 2):
movl mem,%reg --> movzbl mem,%reg
and $0xff,%reg
this occurs several times loading from tables -- it's a space
win and a perf win everywhere.
o used a "gladman trick" on %edx in encode because it was easy
enough -- during encode we finish with the low half of %edx
before we need the high half, so after finishing with the low
half i inserted a "shr $16,%edx" which lets us use movzbl %dl/%dh
to get the top two bytes (similarly for %ecx during decode).
i think i gave up a bit on p4-2 with this step, but i figured
it was worth it because it helped everywhere else and p4-2 has
been superceded by p4-3. plus this transform saves code space.
it's not easy to transform the other 3 registers in this way
without major surgery around loop edges ... which will have to
wait for another rainy day.
-dean
SUBMISSION TYPE: TSU
SUBMITTED BY: dean gaudet
SUBMITTED FOR: dean gaudet
POINT OF CONTACT: [EMAIL PROTECTED]
PHONE and/or FAX: (408) 919-3086
MANUFACTURER: openssl
PRODUCT NAME/MODEL #: 0.9.8-dev
ECCN: 5D002
Index: crypto/aes/asm/aes-586.pl
===================================================================
RCS file: /home/dean/openssl/Repository/openssl/crypto/aes/asm/aes-586.pl,v
retrieving revision 1.1
diff -u -r1.1 aes-586.pl
--- crypto/aes/asm/aes-586.pl 23 Dec 2004 21:32:34 -0000 1.1
+++ crypto/aes/asm/aes-586.pl 28 Dec 2004 02:31:39 -0000
@@ -4,6 +4,8 @@
# Written by Andy Polyakov <[EMAIL PROTECTED]> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
+#
+# Additional contributions by dean gaudet <[EMAIL PROTECTED]>.
# ====================================================================
#
# You might fail to appreciate this module performance from the first
@@ -15,15 +17,6 @@
# more than *twice* as fast! Yes, all this buzz about PIC means that
# [unlike other implementations] this module was explicitly designed
# to be safe to use even in shared library context...
-#
-# Special note about instruction choice. Do you recall RC4_INT code
-# performing poorly on P4? It might be the time to figure out why.
-# RC4_INT code implies effective address calculations in base+offset*4
-# form. Trouble is that it seems that offset scaling turned to be
-# critical path... At least eliminating scaling resulted in 2.8x RC4
-# performance improvement [as you might recall]. As AES code is hungry
-# for scaling too, I [try to] avoid the latter by favoring off-by-2
-# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
@@ -41,26 +34,30 @@
{ my ($i,$te,@s) = @_;
my $tmp,$out;
- if ($i==3) { $out=$s[0]; &mov ("edi",&DWP(12,"esp"));}
- else { $out="esi"; &mov ($out,$s[0]); }
- &shr ($out,24-2);
- &and ($out,0xFF<<2);
- &mov ($out,&DWP(1024*0,$te,$out));
-
- if ($i==3) { $tmp=$s[1]; }
- else { $tmp="edi"; &mov ($tmp,$s[1]); }
- &shr ($tmp,16-2);
- &and ($tmp,0xFF<<2);
- &xor ($out,&DWP(1024*1,$te,$tmp));
+ if ($i==3) { $out=$s[0]; &movz($out,&HB($s[0]));
+ &mov ("edi",&DWP(12,"esp")); }
+ else { $out="esi"; &mov ($out,$s[0]);
+ &shr ($out,24); }
+ &mov ($out,&DWP(1024*0,$te,$out,4));
+
+ if ($i==2) { $tmp="edi"; &movz ($tmp,&LB($s[1])); }
+ elsif ($i==3){ $tmp=$s[1];
+ &shr ($tmp,16);
+ &movz ($tmp,&LB($tmp)); }
+ else { $tmp="edi"; &mov ($tmp,$s[1]);
+ &shr ($tmp,16);
+ &and ($tmp,0xFF); }
+ &xor ($out,&DWP(1024*1,$te,$tmp,4));
if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }
else { $tmp="edi"; }
&movz ($tmp,&HB($s[2]));
+ if ($i==1) { &shr ($s[2],16); }
&xor ($out,&DWP(1024*2,$te,$tmp,4));
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }
- else { $tmp="edi"; &mov ($tmp,$s[3]); }
- &and ($tmp,0xFF);
+ if ($i==3) { $tmp=$s[3]; &movz ($tmp,&LB($s[3]));
+ &mov ($s[2],&DWP(4,"esp")); }
+ else { $tmp="edi"; &movz ($tmp,&LB($s[3])); }
&xor ($out,&DWP(1024*3,$te,$tmp,4));
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); }
if ($i==3) { &mov ($s[3],"esi"); }
@@ -70,33 +67,36 @@
{ my ($i,$te,@s)[EMAIL PROTECTED];
my $tmp,$out;
- if ($i==3) { $out=$s[0]; &mov ("edi",&DWP(12,"esp"));}
- else { $out="esi"; &mov ($out,$s[0]); }
- &shr ($out,24-2);
- &and ($out,0xFF<<2);
- &mov ($out,&DWP(0,$te,$out));
+ if ($i==3) { $out=$s[0]; &movz ($out,&HB($s[0]));
+ &mov ("edi",&DWP(12,"esp")); }
+ else { $out="esi"; &mov ($out,$s[0]);
+ &shr ($out,24); }
+ &mov ($out,&DWP(0,$te,$out,4));
&and ($out,0xff000000);
- if ($i==3) { $tmp=$s[1]; }
- else { $tmp="edi"; &mov ($tmp,$s[1]); }
- &shr ($tmp,16-2);
- &and ($tmp,0xFF<<2);
- &mov ($tmp,&DWP(0,$te,$tmp));
+ if ($i==2) { $tmp="edi"; &movz ($tmp,&LB($s[1])); }
+ elsif ($i==3){ $tmp=$s[1];
+ &shr ($tmp,16);
+ &movz ($tmp,&LB($tmp)); }
+ else { $tmp="edi"; &mov ($tmp,$s[1]);
+ &shr ($tmp,16);
+ &and ($tmp,0xFF); }
+ &mov ($tmp,&DWP(0,$te,$tmp,4));
&and ($tmp,0x00ff0000);
&xor ($out,$tmp);
if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }
else { $tmp="edi"; }
&movz ($tmp,&HB($s[2]));
+ if ($i==1) { &shr ($s[2],16); }
&mov ($tmp,&DWP(0,$te,$tmp,4));
&and ($tmp,0x0000ff00);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }
- else { $tmp="edi"; &mov ($tmp,$s[3]); }
- &and ($tmp,0xFF);
- &mov ($tmp,&DWP(0,$te,$tmp,4));
- &and ($tmp,0x000000ff);
+ if ($i==3) { $tmp=$s[3]; &movz ($tmp,&LB($s[3]));
+ &mov ($s[2],&DWP(4,"esp")); }
+ else { $tmp="edi"; &movz ($tmp,&LB($s[3])); }
+ &movz ($tmp,&BP(0,$te,$tmp,4));
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); }
if ($i==3) { &mov ($s[3],"esi"); }
@@ -565,26 +565,29 @@
{ my ($i,$td,@s) = @_;
my $tmp,$out;
- if ($i==3) { $out=$s[0]; &mov ("edi",&DWP(12,"esp"));}
- else { $out="esi"; &mov ($out,$s[0]); }
- &shr ($out,24-2);
- &and ($out,0xFF<<2);
- &mov ($out,&DWP(1024*0,$td,$out));
-
- if ($i==3) { $tmp=$s[1]; }
- else { $tmp="edi"; &mov ($tmp,$s[1]); }
- &shr ($tmp,16-2);
- &and ($tmp,0xFF<<2);
- &xor ($out,&DWP(1024*1,$td,$tmp));
+ if ($i==2) { $out="esi"; &movz ($out,&HB($s[0])); }
+ else {
+ if ($i==3){ $out=$s[0]; &mov ("edi",&DWP(12,"esp"));}
+ else { $out="esi"; &mov ($out,$s[0]); }
+ &shr ($out,24);
+ }
+ &mov ($out,&DWP(1024*0,$td,$out,4));
+
+ if ($i==3) { $tmp=$s[1]; &movz ($tmp,&LB($s[1])); }
+ else { $tmp="edi"; &mov ($tmp,$s[1]);
+ &shr ($tmp,16);
+ &and ($tmp,0xFF); }
+ &xor ($out,&DWP(1024*1,$td,$tmp,4));
if ($i==3) { $tmp=$s[2]; &mov ($s[1],"esi"); }
else { $tmp="edi"; }
&movz ($tmp,&HB($s[2]));
&xor ($out,&DWP(1024*2,$td,$tmp,4));
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }
- else { $tmp="edi"; &mov ($tmp,$s[3]); }
- &and ($tmp,0xFF);
+ if ($i==3) { $tmp=$s[3]; &movz ($tmp,&LB($s[3]));
+ &mov ($s[2],&DWP(4,"esp")); }
+ else { $tmp="edi"; &movz ($tmp,&LB($s[3])); }
+ if ($i==1) { &shr ($s[3],16); }
&xor ($out,&DWP(1024*3,$td,$tmp,4));
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); }
if ($i==3) { &mov ($s[3],&DWP(0,"esp")); }
@@ -594,18 +597,20 @@
{ my ($i,$td,@s)[EMAIL PROTECTED];
my $tmp,$out;
- if ($i==3) { $out=$s[0]; &mov ("edi",&DWP(12,"esp"));}
- else { $out="esi"; &mov ($out,$s[0]); }
- &shr ($out,24-2);
- &and ($out,0xFF<<2);
- &mov ($out,&DWP(0,$td,$out));
+ if ($i==2) { $out="esi"; &movz ($out,&HB($s[0])); }
+ else {
+ if ($i==3){ $out=$s[0]; &mov ("edi",&DWP(12,"esp"));}
+ else { $out="esi"; &mov ($out,$s[0]); }
+ &shr ($out,24);
+ }
+ &mov ($out,&DWP(0,$td,$out,4));
&and ($out,0xff000000);
- if ($i==3) { $tmp=$s[1]; }
- else { $tmp="edi"; &mov ($tmp,$s[1]); }
- &shr ($tmp,16-2);
- &and ($tmp,0xFF<<2);
- &mov ($tmp,&DWP(0,$td,$tmp));
+ if ($i==3) { $tmp=$s[1]; &movz ($tmp,&LB($s[1])); }
+ else { $tmp="edi"; &mov ($tmp,$s[1]);
+ &shr ($tmp,16);
+ &and ($tmp,0xFF); }
+ &mov ($tmp,&DWP(0,$td,$tmp,4));
&and ($tmp,0x00ff0000);
&xor ($out,$tmp);
@@ -616,11 +621,11 @@
&and ($tmp,0x0000ff00);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }
- else { $tmp="edi"; &mov ($tmp,$s[3]); }
- &and ($tmp,0xFF);
- &mov ($tmp,&DWP(0,$td,$tmp,4));
- &and ($tmp,0x000000ff);
+ if ($i==3) { $tmp=$s[3]; &movz ($tmp,&LB($s[3]));
+ &mov ($s[2],&DWP(4,"esp")); }
+ else { $tmp="edi"; &movz ($tmp,&LB($s[3])); }
+ if ($i==1) { &shr ($s[3],16); }
+ &movz ($tmp,&BP(0,$td,$tmp,4));
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); }
if ($i==3) { &mov ($s[3],&DWP(0,"esp")); }
______________________________________________________________________
OpenSSL Project http://www.openssl.org
Development Mailing List [email protected]
Automated List Manager [EMAIL PROTECTED]