This patch is a faster bn_mul_add_words for x86 assembly.
For example, rsa 1024 bits sign/s improved about 4.7%.
This patch is based on code written by Sean Stanek.
Output from "openssl speed rsa" on an 800 MHz AMD Athlon:
Before patch:
sign verify sign/s verify/s
rsa 512 bits 0.0017s 0.0001s 594.6 7578.4
rsa 1024 bits 0.0076s 0.0004s 131.1 2731.0
rsa 2048 bits 0.0427s 0.0012s 23.4 843.2
rsa 4096 bits 0.2759s 0.0041s 3.6 242.2
After patch:
sign verify sign/s verify/s
rsa 512 bits 0.0016s 0.0001s 615.5 7936.6
rsa 1024 bits 0.0073s 0.0003s 137.3 2891.8
rsa 2048 bits 0.0405s 0.0011s 24.7 893.0
rsa 4096 bits 0.2590s 0.0039s 3.9 257.6
Index: crypto/bn/asm/bn-586.pl
===================================================================
RCS file: /home/paul/openssl/rsync/cvs/openssl/crypto/bn/asm/bn-586.pl,v
retrieving revision 1.5
diff -u -r1.5 bn-586.pl
--- crypto/bn/asm/bn-586.pl 2000/12/06 16:30:23 1.5
+++ crypto/bn/asm/bn-586.pl 2001/08/24 21:01:45
@@ -22,78 +22,111 @@
&function_begin($name,"");
&comment("");
- $Low="eax";
- $High="edx";
- $a="ebx";
- $w="ebp";
- $r="edi";
- $c="esi";
-
- &xor($c,$c); # clear carry
- &mov($r,&wparam(0)); #
-
- &mov("ecx",&wparam(2)); #
- &mov($a,&wparam(1)); #
-
- &and("ecx",0xfffffff8); # num / 8
- &mov($w,&wparam(3)); #
-
- &push("ecx"); # Up the stack for a tmp variable
-
- &jz(&label("maw_finish"));
-
- &set_label("maw_loop",0);
-
- &mov(&swtmp(0),"ecx"); #
-
- for ($i=0; $i<32; $i+=4)
- {
- &comment("Round $i");
-
- &mov("eax",&DWP($i,$a,"",0)); # *a
- &mul($w); # *a * w
- &add("eax",$c); # L(t)+= *r
- &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
- &adc("edx",0); # H(t)+=carry
- &add("eax",$c); # L(t)+=c
- &adc("edx",0); # H(t)+=carry
- &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
- &mov($c,"edx"); # c= H(t);
- }
-
- &comment("");
- &mov("ecx",&swtmp(0)); #
- &add($a,32);
- &add($r,32);
- &sub("ecx",8);
- &jnz(&label("maw_loop"));
-
- &set_label("maw_finish",0);
- &mov("ecx",&wparam(2)); # get num
- &and("ecx",7);
- &jnz(&label("maw_finish2")); # helps branch prediction
- &jmp(&label("maw_end"));
-
- &set_label("maw_finish2",1);
- for ($i=0; $i<7; $i++)
- {
- &comment("Tail Round $i");
- &mov("eax",&DWP($i*4,$a,"",0));# *a
- &mul($w); # *a * w
- &add("eax",$c); # L(t)+=c
- &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
- &adc("edx",0); # H(t)+=carry
- &add("eax",$c);
- &adc("edx",0); # H(t)+=carry
- &dec("ecx") if ($i != 7-1);
- &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
- &mov($c,"edx"); # c= H(t);
- &jz(&label("maw_end")) if ($i != 7-1);
- }
- &set_label("maw_end",0);
- &mov("eax",$c);
+ $r = 0;
+ $a = 1;
+ $num = 2;
+ $w = 3;
+
+ &mov("ebx",&wparam($r)); # r
+ &mov("ebp",&wparam($a)); # a
+ &mov("ecx",&wparam($num)); # num
+
+ &shr("ecx",3); # num / 8
+ &mov("edi",0); # clear carry
+ &jz(&label("bn_mul_add_words_part"));
+
+ &set_label("bn_mul_add_words_full_loop",0);
+ &comment("8 dwords per loop (2 dwords per round * 4 rounds per
loop)");
+
+ &push("ecx");
+ &mov("ecx",&wparam($w)); # w
+
+ for ($k=0; $k<4; $k++)
+ {
+ &comment("Round $k");
+ $i = $k * 8;
+ &mov("eax",&DWP($i,"ebp","",0));
+ &mov("esi",&DWP($i+4,"ebp","",0));
+ &mul("ecx");
+ &add("eax","edi");
+ &mov("edi","edx");
+ &adc("edi",0);
+ &add(&DWP($i,"ebx","",0),"eax");
+ &mov("eax","esi");
+ &adc("edi",0);
+ &mul("ecx");
+ &add("eax","edi");
+ &mov("edi","edx");
+ &adc("edi",0);
+ &add(&DWP($i+4,"ebx","",0),"eax");
+ &adc("edi",0);
+ }
+
+ &pop("ecx");
+ &add("ebp",32);
+ &add("ebx",32);
+ &dec("ecx");
+ &jnz(&label("bn_mul_add_words_full_loop"));
+
+ &set_label("bn_mul_add_words_part",0);
+ &mov("esi",&wparam($num)); # num
+ &and("esi",7);
+ &mov("ecx",&wparam($w)); # w
+
+ &comment("do 0..7 more dwords");
+ &jmp(&DWP(&label("bn_mul_add_words_array"),"","esi",4));
+
+ &set_label("bn_mul_add_words_array",1);
+ &comment("array for quick jump for last 0..7 dwords");
+ &data_word(&label("bn_mul_add_words_end"));
+ for ($i=1; $i<=7; $i++)
+ {
+ &data_word(&label("bn_mul_add_words_part_$i"));
+ }
+
+ for ($j=7; $j>=1; $j--)
+ {
+ &set_label("bn_mul_add_words_part_$j",1);
+ for ($k=0; $k<($j>>1); $k++)
+ {
+ &comment("Part $j, Round $k");
+ $i = $k * 8;
+ &mov("eax",&DWP($i,"ebp","",0));
+ &mov("esi",&DWP($i+4,"ebp","",0));
+ &mul("ecx");
+ &add("eax","edi");
+ &mov("edi","edx");
+ &adc("edi",0);
+ &add(&DWP($i,"ebx","",0),"eax");
+ &mov("eax","esi");
+ &adc("edi",0);
+ &mul("ecx");
+ &add("eax","edi");
+ &mov("edi","edx");
+ &adc("edi",0);
+ &add(&DWP($i+4,"ebx","",0),"eax");
+ &adc("edi",0);
+ }
+
+ if (($j % 2) == 1)
+ {
+ &comment("Part $j, one more dword");
+ $i = $k * 8;
+ &mov("eax",&DWP($i,"ebp","",0));
+ &mul("ecx");
+ &add("eax","edi");
+ &mov("edi","edx");
+ &adc("edi",0);
+ &add(&DWP($i,"ebx","",0),"eax");
+ &adc("edi",0);
+ }
- &pop("ecx"); # clear variable from
+ &jmp(&label("bn_mul_add_words_end")) if ($j != 1);
+ }
+
+ &set_label("bn_mul_add_words_end",0);
+
+ &mov("eax","edi");
&function_end($name);
}
______________________________________________________________________
OpenSSL Project http://www.openssl.org
Development Mailing List [EMAIL PROTECTED]
Automated List Manager [EMAIL PROTECTED]