Improve sha1sum speed

Loïc Le Loarer Mon, 05 Sep 2011 15:32:21 -0700

Hi,
I saw in the todo list for coreutils that improving sha1sum speed was a
target. So I worked a bit on that.


By using the ideas from the sha1.c file from git sources (
http://git.kernel.org/?p=git/git.git;a=tree;f=block-sha1;hb=pu) which is
clearly faster than the one in gnulib, I have been able to aligned the speed
of Linus' git.

Here is what I did:
First create a test file:
$ dd if=/dev/zero of=big_zero count=1 seek=$(( 1000 * 1000000 - 1 )) bs=1
Test sha1sum on it:
$ \time sha1sum < big_zero
1dd775261d7abab0b66910acc1d827a2c3799eaf  -
3.49user 0.12system 0:03.61elapsed 99%CPU (0avgtext+0avgdata
2512maxresident)k
0inputs+0outputs (0major+202minor)pagefaults 0swaps
And Linus' sha1:
$ \time ../git-1.7.1/test-sha1 < big_zero
1dd775261d7abab0b66910acc1d827a2c3799eaf
2.54user 0.13system 0:02.70elapsed 98%CPU (0avgtext+0avgdata
2352maxresident)k
976inputs+0outputs (2major+191minor)pagefaults 0swaps

Here is my system:
$ gcc -v
Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro
4.4.4-14ubuntu5' --with-bugurl=file:///usr/share/doc/gcc-4.4/README.Bugs
--enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr
--program-suffix=-4.4 --enable-shared --enable-multiarch
--enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib
--without-included-gettext --enable-threads=posix
--with-gxx-include-dir=/usr/include/c++/4.4 --libdir=/usr/lib --enable-nls
--with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug
--enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic
--enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 4.4.5 (Ubuntu/Linaro 4.4.4-14ubuntu5)

$ cat /proc/cpuinfo
processor    : 0
vendor_id    : GenuineIntel
cpu family    : 6
model        : 42
model name    : Genuine Intel(R) CPU 0 @ 2.50GHz
stepping    : 5
cpu MHz        : 800.000
cache size    : 3072 KB
physical id    : 0
siblings    : 4
core id        : 0
cpu cores    : 2
apicid        : 0
initial apicid    : 0
fpu        : yes
fpu_exception    : yes
cpuid level    : 13
wp        : yes
flags        : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
lm constant_tsc arch_perfmon pebs bts rep_good xtopology nonstop_tsc
aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat pln pts
dts tpr_shadow vnmi flexpriority ept vpid
bogomips    : 4988.10
clflush size    : 64
cache_alignment    : 64
address sizes    : 36 bits physical, 48 bits virtual
power management:

You can see that it is working in 64 mode.

Checkout coreutils git lastest version and compile, here is original speed:
$ \time ./src/sha1sum < big_zero
1dd775261d7abab0b66910acc1d827a2c3799eaf  -
3.71user 0.20system 0:03.92elapsed 99%CPU (0avgtext+0avgdata
2528maxresident)k
0inputs+0outputs (0major+195minor)pagefaults 0swaps

Using -O3 to compile: (make CFLAGS="-g -O3") instead of default -O2:
$ \time ./src/sha1sum < big_zero
1dd775261d7abab0b66910acc1d827a2c3799eaf  -
3.47user 0.14system 0:03.62elapsed 99%CPU (0avgtext+0avgdata
2512maxresident)k
0inputs+0outputs (0major+195minor)pagefaults 0swaps

Just changing SWAP to ntohl does have a significant effect on my platform
(the bswap instruction is used). According to Linus comment, this should be
done only in platform having fast unalinged load:
$ diff -uN lib/sha1.c.orig lib/sha1.c
--- lib/sha1.c.orig    2011-09-05 23:31:23.128552012 +0200
+++ lib/sha1.c    2011-09-05 23:29:58.988552016 +0200
@@ -29,6 +29,7 @@
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
+#include <arpa/inet.h>

 #if USE_UNLOCKED_IO
 # include "unlocked-io.h"
@@ -333,7 +334,7 @@
       int t;
       for (t = 0; t < 16; t++)
         {
-          x[t] = SWAP (*words);
+          x[t] = ntohl(*words);
           words++;
         }

$ \time ./src/sha1sum.v1 < big_zero
1dd775261d7abab0b66910acc1d827a2c3799eaf  -
3.28user 0.10system 0:03.39elapsed 99%CPU (0avgtext+0avgdata
2512maxresident)k
0inputs+0outputs (0major+194minor)pagefaults 0swaps

Then add a second patch to avoid the (,) construction and reorder the
assignement to x[] array in the main define:

--- lib/sha1.c.orig    2011-09-05 23:45:19.988552016 +0200
+++ lib/sha1.c    2011-09-05 23:47:57.188552016 +0200
@@ -317,112 +317,107 @@

 #define rol(x, n) (((x) << (n)) | ((uint32_t) (x) >> (32 - (n))))

-#define M(I) ( tm =   x[I&0x0f] ^ x[(I-14)&0x0f] \
-                    ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \
-               , (x[I&0x0f] = rol(tm, 1)) )
+#define X(I) ntohl(words[I])
+#define M(I) rol(  x[I&0x0f] ^ x[(I-14)&0x0f] \
+                 ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f], 1) \

-#define R(A,B,C,D,E,F,K,M)  do { E += rol( A, 5 )     \
+#define R(A,B,C,D,E,F,K,M,I) do { unsigned int TEMP = M(I); x[I&0x0f] =
TEMP;\
+                 E += rol( A, 5 )     \
                                       + F( B, C, D )  \
                                       + K             \
-                                      + M;            \
+                                      + TEMP;         \
                                  B = rol( B, 30 );    \
                                } while(0)

+
   while (words < endp)
     {
-      uint32_t tm;
-      int t;
-      for (t = 0; t < 16; t++)
-        {
-          x[t] = ntohl(*words);
-          words++;
-        }
-
-      R( a, b, c, d, e, F1, K1, x[ 0] );
-      R( e, a, b, c, d, F1, K1, x[ 1] );
-      R( d, e, a, b, c, F1, K1, x[ 2] );
-      R( c, d, e, a, b, F1, K1, x[ 3] );
-      R( b, c, d, e, a, F1, K1, x[ 4] );
-      R( a, b, c, d, e, F1, K1, x[ 5] );
-      R( e, a, b, c, d, F1, K1, x[ 6] );
-      R( d, e, a, b, c, F1, K1, x[ 7] );
-      R( c, d, e, a, b, F1, K1, x[ 8] );
-      R( b, c, d, e, a, F1, K1, x[ 9] );
-      R( a, b, c, d, e, F1, K1, x[10] );
-      R( e, a, b, c, d, F1, K1, x[11] );
-      R( d, e, a, b, c, F1, K1, x[12] );
-      R( c, d, e, a, b, F1, K1, x[13] );
-      R( b, c, d, e, a, F1, K1, x[14] );
-      R( a, b, c, d, e, F1, K1, x[15] );
-      R( e, a, b, c, d, F1, K1, M(16) );
-      R( d, e, a, b, c, F1, K1, M(17) );
-      R( c, d, e, a, b, F1, K1, M(18) );
-      R( b, c, d, e, a, F1, K1, M(19) );
-      R( a, b, c, d, e, F2, K2, M(20) );
-      R( e, a, b, c, d, F2, K2, M(21) );
-      R( d, e, a, b, c, F2, K2, M(22) );
-      R( c, d, e, a, b, F2, K2, M(23) );
-      R( b, c, d, e, a, F2, K2, M(24) );
-      R( a, b, c, d, e, F2, K2, M(25) );
-      R( e, a, b, c, d, F2, K2, M(26) );
-      R( d, e, a, b, c, F2, K2, M(27) );
-      R( c, d, e, a, b, F2, K2, M(28) );
-      R( b, c, d, e, a, F2, K2, M(29) );
-      R( a, b, c, d, e, F2, K2, M(30) );
-      R( e, a, b, c, d, F2, K2, M(31) );
-      R( d, e, a, b, c, F2, K2, M(32) );
-      R( c, d, e, a, b, F2, K2, M(33) );
-      R( b, c, d, e, a, F2, K2, M(34) );
-      R( a, b, c, d, e, F2, K2, M(35) );
-      R( e, a, b, c, d, F2, K2, M(36) );
-      R( d, e, a, b, c, F2, K2, M(37) );
-      R( c, d, e, a, b, F2, K2, M(38) );
-      R( b, c, d, e, a, F2, K2, M(39) );
-      R( a, b, c, d, e, F3, K3, M(40) );
-      R( e, a, b, c, d, F3, K3, M(41) );
-      R( d, e, a, b, c, F3, K3, M(42) );
-      R( c, d, e, a, b, F3, K3, M(43) );
-      R( b, c, d, e, a, F3, K3, M(44) );
-      R( a, b, c, d, e, F3, K3, M(45) );
-      R( e, a, b, c, d, F3, K3, M(46) );
-      R( d, e, a, b, c, F3, K3, M(47) );
-      R( c, d, e, a, b, F3, K3, M(48) );
-      R( b, c, d, e, a, F3, K3, M(49) );
-      R( a, b, c, d, e, F3, K3, M(50) );
-      R( e, a, b, c, d, F3, K3, M(51) );
-      R( d, e, a, b, c, F3, K3, M(52) );
-      R( c, d, e, a, b, F3, K3, M(53) );
-      R( b, c, d, e, a, F3, K3, M(54) );
-      R( a, b, c, d, e, F3, K3, M(55) );
-      R( e, a, b, c, d, F3, K3, M(56) );
-      R( d, e, a, b, c, F3, K3, M(57) );
-      R( c, d, e, a, b, F3, K3, M(58) );
-      R( b, c, d, e, a, F3, K3, M(59) );
-      R( a, b, c, d, e, F4, K4, M(60) );
-      R( e, a, b, c, d, F4, K4, M(61) );
-      R( d, e, a, b, c, F4, K4, M(62) );
-      R( c, d, e, a, b, F4, K4, M(63) );
-      R( b, c, d, e, a, F4, K4, M(64) );
-      R( a, b, c, d, e, F4, K4, M(65) );
-      R( e, a, b, c, d, F4, K4, M(66) );
-      R( d, e, a, b, c, F4, K4, M(67) );
-      R( c, d, e, a, b, F4, K4, M(68) );
-      R( b, c, d, e, a, F4, K4, M(69) );
-      R( a, b, c, d, e, F4, K4, M(70) );
-      R( e, a, b, c, d, F4, K4, M(71) );
-      R( d, e, a, b, c, F4, K4, M(72) );
-      R( c, d, e, a, b, F4, K4, M(73) );
-      R( b, c, d, e, a, F4, K4, M(74) );
-      R( a, b, c, d, e, F4, K4, M(75) );
-      R( e, a, b, c, d, F4, K4, M(76) );
-      R( d, e, a, b, c, F4, K4, M(77) );
-      R( c, d, e, a, b, F4, K4, M(78) );
-      R( b, c, d, e, a, F4, K4, M(79) );
+      R( a, b, c, d, e, F1, K1, X,  0);
+      R( e, a, b, c, d, F1, K1, X,  1);
+      R( d, e, a, b, c, F1, K1, X,  2);
+      R( c, d, e, a, b, F1, K1, X,  3);
+      R( b, c, d, e, a, F1, K1, X,  4);
+      R( a, b, c, d, e, F1, K1, X,  5);
+      R( e, a, b, c, d, F1, K1, X,  6);
+      R( d, e, a, b, c, F1, K1, X,  7);
+      R( c, d, e, a, b, F1, K1, X,  8);
+      R( b, c, d, e, a, F1, K1, X,  9);
+      R( a, b, c, d, e, F1, K1, X, 10);
+      R( e, a, b, c, d, F1, K1, X, 11);
+      R( d, e, a, b, c, F1, K1, X, 12);
+      R( c, d, e, a, b, F1, K1, X, 13);
+      R( b, c, d, e, a, F1, K1, X, 14);
+      R( a, b, c, d, e, F1, K1, X, 15);
+      R( e, a, b, c, d, F1, K1, M, 16);
+      R( d, e, a, b, c, F1, K1, M, 17);
+      R( c, d, e, a, b, F1, K1, M, 18);
+      R( b, c, d, e, a, F1, K1, M, 19);
+      R( a, b, c, d, e, F2, K2, M, 20);
+      R( e, a, b, c, d, F2, K2, M, 21);
+      R( d, e, a, b, c, F2, K2, M, 22);
+      R( c, d, e, a, b, F2, K2, M, 23);
+      R( b, c, d, e, a, F2, K2, M, 24);
+      R( a, b, c, d, e, F2, K2, M, 25);
+      R( e, a, b, c, d, F2, K2, M, 26);
+      R( d, e, a, b, c, F2, K2, M, 27);
+      R( c, d, e, a, b, F2, K2, M, 28);
+      R( b, c, d, e, a, F2, K2, M, 29);
+      R( a, b, c, d, e, F2, K2, M, 30);
+      R( e, a, b, c, d, F2, K2, M, 31);
+      R( d, e, a, b, c, F2, K2, M, 32);
+      R( c, d, e, a, b, F2, K2, M, 33);
+      R( b, c, d, e, a, F2, K2, M, 34);
+      R( a, b, c, d, e, F2, K2, M, 35);
+      R( e, a, b, c, d, F2, K2, M, 36);
+      R( d, e, a, b, c, F2, K2, M, 37);
+      R( c, d, e, a, b, F2, K2, M, 38);
+      R( b, c, d, e, a, F2, K2, M, 39);
+      R( a, b, c, d, e, F3, K3, M, 40);
+      R( e, a, b, c, d, F3, K3, M, 41);
+      R( d, e, a, b, c, F3, K3, M, 42);
+      R( c, d, e, a, b, F3, K3, M, 43);
+      R( b, c, d, e, a, F3, K3, M, 44);
+      R( a, b, c, d, e, F3, K3, M, 45);
+      R( e, a, b, c, d, F3, K3, M, 46);
+      R( d, e, a, b, c, F3, K3, M, 47);
+      R( c, d, e, a, b, F3, K3, M, 48);
+      R( b, c, d, e, a, F3, K3, M, 49);
+      R( a, b, c, d, e, F3, K3, M, 50);
+      R( e, a, b, c, d, F3, K3, M, 51);
+      R( d, e, a, b, c, F3, K3, M, 52);
+      R( c, d, e, a, b, F3, K3, M, 53);
+      R( b, c, d, e, a, F3, K3, M, 54);
+      R( a, b, c, d, e, F3, K3, M, 55);
+      R( e, a, b, c, d, F3, K3, M, 56);
+      R( d, e, a, b, c, F3, K3, M, 57);
+      R( c, d, e, a, b, F3, K3, M, 58);
+      R( b, c, d, e, a, F3, K3, M, 59);
+      R( a, b, c, d, e, F4, K4, M, 60);
+      R( e, a, b, c, d, F4, K4, M, 61);
+      R( d, e, a, b, c, F4, K4, M, 62);
+      R( c, d, e, a, b, F4, K4, M, 63);
+      R( b, c, d, e, a, F4, K4, M, 64);
+      R( a, b, c, d, e, F4, K4, M, 65);
+      R( e, a, b, c, d, F4, K4, M, 66);
+      R( d, e, a, b, c, F4, K4, M, 67);
+      R( c, d, e, a, b, F4, K4, M, 68);
+      R( b, c, d, e, a, F4, K4, M, 69);
+      R( a, b, c, d, e, F4, K4, M, 70);
+      R( e, a, b, c, d, F4, K4, M, 71);
+      R( d, e, a, b, c, F4, K4, M, 72);
+      R( c, d, e, a, b, F4, K4, M, 73);
+      R( b, c, d, e, a, F4, K4, M, 74);
+      R( a, b, c, d, e, F4, K4, M, 75);
+      R( e, a, b, c, d, F4, K4, M, 76);
+      R( d, e, a, b, c, F4, K4, M, 77);
+      R( c, d, e, a, b, F4, K4, M, 78);
+      R( b, c, d, e, a, F4, K4, M, 79);

       a = ctx->A += a;
       b = ctx->B += b;
       c = ctx->C += c;
       d = ctx->D += d;
       e = ctx->E += e;
+      words += 16;
     }
 }

I can't explain exactly how but this does have an effect:
$ \time ./src/sha1sum < big_zero
1dd775261d7abab0b66910acc1d827a2c3799eaf  -
3.06user 0.11system 0:03.17elapsed 99%CPU (0avgtext+0avgdata
2544maxresident)k
0inputs+0outputs (0major+197minor)pagefaults 0swaps

Then, the volatile addition which, according to Linus, is working great on
architecture with not enough registers to accomodate the whole x[] array by
preventing gcc from missing with the registers. This is a small patch:

--- lib/sha1.c.orig    2011-09-05 23:54:08.498552015 +0200
+++ lib/sha1.c    2011-09-05 23:54:11.408552015 +0200
@@ -321,7 +321,7 @@
 #define M(I) rol(  x[I&0x0f] ^ x[(I-14)&0x0f] \
                  ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f], 1) \

-#define R(A,B,C,D,E,F,K,M,I) do { unsigned int TEMP = M(I); x[I&0x0f] =
TEMP;\
+#define R(A,B,C,D,E,F,K,M,I) do { unsigned int TEMP = M(I); *(volatile
unsigned int *)&x[I&0x0f] = TEMP;\
                  E += rol( A, 5 )     \
                                       + F( B, C, D )  \
                                       + K             \

This does have a very big impact on speed:
$ \time ./src/sha1sum < big_zero
1dd775261d7abab0b66910acc1d827a2c3799eaf  -
2.57user 0.11system 0:02.69elapsed 99%CPU (0avgtext+0avgdata
2544maxresident)k
0inputs+0outputs (0major+196minor)pagefaults 0swaps

Eventually, the last performance stuff in Linus' sha1 is the use of inline
asm for ror and rol instruction, but it seems it has no impact on
performance for me, so I don't include it here.

Clearly, those patches are not clean, but I can had the necessary stuff so
that ntohl and volatile write access are used when necessary by compilation
directives like it is done in Linus' sha1.c.

My question is: would you consider such a patch for inclusion? is it the
correct direction? Maybe, a better alternative is to just include block-sha1
from Linus directly as it seems that there is no license problems (
http://lists.gnu.org/archive/html/bug-coreutils/2009-08/msg00136.html).

Also note that similar work can be done for sha256sum and sha512sum, I could
work on this if it has a chance to be included.

And also, there is a sse version from intel
http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/?cid=sw:dslnews5149which
I have verified is even faster. I would be OK to do integration work
if it has a chance to be accepted in gnulib or coreutils.

Thanks in advance for your answers,
Best regards,
-- 
Loïc

Improve sha1sum speed

Reply via email to