Hi, I saw in the todo list for coreutils that improving sha1sum speed was a target. So I worked a bit on that.
By using the ideas from the sha1.c file from git sources ( http://git.kernel.org/?p=git/git.git;a=tree;f=block-sha1;hb=pu) which is clearly faster than the one in gnulib, I have been able to aligned the speed of Linus' git. Here is what I did: First create a test file: $ dd if=/dev/zero of=big_zero count=1 seek=$(( 1000 * 1000000 - 1 )) bs=1 Test sha1sum on it: $ \time sha1sum < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf - 3.49user 0.12system 0:03.61elapsed 99%CPU (0avgtext+0avgdata 2512maxresident)k 0inputs+0outputs (0major+202minor)pagefaults 0swaps And Linus' sha1: $ \time ../git-1.7.1/test-sha1 < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf 2.54user 0.13system 0:02.70elapsed 98%CPU (0avgtext+0avgdata 2352maxresident)k 976inputs+0outputs (2major+191minor)pagefaults 0swaps Here is my system: $ gcc -v Using built-in specs. Target: x86_64-linux-gnu Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.4.4-14ubuntu5' --with-bugurl=file:///usr/share/doc/gcc-4.4/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.4 --enable-shared --enable-multiarch --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.4 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu Thread model: posix gcc version 4.4.5 (Ubuntu/Linaro 4.4.4-14ubuntu5) $ cat /proc/cpuinfo processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 42 model name : Genuine Intel(R) CPU 0 @ 2.50GHz stepping : 5 cpu MHz : 800.000 cache size : 3072 KB physical id : 0 siblings : 4 core id : 0 cpu cores : 2 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat pln pts dts tpr_shadow vnmi flexpriority ept vpid bogomips : 4988.10 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: You can see that it is working in 64 mode. Checkout coreutils git lastest version and compile, here is original speed: $ \time ./src/sha1sum < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf - 3.71user 0.20system 0:03.92elapsed 99%CPU (0avgtext+0avgdata 2528maxresident)k 0inputs+0outputs (0major+195minor)pagefaults 0swaps Using -O3 to compile: (make CFLAGS="-g -O3") instead of default -O2: $ \time ./src/sha1sum < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf - 3.47user 0.14system 0:03.62elapsed 99%CPU (0avgtext+0avgdata 2512maxresident)k 0inputs+0outputs (0major+195minor)pagefaults 0swaps Just changing SWAP to ntohl does have a significant effect on my platform (the bswap instruction is used). According to Linus comment, this should be done only in platform having fast unalinged load: $ diff -uN lib/sha1.c.orig lib/sha1.c --- lib/sha1.c.orig 2011-09-05 23:31:23.128552012 +0200 +++ lib/sha1.c 2011-09-05 23:29:58.988552016 +0200 @@ -29,6 +29,7 @@ #include <stddef.h> #include <stdlib.h> #include <string.h> +#include <arpa/inet.h> #if USE_UNLOCKED_IO # include "unlocked-io.h" @@ -333,7 +334,7 @@ int t; for (t = 0; t < 16; t++) { - x[t] = SWAP (*words); + x[t] = ntohl(*words); words++; } $ \time ./src/sha1sum.v1 < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf - 3.28user 0.10system 0:03.39elapsed 99%CPU (0avgtext+0avgdata 2512maxresident)k 0inputs+0outputs (0major+194minor)pagefaults 0swaps Then add a second patch to avoid the (,) construction and reorder the assignement to x[] array in the main define: --- lib/sha1.c.orig 2011-09-05 23:45:19.988552016 +0200 +++ lib/sha1.c 2011-09-05 23:47:57.188552016 +0200 @@ -317,112 +317,107 @@ #define rol(x, n) (((x) << (n)) | ((uint32_t) (x) >> (32 - (n)))) -#define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \ - ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \ - , (x[I&0x0f] = rol(tm, 1)) ) +#define X(I) ntohl(words[I]) +#define M(I) rol( x[I&0x0f] ^ x[(I-14)&0x0f] \ + ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f], 1) \ -#define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \ +#define R(A,B,C,D,E,F,K,M,I) do { unsigned int TEMP = M(I); x[I&0x0f] = TEMP;\ + E += rol( A, 5 ) \ + F( B, C, D ) \ + K \ - + M; \ + + TEMP; \ B = rol( B, 30 ); \ } while(0) + while (words < endp) { - uint32_t tm; - int t; - for (t = 0; t < 16; t++) - { - x[t] = ntohl(*words); - words++; - } - - R( a, b, c, d, e, F1, K1, x[ 0] ); - R( e, a, b, c, d, F1, K1, x[ 1] ); - R( d, e, a, b, c, F1, K1, x[ 2] ); - R( c, d, e, a, b, F1, K1, x[ 3] ); - R( b, c, d, e, a, F1, K1, x[ 4] ); - R( a, b, c, d, e, F1, K1, x[ 5] ); - R( e, a, b, c, d, F1, K1, x[ 6] ); - R( d, e, a, b, c, F1, K1, x[ 7] ); - R( c, d, e, a, b, F1, K1, x[ 8] ); - R( b, c, d, e, a, F1, K1, x[ 9] ); - R( a, b, c, d, e, F1, K1, x[10] ); - R( e, a, b, c, d, F1, K1, x[11] ); - R( d, e, a, b, c, F1, K1, x[12] ); - R( c, d, e, a, b, F1, K1, x[13] ); - R( b, c, d, e, a, F1, K1, x[14] ); - R( a, b, c, d, e, F1, K1, x[15] ); - R( e, a, b, c, d, F1, K1, M(16) ); - R( d, e, a, b, c, F1, K1, M(17) ); - R( c, d, e, a, b, F1, K1, M(18) ); - R( b, c, d, e, a, F1, K1, M(19) ); - R( a, b, c, d, e, F2, K2, M(20) ); - R( e, a, b, c, d, F2, K2, M(21) ); - R( d, e, a, b, c, F2, K2, M(22) ); - R( c, d, e, a, b, F2, K2, M(23) ); - R( b, c, d, e, a, F2, K2, M(24) ); - R( a, b, c, d, e, F2, K2, M(25) ); - R( e, a, b, c, d, F2, K2, M(26) ); - R( d, e, a, b, c, F2, K2, M(27) ); - R( c, d, e, a, b, F2, K2, M(28) ); - R( b, c, d, e, a, F2, K2, M(29) ); - R( a, b, c, d, e, F2, K2, M(30) ); - R( e, a, b, c, d, F2, K2, M(31) ); - R( d, e, a, b, c, F2, K2, M(32) ); - R( c, d, e, a, b, F2, K2, M(33) ); - R( b, c, d, e, a, F2, K2, M(34) ); - R( a, b, c, d, e, F2, K2, M(35) ); - R( e, a, b, c, d, F2, K2, M(36) ); - R( d, e, a, b, c, F2, K2, M(37) ); - R( c, d, e, a, b, F2, K2, M(38) ); - R( b, c, d, e, a, F2, K2, M(39) ); - R( a, b, c, d, e, F3, K3, M(40) ); - R( e, a, b, c, d, F3, K3, M(41) ); - R( d, e, a, b, c, F3, K3, M(42) ); - R( c, d, e, a, b, F3, K3, M(43) ); - R( b, c, d, e, a, F3, K3, M(44) ); - R( a, b, c, d, e, F3, K3, M(45) ); - R( e, a, b, c, d, F3, K3, M(46) ); - R( d, e, a, b, c, F3, K3, M(47) ); - R( c, d, e, a, b, F3, K3, M(48) ); - R( b, c, d, e, a, F3, K3, M(49) ); - R( a, b, c, d, e, F3, K3, M(50) ); - R( e, a, b, c, d, F3, K3, M(51) ); - R( d, e, a, b, c, F3, K3, M(52) ); - R( c, d, e, a, b, F3, K3, M(53) ); - R( b, c, d, e, a, F3, K3, M(54) ); - R( a, b, c, d, e, F3, K3, M(55) ); - R( e, a, b, c, d, F3, K3, M(56) ); - R( d, e, a, b, c, F3, K3, M(57) ); - R( c, d, e, a, b, F3, K3, M(58) ); - R( b, c, d, e, a, F3, K3, M(59) ); - R( a, b, c, d, e, F4, K4, M(60) ); - R( e, a, b, c, d, F4, K4, M(61) ); - R( d, e, a, b, c, F4, K4, M(62) ); - R( c, d, e, a, b, F4, K4, M(63) ); - R( b, c, d, e, a, F4, K4, M(64) ); - R( a, b, c, d, e, F4, K4, M(65) ); - R( e, a, b, c, d, F4, K4, M(66) ); - R( d, e, a, b, c, F4, K4, M(67) ); - R( c, d, e, a, b, F4, K4, M(68) ); - R( b, c, d, e, a, F4, K4, M(69) ); - R( a, b, c, d, e, F4, K4, M(70) ); - R( e, a, b, c, d, F4, K4, M(71) ); - R( d, e, a, b, c, F4, K4, M(72) ); - R( c, d, e, a, b, F4, K4, M(73) ); - R( b, c, d, e, a, F4, K4, M(74) ); - R( a, b, c, d, e, F4, K4, M(75) ); - R( e, a, b, c, d, F4, K4, M(76) ); - R( d, e, a, b, c, F4, K4, M(77) ); - R( c, d, e, a, b, F4, K4, M(78) ); - R( b, c, d, e, a, F4, K4, M(79) ); + R( a, b, c, d, e, F1, K1, X, 0); + R( e, a, b, c, d, F1, K1, X, 1); + R( d, e, a, b, c, F1, K1, X, 2); + R( c, d, e, a, b, F1, K1, X, 3); + R( b, c, d, e, a, F1, K1, X, 4); + R( a, b, c, d, e, F1, K1, X, 5); + R( e, a, b, c, d, F1, K1, X, 6); + R( d, e, a, b, c, F1, K1, X, 7); + R( c, d, e, a, b, F1, K1, X, 8); + R( b, c, d, e, a, F1, K1, X, 9); + R( a, b, c, d, e, F1, K1, X, 10); + R( e, a, b, c, d, F1, K1, X, 11); + R( d, e, a, b, c, F1, K1, X, 12); + R( c, d, e, a, b, F1, K1, X, 13); + R( b, c, d, e, a, F1, K1, X, 14); + R( a, b, c, d, e, F1, K1, X, 15); + R( e, a, b, c, d, F1, K1, M, 16); + R( d, e, a, b, c, F1, K1, M, 17); + R( c, d, e, a, b, F1, K1, M, 18); + R( b, c, d, e, a, F1, K1, M, 19); + R( a, b, c, d, e, F2, K2, M, 20); + R( e, a, b, c, d, F2, K2, M, 21); + R( d, e, a, b, c, F2, K2, M, 22); + R( c, d, e, a, b, F2, K2, M, 23); + R( b, c, d, e, a, F2, K2, M, 24); + R( a, b, c, d, e, F2, K2, M, 25); + R( e, a, b, c, d, F2, K2, M, 26); + R( d, e, a, b, c, F2, K2, M, 27); + R( c, d, e, a, b, F2, K2, M, 28); + R( b, c, d, e, a, F2, K2, M, 29); + R( a, b, c, d, e, F2, K2, M, 30); + R( e, a, b, c, d, F2, K2, M, 31); + R( d, e, a, b, c, F2, K2, M, 32); + R( c, d, e, a, b, F2, K2, M, 33); + R( b, c, d, e, a, F2, K2, M, 34); + R( a, b, c, d, e, F2, K2, M, 35); + R( e, a, b, c, d, F2, K2, M, 36); + R( d, e, a, b, c, F2, K2, M, 37); + R( c, d, e, a, b, F2, K2, M, 38); + R( b, c, d, e, a, F2, K2, M, 39); + R( a, b, c, d, e, F3, K3, M, 40); + R( e, a, b, c, d, F3, K3, M, 41); + R( d, e, a, b, c, F3, K3, M, 42); + R( c, d, e, a, b, F3, K3, M, 43); + R( b, c, d, e, a, F3, K3, M, 44); + R( a, b, c, d, e, F3, K3, M, 45); + R( e, a, b, c, d, F3, K3, M, 46); + R( d, e, a, b, c, F3, K3, M, 47); + R( c, d, e, a, b, F3, K3, M, 48); + R( b, c, d, e, a, F3, K3, M, 49); + R( a, b, c, d, e, F3, K3, M, 50); + R( e, a, b, c, d, F3, K3, M, 51); + R( d, e, a, b, c, F3, K3, M, 52); + R( c, d, e, a, b, F3, K3, M, 53); + R( b, c, d, e, a, F3, K3, M, 54); + R( a, b, c, d, e, F3, K3, M, 55); + R( e, a, b, c, d, F3, K3, M, 56); + R( d, e, a, b, c, F3, K3, M, 57); + R( c, d, e, a, b, F3, K3, M, 58); + R( b, c, d, e, a, F3, K3, M, 59); + R( a, b, c, d, e, F4, K4, M, 60); + R( e, a, b, c, d, F4, K4, M, 61); + R( d, e, a, b, c, F4, K4, M, 62); + R( c, d, e, a, b, F4, K4, M, 63); + R( b, c, d, e, a, F4, K4, M, 64); + R( a, b, c, d, e, F4, K4, M, 65); + R( e, a, b, c, d, F4, K4, M, 66); + R( d, e, a, b, c, F4, K4, M, 67); + R( c, d, e, a, b, F4, K4, M, 68); + R( b, c, d, e, a, F4, K4, M, 69); + R( a, b, c, d, e, F4, K4, M, 70); + R( e, a, b, c, d, F4, K4, M, 71); + R( d, e, a, b, c, F4, K4, M, 72); + R( c, d, e, a, b, F4, K4, M, 73); + R( b, c, d, e, a, F4, K4, M, 74); + R( a, b, c, d, e, F4, K4, M, 75); + R( e, a, b, c, d, F4, K4, M, 76); + R( d, e, a, b, c, F4, K4, M, 77); + R( c, d, e, a, b, F4, K4, M, 78); + R( b, c, d, e, a, F4, K4, M, 79); a = ctx->A += a; b = ctx->B += b; c = ctx->C += c; d = ctx->D += d; e = ctx->E += e; + words += 16; } } I can't explain exactly how but this does have an effect: $ \time ./src/sha1sum < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf - 3.06user 0.11system 0:03.17elapsed 99%CPU (0avgtext+0avgdata 2544maxresident)k 0inputs+0outputs (0major+197minor)pagefaults 0swaps Then, the volatile addition which, according to Linus, is working great on architecture with not enough registers to accomodate the whole x[] array by preventing gcc from missing with the registers. This is a small patch: --- lib/sha1.c.orig 2011-09-05 23:54:08.498552015 +0200 +++ lib/sha1.c 2011-09-05 23:54:11.408552015 +0200 @@ -321,7 +321,7 @@ #define M(I) rol( x[I&0x0f] ^ x[(I-14)&0x0f] \ ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f], 1) \ -#define R(A,B,C,D,E,F,K,M,I) do { unsigned int TEMP = M(I); x[I&0x0f] = TEMP;\ +#define R(A,B,C,D,E,F,K,M,I) do { unsigned int TEMP = M(I); *(volatile unsigned int *)&x[I&0x0f] = TEMP;\ E += rol( A, 5 ) \ + F( B, C, D ) \ + K \ This does have a very big impact on speed: $ \time ./src/sha1sum < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf - 2.57user 0.11system 0:02.69elapsed 99%CPU (0avgtext+0avgdata 2544maxresident)k 0inputs+0outputs (0major+196minor)pagefaults 0swaps Eventually, the last performance stuff in Linus' sha1 is the use of inline asm for ror and rol instruction, but it seems it has no impact on performance for me, so I don't include it here. Clearly, those patches are not clean, but I can had the necessary stuff so that ntohl and volatile write access are used when necessary by compilation directives like it is done in Linus' sha1.c. My question is: would you consider such a patch for inclusion? is it the correct direction? Maybe, a better alternative is to just include block-sha1 from Linus directly as it seems that there is no license problems ( http://lists.gnu.org/archive/html/bug-coreutils/2009-08/msg00136.html). Also note that similar work can be done for sha256sum and sha512sum, I could work on this if it has a chance to be included. And also, there is a sse version from intel http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/?cid=sw:dslnews5149which I have verified is even faster. I would be OK to do integration work if it has a chance to be accepted in gnulib or coreutils. Thanks in advance for your answers, Best regards, -- Loïc