Hi Pádraig,
Thank you for your answer.
2011/9/6 Pádraig Brady <[email protected]>
> A few general points.
> You essentially used Linus' code (albeit by
> very helpfully isolating the significant differences).
> It might be easier/required to just include it in gnulib?
> There are a few files in gnulib that are not copyright of the FSF,
> so would Nicolas and Linus need to assign copyright?
>
Yes, this is what I did. I don't thing that including Linus' is easier as
the functions have a different prototype. Also, sha1, sha256 and sha512
share the same structure in gnulib, changing one without changing the other
would be weird. But if you thing it is required, I have not problem with
that.
By the way, I have done a test on sha512 and I have improved the speed on
the same 1Gb zero file from 4.5 to 3.9s. Please find the patch attached. So
I thing that using the same technics, we could improve all sha's speed.
For performance testing I've found gcc generates
> much more deterministic results with a -march
> as close to native as possible or otherwise
> the code is very susceptible to alignment issues etc.
> Your compiler supports -march=native.
> Note also gcc 4.6 has much better support for your sandy bridge CPU,
> either with -march=native or -march=corei7-avx
>
I tried using gcc-4.6.1 (I recompiled it under my ubuntu 10.10) but I
couldn't see any differences. For me, using any combination of -march=native
or not and gcc 4.4.5 or 4.6.1 doesn't make a difference, all the times are
in the measurement margin.
As for the SSE version, I would also like to see that included,
> given the proportion of hardware supporting that these days.
> I previously noticed a coreutils SSE2 patch here:
> http://www.arctic.org/~dean/crypto/sha1.html
> Though we'd probably need some runtime SSE detection to include that.
>
Ok, I could try to work on this. The real problem is to test that
compilation and SSE detection is done correctly on several platform. I only
have access to a few x86 machines, what is the usual way to test more
platforms ?
Best regards
--
Loïc
--- lib/sha512.c.orig 2011-09-06 15:24:17.320209997 +0200
+++ lib/sha512.c 2011-09-06 14:54:35.503382001 +0200
@@ -498,19 +498,23 @@
#define SS0(x) u64xor (u64rol (x, 36), u64xor (u64rol (x, 30), u64rol (x, 25)))
#define SS1(x) u64xor (u64rol(x, 50), u64xor (u64rol (x, 46), u64rol (x, 23)))
-#define M(I) (x[(I) & 15] \
- = u64plus (x[(I) & 15], \
- u64plus (S1 (x[((I) - 2) & 15]), \
- u64plus (x[((I) - 7) & 15], \
- S0 (x[((I) - 15) & 15])))))
+#include "endian.h"
+#define X(I) (be64toh(words[I]))
+#define M(I) (u64plus (x[(I) & 15], \
+ u64plus (S1 (x[((I) - 2) & 15]), \
+ u64plus (x[((I) - 7) & 15], \
+ S0 (x[((I) - 15) & 15])))))
-#define R(A, B, C, D, E, F, G, H, K, M) \
+#define R(A, B, C, D, E, F, G, H, M, I) \
do \
{ \
+ u64 temp = M(I); \
u64 t0 = u64plus (SS0 (A), F2 (A, B, C)); \
u64 t1 = \
u64plus (H, u64plus (SS1 (E), \
- u64plus (F1 (E, F, G), u64plus (K, M)))); \
+ u64plus (F1 (E, F, G), \
+ u64plus (K(I), temp)))); \
+ *(volatile u64 *)&x[(I) & 15] = temp; \
D = u64plus (D, t1); \
H = u64plus (t0, t1); \
} \
@@ -518,94 +522,86 @@
while (words < endp)
{
- int t;
- /* FIXME: see sha1.c for a better implementation. */
- for (t = 0; t < 16; t++)
- {
- x[t] = SWAP (*words);
- words++;
- }
-
- R( a, b, c, d, e, f, g, h, K( 0), x[ 0] );
- R( h, a, b, c, d, e, f, g, K( 1), x[ 1] );
- R( g, h, a, b, c, d, e, f, K( 2), x[ 2] );
- R( f, g, h, a, b, c, d, e, K( 3), x[ 3] );
- R( e, f, g, h, a, b, c, d, K( 4), x[ 4] );
- R( d, e, f, g, h, a, b, c, K( 5), x[ 5] );
- R( c, d, e, f, g, h, a, b, K( 6), x[ 6] );
- R( b, c, d, e, f, g, h, a, K( 7), x[ 7] );
- R( a, b, c, d, e, f, g, h, K( 8), x[ 8] );
- R( h, a, b, c, d, e, f, g, K( 9), x[ 9] );
- R( g, h, a, b, c, d, e, f, K(10), x[10] );
- R( f, g, h, a, b, c, d, e, K(11), x[11] );
- R( e, f, g, h, a, b, c, d, K(12), x[12] );
- R( d, e, f, g, h, a, b, c, K(13), x[13] );
- R( c, d, e, f, g, h, a, b, K(14), x[14] );
- R( b, c, d, e, f, g, h, a, K(15), x[15] );
- R( a, b, c, d, e, f, g, h, K(16), M(16) );
- R( h, a, b, c, d, e, f, g, K(17), M(17) );
- R( g, h, a, b, c, d, e, f, K(18), M(18) );
- R( f, g, h, a, b, c, d, e, K(19), M(19) );
- R( e, f, g, h, a, b, c, d, K(20), M(20) );
- R( d, e, f, g, h, a, b, c, K(21), M(21) );
- R( c, d, e, f, g, h, a, b, K(22), M(22) );
- R( b, c, d, e, f, g, h, a, K(23), M(23) );
- R( a, b, c, d, e, f, g, h, K(24), M(24) );
- R( h, a, b, c, d, e, f, g, K(25), M(25) );
- R( g, h, a, b, c, d, e, f, K(26), M(26) );
- R( f, g, h, a, b, c, d, e, K(27), M(27) );
- R( e, f, g, h, a, b, c, d, K(28), M(28) );
- R( d, e, f, g, h, a, b, c, K(29), M(29) );
- R( c, d, e, f, g, h, a, b, K(30), M(30) );
- R( b, c, d, e, f, g, h, a, K(31), M(31) );
- R( a, b, c, d, e, f, g, h, K(32), M(32) );
- R( h, a, b, c, d, e, f, g, K(33), M(33) );
- R( g, h, a, b, c, d, e, f, K(34), M(34) );
- R( f, g, h, a, b, c, d, e, K(35), M(35) );
- R( e, f, g, h, a, b, c, d, K(36), M(36) );
- R( d, e, f, g, h, a, b, c, K(37), M(37) );
- R( c, d, e, f, g, h, a, b, K(38), M(38) );
- R( b, c, d, e, f, g, h, a, K(39), M(39) );
- R( a, b, c, d, e, f, g, h, K(40), M(40) );
- R( h, a, b, c, d, e, f, g, K(41), M(41) );
- R( g, h, a, b, c, d, e, f, K(42), M(42) );
- R( f, g, h, a, b, c, d, e, K(43), M(43) );
- R( e, f, g, h, a, b, c, d, K(44), M(44) );
- R( d, e, f, g, h, a, b, c, K(45), M(45) );
- R( c, d, e, f, g, h, a, b, K(46), M(46) );
- R( b, c, d, e, f, g, h, a, K(47), M(47) );
- R( a, b, c, d, e, f, g, h, K(48), M(48) );
- R( h, a, b, c, d, e, f, g, K(49), M(49) );
- R( g, h, a, b, c, d, e, f, K(50), M(50) );
- R( f, g, h, a, b, c, d, e, K(51), M(51) );
- R( e, f, g, h, a, b, c, d, K(52), M(52) );
- R( d, e, f, g, h, a, b, c, K(53), M(53) );
- R( c, d, e, f, g, h, a, b, K(54), M(54) );
- R( b, c, d, e, f, g, h, a, K(55), M(55) );
- R( a, b, c, d, e, f, g, h, K(56), M(56) );
- R( h, a, b, c, d, e, f, g, K(57), M(57) );
- R( g, h, a, b, c, d, e, f, K(58), M(58) );
- R( f, g, h, a, b, c, d, e, K(59), M(59) );
- R( e, f, g, h, a, b, c, d, K(60), M(60) );
- R( d, e, f, g, h, a, b, c, K(61), M(61) );
- R( c, d, e, f, g, h, a, b, K(62), M(62) );
- R( b, c, d, e, f, g, h, a, K(63), M(63) );
- R( a, b, c, d, e, f, g, h, K(64), M(64) );
- R( h, a, b, c, d, e, f, g, K(65), M(65) );
- R( g, h, a, b, c, d, e, f, K(66), M(66) );
- R( f, g, h, a, b, c, d, e, K(67), M(67) );
- R( e, f, g, h, a, b, c, d, K(68), M(68) );
- R( d, e, f, g, h, a, b, c, K(69), M(69) );
- R( c, d, e, f, g, h, a, b, K(70), M(70) );
- R( b, c, d, e, f, g, h, a, K(71), M(71) );
- R( a, b, c, d, e, f, g, h, K(72), M(72) );
- R( h, a, b, c, d, e, f, g, K(73), M(73) );
- R( g, h, a, b, c, d, e, f, K(74), M(74) );
- R( f, g, h, a, b, c, d, e, K(75), M(75) );
- R( e, f, g, h, a, b, c, d, K(76), M(76) );
- R( d, e, f, g, h, a, b, c, K(77), M(77) );
- R( c, d, e, f, g, h, a, b, K(78), M(78) );
- R( b, c, d, e, f, g, h, a, K(79), M(79) );
+ R( a, b, c, d, e, f, g, h, X, 0 );
+ R( h, a, b, c, d, e, f, g, X, 1 );
+ R( g, h, a, b, c, d, e, f, X, 2 );
+ R( f, g, h, a, b, c, d, e, X, 3 );
+ R( e, f, g, h, a, b, c, d, X, 4 );
+ R( d, e, f, g, h, a, b, c, X, 5 );
+ R( c, d, e, f, g, h, a, b, X, 6 );
+ R( b, c, d, e, f, g, h, a, X, 7 );
+ R( a, b, c, d, e, f, g, h, X, 8 );
+ R( h, a, b, c, d, e, f, g, X, 9 );
+ R( g, h, a, b, c, d, e, f, X, 10 );
+ R( f, g, h, a, b, c, d, e, X, 11 );
+ R( e, f, g, h, a, b, c, d, X, 12 );
+ R( d, e, f, g, h, a, b, c, X, 13 );
+ R( c, d, e, f, g, h, a, b, X, 14 );
+ R( b, c, d, e, f, g, h, a, X, 15 );
+ R( a, b, c, d, e, f, g, h, M, 16 );
+ R( h, a, b, c, d, e, f, g, M, 17 );
+ R( g, h, a, b, c, d, e, f, M, 18 );
+ R( f, g, h, a, b, c, d, e, M, 19 );
+ R( e, f, g, h, a, b, c, d, M, 20 );
+ R( d, e, f, g, h, a, b, c, M, 21 );
+ R( c, d, e, f, g, h, a, b, M, 22 );
+ R( b, c, d, e, f, g, h, a, M, 23 );
+ R( a, b, c, d, e, f, g, h, M, 24 );
+ R( h, a, b, c, d, e, f, g, M, 25 );
+ R( g, h, a, b, c, d, e, f, M, 26 );
+ R( f, g, h, a, b, c, d, e, M, 27 );
+ R( e, f, g, h, a, b, c, d, M, 28 );
+ R( d, e, f, g, h, a, b, c, M, 29 );
+ R( c, d, e, f, g, h, a, b, M, 30 );
+ R( b, c, d, e, f, g, h, a, M, 31 );
+ R( a, b, c, d, e, f, g, h, M, 32 );
+ R( h, a, b, c, d, e, f, g, M, 33 );
+ R( g, h, a, b, c, d, e, f, M, 34 );
+ R( f, g, h, a, b, c, d, e, M, 35 );
+ R( e, f, g, h, a, b, c, d, M, 36 );
+ R( d, e, f, g, h, a, b, c, M, 37 );
+ R( c, d, e, f, g, h, a, b, M, 38 );
+ R( b, c, d, e, f, g, h, a, M, 39 );
+ R( a, b, c, d, e, f, g, h, M, 40 );
+ R( h, a, b, c, d, e, f, g, M, 41 );
+ R( g, h, a, b, c, d, e, f, M, 42 );
+ R( f, g, h, a, b, c, d, e, M, 43 );
+ R( e, f, g, h, a, b, c, d, M, 44 );
+ R( d, e, f, g, h, a, b, c, M, 45 );
+ R( c, d, e, f, g, h, a, b, M, 46 );
+ R( b, c, d, e, f, g, h, a, M, 47 );
+ R( a, b, c, d, e, f, g, h, M, 48 );
+ R( h, a, b, c, d, e, f, g, M, 49 );
+ R( g, h, a, b, c, d, e, f, M, 50 );
+ R( f, g, h, a, b, c, d, e, M, 51 );
+ R( e, f, g, h, a, b, c, d, M, 52 );
+ R( d, e, f, g, h, a, b, c, M, 53 );
+ R( c, d, e, f, g, h, a, b, M, 54 );
+ R( b, c, d, e, f, g, h, a, M, 55 );
+ R( a, b, c, d, e, f, g, h, M, 56 );
+ R( h, a, b, c, d, e, f, g, M, 57 );
+ R( g, h, a, b, c, d, e, f, M, 58 );
+ R( f, g, h, a, b, c, d, e, M, 59 );
+ R( e, f, g, h, a, b, c, d, M, 60 );
+ R( d, e, f, g, h, a, b, c, M, 61 );
+ R( c, d, e, f, g, h, a, b, M, 62 );
+ R( b, c, d, e, f, g, h, a, M, 63 );
+ R( a, b, c, d, e, f, g, h, M, 64 );
+ R( h, a, b, c, d, e, f, g, M, 65 );
+ R( g, h, a, b, c, d, e, f, M, 66 );
+ R( f, g, h, a, b, c, d, e, M, 67 );
+ R( e, f, g, h, a, b, c, d, M, 68 );
+ R( d, e, f, g, h, a, b, c, M, 69 );
+ R( c, d, e, f, g, h, a, b, M, 70 );
+ R( b, c, d, e, f, g, h, a, M, 71 );
+ R( a, b, c, d, e, f, g, h, M, 72 );
+ R( h, a, b, c, d, e, f, g, M, 73 );
+ R( g, h, a, b, c, d, e, f, M, 74 );
+ R( f, g, h, a, b, c, d, e, M, 75 );
+ R( e, f, g, h, a, b, c, d, M, 76 );
+ R( d, e, f, g, h, a, b, c, M, 77 );
+ R( c, d, e, f, g, h, a, b, M, 78 );
+ R( b, c, d, e, f, g, h, a, M, 79 );
a = ctx->state[0] = u64plus (ctx->state[0], a);
b = ctx->state[1] = u64plus (ctx->state[1], b);
@@ -615,5 +611,6 @@
f = ctx->state[5] = u64plus (ctx->state[5], f);
g = ctx->state[6] = u64plus (ctx->state[6], g);
h = ctx->state[7] = u64plus (ctx->state[7], h);
+ words += 16;
}
}