>> The names are the order they were written in.  "One" is the lib/sha1.c
>> code (547 bytes with -Os).  "Four" is a 5x unrolled C version (1106 bytes).
>
> I'd like to see your version four.

Here's the test driver wrapped around the earlier assembly code.
It's an ugly mess of copy & paste code, of course.

I suspect it could be shrunk by allocating the W[] array locally,
thereby freeing up a register.  Size is -Os -fomit-frame-pointer.


/*
 * SHA transform algorithm, originally taken from code written by
 * Peter Gutmann, and placed in the public domain.
 */
#include <stdint.h>
#include <stdio.h>

#define rol32(x, s) ((x)<<(s) | (x)>>(32-(s)))
static inline uint32_t __attribute__((const))
be32_to_cpu(unsigned x)
{
        asm("bswap      %0" : "+r"(x));
        return x;
}


/* The SHA f()-functions.  */

#define f1(x,y,z)   (z ^ (x & (y ^ z)))         /* x ? y : z */
#define f2(x,y,z)   (x ^ y ^ z)                 /* XOR */
#define f3(x,y,z)   ((x & y) + (z & (x ^ y)))   /* majority */

/* The SHA Mysterious Constants */

#define K1  0x5A827999L                 /* Rounds  0-19: sqrt(2) * 2^30 */
#define K2  0x6ED9EBA1L                 /* Rounds 20-39: sqrt(3) * 2^30 */
#define K3  0x8F1BBCDCL                 /* Rounds 40-59: sqrt(5) * 2^30 */
#define K4  0xCA62C1D6L                 /* Rounds 60-79: sqrt(10) * 2^30 */

/**
 * sha_transform - single block SHA1 transform
 *
 * @digest: 160 bit digest to update
 * @data:   512 bits of data to hash
 * @W:      80 words of workspace (see note)
 *
 * This function generates a SHA1 digest for a single 512-bit block.
 * Be warned, it does not handle padding and message digest, do not
 * confuse it with the full FIPS 180-1 digest algorithm for variable
 * length messages.
 *
 * Note: If the hash is security sensitive, the caller should be sure
 * to clear the workspace. This is left to the caller to avoid
 * unnecessary clears between chained hashing operations.
 */
void sha_transform(uint32_t digest[5], const char in[64], uint32_t W[80])
{
        register uint32_t a, b, c, d, e, t, i;

        for (i = 0; i < 16; i++)
                W[i] = be32_to_cpu(((const uint32_t *)in)[i]);

        for (i = 0; i < 64; i++)
                W[i+16] = rol32(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 1);

        a = digest[0];
        b = digest[1];
        c = digest[2];
        d = digest[3];
        e = digest[4];

        for (i = 0; i < 20; i++) {
                t = f1(b, c, d) + K1 + rol32(a, 5) + e + W[i];
                e = d; d = c; c = rol32(b, 30); b = a; a = t;
        }

        for (; i < 40; i ++) {
                t = f2(b, c, d) + K2 + rol32(a, 5) + e + W[i];
                e = d; d = c; c = rol32(b, 30); b = a; a = t;
        }

        for (; i < 60; i ++) {
                t = f3(b, c, d) + K3 + rol32(a, 5) + e + W[i];
                e = d; d = c; c = rol32(b, 30); b = a; a = t;
        }

        for (; i < 80; i ++) {
                t = f2(b, c, d) + K4 + rol32(a, 5) + e + W[i];
                e = d; d = c; c = rol32(b, 30); b = a; a = t;
        }

        digest[0] += a;
        digest[1] += b;
        digest[2] += c;
        digest[3] += d;
        digest[4] += e;
}

#define ROUND(a,b,c,d,e,f,add)  \
        ( e += add + f(b,c,d),  \
          b = rol32(b, 30),     \
          e += rol32(a, 5) )

void sha_transform4(uint32_t digest[5], const char in[64], uint32_t W[80])
{
        register uint32_t a, b, c, d, e, i;

        for (i = 0; i < 16; i++)
                W[i] = be32_to_cpu(((const uint32_t *)in)[i]);

        for (i = 0; i < 64; i++) {
                a = W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i];
                W[i+16] = rol32(a, 1);
        }

        a = digest[0];
        b = digest[1];
        c = digest[2];
        d = digest[3];
        e = digest[4];

        for (i = 0; i < 20; i += 5) {
                ROUND(a,b,c,d,e,f1,W[i  ]+K1);
                ROUND(e,a,b,c,d,f1,W[i+1]+K1);
                ROUND(d,e,a,b,c,f1,W[i+2]+K1);
                ROUND(c,d,e,a,b,f1,W[i+3]+K1);
                ROUND(b,c,d,e,a,f1,W[i+4]+K1);
        }

        for (; i < 40; i += 5) {
                ROUND(a,b,c,d,e,f2,W[i  ]+K2);
                ROUND(e,a,b,c,d,f2,W[i+1]+K2);
                ROUND(d,e,a,b,c,f2,W[i+2]+K2);
                ROUND(c,d,e,a,b,f2,W[i+3]+K2);
                ROUND(b,c,d,e,a,f2,W[i+4]+K2);
        }
        for (; i < 60; i += 5) {
                ROUND(a,b,c,d,e,f3,W[i  ]+K3);
                ROUND(e,a,b,c,d,f3,W[i+1]+K3);
                ROUND(d,e,a,b,c,f3,W[i+2]+K3);
                ROUND(c,d,e,a,b,f3,W[i+3]+K3);
                ROUND(b,c,d,e,a,f3,W[i+4]+K3);
        }
        for (; i < 80; i += 5) {
                ROUND(a,b,c,d,e,f2,W[i  ]+K4);
                ROUND(e,a,b,c,d,f2,W[i+1]+K4);
                ROUND(d,e,a,b,c,f2,W[i+2]+K4);
                ROUND(c,d,e,a,b,f2,W[i+3]+K4);
                ROUND(b,c,d,e,a,f2,W[i+4]+K4);
        }

        digest[0] += a;
        digest[1] += b;
        digest[2] += c;
        digest[3] += d;
        digest[4] += e;
}

extern void sha_transform2(uint32_t digest[5], const char in[64]);
extern void sha_transform3(uint32_t digest[5], const char in[64]);
extern void sha_transform5(uint32_t digest[5], const char in[64]);
extern void sha_stackwipe(void);

void sha_init(uint32_t buf[5])
{
        buf[0] = 0x67452301;
        buf[1] = 0xefcdab89;
        buf[2] = 0x98badcfe;
        buf[3] = 0x10325476;
        buf[4] = 0xc3d2e1f0;
}

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

#if 1
void sha_stackwipe2(void)
{
        uint32_t buf[90];
        memset(buf, 0, sizeof buf);
        asm("" : : "r" (&buf)); /* Force the compiler to do the memset */
}
#endif


#define TEST_SIZE (10*1024*1024)

int main(void)
{
        uint32_t W[80];
        uint32_t out[5];
        char const text[64] = "Hello, world!\n";
        char *buf;
        uint32_t *p;
        unsigned i;
        struct timeval start, stop;

        sha_init(out);
        sha_transform(out, text, W);
        printf("  One: %08x %08x %08x %08x %08x\n",
                out[0], out[1], out[2], out[3], out[4]);

        sha_init(out);
        sha_transform4(out, text, W);
        printf(" Four: %08x %08x %08x %08x %08x\n",
                out[0], out[1], out[2], out[3], out[4]);

        sha_init(out);
        sha_transform2(out, text);
        printf("  Two: %08x %08x %08x %08x %08x\n",
                out[0], out[1], out[2], out[3], out[4]);

        sha_init(out);
        sha_transform3(out, text);
        printf("Three: %08x %08x %08x %08x %08x\n",
                out[0], out[1], out[2], out[3], out[4]);

        sha_init(out);
        sha_transform5(out, text);
        printf(" Five: %08x %08x %08x %08x %08x\n",
                out[0], out[1], out[2], out[3], out[4]);

        sha_stackwipe();
#if 1

        /* Set up a large buffer full of stuff */
        buf = malloc(TEST_SIZE);
        p = (uint32_t *)buf;
        memcpy(p, W+80-16, 16*sizeof *p);
        for (i = 0; i < TEST_SIZE/sizeof *p - 16; i++) {
                uint32_t a = p[i+13] ^ p[i+8] ^ p[i+2] ^ p[i];
                p[i+16] = rol32(a, 1);
        }

        sha_init(out);
        gettimeofday(&start, 0);
        for (i = 0; i < TEST_SIZE; i += 64)
                sha_transform(out, buf+i, W);
        gettimeofday(&stop, 0);
        printf("  One: %08x %08x %08x %08x %08x -- %lu us\n",
                out[0], out[1], out[2], out[3], out[4],
                1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);

        sha_init(out);
        gettimeofday(&start, 0);
        for (i = 0; i < TEST_SIZE; i += 64)
                sha_transform4(out, buf+i, W);
        gettimeofday(&stop, 0);
        printf(" Four: %08x %08x %08x %08x %08x -- %lu us\n",
                out[0], out[1], out[2], out[3], out[4],
                1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);

        sha_init(out);
        gettimeofday(&start, 0);
        for (i = 0; i < TEST_SIZE; i += 64)
                sha_transform2(out, buf+i);
        gettimeofday(&stop, 0);
        printf("  Two: %08x %08x %08x %08x %08x -- %lu us\n",
                out[0], out[1], out[2], out[3], out[4],
                1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);

        sha_init(out);
        gettimeofday(&start, 0);
        for (i = 0; i < TEST_SIZE; i += 64)
                sha_transform3(out, buf+i);
        gettimeofday(&stop, 0);
        printf("Three: %08x %08x %08x %08x %08x -- %lu us\n",
                out[0], out[1], out[2], out[3], out[4],
                1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);

        sha_init(out);
        gettimeofday(&start, 0);
        for (i = 0; i < TEST_SIZE; i += 64)
                sha_transform5(out, buf+i);
        gettimeofday(&stop, 0);
        printf(" Five: %08x %08x %08x %08x %08x -- %lu us\n",
                out[0], out[1], out[2], out[3], out[4],
                1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);
        
        sha_stackwipe();
#endif  
        
        return 0;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to