Hi all,
the attached patch adds SHA1 support for VIA PadLock engine.
There are several design decisions that I may need to explain:
The xsha1 instruction always finalizes the MD computation, i.e. it is
not possible to call the hardware in sha1_update() with the provided
input buffer. Instead its necessary to accumulate all data from
update()s in some buffer and hash them only in final().
The imminent problem here is the buffer size. For obvious reasons we
can't let it grow indefinitely. Instead there is a set maximum size (8k
in my patch) after which the engine falls back to software SHA1, pushes
all accumulated data to its update() method and never touches the hardware.
The context structure looks like this:
struct padlock_digest_data {
void *buf_start, *buf_alloc;
ssize_t used;
unsigned long order:8, bypass:1;
SHA_CTX fallback_ctx;
};
In padlock_init() I allocate a buffer of a given size (8k as well) whose
first 16B-aligned address goes to buf_start. Having the input data
aligned allows PadLock crunch them faster.
I did some experiments with having a statically allocated buffer in this
structure (to avoid malloc() for small datasets), but it actually made
things slower. malloc() appears to be fast enough.
And yes, some numbers :-) These are from VIA Esther 1.2GHz:
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes
sha1-sw 4108.58k 12323.46k 28142.76k 41445.20k 48078.85k
sha1-padlock 3321.16k 12656.98k 44152.58k 116508.25k 224807.59k
Please comment or commit.
Thanks!
Michal Ludvig
--
* Personal homepage: http://www.logix.cz/michal
Index: openssl-0.9.8-O2/crypto/engine/eng_padlock.c
===================================================================
--- openssl-0.9.8-O2.orig/crypto/engine/eng_padlock.c
+++ openssl-0.9.8-O2/crypto/engine/eng_padlock.c
@@ -74,11 +74,22 @@
#ifndef OPENSSL_NO_AES
#include <openssl/aes.h>
#endif
+#ifndef OPENSSL_NO_SHA
+#include <openssl/sha.h>
+#endif
#include <openssl/rand.h>
#ifndef OPENSSL_NO_HW
#ifndef OPENSSL_NO_HW_PADLOCK
+/* PadLock RNG is disabled by default */
+#define PADLOCK_NO_RNG 1
+
+/* No ASM routines for SHA in MSC yet */
+#ifdef _MSC_VER
+#define OPENSSL_NO_SHA
+#endif
+
/* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
#if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
# ifndef OPENSSL_NO_DYNAMIC_ENGINE
@@ -134,20 +145,41 @@ static int padlock_available(void);
static int padlock_init(ENGINE *e);
/* RNG Stuff */
+#ifndef PADLOCK_NO_RNG
static RAND_METHOD padlock_rand;
+#endif
/* Cipher Stuff */
#ifndef OPENSSL_NO_AES
static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int
**nids, int nid);
#endif
+/* Digest Stuff */
+#ifndef OPENSSL_NO_SHA
+static int padlock_digests(ENGINE *e, const EVP_MD **digest, const int **nids,
int nid);
+#endif
+
/* Engine names */
static const char *padlock_id = "padlock";
static char padlock_name[100];
/* Available features */
-static int padlock_use_ace = 0; /* Advanced Cryptography Engine */
-static int padlock_use_rng = 0; /* Random Number Generator */
+enum padlock_flags {
+ PADLOCK_RNG = 0x01,
+ PADLOCK_ACE = 0x02,
+ PADLOCK_ACE2 = 0x04,
+ PADLOCK_PHE = 0x08,
+ PADLOCK_PMM = 0x10
+};
+enum padlock_flags padlock_flags;
+
+#define PADLOCK_HAVE_RNG (padlock_flags & PADLOCK_RNG)
+#define PADLOCK_HAVE_ACE (padlock_flags & (PADLOCK_ACE|PADLOCK_ACE2))
+#define PADLOCK_HAVE_ACE1 (padlock_flags & PADLOCK_ACE)
+#define PADLOCK_HAVE_ACE2 (padlock_flags & PADLOCK_ACE2)
+#define PADLOCK_HAVE_PHE (padlock_flags & PADLOCK_PHE)
+#define PADLOCK_HAVE_PMM (padlock_flags & PADLOCK_PMM)
+
#ifndef OPENSSL_NO_AES
static int padlock_aes_align_required = 1;
#endif
@@ -161,25 +193,30 @@ padlock_bind_helper(ENGINE *e)
/* Check available features */
padlock_available();
-#if 1 /* disable RNG for now, see commentary in vicinity of RNG code */
- padlock_use_rng=0;
-#endif
-
/* Generate a nice engine name with available features */
BIO_snprintf(padlock_name, sizeof(padlock_name),
- "VIA PadLock (%s, %s)",
- padlock_use_rng ? "RNG" : "no-RNG",
- padlock_use_ace ? "ACE" : "no-ACE");
+ "VIA PadLock: %s%s%s%s%s",
+ padlock_flags ? "" : "not supported",
+ PADLOCK_HAVE_RNG ? "RNG " : "",
+ PADLOCK_HAVE_ACE ? (PADLOCK_HAVE_ACE2 ? "ACE2 " : "ACE ") : "",
+ PADLOCK_HAVE_PHE ? "PHE " : "",
+ PADLOCK_HAVE_PMM ? "PMM " : "");
/* Register everything or return with an error */
if (!ENGINE_set_id(e, padlock_id) ||
!ENGINE_set_name(e, padlock_name) ||
- !ENGINE_set_init_function(e, padlock_init) ||
+ !ENGINE_set_init_function(e, padlock_init)
#ifndef OPENSSL_NO_AES
- (padlock_use_ace && !ENGINE_set_ciphers (e, padlock_ciphers)) ||
+ || (PADLOCK_HAVE_ACE && !ENGINE_set_ciphers (e, padlock_ciphers))
+#endif
+#ifndef OPENSSL_NO_SHA
+ || (PADLOCK_HAVE_PHE && !ENGINE_set_digests (e, padlock_digests))
+#endif
+#ifndef PADLOCK_NO_RNG
+ || (PADLOCK_HAVE_RNG && !ENGINE_set_RAND (e, &padlock_rand))
#endif
- (padlock_use_rng && !ENGINE_set_RAND (e, &padlock_rand))) {
+ ) {
return 0;
}
@@ -209,7 +246,7 @@ ENGINE_padlock(void)
static int
padlock_init(ENGINE *e)
{
- return (padlock_use_rng || padlock_use_ace);
+ return (padlock_flags);
}
/* This stuff is needed if this ENGINE is being compiled into a self-contained
@@ -236,6 +273,17 @@ IMPLEMENT_DYNAMIC_BIND_FN (padlock_bind_
/* ===== Here comes the "real" engine ===== */
+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
+
+/* How to test if we need to typedef uint32_t ??? */
+typedef unsigned long uint32_t;
+
#ifndef OPENSSL_NO_AES
/* Some AES-related constants */
#define AES_BLOCK_SIZE 16
@@ -359,10 +407,22 @@ padlock_available(void)
: "+a"(eax), "=d"(edx) : : "ecx");
/* Fill up some flags */
- padlock_use_ace = ((edx & (0x3<<6)) == (0x3<<6));
- padlock_use_rng = ((edx & (0x3<<2)) == (0x3<<2));
+ padlock_flags |= ((edx & (0x3<<3)) ? PADLOCK_RNG : 0);
+ padlock_flags |= ((edx & (0x3<<7)) ? PADLOCK_ACE : 0);
+ padlock_flags |= ((edx & (0x3<<9)) ? PADLOCK_ACE2 : 0);
+ padlock_flags |= ((edx & (0x3<<11)) ? PADLOCK_PHE : 0);
+ padlock_flags |= ((edx & (0x3<<13)) ? PADLOCK_PMM : 0);
- return padlock_use_ace + padlock_use_rng;
+ return padlock_flags;
+}
+
+static inline void
+padlock_htonl_block(uint32_t *data, size_t count)
+{
+ while (count--) {
+ asm volatile ("bswapl %0" : "+r"(*data));
+ data++;
+ }
}
#ifndef OPENSSL_NO_AES
@@ -371,12 +431,9 @@ static inline void
padlock_bswapl(AES_KEY *ks)
{
size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
- unsigned int *key = ks->rd_key;
+ uint32_t *key = (uint32_t*) ks->rd_key;
- while (i--) {
- asm volatile ("bswapl %0" : "+r"(*key));
- key++;
- }
+ padlock_htonl_block(key, i);
}
#endif
@@ -1047,6 +1104,250 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx,
#endif /* OPENSSL_NO_AES */
+#ifndef OPENSSL_NO_SHA
+
+#define PADLOCK_SHA_INIT_ORD 13 /* = 8192 */
+#define PADLOCK_SHA_MAX_ORD 13 /* = 8192 */
+
+// #define PADLOCK_SHA_STAT 1
+
+/* Don't forget to initialize all relevant
+ * fields in padlock_sha_init() or face the
+ * consequences!!!
+ * BTW We don't use bzero() on this structure
+ * because zeroing fallback_ctx is
+ * a waste of time. */
+struct padlock_digest_data {
+ SHA_CTX fallback_ctx;
+ void *buf_start, *buf_alloc;
+ ssize_t used;
+ unsigned long order:8, bypass:1;
+#ifdef PADLOCK_SHA_STAT
+ size_t stat_count, stat_total;
+#endif
+};
+
+#ifdef PADLOCK_SHA_STAT
+size_t all_count, all_total;
+#endif
+
+#define DIGEST_DATA(ctx) ((struct padlock_digest_data *)(ctx->md_data))
+#define DDATA_FREE(ddata) ((size_t)(1L << ddata->order) - ddata->used)
+
+static void
+padlock_sha_bypass(struct padlock_digest_data *ddata)
+{
+ if (ddata->bypass)
+ return;
+
+ SHA1_Init(&ddata->fallback_ctx);
+ if (ddata->buf_start && ddata->used > 0) {
+ SHA1_Update(&ddata->fallback_ctx, ddata->buf_start,
ddata->used);
+ if (ddata->buf_alloc) {
+ free(ddata->buf_alloc);
+ ddata->buf_alloc = 0;
+ }
+ }
+ ddata->buf_start = 0;
+ ddata->used = 0;
+ ddata->bypass = 1;
+
+ return;
+}
+
+static inline void
+padlock_do_sha1(char *in, char *out, int count)
+{
+ /* We can't store directly to *out as it
+ * doesn't have to be aligned. But who cares,
+ * it's only a few bytes... */
+ char buf[128+16];
+ char *output = NEAREST_ALIGNED(buf);
+
+ ((uint32_t*)output)[0] = 0x67452301;
+ ((uint32_t*)output)[1] = 0xEFCDAB89;
+ ((uint32_t*)output)[2] = 0x98BADCFE;
+ ((uint32_t*)output)[3] = 0x10325476;
+ ((uint32_t*)output)[4] = 0xC3D2E1F0;
+
+ asm volatile (".byte 0xf3,0x0f,0xa6,0xc8" /* rep xsha1 */
+ : "+S"(in), "+D"(output)
+ : "c"(count), "a"(0));
+
+ memcpy(out, output, 5 * sizeof(uint32_t));
+
+ padlock_htonl_block((uint32_t*)out, 5);
+}
+
+static int
+padlock_sha_init(EVP_MD_CTX *ctx)
+{
+ struct padlock_digest_data *ddata = DIGEST_DATA(ctx);
+
+ ddata->used = 0;
+ ddata->bypass = 0;
+
+ ddata->order = PADLOCK_SHA_INIT_ORD;
+ ddata->buf_alloc = malloc((1L << ddata->order) + 16);
+ ddata->buf_start = NEAREST_ALIGNED(ddata->buf_alloc);
+
+ return 1;
+}
+
+static int
+padlock_sha_update(EVP_MD_CTX *ctx, const void *data, size_t length)
+{
+ struct padlock_digest_data *ddata = DIGEST_DATA(ctx);
+
+#ifdef PADLOCK_SHA_STAT
+ ddata->stat_count++;
+ ddata->stat_total += length;
+ all_count++;
+ all_total += length;
+#endif
+ if (unlikely(ddata->bypass)) {
+ SHA1_Update(&ddata->fallback_ctx, data, length);
+ return 1;
+ }
+ if (unlikely(DDATA_FREE(ddata) < length)) {
+ if (likely(ddata->used + length > (1 << PADLOCK_SHA_MAX_ORD))) {
+ /* Too much data to be stored -> bypass to SW SHA */
+ padlock_sha_bypass(ddata);
+ SHA1_Update(&ddata->fallback_ctx, data, length);
+ return 1;
+ } else {
+ /* Resize the alocated buffer */
+ char *new_buf;
+ size_t new_size;
+
+ while ((1<<++ddata->order) < (ddata->used + length));
+ new_size = (1<<ddata->order);
+ if(!(new_buf = realloc(ddata->buf_alloc, new_size +
16))) {
+ /* fallback plan again */
+ padlock_sha_bypass(ddata);
+ SHA1_Update(&ddata->fallback_ctx, data, length);
+ return 1;
+ }
+ ddata->buf_alloc = new_buf;
+ ddata->buf_start = NEAREST_ALIGNED(new_buf);
+ }
+ }
+
+ memcpy(ddata->buf_start + ddata->used, data, length);
+ ddata->used += length;
+
+ return 1;
+}
+
+static int
+padlock_sha_final(EVP_MD_CTX *ctx, unsigned char *md)
+{
+ struct padlock_digest_data *ddata = DIGEST_DATA(ctx);
+
+#ifdef PADLOCK_SHA_STAT
+ fprintf(stderr, "PadLock CTX: cnt=%zu, tot=%zu, avg=%zu\n",
+ ddata->stat_count, ddata->stat_total,
+ ddata->stat_count ? (ddata->stat_total/ddata->stat_count) : 0);
+ fprintf(stderr, "PadLock ALL: cnt=%zu, tot=%zu, avg=%zu\n",
+ all_count, all_total, all_count ? (all_total/all_count) : 0);
+#endif
+
+ if (ddata->bypass) {
+ SHA1_Final(md, &ddata->fallback_ctx);
+ return 1;
+ }
+
+ /* Pass the input buffer to PadLock microcode... */
+ padlock_do_sha1(ddata->buf_start, md, ddata->used);
+ free(ddata->buf_alloc);
+ ddata->buf_start = 0;
+ ddata->buf_alloc = 0;
+ ddata->used = 0;
+
+ return 1;
+}
+
+static int
+padlock_sha_copy(EVP_MD_CTX *to,const EVP_MD_CTX *from)
+{
+ struct padlock_digest_data *ddata_from = DIGEST_DATA(from);
+ struct padlock_digest_data *ddata_to = DIGEST_DATA(to);
+
+ memcpy(ddata_to, ddata_from, sizeof(struct padlock_digest_data));
+ if (ddata_from->buf_alloc) {
+ ddata_to->buf_alloc = malloc(1L << ddata_to->order);
+ if (!ddata_to->buf_start) {
+ fprintf(stderr, "%s(): malloc() failed\n", __func__);
+ exit(1);
+ }
+ ddata_to->buf_start = NEAREST_ALIGNED(ddata_to->buf_alloc);
+ memcpy(ddata_to->buf_start, ddata_from->buf_start,
ddata_from->used);
+ }
+ return 1;
+}
+
+static int
+padlock_sha_cleanup(EVP_MD_CTX *ctx)
+{
+ struct padlock_digest_data *ddata = DIGEST_DATA(ctx);
+
+ if (ddata->buf_alloc)
+ free(ddata->buf_alloc);
+
+ memset(ddata, 0, sizeof(struct padlock_digest_data));
+
+ return 1;
+}
+
+static const EVP_MD padlock_sha1_md = {
+ NID_sha1,
+ NID_sha1WithRSAEncryption,
+ SHA_DIGEST_LENGTH,
+ 0,
+ padlock_sha_init,
+ padlock_sha_update,
+ padlock_sha_final,
+ padlock_sha_copy,
+ padlock_sha_cleanup,
+ EVP_PKEY_RSA_method,
+ SHA_CBLOCK,
+ sizeof(struct padlock_digest_data),
+};
+
+static int padlock_digest_nids[] = {
+ NID_sha1,
+// NID_sha256
+};
+
+static int padlock_digest_nids_num =
sizeof(padlock_digest_nids)/sizeof(padlock_digest_nids[0]);
+
+static int
+padlock_digests (ENGINE *e, const EVP_MD **digest, const int **nids, int nid)
+{
+ /* No specific digest => return a list of supported nids ... */
+ if (!digest) {
+ *nids = padlock_digest_nids;
+ return padlock_digest_nids_num;
+ }
+
+ /* ... or the requested "digest" otherwise */
+ switch (nid) {
+ case NID_sha1:
+ *digest = &padlock_sha1_md;
+ break;
+
+ default:
+ /* Sorry, we don't support this NID */
+ *digest = NULL;
+ return 0;
+ }
+
+ return 1;
+}
+
+#endif /* OPENSSL_NO_SHA */
+
+#ifndef PADLOCK_NO_RNG
/* ===== Random Number Generator ===== */
/*
* This code is not engaged. The reason is that it does not comply
@@ -1102,6 +1403,7 @@ static RAND_METHOD padlock_rand = {
padlock_rand_bytes, /* pseudorand */
padlock_rand_status, /* rand status */
};
+#endif /* PADLOCK_NO_RNG */
#endif /* COMPILE_HW_PADLOCK */