On a SPARC-T4, with AES opcodes disabled (OPENSSL_sparcv9cap=0):

type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes
aes-128 cbc      75200.21k    83425.11k    86767.67k    87853.06k    88279.72k
aes-192 cbc      64906.68k    71059.56k    73902.42k    74532.52k    74855.77k
aes-256 cbc      56814.90k    61781.72k    63903.74k    64367.27k    64607.57k

And with them enabled:

type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes
aes-128 cbc     501882.74k   836726.87k   993102.76k  1020379.48k  1054083.75k
aes-192 cbc     435068.22k   707080.77k   837915.90k   864243.03k   889279.83k
aes-256 cbc     393746.28k   620463.13k   727483.31k   749580.97k   769029.46k

This system is a T4-2 so it's fun to show off some parallel benchmarks,
for example "openssl speed -multi 16 -evp aes-128-ecb" gives:

type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes
evp            7429568.93k 17815630.93k 28436597.93k 32033047.55k 35120630.44k

35GB/sec AES encryption, not too bad.

Currently CBC, ECB, CTR, OFB, and CFB modes are explicitly optimized.
Other modes will be optimized in the future.

Signed-off-by: David S. Miller <da...@davemloft.net>
---
 Configure                     |    2 +-
 crypto/aes/aes_sparccore.c    |   55 ++++
 crypto/aes/asm/aes-sparcv9.pl |  666 +++++++++++++++++++++++++++++++++++++++++
 crypto/evp/e_aes.c            |  400 +++++++++++++++++++++++++
 crypto/sparc_arch.h           |   19 ++
 5 files changed, 1141 insertions(+), 1 deletion(-)

diff --git a/Configure b/Configure
index 66b4ff8..217a552 100755
--- a/Configure
+++ b/Configure
@@ -130,7 +130,7 @@ my $x86_elf_asm="$x86_asm:elf";
 
 my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o 
x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o 
aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o 
sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o 
cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o";
 my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o 
aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o 
rc4_skey.o:::::ghash-ia64.o::void";
-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o 
sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_sparccore.o aes_cbc.o 
aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o 
sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o 
sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_sparccore.o 
aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o 
sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
 my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
 my $alpha_asm="alphacpuid.o:bn_asm.o 
alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o 
sha256-mips.o sha512-mips.o::::::::";
diff --git a/crypto/aes/aes_sparccore.c b/crypto/aes/aes_sparccore.c
index 2842cbc..658cc66 100644
--- a/crypto/aes/aes_sparccore.c
+++ b/crypto/aes/aes_sparccore.c
@@ -36,6 +36,7 @@
 #include <stdlib.h>
 #include <openssl/crypto.h>
 #include <openssl/aes.h>
+#include <openssl/modes.h>
 #include "aes_locl.h"
 
 #include "sparc_arch.h"
@@ -270,3 +271,57 @@ int AES_set_decrypt_key(const unsigned char *userKey, 
const int bits,
        }
        return 0;
 }
+
+void aes_sparc_hw_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t length, const AES_KEY *key,
+                             unsigned char *ivec, int enc);
+
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                    size_t len, const AES_KEY *key,
+                    unsigned char *ivec, const int enc)
+{
+       const void *aligned_in;
+       void *aligned_out;
+       int aligned_len;
+       size_t bl = 16;
+
+       if (!(OPENSSL_sparcv9cap_P & SPARCV9_AES))
+               goto slow;
+
+       aligned_len = len & ~(bl - 1);
+       if (!aligned_len)
+               goto trailing;
+
+       aligned_out = out;
+       if ((unsigned long) out & 0x7) {
+               aligned_out = OPENSSL_malloc(aligned_len);
+               if (!aligned_out)
+                       goto slow;
+       }
+       aligned_in = in;
+       if ((unsigned long)in & 0x7) {
+               memcpy(aligned_out, in, aligned_len);
+               aligned_in = (const void *) aligned_out;
+       }
+
+       aes_sparc_hw_cbc_encrypt(aligned_in, aligned_out, aligned_len,
+                                key, ivec, enc);
+
+       if ((unsigned long)out & 0x7) {
+               memcpy(out, aligned_out, aligned_len);
+               OPENSSL_free(aligned_out);
+       }
+trailing:
+       len -= aligned_len;
+       if (len) {
+               out += aligned_len;
+               in += aligned_len;
+slow:
+               if (enc)
+                       CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
+                                             (block128_f)AES_encrypt);
+               else
+                       CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
+                                             (block128_f)AES_decrypt);
+       }
+}
diff --git a/crypto/aes/asm/aes-sparcv9.pl b/crypto/aes/asm/aes-sparcv9.pl
index f022b7b..0d47bdb 100755
--- a/crypto/aes/asm/aes-sparcv9.pl
+++ b/crypto/aes/asm/aes-sparcv9.pl
@@ -1651,6 +1651,672 @@ aes_sparc_hw_expand_key:
 .type  aes_sparc_hw_expand_key,#function
 .size  aes_sparc_hw_expand_key,(.-aes_sparc_hw_expand_key)
 
+#define SETUP_KEY_AND_ROUNDS(KEY, ROUNDS, TMP1, TMP2)  \\
+       andcc   %KEY, 0x4, %TMP1;       \\
+       mov     %KEY, %TMP2;            \\
+       add     %KEY, 240, %ROUNDS;     \\
+       movne   %icc, %TMP2, %ROUNDS;   \\
+       add     %KEY, %TMP1, %KEY;      \\
+       ld      [%ROUNDS], %ROUNDS;
+
+#define LOAD_ENCRYPT_KEY_128(KEY)      \\
+       ldd     [%KEY + 0x10], %f8;     \\
+       ldd     [%KEY + 0x18], %f10;    \\
+       ldd     [%KEY + 0x20], %f12;    \\
+       ldd     [%KEY + 0x28], %f14;    \\
+       ldd     [%KEY + 0x30], %f16;    \\
+       ldd     [%KEY + 0x38], %f18;    \\
+       ldd     [%KEY + 0x40], %f20;    \\
+       ldd     [%KEY + 0x48], %f22;    \\
+       ldd     [%KEY + 0x50], %f24;    \\
+       ldd     [%KEY + 0x58], %f26;    \\
+       ldd     [%KEY + 0x60], %f28;    \\
+       ldd     [%KEY + 0x68], %f30;    \\
+       ldd     [%KEY + 0x70], %f32;    \\
+       ldd     [%KEY + 0x78], %f34;    \\
+       ldd     [%KEY + 0x80], %f36;    \\
+       ldd     [%KEY + 0x88], %f38;    \\
+       ldd     [%KEY + 0x90], %f40;    \\
+       ldd     [%KEY + 0x98], %f42;    \\
+       ldd     [%KEY + 0xa0], %f44;    \\
+       ldd     [%KEY + 0xa8], %f46;
+
+#define LOAD_ENCRYPT_KEY_192(KEY)      \\
+       LOAD_ENCRYPT_KEY_128(KEY)       \\
+       ldd     [%KEY + 0xb0], %f48;    \\
+       ldd     [%KEY + 0xb8], %f50;    \\
+       ldd     [%KEY + 0xc0], %f52;    \\
+       ldd     [%KEY + 0xc8], %f54;
+
+#define LOAD_ENCRYPT_KEY_256(KEY)      \\
+       LOAD_ENCRYPT_KEY_192(KEY)       \\
+       ldd     [%KEY + 0xd0], %f56;    \\
+       ldd     [%KEY + 0xd8], %f58;    \\
+       ldd     [%KEY + 0xe0], %f60;    \\
+       ldd     [%KEY + 0xe8], %f62;
+
+#define LOAD_DECRYPT_KEY_128(KEY)      \\
+       ldd     [%KEY + 0x18], %f8;     \\
+       ldd     [%KEY + 0x10], %f10;    \\
+       ldd     [%KEY + 0x28], %f12;    \\
+       ldd     [%KEY + 0x20], %f14;    \\
+       ldd     [%KEY + 0x38], %f16;    \\
+       ldd     [%KEY + 0x30], %f18;    \\
+       ldd     [%KEY + 0x48], %f20;    \\
+       ldd     [%KEY + 0x40], %f22;    \\
+       ldd     [%KEY + 0x58], %f24;    \\
+       ldd     [%KEY + 0x50], %f26;    \\
+       ldd     [%KEY + 0x68], %f28;    \\
+       ldd     [%KEY + 0x60], %f30;    \\
+       ldd     [%KEY + 0x78], %f32;    \\
+       ldd     [%KEY + 0x70], %f34;    \\
+       ldd     [%KEY + 0x88], %f36;    \\
+       ldd     [%KEY + 0x80], %f38;    \\
+       ldd     [%KEY + 0x98], %f40;    \\
+       ldd     [%KEY + 0x90], %f42;    \\
+       ldd     [%KEY + 0xa8], %f44;    \\
+       ldd     [%KEY + 0xa0], %f46;
+
+#define LOAD_DECRYPT_KEY_192(KEY)      \\
+       LOAD_DECRYPT_KEY_128(KEY)       \\
+       ldd     [%KEY + 0xb8], %f48;    \\
+       ldd     [%KEY + 0xb0], %f50;    \\
+       ldd     [%KEY + 0xc8], %f52;    \\
+       ldd     [%KEY + 0xc0], %f54;
+
+#define LOAD_DECRYPT_KEY_256(KEY)      \\
+       LOAD_DECRYPT_KEY_192(KEY)       \\
+       ldd     [%KEY + 0xd8], %f56;    \\
+       ldd     [%KEY + 0xd0], %f58;    \\
+       ldd     [%KEY + 0xe8], %f60;    \\
+       ldd     [%KEY + 0xe0], %f62;
+
+#define ENCRYPT_192(KEY_BASE, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1)
+
+#define ENCRYPT_256(KEY_BASE, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1)
+
+#define DECRYPT_192(KEY_BASE, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1)
+
+#define DECRYPT_256(KEY_BASE, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1)
+
+#define ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \\
+       AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \\
+       AES_EROUND01(KEY_BASE +  0, I2, I3, T2) \\
+       AES_EROUND23(KEY_BASE +  2, I2, I3, T3) \\
+       AES_EROUND01(KEY_BASE +  4, T0, T1, I0) \\
+       AES_EROUND23(KEY_BASE +  6, T0, T1, I1) \\
+       AES_EROUND01(KEY_BASE +  4, T2, T3, I2) \\
+       AES_EROUND23(KEY_BASE +  6, T2, T3, I3)
+
+#define ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \\
+       AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \\
+       AES_EROUND01(KEY_BASE +  0, I2, I3, T2) \\
+       AES_EROUND23(KEY_BASE +  2, I2, I3, T3) \\
+       AES_EROUND01_L(KEY_BASE +  4, T0, T1, I0) \\
+       AES_EROUND23_L(KEY_BASE +  6, T0, T1, I1) \\
+       AES_EROUND01_L(KEY_BASE +  4, T2, T3, I2) \\
+       AES_EROUND23_L(KEY_BASE +  6, T2, T3, I3)
+
+#define ENCRYPT_128_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3)
+
+#define ENCRYPT_192_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 40, I0, I1, I2, I3, T0, T1, T2, T3)
+
+#define ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, TMP_BASE) \\
+       ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, \\
+                            TMP_BASE + 0, TMP_BASE + 2, TMP_BASE + 4, TMP_BASE 
+ 6)
+
+#define ENCRYPT_256_2(KEY_BASE, I0, I1, I2, I3, IGN0, IGN1, IGN2, IGN3) \\
+       ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, KEY_BASE + 48) 
\\
+       ldd     [%o3 + 0xd0], %f56; \\
+       ldd     [%o3 + 0xd8], %f58; \\
+       ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       ldd     [%o3 + 0xe0], %f60; \\
+       ldd     [%o3 + 0xe8], %f62; \\
+       ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 40, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       AES_EROUND01(KEY_BASE +  48, I0, I1, KEY_BASE + 0) \\
+       AES_EROUND23(KEY_BASE +  50, I0, I1, KEY_BASE + 2) \\
+       AES_EROUND01(KEY_BASE +  48, I2, I3, KEY_BASE + 4) \\
+       AES_EROUND23(KEY_BASE +  50, I2, I3, KEY_BASE + 6) \\
+       AES_EROUND01_L(KEY_BASE +  52, KEY_BASE + 0, KEY_BASE + 2, I0) \\
+       AES_EROUND23_L(KEY_BASE +  54, KEY_BASE + 0, KEY_BASE + 2, I1) \\
+       ldd     [%o3 + 0x10], %f8; \\
+       ldd     [%o3 + 0x18], %f10; \\
+       AES_EROUND01_L(KEY_BASE +  52, KEY_BASE + 4, KEY_BASE + 6, I2) \\
+       AES_EROUND23_L(KEY_BASE +  54, KEY_BASE + 4, KEY_BASE + 6, I3) \\
+       ldd     [%o3 + 0x20], %f12; \\
+       ldd     [%o3 + 0x28], %f14;
+
+#define DECRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \\
+       AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \\
+       AES_DROUND23(KEY_BASE +  0, I2, I3, T3) \\
+       AES_DROUND01(KEY_BASE +  2, I2, I3, T2) \\
+       AES_DROUND23(KEY_BASE +  4, T0, T1, I1) \\
+       AES_DROUND01(KEY_BASE +  6, T0, T1, I0) \\
+       AES_DROUND23(KEY_BASE +  4, T2, T3, I3) \\
+       AES_DROUND01(KEY_BASE +  6, T2, T3, I2)
+
+#define DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \\
+       AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \\
+       AES_DROUND23(KEY_BASE +  0, I2, I3, T3) \\
+       AES_DROUND01(KEY_BASE +  2, I2, I3, T2) \\
+       AES_DROUND23_L(KEY_BASE +  4, T0, T1, I1) \\
+       AES_DROUND01_L(KEY_BASE +  6, T0, T1, I0) \\
+       AES_DROUND23_L(KEY_BASE +  4, T2, T3, I3) \\
+       AES_DROUND01_L(KEY_BASE +  6, T2, T3, I2)
+
+#define DECRYPT_128_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3)
+
+#define DECRYPT_192_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) \\
+       DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 40, I0, I1, I2, I3, T0, T1, T2, T3)
+
+#define DECRYPT_256_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, TMP_BASE) \\
+       DECRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, \\
+                            TMP_BASE + 0, TMP_BASE + 2, TMP_BASE + 4, TMP_BASE 
+ 6)
+
+#define DECRYPT_256_2(KEY_BASE, I0, I1, I2, I3, IGN0, IGN1, IGN2, IGN3) \\
+       DECRYPT_256_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, KEY_BASE + 48) 
\\
+       ldd     [%o3 + 0xd8], %f56; \\
+       ldd     [%o3 + 0xd0], %f58; \\
+       DECRYPT_256_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       ldd     [%o3 + 0xe8], %f60; \\
+       ldd     [%o3 + 0xe0], %f62; \\
+       DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 40, I0, I1, I2, I3, KEY_BASE +  0) 
\\
+       AES_DROUND23(KEY_BASE +  48, I0, I1, KEY_BASE + 2) \\
+       AES_DROUND01(KEY_BASE +  50, I0, I1, KEY_BASE + 0) \\
+       AES_DROUND23(KEY_BASE +  48, I2, I3, KEY_BASE + 6) \\
+       AES_DROUND01(KEY_BASE +  50, I2, I3, KEY_BASE + 4) \\
+       AES_DROUND23_L(KEY_BASE +  52, KEY_BASE + 0, KEY_BASE + 2, I1) \\
+       AES_DROUND01_L(KEY_BASE +  54, KEY_BASE + 0, KEY_BASE + 2, I0) \\
+       ldd     [%o3 + 0x18], %f8; \\
+       ldd     [%o3 + 0x10], %f10; \\
+       AES_DROUND23_L(KEY_BASE +  52, KEY_BASE + 4, KEY_BASE + 6, I3) \\
+       AES_DROUND01_L(KEY_BASE +  54, KEY_BASE + 4, KEY_BASE + 6, I2) \\
+       ldd     [%o3 + 0x28], %f12; \\
+       ldd     [%o3 + 0x20], %f14;
+
+#define ECB_CRYPT_IMPL(KEYLEN, ED, KEY, SRC, DST, LEN) \\
+       LOAD_##ED##CRYPT_KEY_##KEYLEN##(KEY);           \\
+       subcc   %LEN, 0x10, %LEN;                       \\
+       be      10f;                                    \\
+        nop;                                           \\
+.Lecb_##ED##crypt_##KEYLEN##_loop:                     \\
+       ldx     [%SRC + 0x00], %g3;                     \\
+       ldx     [%SRC + 0x08], %g5;                     \\
+       ldx     [%SRC + 0x10], %o4;                     \\
+       ldx     [%SRC + 0x18], %o5;                     \\
+       xor     %g1, %g3, %g3;                          \\
+       xor     %g2, %g5, %g5;                          \\
+       MOVXTOD_G3_F4;                                  \\
+       MOVXTOD_G5_F6;                                  \\
+       xor     %g1, %o4, %g3;                          \\
+       xor     %g2, %o5, %g5;                          \\
+       MOVXTOD_G3_F0;                                  \\
+       MOVXTOD_G5_F2;                                  \\
+       ##ED##CRYPT_##KEYLEN##_2(8, 4, 6, 0, 2, 56, 58, 60, 62);\\
+       std     %f4, [%DST + 0x00];                     \\
+       std     %f6, [%DST + 0x08];                     \\
+       std     %f0, [%DST + 0x10];                     \\
+       std     %f2, [%DST + 0x18];                     \\
+       subcc   %LEN, 0x20, %LEN;                       \\
+       add     %SRC, 0x20, %SRC;                       \\
+       bgt,pt  %icc, .Lecb_##ED##crypt_##KEYLEN##_loop;\\
+        add    %DST, 0x20, %DST;                       \\
+       blt,pt  %icc, 11f;                              \\
+        nop;                                           \\
+10:    ldx     [%SRC + 0x00], %g3;                     \\
+       ldx     [%SRC + 0x08], %g5;                     \\
+       xor     %g1, %g3, %g3;                          \\
+       xor     %g2, %g5, %g5;                          \\
+       MOVXTOD_G3_F4;                                  \\
+       MOVXTOD_G5_F6;                                  \\
+       ##ED##CRYPT_##KEYLEN##(8, 4, 6, 0, 2);          \\
+       std     %f4, [%DST + 0x00];                     \\
+       std     %f6, [%DST + 0x08];                     \\
+11:    retl;                                           \\
+        nop;
+
+       .align  32
+       .globl  aes_sparc_hw_ecb_encrypt
+aes_sparc_hw_ecb_encrypt:
+       /* %o0=in, %o1=out, %o2=len, %o3=KEY, %o4=enc */
+       SETUP_KEY_AND_ROUNDS(o3, g1, g2, g3)
+       cmp     %o4, 0
+       be      .Lecb_decrypt
+        cmp    %g1, 12
+
+       ldx     [%o3 + 0x00], %g1
+       bl      .Lecb_encrypt_128
+        ldx    [%o3 + 0x08], %g2
+       be      .Lecb_encrypt_192
+        nop
+
+       ECB_CRYPT_IMPL(256, EN, o3, o0, o1, o2)
+
+.Lecb_encrypt_192:
+       ECB_CRYPT_IMPL(192, EN, o3, o0, o1, o2)
+
+.Lecb_encrypt_128:
+       ECB_CRYPT_IMPL(128, EN, o3, o0, o1, o2)
+
+.Lecb_decrypt:
+       ldx     [%o3 + 0x00], %g1
+       bl      .Lecb_decrypt_128
+        ldx    [%o3 + 0x08], %g2
+       be      .Lecb_decrypt_192
+        nop
+
+       ECB_CRYPT_IMPL(256, DE, o3, o0, o1, o2)
+
+.Lecb_decrypt_192:
+       ECB_CRYPT_IMPL(192, DE, o3, o0, o1, o2)
+
+.Lecb_decrypt_128:
+       ECB_CRYPT_IMPL(128, DE, o3, o0, o1, o2)
+       .type   aes_sparc_hw_ecb_encrypt,#function
+       .size   aes_sparc_hw_ecb_encrypt,(.-aes_sparc_hw_ecb_encrypt)
+
+#define CBC_ENCRYPT_IMPL(KEYLEN, KEY, SRC, DST, IV, LEN)\\
+       LOAD_ENCRYPT_KEY_##KEYLEN##(KEY);               \\
+.Lcbc_ENcrypt_##KEYLEN##_loop:                         \\
+       ldx     [%SRC + 0x00], %g3;                     \\
+       ldx     [%SRC + 0x08], %g5;                     \\
+       add     %SRC, 0x10, %SRC;                       \\
+       subcc   %LEN, 0x10, %LEN;                       \\
+       add     %DST, 0x10, %DST;                       \\
+       xor     %g1, %g3, %g3;                          \\
+       xor     %g2, %g5, %g5;                          \\
+       MOVXTOD_G3_F0;                                  \\
+       MOVXTOD_G5_F2;                                  \\
+       fxor    %f4, %f0, %f4;                          \\
+       fxor    %f6, %f2, %f6;                          \\
+       ENCRYPT_##KEYLEN##(8, 4, 6, 0, 2);              \\
+       std     %f4, [%DST - 0x10];                     \\
+       bne     .Lcbc_ENcrypt_##KEYLEN##_loop;          \\
+        std    %f6, [%DST - 0x08];                     \\
+       st      %f4, [%IV + 0x00];                      \\
+       st      %f5, [%IV + 0x04];                      \\
+       st      %f6, [%IV + 0x08];                      \\
+       retl;                                           \\
+        st     %f7, [%IV + 0x0c];
+
+#define CBC_DECRYPT_IMPL(KEYLEN, KEY, SRC, DST, IV, LEN)\\
+       LOAD_DECRYPT_KEY_##KEYLEN##(KEY);               \\
+       ld      [%IV + 0x00], %o3;                      \\
+       ld      [%IV + 0x04], %g3;                      \\
+       sllx    %o3, 32, %o3;                           \\
+       or      %o3, %g3, %o3;                          \\
+       ld      [%IV + 0x08], %o5;                      \\
+       ld      [%IV + 0x0c], %g3;                      \\
+       sllx    %o5, 32, %o5;                           \\
+       or      %o5, %g3, %o5;                          \\
+.Lcbc_DEcrypt_##KEYLEN##_loop:                         \\
+       ldx     [%SRC + 0x00], %g3;                     \\
+       ldx     [%SRC + 0x08], %g5;                     \\
+       add     %SRC, 0x10, %SRC;                       \\
+       subcc   %LEN, 0x10, %LEN;                       \\
+       add     %DST, 0x10, %DST;                       \\
+       xor     %g1, %g3, %g3;                          \\
+       xor     %g2, %g5, %g5;                          \\
+       MOVXTOD_G3_F4;                                  \\
+       MOVXTOD_G5_F6;                                  \\
+       DECRYPT_##KEYLEN##(8, 4, 6, 0, 2);              \\
+       MOVXTOD_O3_F0;                                  \\
+       MOVXTOD_O5_F2;                                  \\
+       xor     %g1, %g3, %o3;                          \\
+       xor     %g2, %g5, %o5;                          \\
+       fxor    %f4, %f0, %f4;                          \\
+       fxor    %f6, %f2, %f6;                          \\
+       std     %f4, [%DST - 0x10];                     \\
+       bne,pt  %icc, .Lcbc_DEcrypt_##KEYLEN##_loop;    \\
+        std    %f6, [%DST - 0x08];                     \\
+       srlx    %o3, 32, %g1;                           \\
+       st      %g1, [%IV + 0x00];                      \\
+       srlx    %o5, 32, %g2;                           \\
+       st      %o3, [%IV + 0x04];                      \\
+       st      %g2, [%IV + 0x08];                      \\
+       retl;                                           \\
+        st     %o5, [%IV + 0x0c];
+
+       .align  32
+       .globl  aes_sparc_hw_cbc_encrypt
+aes_sparc_hw_cbc_encrypt:
+       /* %o0=in, %o1=out, %o2=len, %o3=KEY, %o4=IV, %o5=enc */
+       SETUP_KEY_AND_ROUNDS(o3, g3, g1, g2)
+       ldx     [%o3 + 0x00], %g1
+       cmp     %o5, 0
+       ldx     [%o3 + 0x08], %g2
+       be      .Lcbc_decrypt
+        cmp    %g3, 12
+
+       ld      [%o4 + 0x00], %f4
+       ld      [%o4 + 0x04], %f5
+       ld      [%o4 + 0x08], %f6
+       bl      .Lcbc_encrypt_128
+        ld     [%o4 + 0x0c], %f7
+       be      .Lcbc_encrypt_192
+        nop
+
+       CBC_ENCRYPT_IMPL(256, o3, o0, o1, o4, o2)
+
+.Lcbc_encrypt_192:
+       CBC_ENCRYPT_IMPL(192, o3, o0, o1, o4, o2)
+
+.Lcbc_encrypt_128:
+       CBC_ENCRYPT_IMPL(128, o3, o0, o1, o4, o2)
+
+.Lcbc_decrypt:
+       bl      .Lcbc_decrypt_128
+        nop
+       be      .Lcbc_decrypt_192
+        nop
+
+       CBC_DECRYPT_IMPL(256, o3, o0, o1, o4, o2)
+
+.Lcbc_decrypt_192:
+       CBC_DECRYPT_IMPL(192, o3, o0, o1, o4, o2)
+
+.Lcbc_decrypt_128:
+       CBC_DECRYPT_IMPL(128, o3, o0, o1, o4, o2)
+       .type   aes_sparc_hw_cbc_encrypt,#function
+       .size   aes_sparc_hw_cbc_encrypt,(.-aes_sparc_hw_cbc_encrypt)
+
+#define CTR_KEY_FIXUP_128(KEY)
+#define CTR_KEY_FIXUP_192(KEY)
+#define CTR_KEY_FIXUP_256(KEY) \\
+       ldd     [%KEY + 0xd0], %f56; \\
+       ldd     [%KEY + 0xd8], %f58; \\
+       ldd     [%KEY + 0xe0], %f60; \\
+       ldd     [%KEY + 0xe8], %f62;
+
+#define CTR_ENCRYPT_IMPL(KEYLEN, KEY, SRC, DST, IV, LEN)\\
+       LOAD_ENCRYPT_KEY_##KEYLEN##(KEY);               \\
+       ld      [%IV + 0x00], %g3;                      \\
+       ld      [%IV + 0x04], %o5;                      \\
+       sllx    %g3, 32, %g3;                           \\
+       or      %g3, %o5, %g3;                          \\
+       ld      [%IV + 0x08], %g5;                      \\
+       ld      [%IV + 0x0c], %o5;                      \\
+       subcc   %LEN, 0x10, %LEN;                       \\
+       sllx    %g5, 32, %g5;                           \\
+       be      10f;                                    \\
+        or     %g5, %o5, %g5;                          \\
+.Lctr_ENcrypt_##KEYLEN##_loop:                         \\
+       xor     %g1, %g3, %o5;                          \\
+       MOVXTOD_O5_F0;                                  \\
+       xor     %g2, %g5, %o5;                          \\
+       MOVXTOD_O5_F2;                                  \\
+       add     %g5, 1, %g5;                            \\
+       add     %g3, 1, %o5;                            \\
+       movrz   %g5, %o5, %g3;                          \\
+       xor     %g1, %g3, %o5;                          \\
+       MOVXTOD_O5_F4;                                  \\
+       xor     %g2, %g5, %o5;                          \\
+       MOVXTOD_O5_F6;                                  \\
+       add     %g5, 1, %g5;                            \\
+       add     %g3, 1, %o5;                            \\
+       movrz   %g5, %o5, %g3;                          \\
+       ENCRYPT_##KEYLEN##_2(8, 0, 2, 4, 6, 56, 58, 60, 62);\\
+       ldd     [%SRC + 0x00], %f56;                    \\
+       ldd     [%SRC + 0x08], %f58;                    \\
+       ldd     [%SRC + 0x10], %f60;                    \\
+       ldd     [%SRC + 0x18], %f62;                    \\
+       fxor    %f56, %f0, %f56;                        \\
+       fxor    %f58, %f2, %f58;                        \\
+       fxor    %f60, %f4, %f60;                        \\
+       fxor    %f62, %f6, %f62;                        \\
+       std     %f56, [%DST + 0x00];                    \\
+       std     %f58, [%DST + 0x08];                    \\
+       std     %f60, [%DST + 0x10];                    \\
+       std     %f62, [%DST + 0x18];                    \\
+       subcc   %LEN, 0x20, %LEN;                       \\
+       add     %SRC, 0x20, %SRC;                       \\
+       bgt,pt  %icc, .Lctr_ENcrypt_##KEYLEN##_loop;    \\
+        add    %DST, 0x20, %DST;                       \\
+       blt,pt  %icc, 11f;                              \\
+        nop;                                           \\
+       CTR_KEY_FIXUP_##KEYLEN##(KEY);                  \\
+10:    xor     %g1, %g3, %o5;                          \\
+       MOVXTOD_O5_F0;                                  \\
+       xor     %g2, %g5, %o5;                          \\
+       MOVXTOD_O5_F2;                                  \\
+       add     %g5, 1, %g5;                            \\
+       add     %g3, 1, %o5;                            \\
+       movrz   %g5, %o5, %g3;                          \\
+       ENCRYPT_##KEYLEN##(8, 0, 2, 4, 6);              \\
+       ldd     [%SRC + 0x00], %f4;                     \\
+       ldd     [%SRC + 0x08], %f6;                     \\
+       fxor    %f4, %f0, %f4;                          \\
+       fxor    %f6, %f2, %f6;                          \\
+       std     %f4, [%DST + 0x00];                     \\
+       std     %f6, [%DST + 0x08];                     \\
+11:    srlx    %g3, 32, %g1;                           \\
+       st      %g1, [%IV + 0x00];                      \\
+       srlx    %g5, 32, %g2;                           \\
+       st      %g3, [%IV + 0x04];                      \\
+       st      %g2, [%IV + 0x08];                      \\
+       retl;                                           \\
+        st     %g5, [%IV + 0x0c];
+
+       .align  32
+       .globl  aes_sparc_hw_ctr_encrypt
+aes_sparc_hw_ctr_encrypt:
+       /* %o0=in, %o1=out, %o2=len, %o3=KEY, %o4=IV */
+       SETUP_KEY_AND_ROUNDS(o3, g3, g1, g2)
+       cmp     %g3, 12
+       ldx     [%o3 + 0x00], %g1
+       bl      .Lctr_encrypt_128
+        ldx    [%o3 + 0x08], %g2
+       be      .Lctr_encrypt_192
+        nop
+
+       CTR_ENCRYPT_IMPL(256, o3, o0, o1, o4, o2)
+
+.Lctr_encrypt_192:
+       CTR_ENCRYPT_IMPL(192, o3, o0, o1, o4, o2)
+
+.Lctr_encrypt_128:
+       CTR_ENCRYPT_IMPL(128, o3, o0, o1, o4, o2)
+       .type   aes_sparc_hw_ctr_encrypt,#function
+       .size   aes_sparc_hw_ctr_encrypt,(.-aes_sparc_hw_ctr_encrypt)
+
+#define OFB_ENCRYPT_IMPL(KEYLEN, KEY, SRC, DST, IV, LEN)\\
+       LOAD_ENCRYPT_KEY_##KEYLEN##(KEY);               \\
+.Lofb_ENcrypt_##KEYLEN##_loop:                         \\
+       MOVXTOD_G3_F0;                                  \\
+       MOVXTOD_G5_F2;                                  \\
+       fxor    %f4, %f0, %f4;                          \\
+       fxor    %f6, %f2, %f6;                          \\
+       ENCRYPT_##KEYLEN##(8, 4, 6, 0, 2);              \\
+       ldd     [%SRC + 0x00], %f0;                     \\
+       ldd     [%SRC + 0x08], %f2;                     \\
+       add     %SRC, 0x10, %SRC;                       \\
+       fxor    %f4, %f0, %f0;                          \\
+       fxor    %f6, %f2, %f2;                          \\
+       std     %f0, [%DST + 0x00];                     \\
+       std     %f2, [%DST + 0x08];                     \\
+       subcc   %LEN, 0x10, %LEN;                       \\
+       bne     .Lofb_ENcrypt_##KEYLEN##_loop;          \\
+        add    %DST, 0x10, %DST;                       \\
+       st      %f4, [%IV + 0x00];                      \\
+       st      %f5, [%IV + 0x04];                      \\
+       st      %f6, [%IV + 0x08];                      \\
+       retl;                                           \\
+        st     %f7, [%IV + 0x0c];
+
+       .align  32
+       .globl  aes_sparc_hw_ofb_encrypt
+aes_sparc_hw_ofb_encrypt:
+       /* %o0=in, %o1=out, %o2=len, %o3=KEY, %o4=IV */
+       SETUP_KEY_AND_ROUNDS(o3, g3, g1, g2)
+       cmp     %g3, 12
+       ldx     [%o3 + 0x00], %g3
+       ldx     [%o3 + 0x08], %g5
+       ld      [%o4 + 0x00], %f4
+       ld      [%o4 + 0x04], %f5
+       ld      [%o4 + 0x08], %f6
+       bl      .Lofb_encrypt_128
+        ld     [%o4 + 0x0c], %f7
+       be      .Lofb_encrypt_192
+        nop
+
+       OFB_ENCRYPT_IMPL(256, o3, o0, o1, o4, o2)
+
+.Lofb_encrypt_192:
+       OFB_ENCRYPT_IMPL(192, o3, o0, o1, o4, o2)
+
+.Lofb_encrypt_128:
+       OFB_ENCRYPT_IMPL(128, o3, o0, o1, o4, o2)
+       .type   aes_sparc_hw_ofb_encrypt,#function
+       .size   aes_sparc_hw_ofb_encrypt,(.-aes_sparc_hw_ofb_encrypt)
+
+#define CFB_ENCRYPT_IMPL(KEYLEN, KEY, SRC, DST, IV, LEN)\\
+       LOAD_ENCRYPT_KEY_##KEYLEN##(KEY);               \\
+.Lcfb_ENcrypt_##KEYLEN##_loop:                         \\
+       MOVXTOD_G3_F0;                                  \\
+       MOVXTOD_G5_F2;                                  \\
+       fxor    %f4, %f0, %f4;                          \\
+       fxor    %f6, %f2, %f6;                          \\
+       ENCRYPT_##KEYLEN##(8, 4, 6, 0, 2);              \\
+       ldd     [%SRC + 0x00], %f0;                     \\
+       ldd     [%SRC + 0x08], %f2;                     \\
+       add     %SRC, 0x10, %SRC;                       \\
+       fxor    %f4, %f0, %f4;                          \\
+       fxor    %f6, %f2, %f6;                          \\
+       std     %f4, [%DST + 0x00];                     \\
+       std     %f6, [%DST + 0x08];                     \\
+       subcc   %LEN, 0x10, %LEN;                       \\
+       bne     .Lcfb_ENcrypt_##KEYLEN##_loop;          \\
+        add    %DST, 0x10, %DST;                       \\
+       st      %f4, [%IV + 0x00];                      \\
+       st      %f5, [%IV + 0x04];                      \\
+       st      %f6, [%IV + 0x08];                      \\
+       retl;                                           \\
+        st     %f7, [%IV + 0x0c];
+
+#define CFB_DECRYPT_IMPL(KEYLEN, KEY, SRC, DST, IV, LEN)\\
+       LOAD_ENCRYPT_KEY_##KEYLEN##(KEY);               \\
+.Lcfb_DEcrypt_##KEYLEN##_loop:                         \\
+       MOVXTOD_G3_F0;                                  \\
+       MOVXTOD_G5_F2;                                  \\
+       fxor    %f4, %f0, %f0;                          \\
+       fxor    %f6, %f2, %f2;                          \\
+       ENCRYPT_##KEYLEN##(8, 0, 2, 4, 6);              \\
+       ldd     [%SRC + 0x00], %f4;                     \\
+       ldd     [%SRC + 0x08], %f6;                     \\
+       add     %SRC, 0x10, %SRC;                       \\
+       fxor    %f4, %f0, %f0;                          \\
+       fxor    %f6, %f2, %f2;                          \\
+       std     %f0, [%DST + 0x00];                     \\
+       std     %f2, [%DST + 0x08];                     \\
+       subcc   %LEN, 0x10, %LEN;                       \\
+       bne     .Lcfb_DEcrypt_##KEYLEN##_loop;          \\
+        add    %DST, 0x10, %DST;                       \\
+       st      %f4, [%IV + 0x00];                      \\
+       st      %f5, [%IV + 0x04];                      \\
+       st      %f6, [%IV + 0x08];                      \\
+       retl;                                           \\
+        st     %f7, [%IV + 0x0c];
+
+       .align  32
+       .globl  aes_sparc_hw_cfb_encrypt
+aes_sparc_hw_cfb_encrypt:
+       /* %o0=in, %o1=out, %o2=len, %o3=KEY, %o4=IV, %o5=enc */
+       SETUP_KEY_AND_ROUNDS(o3, g1, g2, g3)
+       ldx     [%o3 + 0x00], %g3
+       ldx     [%o3 + 0x08], %g5
+       ld      [%o4 + 0x00], %f4
+       ld      [%o4 + 0x04], %f5
+       ld      [%o4 + 0x08], %f6
+       ld      [%o4 + 0x0c], %f7
+       cmp     %o5, 0
+       be      .Lcfb_decrypt
+        cmp    %g1, 12
+
+       bl      .Lcfb_encrypt_128
+        nop
+       be      .Lcfb_encrypt_192
+        nop
+
+       CFB_ENCRYPT_IMPL(256, o3, o0, o1, o4, o2)
+
+.Lcfb_encrypt_192:
+       CFB_ENCRYPT_IMPL(192, o3, o0, o1, o4, o2)
+
+.Lcfb_encrypt_128:
+       CFB_ENCRYPT_IMPL(128, o3, o0, o1, o4, o2)
+
+.Lcfb_decrypt:
+       bl      .Lcfb_decrypt_128
+        nop
+       be      .Lcfb_decrypt_192
+        nop
+
+       CFB_DECRYPT_IMPL(256, o3, o0, o1, o4, o2)
+
+.Lcfb_decrypt_192:
+       CFB_DECRYPT_IMPL(192, o3, o0, o1, o4, o2)
+
+.Lcfb_decrypt_128:
+       CFB_DECRYPT_IMPL(128, o3, o0, o1, o4, o2)
+       .type   aes_sparc_hw_cfb_encrypt,#function
+       .size   aes_sparc_hw_cfb_encrypt,(.-aes_sparc_hw_cfb_encrypt)
 ___
 
 # fmovs instructions substituting for FP nops were originally added
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index 5dccb2f..63b6edc 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -459,6 +459,404 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
 
 #else
 
+#if    defined(AES_ASM) && defined(__sparc__)
+
+#include "sparc_arch.h"
+
+#define        SPARC_AES_CAPABLE       (OPENSSL_sparcv9cap_P & SPARCV9_AES)
+
+void aes_sparc_hw_ecb_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t length, const AES_KEY *key, int enc);
+void aes_sparc_hw_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t length, const AES_KEY *key,
+                             unsigned char *ivec, int enc);
+void aes_sparc_hw_ctr_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t length, const void *key,
+                             unsigned char *ivec);
+void aes_sparc_hw_ofb_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t length, const void *key,
+                             unsigned char *ivec);
+void aes_sparc_hw_cfb_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t length, const void *key,
+                             unsigned char *ivec, int enc);
+
+static int aes_sparc_hw_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                  const unsigned char *in, size_t len)
+{
+       size_t bl = AES_BLOCK_SIZE;
+       const void *aligned_in;
+       void *aligned_out;
+       int aligned_len;
+
+       aligned_len = len & ~(bl - 1);
+       if (!aligned_len)
+               goto trailing;
+
+       aligned_out = (void *) out;
+       if ((unsigned long)out & 0x7) {
+               aligned_out = OPENSSL_malloc(aligned_len);
+               if (!aligned_out)
+                       goto slow;
+       }
+
+       aligned_in = (const void *) in;
+       if ((unsigned long)in & 0x7) {
+               memcpy(aligned_out, in, aligned_len);
+               aligned_in = (const void *) aligned_out;
+       }
+
+       aes_sparc_hw_cbc_encrypt(aligned_in, aligned_out, aligned_len,
+                                ctx->cipher_data, ctx->iv, ctx->encrypt);
+
+       if ((unsigned long)out & 0x7) {
+               memcpy(out, aligned_out, aligned_len);
+               OPENSSL_free(aligned_out);
+       }
+trailing:
+       len -= aligned_len;
+       if (len) {
+               EVP_AES_KEY *dat;
+
+               in += aligned_len;
+               out += aligned_len;
+slow:
+               dat = (EVP_AES_KEY *)ctx->cipher_data;
+               if (ctx->encrypt)
+                       CRYPTO_cbc128_encrypt(in, out, len, &dat->ks,
+                                             ctx->iv, dat->block);
+               else
+                       CRYPTO_cbc128_decrypt(in, out, len, &dat->ks,
+                                             ctx->iv, dat->block);
+       }
+
+       return 1;
+}
+
+static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+                         const unsigned char *in, size_t len);
+
+static int aes_sparc_hw_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+                                  const unsigned char *in, size_t len)
+{
+       size_t bl = ctx->cipher->block_size;
+       const void *aligned_in;
+       void *aligned_out;
+
+       if (len < bl)
+               return 1;
+
+       len &= ~(bl - 1);
+
+       aligned_out = (void *) out;
+       if ((unsigned long)out & 0x7) {
+               aligned_out = OPENSSL_malloc(len);
+               if (!aligned_out)
+                       return aes_ecb_cipher(ctx, out, in, len);
+       }
+
+       aligned_in = (const void *) in;
+       if ((unsigned long)in & 0x7) {
+               memcpy(aligned_out, in, len);
+               aligned_in = (const void *) aligned_out;
+       }
+
+       aes_sparc_hw_ecb_encrypt(aligned_in, aligned_out, len,
+                                ctx->cipher_data, ctx->encrypt);
+
+       if ((unsigned long)out & 0x7) {
+               memcpy(out, aligned_out, len);
+               OPENSSL_free(aligned_out);
+       }
+
+       return 1;
+}
+
+static int aes_sparc_hw_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                  const unsigned char *in, size_t len)
+{
+       const void *aligned_in;
+       void *aligned_out;
+       int aligned_len;
+
+       while (ctx->num && len) {
+               *out++ = *in++ ^ ctx->buf[ctx->num];
+               ctx->num = (ctx->num + 1) % 16;
+               len--;
+       }
+
+       aligned_len = len & ~(16 - 1);
+       if (!aligned_len)
+               goto trailing;
+
+       aligned_out = (void *) out;
+       if ((unsigned long)out & 0x7) {
+               aligned_out = OPENSSL_malloc(aligned_len);
+               if (!aligned_out)
+                       goto slow;
+       }
+
+       aligned_in = (const void *) in;
+       if ((unsigned long)in & 0x7) {
+               memcpy(aligned_out, in, aligned_len);
+               aligned_in = (const void *) aligned_out;
+       }
+
+       aes_sparc_hw_ctr_encrypt(aligned_in, aligned_out, aligned_len,
+                                ctx->cipher_data, ctx->iv);
+
+       if ((unsigned long)out & 0x7) {
+               memcpy(out, aligned_out, aligned_len);
+               OPENSSL_free(aligned_out);
+       }
+trailing:
+       len -= aligned_len;
+       if (len) {
+               EVP_AES_KEY *dat;
+               unsigned int num;
+
+               in += aligned_len;
+               out += aligned_len;
+slow:
+               dat = (EVP_AES_KEY *)ctx->cipher_data;
+               num = ctx->num;
+               CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ctx->iv,
+                                     ctx->buf, &num, dat->block);
+               ctx->num = (size_t) num;
+       }
+
+       return 1;
+}
+
+static int aes_sparc_hw_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                  const unsigned char *in, size_t len)
+{
+       const void *aligned_in;
+       void *aligned_out;
+       int aligned_len;
+
+       while (ctx->num && len) {
+               *out++ = *in++ ^ ctx->iv[ctx->num];
+               ctx->num = (ctx->num + 1) % 16;
+               len--;
+       }
+
+       aligned_len = len & ~(16 - 1);
+       if (!aligned_len)
+               goto trailing;
+
+       aligned_out = (void *) out;
+       if ((unsigned long)out & 0x7) {
+               aligned_out = OPENSSL_malloc(aligned_len);
+               if (!aligned_out)
+                       goto slow;
+       }
+
+       aligned_in = (const void *) in;
+       if ((unsigned long)in & 0x7) {
+               memcpy(aligned_out, in, aligned_len);
+               aligned_in = (const void *) aligned_out;
+       }
+
+       aes_sparc_hw_ofb_encrypt(aligned_in, aligned_out, aligned_len,
+                                ctx->cipher_data, ctx->iv);
+
+       if ((unsigned long)out & 0x7) {
+               memcpy(out, aligned_out, aligned_len);
+               OPENSSL_free(aligned_out);
+       }
+trailing:
+       len -= aligned_len;
+       if (len) {
+               EVP_AES_KEY *dat;
+
+               in += aligned_len;
+               out += aligned_len;
+slow:
+               dat = (EVP_AES_KEY *)ctx->cipher_data;
+               CRYPTO_ofb128_encrypt(in, out, len, &dat->ks, ctx->iv,
+                                     &ctx->num, dat->block);
+       }
+
+       return 1;
+}
+
+static int aes_sparc_hw_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                  const unsigned char *in, size_t len)
+{
+       const void *aligned_in;
+       void *aligned_out;
+       int aligned_len;
+
+       if (ctx->num) {
+               if (ctx->encrypt) {
+                       while (ctx->num && len) {
+                               *out++ = *in++ ^ ctx->iv[ctx->num];
+                               ctx->num = (ctx->num + 1) % 16;
+                               len--;
+                       }
+               } else {
+                       while (ctx->num && len) {
+                               unsigned char c;
+
+                               c = *in++;
+                               *out++ = c ^ ctx->iv[ctx->num];
+                               ctx->iv[ctx->num] = c;
+
+                               ctx->num = (ctx->num + 1) % 16;
+                               len--;
+                       }
+               }
+       }
+
+       aligned_len = len & ~(16 - 1);
+       if (!aligned_len)
+               goto trailing;
+
+       aligned_out = (void *) out;
+       if ((unsigned long)out & 0x7) {
+               aligned_out = OPENSSL_malloc(aligned_len);
+               if (!aligned_out)
+                       goto slow;
+       }
+
+       aligned_in = (const void *) in;
+       if ((unsigned long)in & 0x7) {
+               memcpy(aligned_out, in, aligned_len);
+               aligned_in = (const void *) aligned_out;
+       }
+
+       aes_sparc_hw_cfb_encrypt(aligned_in, aligned_out, aligned_len,
+                                ctx->cipher_data, ctx->iv, ctx->encrypt);
+
+       if ((unsigned long)out & 0x7) {
+               memcpy(out, aligned_out, aligned_len);
+               OPENSSL_free(aligned_out);
+       }
+trailing:
+       len -= aligned_len;
+       if (len) {
+               EVP_AES_KEY *dat;
+
+               in += aligned_len;
+               out += aligned_len;
+slow:
+               dat = (EVP_AES_KEY *)ctx->cipher_data;
+               CRYPTO_cfb128_encrypt(in, out, len, &dat->ks, ctx->iv,
+                                     &ctx->num, ctx->encrypt, dat->block);
+       }
+
+       return 1;
+}
+
+#define aes_sparc_hw_cfb8_cipher aes_cfb8_cipher
+static int aes_sparc_hw_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+                                   const unsigned char *in,size_t len);
+
+#define aes_sparc_hw_cfb1_cipher aes_cfb1_cipher
+static int aes_sparc_hw_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+                                   const unsigned char *in,size_t len);
+
+#define aes_sparc_hw_gcm_init_key aes_gcm_init_key
+static int aes_sparc_hw_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char 
*key,
+                                    const unsigned char *iv, int enc);
+
+#define aes_sparc_hw_gcm_cipher aes_gcm_cipher
+static int aes_sparc_hw_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                  const unsigned char *in, size_t len);
+
+#define aes_sparc_hw_xts_init_key aes_xts_init_key
+static int aes_sparc_hw_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char 
*key,
+                                    const unsigned char *iv, int enc);
+
+#define aes_sparc_hw_xts_cipher aes_xts_cipher
+static int aes_sparc_hw_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                  const unsigned char *in, size_t len);
+
+#define aes_sparc_hw_ccm_init_key aes_ccm_init_key
+static int aes_sparc_hw_ccm_init_key(EVP_CIPHER_CTX *ctx,
+                                    const unsigned char *key,
+                                    const unsigned char *iv, int enc);
+
+#define aes_sparc_hw_ccm_cipher aes_ccm_cipher
+static int aes_sparc_hw_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                  const unsigned char *in, size_t len);
+
+static int aes_sparc_hw_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                                const unsigned char *iv, int enc)
+{
+       EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+       int ret, mode;
+
+       mode = ctx->cipher->flags & EVP_CIPH_MODE;
+       if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+           && !enc) {
+               ret = AES_set_decrypt_key(key, ctx->key_len*8, 
ctx->cipher_data);
+               dat->block      = (block128_f)AES_decrypt;
+               dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+                                       (cbc128_f)aes_sparc_hw_cbc_encrypt :
+                                       NULL;
+       } else {
+               ret = AES_set_encrypt_key(key, ctx->key_len*8, 
ctx->cipher_data);
+               dat->block      = (block128_f)AES_encrypt;
+               if (mode==EVP_CIPH_CBC_MODE)
+                       dat->stream.cbc = (cbc128_f)aes_sparc_hw_cbc_encrypt;
+               else
+                       dat->stream.cbc = NULL;
+       }
+
+       if (ret < 0) {
+               EVPerr(EVP_F_AES_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
+               return 0;
+       }
+
+       return 1;
+}
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) 
\
+static const EVP_CIPHER aes_sparc_hw_##keylen##_##mode = { \
+       nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+       flags|EVP_CIPH_##MODE##_MODE,   \
+       aes_sparc_hw_init_key,          \
+       aes_sparc_hw_##mode##_cipher,   \
+       NULL,                           \
+       sizeof(EVP_AES_KEY),            \
+       NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+       nid##_##keylen##_##nmode,blocksize,     \
+       keylen/8,ivlen, \
+       flags|EVP_CIPH_##MODE##_MODE,   \
+       aes_init_key,                   \
+       aes_##mode##_cipher,            \
+       NULL,                           \
+       sizeof(EVP_AES_KEY),            \
+       NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return 
SPARC_AES_CAPABLE?&aes_sparc_hw_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_sparc_hw_##keylen##_##mode = { \
+       nid##_##keylen##_##mode,blocksize, \
+       (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+       flags|EVP_CIPH_##MODE##_MODE,   \
+       aes_sparc_hw_##mode##_init_key, \
+       aes_sparc_hw_##mode##_cipher,   \
+       aes_##mode##_cleanup,           \
+       sizeof(EVP_AES_##MODE##_CTX),   \
+       NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+       nid##_##keylen##_##mode,blocksize, \
+       (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+       flags|EVP_CIPH_##MODE##_MODE,   \
+       aes_##mode##_init_key,          \
+       aes_##mode##_cipher,            \
+       aes_##mode##_cleanup,           \
+       sizeof(EVP_AES_##MODE##_CTX),   \
+       NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return 
SPARC_AES_CAPABLE?&aes_sparc_hw_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#else
+
 #define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) 
\
 static const EVP_CIPHER aes_##keylen##_##mode = { \
        nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
@@ -485,6 +883,8 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
 { return &aes_##keylen##_##mode; }
 #endif
 
+#endif
+
 #define BLOCK_CIPHER_generic_pack(nid,keylen,flags)            \
        
BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)
     \
        
BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)
      \
diff --git a/crypto/sparc_arch.h b/crypto/sparc_arch.h
index f478ce3..032d67c 100644
--- a/crypto/sparc_arch.h
+++ b/crypto/sparc_arch.h
@@ -61,6 +61,25 @@ extern int OPENSSL_sparcv9cap_P;
 #define AES_KEXPAND2(a,b,c)    \
        .word   (F3F(2, 0x36, 0x131)|RS1(a)|RS2(b)|RD(c));
 
+#define MOVXTOD_G3_F4          \
+       .word   0x89b02303;
+#define MOVXTOD_G5_F6          \
+       .word   0x8db02305;
+#define MOVXTOD_G3_F0          \
+       .word   0x81b02303;
+#define MOVXTOD_G5_F2          \
+       .word   0x85b02305;
+#define MOVXTOD_O3_F0          \
+       .word   0x81b0230b;
+#define MOVXTOD_O5_F0          \
+       .word   0x81b0230d;
+#define MOVXTOD_O5_F2          \
+       .word   0x85b0230d;
+#define MOVXTOD_O5_F4          \
+       .word   0x89b0230d;
+#define MOVXTOD_O5_F6          \
+       .word   0x8db0230d;
+
 #ifdef __PIC__
 #define SPARC_PIC_THUNK(reg)   \
        .align  32;             \
-- 
1.7.10.4

______________________________________________________________________
OpenSSL Project                                 http://www.openssl.org
Development Mailing List                       openssl-dev@openssl.org
Automated List Manager                           majord...@openssl.org

Reply via email to