The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.

To illustrate the performance gain here's a short summary of the tcrypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:

x86:                              i568       aes-ni   delta
256 bit, 8kB blocks, ECB:  125.94 MB/s  187.09 MB/s  +48.6%
256 bit, 8kB blocks, CBC:  118.04 MB/s  171.61 MB/s  +45.4%
320 bit, 8kB blocks, LRW:  128.20 MB/s  168.32 MB/s  +31.3%
512 bit, 8kB blocks, XTS:  118.35 MB/s  166.49 MB/s  +40.7%

Additionally, due to some minor optimizations, the 64-bit version also
got a performance gain up to 20% as seen below:

x86-64:                      old impl.     new impl.  delta
256 bit, 8kB blocks, ECB:  152.25 MB/s  183.84 MB/s  +20.7%
256 bit, 8kB blocks, CBC:  144.12 MB/s  170.03 MB/s  +18.0%
320 bit, 8kB blocks, LRW:  159.13 MB/s  169.69 MB/s   +6.6%
512 bit, 8kB blocks, XTS:  144.27 MB/s  172.14 MB/s  +19.3%

Signed-off-by: Mathias Krause <mini...@googlemail.com>
---
v3 changes:
* fixed 32-bit implementation of aesni_ecb_enc (a hunk somehow moved to the end
  of another function)

Sorry for the noise. I should have reviewed the patch more carefully :/

v2 changes:
* hide almost all register names in macros so the same code base can be shared
  between x86 and x86_64
* unified Kconfig documentation again
* added alignment constraints for internal functions.
---
 arch/x86/crypto/aesni-intel_asm.S  |  149 ++++++++++++++++++++++++++++-------
 arch/x86/crypto/aesni-intel_glue.c |   22 ++++-
 crypto/Kconfig                     |    8 +-
 3 files changed, 141 insertions(+), 38 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index ff16756..48d6f7c 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,9 @@
  *            Vinodh Gopal <vinodh.go...@intel.com>
  *            Kahraman Akdemir
  *
+ * Ported x86_64 version to x86:
+ *    Author: Mathias Krause <mini...@googlemail.com>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -32,12 +35,16 @@
 #define IN     IN1
 #define KEY    %xmm2
 #define IV     %xmm3
+
 #define BSWAP_MASK %xmm10
 #define CTR    %xmm11
 #define INC    %xmm12
 
+#ifdef __x86_64__
+#define AREG   %rax
 #define KEYP   %rdi
 #define OUTP   %rsi
+#define UKEYP  OUTP
 #define INP    %rdx
 #define LEN    %rcx
 #define IVP    %r8
@@ -46,6 +53,18 @@
 #define TKEYP  T1
 #define T2     %r11
 #define TCTR_LOW T2
+#else
+#define AREG   %eax
+#define KEYP   %edi
+#define OUTP   AREG
+#define UKEYP  OUTP
+#define INP    %edx
+#define LEN    %esi
+#define IVP    %ebp
+#define KLEN   %ebx
+#define T1     %ecx
+#define TKEYP  T1
+#endif
 
 _key_expansion_128:
 _key_expansion_256a:
@@ -55,10 +74,11 @@ _key_expansion_256a:
        shufps $0b10001100, %xmm0, %xmm4
        pxor %xmm4, %xmm0
        pxor %xmm1, %xmm0
-       movaps %xmm0, (%rcx)
-       add $0x10, %rcx
+       movaps %xmm0, (TKEYP)
+       add $0x10, TKEYP
        ret
 
+.align 4
 _key_expansion_192a:
        pshufd $0b01010101, %xmm1, %xmm1
        shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +96,13 @@ _key_expansion_192a:
 
        movaps %xmm0, %xmm1
        shufps $0b01000100, %xmm0, %xmm6
-       movaps %xmm6, (%rcx)
+       movaps %xmm6, (TKEYP)
        shufps $0b01001110, %xmm2, %xmm1
-       movaps %xmm1, 16(%rcx)
-       add $0x20, %rcx
+       movaps %xmm1, 0x10(TKEYP)
+       add $0x20, TKEYP
        ret
 
+.align 4
 _key_expansion_192b:
        pshufd $0b01010101, %xmm1, %xmm1
        shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +117,11 @@ _key_expansion_192b:
        pxor %xmm3, %xmm2
        pxor %xmm5, %xmm2
 
-       movaps %xmm0, (%rcx)
-       add $0x10, %rcx
+       movaps %xmm0, (TKEYP)
+       add $0x10, TKEYP
        ret
 
+.align 4
 _key_expansion_256b:
        pshufd $0b10101010, %xmm1, %xmm1
        shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +129,8 @@ _key_expansion_256b:
        shufps $0b10001100, %xmm2, %xmm4
        pxor %xmm4, %xmm2
        pxor %xmm1, %xmm2
-       movaps %xmm2, (%rcx)
-       add $0x10, %rcx
+       movaps %xmm2, (TKEYP)
+       add $0x10, TKEYP
        ret
 
 /*
@@ -116,17 +138,23 @@ _key_expansion_256b:
  *                   unsigned int key_len)
  */
 ENTRY(aesni_set_key)
-       movups (%rsi), %xmm0            # user key (first 16 bytes)
-       movaps %xmm0, (%rdi)
-       lea 0x10(%rdi), %rcx            # key addr
-       movl %edx, 480(%rdi)
+#ifndef __x86_64__
+       pushl KEYP
+       movl 8(%esp), KEYP              # ctx
+       movl 12(%esp), UKEYP            # in_key
+       movl 16(%esp), %edx             # key_len
+#endif
+       movups (UKEYP), %xmm0           # user key (first 16 bytes)
+       movaps %xmm0, (KEYP)
+       lea 0x10(KEYP), TKEYP           # key addr
+       movl %edx, 480(KEYP)
        pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
        cmp $24, %dl
        jb .Lenc_key128
        je .Lenc_key192
-       movups 0x10(%rsi), %xmm2        # other user key
-       movaps %xmm2, (%rcx)
-       add $0x10, %rcx
+       movups 0x10(UKEYP), %xmm2       # other user key
+       movaps %xmm2, (TKEYP)
+       add $0x10, TKEYP
        AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
        call _key_expansion_256a
        AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +183,7 @@ ENTRY(aesni_set_key)
        call _key_expansion_256a
        jmp .Ldec_key
 .Lenc_key192:
-       movq 0x10(%rsi), %xmm2          # other user key
+       movq 0x10(UKEYP), %xmm2         # other user key
        AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
        call _key_expansion_192a
        AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
@@ -195,33 +223,47 @@ ENTRY(aesni_set_key)
        AESKEYGENASSIST 0x36 %xmm0 %xmm1        # round 10
        call _key_expansion_128
 .Ldec_key:
-       sub $0x10, %rcx
-       movaps (%rdi), %xmm0
-       movaps (%rcx), %xmm1
-       movaps %xmm0, 240(%rcx)
-       movaps %xmm1, 240(%rdi)
-       add $0x10, %rdi
-       lea 240-16(%rcx), %rsi
+       sub $0x10, TKEYP
+       movaps (KEYP), %xmm0
+       movaps (TKEYP), %xmm1
+       movaps %xmm0, 240(TKEYP)
+       movaps %xmm1, 240(KEYP)
+       add $0x10, KEYP
+       lea 240-16(TKEYP), UKEYP
 .align 4
 .Ldec_key_loop:
-       movaps (%rdi), %xmm0
+       movaps (KEYP), %xmm0
        AESIMC %xmm0 %xmm1
-       movaps %xmm1, (%rsi)
-       add $0x10, %rdi
-       sub $0x10, %rsi
-       cmp %rcx, %rdi
+       movaps %xmm1, (UKEYP)
+       add $0x10, KEYP
+       sub $0x10, UKEYP
+       cmp TKEYP, KEYP
        jb .Ldec_key_loop
-       xor %rax, %rax
+       xor AREG, AREG
+#ifndef __x86_64__
+       popl KEYP
+#endif
        ret
 
 /*
  * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  */
 ENTRY(aesni_enc)
+#ifndef __x86_64__
+       pushl KEYP
+       pushl KLEN
+       movl 12(%esp), KEYP
+       movl 16(%esp), OUTP
+       movl 20(%esp), INP
+#endif
        movl 480(KEYP), KLEN            # key length
        movups (INP), STATE             # input
        call _aesni_enc1
        movups STATE, (OUTP)            # output
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+#endif
        ret
 
 /*
@@ -236,6 +278,7 @@ ENTRY(aesni_enc)
  *     KEY
  *     TKEYP (T1)
  */
+.align 4
 _aesni_enc1:
        movaps (KEYP), KEY              # key
        mov KEYP, TKEYP
@@ -298,6 +341,7 @@ _aesni_enc1:
  *     KEY
  *     TKEYP (T1)
  */
+.align 4
 _aesni_enc4:
        movaps (KEYP), KEY              # key
        mov KEYP, TKEYP
@@ -391,11 +435,22 @@ _aesni_enc4:
  * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  */
 ENTRY(aesni_dec)
+#ifndef __x86_64__
+       pushl KEYP
+       pushl KLEN
+       movl 12(%esp), KEYP
+       movl 16(%esp), OUTP
+       movl 20(%esp), INP
+#endif
        mov 480(KEYP), KLEN             # key length
        add $240, KEYP
        movups (INP), STATE             # input
        call _aesni_dec1
        movups STATE, (OUTP)            #output
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+#endif
        ret
 
 /*
@@ -410,6 +465,7 @@ ENTRY(aesni_dec)
  *     KEY
  *     TKEYP (T1)
  */
+.align 4
 _aesni_dec1:
        movaps (KEYP), KEY              # key
        mov KEYP, TKEYP
@@ -472,6 +528,7 @@ _aesni_dec1:
  *     KEY
  *     TKEYP (T1)
  */
+.align 4
 _aesni_dec4:
        movaps (KEYP), KEY              # key
        mov KEYP, TKEYP
@@ -566,6 +623,15 @@ _aesni_dec4:
  *                   size_t len)
  */
 ENTRY(aesni_ecb_enc)
+#ifndef __x86_64__
+       pushl LEN
+       pushl KEYP
+       pushl KLEN
+       movl 16(%esp), KEYP
+       movl 20(%esp), OUTP
+       movl 24(%esp), INP
+       movl 28(%esp), LEN
+#endif
        test LEN, LEN           # check length
        jz .Lecb_enc_ret
        mov 480(KEYP), KLEN
@@ -602,6 +668,11 @@ ENTRY(aesni_ecb_enc)
        cmp $16, LEN
        jge .Lecb_enc_loop1
 .Lecb_enc_ret:
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+       popl LEN
+#endif
        ret
 
 /*
@@ -609,6 +680,15 @@ ENTRY(aesni_ecb_enc)
  *                   size_t len);
  */
 ENTRY(aesni_ecb_dec)
+#ifndef __x86_64__
+       pushl LEN
+       pushl KEYP
+       pushl KLEN
+       movl 16(%esp), KEYP
+       movl 20(%esp), OUTP
+       movl 24(%esp), INP
+       movl 28(%esp), LEN
+#endif
        test LEN, LEN
        jz .Lecb_dec_ret
        mov 480(KEYP), KLEN
@@ -646,8 +726,14 @@ ENTRY(aesni_ecb_dec)
        cmp $16, LEN
        jge .Lecb_dec_loop1
 .Lecb_dec_ret:
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+       popl LEN
+#endif
        ret
 
+#ifdef __x86_64__
 /*
  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  *                   size_t len, u8 *iv)
@@ -744,6 +830,7 @@ ENTRY(aesni_cbc_dec)
  *     INC:    == 1, in little endian
  *     BSWAP_MASK == endian swapping mask
  */
+.align 4
 _aesni_inc_init:
        movaps .Lbswap_mask, BSWAP_MASK
        movaps IV, CTR
@@ -768,6 +855,7 @@ _aesni_inc_init:
  *     CTR:    == output IV, in little endian
  *     TCTR_LOW: == lower qword of CTR
  */
+.align 4
 _aesni_inc:
        paddq INC, CTR
        add $1, TCTR_LOW
@@ -839,3 +927,4 @@ ENTRY(aesni_ctr_enc)
        movups IV, (IVP)
 .Lctr_enc_just_ret:
        ret
+#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c 
b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..d0f0e7b 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -55,12 +55,14 @@ asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, 
u8 *out,
                              const u8 *in, unsigned int len);
 asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len);
+#ifdef CONFIG_X86_64
 asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);
+#endif
 
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
@@ -254,6 +256,7 @@ static struct crypto_alg blk_ecb_alg = {
        },
 };
 
+#ifdef CONFIG_X86_64
 static int cbc_encrypt(struct blkcipher_desc *desc,
                       struct scatterlist *dst, struct scatterlist *src,
                       unsigned int nbytes)
@@ -389,6 +392,7 @@ static struct crypto_alg blk_ctr_alg = {
                },
        },
 };
+#endif
 
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
                        unsigned int key_len)
@@ -500,6 +504,7 @@ static struct crypto_alg ablk_ecb_alg = {
        },
 };
 
+#ifdef CONFIG_X86_64
 static int ablk_cbc_init(struct crypto_tfm *tfm)
 {
        struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +617,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
        },
 };
 #endif
+#endif
 
 #ifdef HAS_LRW
 static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -744,12 +750,13 @@ static int __init aesni_init(void)
                goto __aes_err;
        if ((err = crypto_register_alg(&blk_ecb_alg)))
                goto blk_ecb_err;
+       if ((err = crypto_register_alg(&ablk_ecb_alg)))
+               goto ablk_ecb_err;
+#ifdef CONFIG_X86_64
        if ((err = crypto_register_alg(&blk_cbc_alg)))
                goto blk_cbc_err;
        if ((err = crypto_register_alg(&blk_ctr_alg)))
                goto blk_ctr_err;
-       if ((err = crypto_register_alg(&ablk_ecb_alg)))
-               goto ablk_ecb_err;
        if ((err = crypto_register_alg(&ablk_cbc_alg)))
                goto ablk_cbc_err;
        if ((err = crypto_register_alg(&ablk_ctr_alg)))
@@ -758,6 +765,7 @@ static int __init aesni_init(void)
        if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
                goto ablk_rfc3686_ctr_err;
 #endif
+#endif
 #ifdef HAS_LRW
        if ((err = crypto_register_alg(&ablk_lrw_alg)))
                goto ablk_lrw_err;
@@ -784,6 +792,7 @@ ablk_pcbc_err:
        crypto_unregister_alg(&ablk_lrw_alg);
 ablk_lrw_err:
 #endif
+#ifdef CONFIG_X86_64
 #ifdef HAS_CTR
        crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 ablk_rfc3686_ctr_err:
@@ -792,12 +801,13 @@ ablk_rfc3686_ctr_err:
 ablk_ctr_err:
        crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
-       crypto_unregister_alg(&ablk_ecb_alg);
-ablk_ecb_err:
        crypto_unregister_alg(&blk_ctr_alg);
 blk_ctr_err:
        crypto_unregister_alg(&blk_cbc_alg);
 blk_cbc_err:
+#endif
+       crypto_unregister_alg(&ablk_ecb_alg);
+ablk_ecb_err:
        crypto_unregister_alg(&blk_ecb_alg);
 blk_ecb_err:
        crypto_unregister_alg(&__aesni_alg);
@@ -818,14 +828,16 @@ static void __exit aesni_exit(void)
 #ifdef HAS_LRW
        crypto_unregister_alg(&ablk_lrw_alg);
 #endif
+#ifdef CONFIG_X86_64
 #ifdef HAS_CTR
        crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 #endif
        crypto_unregister_alg(&ablk_ctr_alg);
        crypto_unregister_alg(&ablk_cbc_alg);
-       crypto_unregister_alg(&ablk_ecb_alg);
        crypto_unregister_alg(&blk_ctr_alg);
        crypto_unregister_alg(&blk_cbc_alg);
+#endif
+       crypto_unregister_alg(&ablk_ecb_alg);
        crypto_unregister_alg(&blk_ecb_alg);
        crypto_unregister_alg(&__aesni_alg);
        crypto_unregister_alg(&aesni_alg);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e4bac29..459fd35 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -539,8 +539,9 @@ config CRYPTO_AES_X86_64
 
 config CRYPTO_AES_NI_INTEL
        tristate "AES cipher algorithms (AES-NI)"
-       depends on (X86 || UML_X86) && 64BIT
-       select CRYPTO_AES_X86_64
+       depends on (X86 || UML_X86)
+       select CRYPTO_AES_X86_64 if 64BIT
+       select CRYPTO_AES_586 if !64BIT
        select CRYPTO_CRYPTD
        select CRYPTO_ALGAPI
        select CRYPTO_FPU
@@ -565,7 +566,8 @@ config CRYPTO_AES_NI_INTEL
 
          In addition to AES cipher algorithm support, the
          acceleration for some popular block cipher mode is supported
-         too, including ECB, CBC, CTR, LRW, PCBC, XTS.
+         too, including ECB, LRW, PCBC, XTS. The 64 bit version has
+         additional acceleration for CBC and CTR.
 
 config CRYPTO_ANUBIS
        tristate "Anubis cipher algorithm"
-- 
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to