This patch adds AVX optimized XTS-mode helper functions/macros and converts
serpent-avx to use the new facilities. Benefits are slightly improved speed
and reduced stack usage as use of temporary IV-array is avoided.
tcrypt results, with Intel i5-2450M:
enc dec
16B 1.00x 1.00x
64B 1.00x 1.00x
256B1.04x 1.06x
1024B 1.09x 1.09x
8192B 1.10x 1.09x
Signed-off-by: Jussi Kivilinna jussi.kivili...@iki.fi
---
arch/x86/crypto/glue_helper-asm-avx.S | 61 +
arch/x86/crypto/glue_helper.c | 97 +++
arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 45 -
arch/x86/crypto/serpent_avx_glue.c | 87 +---
arch/x86/include/asm/crypto/glue_helper.h | 24 +++
arch/x86/include/asm/crypto/serpent-avx.h |5 +
6 files changed, 273 insertions(+), 46 deletions(-)
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S
b/arch/x86/crypto/glue_helper-asm-avx.S
index f7b6ea2..02ee230 100644
--- a/arch/x86/crypto/glue_helper-asm-avx.S
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -1,7 +1,7 @@
/*
* Shared glue code for 128bit block ciphers, AVX assembler macros
*
- * Copyright (c) 2012 Jussi Kivilinna jussi.kivili...@mbnet.fi
+ * Copyright © 2012-2013 Jussi Kivilinna jussi.kivili...@iki.fi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -89,3 +89,62 @@
vpxor (6*16)(src), x6, x6; \
vpxor (7*16)(src), x7, x7; \
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
+ t1, xts_gf128mul_and_shl1_mask) \
+ vmovdqa xts_gf128mul_and_shl1_mask, t0; \
+ \
+ /* load IV */ \
+ vmovdqu (iv), tiv; \
+ vpxor (0*16)(src), tiv, x0; \
+ vmovdqu tiv, (0*16)(dst); \
+ \
+ /* construct and store IVs, also xor with source */ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (1*16)(src), tiv, x1; \
+ vmovdqu tiv, (1*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (2*16)(src), tiv, x2; \
+ vmovdqu tiv, (2*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (3*16)(src), tiv, x3; \
+ vmovdqu tiv, (3*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (4*16)(src), tiv, x4; \
+ vmovdqu tiv, (4*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (5*16)(src), tiv, x5; \
+ vmovdqu tiv, (5*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (6*16)(src), tiv, x6; \
+ vmovdqu tiv, (6*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (7*16)(src), tiv, x7; \
+ vmovdqu tiv, (7*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vmovdqu tiv, (iv);
+
+#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(dst), x0, x0; \
+ vpxor (1*16)(dst), x1, x1; \
+ vpxor (2*16)(dst), x2, x2; \
+ vpxor (3*16)(dst), x3, x3; \
+ vpxor (4*16)(dst), x4, x4; \
+ vpxor (5*16)(dst), x5, x5; \
+ vpxor (6*16)(dst), x6, x6; \
+ vpxor (7*16)(dst), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 22ce4f6..432f1d76 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -1,7 +1,7 @@
/*
* Shared glue code for 128bit block ciphers
*
- * Copyright (c) 2012 Jussi Kivilinna jussi.kivili...@mbnet.fi
+ * Copyright © 2012-2013 Jussi Kivilinna jussi.kivili...@iki.fi
*
* CBC ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu herb...@gondor.apana.org.au
@@ -304,4 +304,99 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx
*gctx,
}
EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
+static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+ void *ctx,
+ struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ const unsigned int bsize = 128 / 8;
+ unsigned int nbytes = walk-nbytes;
+ u128 *src = (u128 *)walk-src.virt.addr;
+ u128 *dst = (u128 *)walk-dst.virt.addr;
+ unsigned int num_blocks, func_bytes;
+ unsigned int i;
+
+ /* Process multi-block batch */
+ for (i = 0; i gctx-num_funcs; i++) {
+ num_blocks = gctx-funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes = func_bytes) {
+