Re: [PATCH 1/5] crypto: x86 - add more optimized XTS-mode for serpent-avx

2013-04-09 Thread Herbert Xu
On Mon, Apr 08, 2013 at 09:50:55PM +0300, Jussi Kivilinna wrote:
 This patch adds AVX optimized XTS-mode helper functions/macros and converts
 serpent-avx to use the new facilities. Benefits are slightly improved speed
 and reduced stack usage as use of temporary IV-array is avoided.
 
 tcrypt results, with Intel i5-2450M:
 enc dec
 16B 1.00x   1.00x
 64B 1.00x   1.00x
 256B1.04x   1.06x
 1024B   1.09x   1.09x
 8192B   1.10x   1.09x
 
 Signed-off-by: Jussi Kivilinna jussi.kivili...@iki.fi

All applied.  Thanks!
-- 
Email: Herbert Xu herb...@gondor.apana.org.au
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] crypto: x86 - add more optimized XTS-mode for serpent-avx

2013-04-08 Thread Jussi Kivilinna
This patch adds AVX optimized XTS-mode helper functions/macros and converts
serpent-avx to use the new facilities. Benefits are slightly improved speed
and reduced stack usage as use of temporary IV-array is avoided.

tcrypt results, with Intel i5-2450M:
enc dec
16B 1.00x   1.00x
64B 1.00x   1.00x
256B1.04x   1.06x
1024B   1.09x   1.09x
8192B   1.10x   1.09x

Signed-off-by: Jussi Kivilinna jussi.kivili...@iki.fi
---
 arch/x86/crypto/glue_helper-asm-avx.S   |   61 +
 arch/x86/crypto/glue_helper.c   |   97 +++
 arch/x86/crypto/serpent-avx-x86_64-asm_64.S |   45 -
 arch/x86/crypto/serpent_avx_glue.c  |   87 +---
 arch/x86/include/asm/crypto/glue_helper.h   |   24 +++
 arch/x86/include/asm/crypto/serpent-avx.h   |5 +
 6 files changed, 273 insertions(+), 46 deletions(-)

diff --git a/arch/x86/crypto/glue_helper-asm-avx.S 
b/arch/x86/crypto/glue_helper-asm-avx.S
index f7b6ea2..02ee230 100644
--- a/arch/x86/crypto/glue_helper-asm-avx.S
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -1,7 +1,7 @@
 /*
  * Shared glue code for 128bit block ciphers, AVX assembler macros
  *
- * Copyright (c) 2012 Jussi Kivilinna jussi.kivili...@mbnet.fi
+ * Copyright © 2012-2013 Jussi Kivilinna jussi.kivili...@iki.fi
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -89,3 +89,62 @@
vpxor (6*16)(src), x6, x6; \
vpxor (7*16)(src), x7, x7; \
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+   vpsrad $31, iv, tmp; \
+   vpaddq iv, iv, iv; \
+   vpshufd $0x13, tmp, tmp; \
+   vpand mask, tmp, tmp; \
+   vpxor tmp, iv, iv;
+
+#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
+ t1, xts_gf128mul_and_shl1_mask) \
+   vmovdqa xts_gf128mul_and_shl1_mask, t0; \
+   \
+   /* load IV */ \
+   vmovdqu (iv), tiv; \
+   vpxor (0*16)(src), tiv, x0; \
+   vmovdqu tiv, (0*16)(dst); \
+   \
+   /* construct and store IVs, also xor with source */ \
+   gf128mul_x_ble(tiv, t0, t1); \
+   vpxor (1*16)(src), tiv, x1; \
+   vmovdqu tiv, (1*16)(dst); \
+   \
+   gf128mul_x_ble(tiv, t0, t1); \
+   vpxor (2*16)(src), tiv, x2; \
+   vmovdqu tiv, (2*16)(dst); \
+   \
+   gf128mul_x_ble(tiv, t0, t1); \
+   vpxor (3*16)(src), tiv, x3; \
+   vmovdqu tiv, (3*16)(dst); \
+   \
+   gf128mul_x_ble(tiv, t0, t1); \
+   vpxor (4*16)(src), tiv, x4; \
+   vmovdqu tiv, (4*16)(dst); \
+   \
+   gf128mul_x_ble(tiv, t0, t1); \
+   vpxor (5*16)(src), tiv, x5; \
+   vmovdqu tiv, (5*16)(dst); \
+   \
+   gf128mul_x_ble(tiv, t0, t1); \
+   vpxor (6*16)(src), tiv, x6; \
+   vmovdqu tiv, (6*16)(dst); \
+   \
+   gf128mul_x_ble(tiv, t0, t1); \
+   vpxor (7*16)(src), tiv, x7; \
+   vmovdqu tiv, (7*16)(dst); \
+   \
+   gf128mul_x_ble(tiv, t0, t1); \
+   vmovdqu tiv, (iv);
+
+#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+   vpxor (0*16)(dst), x0, x0; \
+   vpxor (1*16)(dst), x1, x1; \
+   vpxor (2*16)(dst), x2, x2; \
+   vpxor (3*16)(dst), x3, x3; \
+   vpxor (4*16)(dst), x4, x4; \
+   vpxor (5*16)(dst), x5, x5; \
+   vpxor (6*16)(dst), x6, x6; \
+   vpxor (7*16)(dst), x7, x7; \
+   store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 22ce4f6..432f1d76 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -1,7 +1,7 @@
 /*
  * Shared glue code for 128bit block ciphers
  *
- * Copyright (c) 2012 Jussi Kivilinna jussi.kivili...@mbnet.fi
+ * Copyright © 2012-2013 Jussi Kivilinna jussi.kivili...@iki.fi
  *
  * CBC  ECB parts based on code (crypto/cbc.c,ecb.c) by:
  *   Copyright (c) 2006 Herbert Xu herb...@gondor.apana.org.au
@@ -304,4 +304,99 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx 
*gctx,
 }
 EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
 
+static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+   void *ctx,
+   struct blkcipher_desc *desc,
+   struct blkcipher_walk *walk)
+{
+   const unsigned int bsize = 128 / 8;
+   unsigned int nbytes = walk-nbytes;
+   u128 *src = (u128 *)walk-src.virt.addr;
+   u128 *dst = (u128 *)walk-dst.virt.addr;
+   unsigned int num_blocks, func_bytes;
+   unsigned int i;
+
+   /* Process multi-block batch */
+   for (i = 0; i  gctx-num_funcs; i++) {
+   num_blocks = gctx-funcs[i].num_blocks;
+   func_bytes = bsize * num_blocks;
+
+   if (nbytes = func_bytes) {
+