[PATCH -v3] crypto: Add PCLMULQDQ accelerated GHASH implementation

2009-09-14 Thread Huang Ying
Hi, Herbert,

The dependency to irq_fpu_usable has been merged by linus' tree.

Best Regards,
Huang Ying
-->
PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
carry-less multiplication. More information about PCLMULQDQ can be
found at:

http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/

Because PCLMULQDQ changes XMM state, its usage must be enclosed with
kernel_fpu_begin/end, which can be used only in process context, the
acceleration is implemented as crypto_ahash. That is, request in soft
IRQ context will be defered to the cryptd kernel thread.

v3:
 - Revise GHASH implementation, performance increase about 2x.

Signed-off-by: Huang Ying 
---
 arch/x86/crypto/Makefile   |3 
 arch/x86/crypto/ghash-clmulni-intel_asm.S  |  157 +
 arch/x86/crypto/ghash-clmulni-intel_glue.c |  331 +
 arch/x86/include/asm/cpufeature.h  |1 
 crypto/Kconfig |8 
 crypto/cryptd.c|7 
 include/crypto/cryptd.h|1 
 7 files changed, 508 insertions(+)
 create mode 100644 arch/x86/crypto/ghash-clmulni-intel_asm.S
 create mode 100644 arch/x86/crypto/ghash-clmulni-intel_glue.c

--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 
@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_6
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+
+ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains accelerated part of ghash
+ * implementation. More information about PCLMULQDQ can be found at:
+ *
+ * 
http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ * Copyright (c) 2009 Intel Corp.
+ *   Author: Huang Ying 
+ *  Vinodh Gopal
+ *  Erdinc Ozturk
+ *  Deniz Karakoyunlu
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include 
+
+.align 16
+.Lbswap_mask:
+   .octa 0x000102030405060708090a0b0c0d0e0f
+.Lpoly:
+   .octa 0xc201
+.Ltwo_one:
+   .octa 0x00010001
+
+#define DATA   %xmm0
+#define SHASH  %xmm1
+#define T1 %xmm2
+#define T2 %xmm3
+#define T3 %xmm4
+#define BSWAP  %xmm5
+#define IN1%xmm6
+
+.text
+
+/*
+ * __clmul_gf128mul_ble:   internal ABI
+ * input:
+ * DATA:   operand1
+ * SHASH:  operand2, hash_key << 1 mod poly
+ * output:
+ * DATA:   operand1 * operand2 mod poly
+ * changed:
+ * T1
+ * T2
+ * T3
+ */
+__clmul_gf128mul_ble:
+   movaps DATA, T1
+   pshufd $0b01001110, DATA, T2
+   pshufd $0b01001110, SHASH, T3
+   pxor DATA, T2
+   pxor SHASH, T3
+
+   # pclmulqdq $0x00, SHASH, DATA  # DATA = a0 * b0
+   .byte 0x66, 0x0f, 0x3a, 0x44, 0xc1, 0x00
+   # pclmulqdq $0x11, SHASH, T1# T1 = a1 * b1
+   .byte 0x66, 0x0f, 0x3a, 0x44, 0xd1, 0x11
+   # pclmulqdq $0x00, T3, T2   # T2 = (a1 + a0) * (b1 + b0)
+   .byte 0x66, 0x0f, 0x3a, 0x44, 0xdc, 0x00
+   pxor DATA, T2
+   pxor T1, T2 # T2 = a0 * b1 + a1 * b0
+
+   movaps T2, T3
+   pslldq $8, T3
+   psrldq $8, T2
+   pxor T3, DATA
+   pxor T2, T1 #  is result of
+   # carry-less multiplication
+
+   # first phase of the reduction
+   movaps DATA, T3
+   psllq $1, T3
+   pxor DATA, T3
+   psllq $5, T3
+   pxor DATA, T3
+   psllq $57, T3
+   movaps T3, T2
+   pslldq $8, T2
+   psrldq $8, T3
+   pxor T2, DATA
+   pxor T3, T1
+
+   # second phase of the reduction
+   movaps DATA, T2
+   psrlq $5, T2
+   pxor DATA, T2
+   psrlq $1, T2
+   pxor DATA, T2
+   psrlq $1, T2
+   pxor T2, T1
+   pxor T1, DATA
+   ret
+
+/* void clmul_ghash_mul(char *dst, const be128 *shash) */
+ENTRY(clmul_ghash_mul)
+   movups (%rdi), DATA
+   movups (%rsi), SHASH
+   movaps .Lbswap_mask, BSWAP
+   pshufb BSWAP, DATA
+   call __clmul_gf128mul_ble
+  

Re: [PATCH]: fix repetition test for hardware RNG to be FIPS compliant (v2)

2009-09-14 Thread Neil Horman
Ok, version 2 of the patch, taking comments into account

To be fips compliant, RNGs need to preform a continuous test on their output.
Specifically the requirement is that the first block of random data generated in
an RNG be saved to see the comparison test, and never returned to the caller.
This patch augments the continuous test in the hardware RNG to enforce this
requirement, making the hardware RNG fips compliant (when operating in fips
mode).

Neil

Signed-off-by: Neil Horman 



 random.c |   28 
 1 file changed, 20 insertions(+), 8 deletions(-)


diff --git a/drivers/char/random.c b/drivers/char/random.c
index d8a9255..36fb05e 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -399,6 +399,14 @@ module_param(debug, bool, 0644);
  * storing entropy in an entropy pool.
  *
  **/
+#define EXTRACT_SIZE 10
+
+#define REP_CHECK_BLOCK_COPIED 1
+
+struct repetition_check {
+   __u8 last_data[EXTRACT_SIZE];
+   __u8 flags;
+};
 
 struct entropy_store;
 struct entropy_store {
@@ -414,7 +422,7 @@ struct entropy_store {
unsigned add_ptr;
int entropy_count;
int input_rotate;
-   __u8 *last_data;
+   struct repetition_check rep;
 };
 
 static __u32 input_pool_data[INPUT_POOL_WORDS];
@@ -714,7 +722,6 @@ void add_disk_randomness(struct gendisk *disk)
 }
 #endif
 
-#define EXTRACT_SIZE 10
 
 /*
  *
@@ -855,19 +862,27 @@ static ssize_t extract_entropy(struct entropy_store *r, 
void *buf,
ssize_t ret = 0, i;
__u8 tmp[EXTRACT_SIZE];
unsigned long flags;
+   size_t saved_nbytes = nbytes;
 
+repeat_extract:
xfer_secondary_pool(r, nbytes);
nbytes = account(r, nbytes, min, reserved);
 
while (nbytes) {
extract_buf(r, tmp);
 
-   if (r->last_data) {
+   if (fips_enabled) {
spin_lock_irqsave(&r->lock, flags);
-   if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
+   if ((r->rep.flags & REP_CHECK_BLOCK_COPIED) &&
+   !memcmp(tmp, r->rep.last_data, EXTRACT_SIZE))
panic("Hardware RNG duplicated output!\n");
-   memcpy(r->last_data, tmp, EXTRACT_SIZE);
+   memcpy(r->rep.last_data, tmp, EXTRACT_SIZE);
spin_unlock_irqrestore(&r->lock, flags);
+   if (!(r->rep.flags & REP_CHECK_BLOCK_COPIED)) {
+   r->rep.flags |= REP_CHECK_BLOCK_COPIED;
+   nbytes = saved_nbytes;
+   goto repeat_extract;
+   }
}
i = min_t(int, nbytes, EXTRACT_SIZE);
memcpy(buf, tmp, i);
@@ -951,9 +966,6 @@ static void init_std_data(struct entropy_store *r)
now = ktime_get_real();
mix_pool_bytes(r, &now, sizeof(now));
mix_pool_bytes(r, utsname(), sizeof(*(utsname(;
-   /* Enable continuous test in fips mode */
-   if (fips_enabled)
-   r->last_data = kmalloc(EXTRACT_SIZE, GFP_KERNEL);
 }
 
 static int rand_initialize(void)
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html