[PATCH v3 2/6] crypto: x86/nhpoly1305 - add AVX2 accelerated NHPoly1305

2018-12-04 Thread Eric Biggers
From: Eric Biggers 

Add a 64-bit AVX2 implementation of NHPoly1305, an ε-almost-∆-universal
hash function used in the Adiantum encryption mode.  For now, only the
NH portion is actually AVX2-accelerated; the Poly1305 part is less
performance-critical so is just implemented in C.

Signed-off-by: Eric Biggers 
---
 arch/x86/crypto/Makefile   |   3 +
 arch/x86/crypto/nh-avx2-x86_64.S   | 157 +
 arch/x86/crypto/nhpoly1305-avx2-glue.c |  77 
 crypto/Kconfig |   8 ++
 4 files changed, 245 insertions(+)
 create mode 100644 arch/x86/crypto/nh-avx2-x86_64.S
 create mode 100644 arch/x86/crypto/nhpoly1305-avx2-glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2a6acb4de373..0b31b16f49d8 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
 obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
 
 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
 
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -106,6 +107,8 @@ ifeq ($(avx2_supported),yes)
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
+
+   nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
 endif
 
 ifeq ($(avx512_supported),yes)
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
new file mode 100644
index ..f7946ea1b704
--- /dev/null
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers 
+ */
+
+#include 
+
+#definePASS0_SUMS  %ymm0
+#definePASS1_SUMS  %ymm1
+#definePASS2_SUMS  %ymm2
+#definePASS3_SUMS  %ymm3
+#defineK0  %ymm4
+#defineK0_XMM  %xmm4
+#defineK1  %ymm5
+#defineK1_XMM  %xmm5
+#defineK2  %ymm6
+#defineK2_XMM  %xmm6
+#defineK3  %ymm7
+#defineK3_XMM  %xmm7
+#defineT0  %ymm8
+#defineT1  %ymm9
+#defineT2  %ymm10
+#defineT2_XMM  %xmm10
+#defineT3  %ymm11
+#defineT3_XMM  %xmm11
+#defineT4  %ymm12
+#defineT5  %ymm13
+#defineT6  %ymm14
+#defineT7  %ymm15
+#defineKEY %rdi
+#defineMESSAGE %rsi
+#defineMESSAGE_LEN %rdx
+#defineHASH%rcx
+
+.macro _nh_2xstridek0, k1, k2, k3
+
+   // Add message words to key words
+   vpaddd  \k0, T3, T0
+   vpaddd  \k1, T3, T1
+   vpaddd  \k2, T3, T2
+   vpaddd  \k3, T3, T3
+
+   // Multiply 32x32 => 64 and accumulate
+   vpshufd $0x10, T0, T4
+   vpshufd $0x32, T0, T0
+   vpshufd $0x10, T1, T5
+   vpshufd $0x32, T1, T1
+   vpshufd $0x10, T2, T6
+   vpshufd $0x32, T2, T2
+   vpshufd $0x10, T3, T7
+   vpshufd $0x32, T3, T3
+   vpmuludqT4, T0, T0
+   vpmuludqT5, T1, T1
+   vpmuludqT6, T2, T2
+   vpmuludqT7, T3, T3
+   vpaddq  T0, PASS0_SUMS, PASS0_SUMS
+   vpaddq  T1, PASS1_SUMS, PASS1_SUMS
+   vpaddq  T2, PASS2_SUMS, PASS2_SUMS
+   vpaddq  T3, PASS3_SUMS, PASS3_SUMS
+.endm
+
+/*
+ * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_avx2)
+
+   vmovdqu 0x00(KEY), K0
+   vmovdqu 0x10(KEY), K1
+   add $0x20, KEY
+   vpxor   PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
+   vpxor   PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
+   vpxor   PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
+   vpxor   PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
+
+   sub $0x40, MESSAGE_LEN
+   jl  .Lloop4_done
+.Lloop4:
+   vmovdqu (MESSAGE), T3
+   vmovdqu 0x00(KEY), K2
+   vmovdqu 0x10(KEY), K3
+   _nh_2xstrideK0, K1, K2, K3
+
+   vmovdqu 0x20(MESSAGE), T3
+   vmovdqu 0x20(KEY), K0
+   vmovdqu 0x30(KEY), K1
+   _nh_2xstrideK2, K3, K0, K1
+
+   add $0x

[PATCH v3 1/6] crypto: x86/nhpoly1305 - add SSE2 accelerated NHPoly1305

2018-12-04 Thread Eric Biggers
From: Eric Biggers 

Add a 64-bit SSE2 implementation of NHPoly1305, an ε-almost-∆-universal
hash function used in the Adiantum encryption mode.  For now, only the
NH portion is actually SSE2-accelerated; the Poly1305 part is less
performance-critical so is just implemented in C.

Signed-off-by: Eric Biggers 
---
 arch/x86/crypto/Makefile   |   4 +
 arch/x86/crypto/nh-sse2-x86_64.S   | 123 +
 arch/x86/crypto/nhpoly1305-sse2-glue.c |  76 +++
 crypto/Kconfig |   8 ++
 4 files changed, 211 insertions(+)
 create mode 100644 arch/x86/crypto/nh-sse2-x86_64.S
 create mode 100644 arch/x86/crypto/nhpoly1305-sse2-glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ce4e43642984..2a6acb4de373 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -47,6 +47,8 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
 obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
 obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
 
+obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -85,6 +87,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
 morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
 morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
 
+nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+
 ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
new file mode 100644
index ..51f52d4ab4bb
--- /dev/null
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers 
+ */
+
+#include 
+
+#definePASS0_SUMS  %xmm0
+#definePASS1_SUMS  %xmm1
+#definePASS2_SUMS  %xmm2
+#definePASS3_SUMS  %xmm3
+#defineK0  %xmm4
+#defineK1  %xmm5
+#defineK2  %xmm6
+#defineK3  %xmm7
+#defineT0  %xmm8
+#defineT1  %xmm9
+#defineT2  %xmm10
+#defineT3  %xmm11
+#defineT4  %xmm12
+#defineT5  %xmm13
+#defineT6  %xmm14
+#defineT7  %xmm15
+#defineKEY %rdi
+#defineMESSAGE %rsi
+#defineMESSAGE_LEN %rdx
+#defineHASH%rcx
+
+.macro _nh_stride  k0, k1, k2, k3, offset
+
+   // Load next message stride
+   movdqu  \offset(MESSAGE), T1
+
+   // Load next key stride
+   movdqu  \offset(KEY), \k3
+
+   // Add message words to key words
+   movdqa  T1, T2
+   movdqa  T1, T3
+   paddd   T1, \k0// reuse k0 to avoid a move
+   paddd   \k1, T1
+   paddd   \k2, T2
+   paddd   \k3, T3
+
+   // Multiply 32x32 => 64 and accumulate
+   pshufd  $0x10, \k0, T4
+   pshufd  $0x32, \k0, \k0
+   pshufd  $0x10, T1, T5
+   pshufd  $0x32, T1, T1
+   pshufd  $0x10, T2, T6
+   pshufd  $0x32, T2, T2
+   pshufd  $0x10, T3, T7
+   pshufd  $0x32, T3, T3
+   pmuludq T4, \k0
+   pmuludq T5, T1
+   pmuludq T6, T2
+   pmuludq T7, T3
+   paddq   \k0, PASS0_SUMS
+   paddq   T1, PASS1_SUMS
+   paddq   T2, PASS2_SUMS
+   paddq   T3, PASS3_SUMS
+.endm
+
+/*
+ * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_sse2)
+
+   movdqu  0x00(KEY), K0
+   movdqu  0x10(KEY), K1
+   movdqu  0x20(KEY), K2
+   add $0x30, KEY
+   pxorPASS0_SUMS, PASS0_SUMS
+   pxorPASS1_SUMS, PASS1_SUMS
+   pxorPASS2_SUMS, PASS2_SUMS
+   pxorPASS3_SUMS, PASS3_SUMS
+
+   sub $0x40, MESSAGE_LEN
+   jl  .Lloop4_done
+.Lloop4:
+   _nh_stride  K0, K1, K2, K3, 0x00
+   _nh_stride  K1, K2, K3, K0, 0x10
+   _nh_stride  K2, K3, K0, K1, 0x20
+   _nh_stride  K3, K0, K1, K2, 0x30
+   add $0x40, KEY
+   add $0x40, MES

[PATCH v3 4/6] crypto: x86/chacha20 - refactor to allow varying number of rounds

2018-12-04 Thread Eric Biggers
From: Eric Biggers 

In preparation for adding XChaCha12 support, rename/refactor the x86_64
SIMD implementations of ChaCha20 to support different numbers of rounds.

Reviewed-by: Martin Willi 
Signed-off-by: Eric Biggers 
---
 arch/x86/crypto/Makefile  |   8 +-
 ...a20-avx2-x86_64.S => chacha-avx2-x86_64.S} |  33 ++--
 ...12vl-x86_64.S => chacha-avx512vl-x86_64.S} |  35 ++--
 ...0-ssse3-x86_64.S => chacha-ssse3-x86_64.S} |  41 ++---
 .../crypto/{chacha20_glue.c => chacha_glue.c} | 150 +-
 5 files changed, 136 insertions(+), 131 deletions(-)
 rename arch/x86/crypto/{chacha20-avx2-x86_64.S => chacha-avx2-x86_64.S} (97%)
 rename arch/x86/crypto/{chacha20-avx512vl-x86_64.S => 
chacha-avx512vl-x86_64.S} (97%)
 rename arch/x86/crypto/{chacha20-ssse3-x86_64.S => chacha-ssse3-x86_64.S} (96%)
 rename arch/x86/crypto/{chacha20_glue.c => chacha_glue.c} (51%)

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 0b31b16f49d8..45734e1cf967 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -78,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
+chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
 aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
@@ -103,7 +103,7 @@ endif
 
 ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o 
camellia_aesni_avx2_glue.o
-   chacha20-x86_64-y += chacha20-avx2-x86_64.o
+   chacha-x86_64-y += chacha-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
@@ -112,7 +112,7 @@ ifeq ($(avx2_supported),yes)
 endif
 
 ifeq ($(avx512_supported),yes)
-   chacha20-x86_64-y += chacha20-avx512vl-x86_64.o
+   chacha-x86_64-y += chacha-avx512vl-x86_64.o
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S 
b/arch/x86/crypto/chacha-avx2-x86_64.S
similarity index 97%
rename from arch/x86/crypto/chacha20-avx2-x86_64.S
rename to arch/x86/crypto/chacha-avx2-x86_64.S
index b6ab082be657..32903fd450af 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -1,5 +1,5 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
  *
  * Copyright (C) 2015 Martin Willi
  *
@@ -38,13 +38,14 @@ CTR4BL: .octa 0x0002
 
 .text
 
-ENTRY(chacha20_2block_xor_avx2)
+ENTRY(chacha_2block_xor_avx2)
# %rdi: Input state matrix, s
# %rsi: up to 2 data blocks output, o
# %rdx: up to 2 data blocks input, i
# %rcx: input/output length in bytes
+   # %r8d: nrounds
 
-   # This function encrypts two ChaCha20 blocks by loading the state
+   # This function encrypts two ChaCha blocks by loading the state
# matrix twice across four AVX registers. It performs matrix operations
# on four words in each matrix in parallel, but requires shuffling to
# rearrange the words after each round.
@@ -68,7 +69,6 @@ ENTRY(chacha20_2block_xor_avx2)
vmovdqa ROT16(%rip),%ymm5
 
mov %rcx,%rax
-   mov $10,%ecx
 
 .Ldoubleround:
 
@@ -138,7 +138,7 @@ ENTRY(chacha20_2block_xor_avx2)
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm3,%ymm3
 
-   dec %ecx
+   sub $2,%r8d
jnz .Ldoubleround
 
# o0 = i0 ^ (x0 + s0)
@@ -228,15 +228,16 @@ ENTRY(chacha20_2block_xor_avx2)
lea -8(%r10),%rsp
jmp .Ldone2
 
-ENDPROC(chacha20_2block_xor_avx2)
+ENDPROC(chacha_2block_xor_avx2)
 
-ENTRY(chacha20_4block_xor_avx2)
+ENTRY(chacha_4block_xor_avx2)
# %rdi: Input state matrix, s
# %rsi: up to 4 data blocks output, o
# %rdx: up to 4 data blocks input, i
# %rcx: input/output length in bytes
+   # %r8d: nrounds
 
-   # This function encrypts four ChaCh

[PATCH v3 5/6] crypto: x86/chacha - add XChaCha12 support

2018-12-04 Thread Eric Biggers
From: Eric Biggers 

Now that the x86_64 SIMD implementations of ChaCha20 and XChaCha20 have
been refactored to support varying the number of rounds, add support for
XChaCha12.  This is identical to XChaCha20 except for the number of
rounds, which is 12 instead of 20.  This can be used by Adiantum.

Reviewed-by: Martin Willi 
Signed-off-by: Eric Biggers 
---
 arch/x86/crypto/chacha_glue.c | 17 +
 crypto/Kconfig|  4 ++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 35fd02b50d27..d19c2908be90 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -232,6 +232,21 @@ static struct skcipher_alg algs[] = {
.setkey = crypto_chacha20_setkey,
.encrypt= xchacha_simd,
.decrypt= xchacha_simd,
+   }, {
+   .base.cra_name  = "xchacha12",
+   .base.cra_driver_name   = "xchacha12-simd",
+   .base.cra_priority  = 300,
+   .base.cra_blocksize = 1,
+   .base.cra_ctxsize   = sizeof(struct chacha_ctx),
+   .base.cra_module= THIS_MODULE,
+
+   .min_keysize= CHACHA_KEY_SIZE,
+   .max_keysize= CHACHA_KEY_SIZE,
+   .ivsize = XCHACHA_IV_SIZE,
+   .chunksize  = CHACHA_BLOCK_SIZE,
+   .setkey = crypto_chacha12_setkey,
+   .encrypt= xchacha_simd,
+   .decrypt= xchacha_simd,
},
 };
 
@@ -268,3 +283,5 @@ MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-simd");
 MODULE_ALIAS_CRYPTO("xchacha20");
 MODULE_ALIAS_CRYPTO("xchacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-simd");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index df466771e9bf..29865c599b04 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1473,8 +1473,8 @@ config CRYPTO_CHACHA20_X86_64
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
help
- SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20
- and XChaCha20 stream ciphers.
+ SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
+ XChaCha20, and XChaCha12 stream ciphers.
 
 config CRYPTO_SEED
tristate "SEED cipher algorithm"
-- 
2.19.2



[PATCH v3 6/6] crypto: x86/chacha - yield the FPU occasionally

2018-12-04 Thread Eric Biggers
From: Eric Biggers 

To improve responsiveness, yield the FPU (temporarily re-enabling
preemption) every 4 KiB encrypted/decrypted, rather than keeping
preemption disabled during the entire encryption/decryption operation.

Alternatively we could do this for every skcipher_walk step, but steps
may be small in some cases, and yielding the FPU is expensive on x86.

Suggested-by: Martin Willi 
Signed-off-by: Eric Biggers 
---
 arch/x86/crypto/chacha_glue.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index d19c2908be90..9b1d3fac4943 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -132,6 +132,7 @@ static int chacha_simd_stream_xor(struct skcipher_request 
*req,
 {
u32 *state, state_buf[16 + 2] __aligned(8);
struct skcipher_walk walk;
+   int next_yield = 4096; /* bytes until next FPU yield */
int err;
 
BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
@@ -144,12 +145,21 @@ static int chacha_simd_stream_xor(struct skcipher_request 
*req,
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
 
-   if (nbytes < walk.total)
+   if (nbytes < walk.total) {
nbytes = round_down(nbytes, walk.stride);
+   next_yield -= nbytes;
+   }
 
chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
  nbytes, ctx->nrounds);
 
+   if (next_yield <= 0) {
+   /* temporarily allow preemption */
+   kernel_fpu_end();
+   kernel_fpu_begin();
+   next_yield = 4096;
+   }
+
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
 
-- 
2.19.2



[PATCH v3 3/6] crypto: x86/chacha20 - add XChaCha20 support

2018-12-04 Thread Eric Biggers
From: Eric Biggers 

Add an XChaCha20 implementation that is hooked up to the x86_64 SIMD
implementations of ChaCha20.  This can be used by Adiantum.

An SSSE3 implementation of single-block HChaCha20 is also added so that
XChaCha20 can use it rather than the generic implementation.  This
required refactoring the ChaCha permutation into its own function.

Signed-off-by: Eric Biggers 
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S |  81 --
 arch/x86/crypto/chacha20_glue.c | 108 ++--
 crypto/Kconfig  |  12 +--
 3 files changed, 141 insertions(+), 60 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S 
b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index d8ac75bb448f..f6792789f875 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -10,6 +10,7 @@
  */
 
 #include 
+#include 
 
 .section   .rodata.cst16.ROT8, "aM", @progbits, 16
 .align 16
@@ -23,37 +24,24 @@ CTRINC: .octa 0x000300020001
 
 .text
 
-ENTRY(chacha20_block_xor_ssse3)
-   # %rdi: Input state matrix, s
-   # %rsi: up to 1 data block output, o
-   # %rdx: up to 1 data block input, i
-   # %rcx: input/output length in bytes
-
-   # This function encrypts one ChaCha20 block by loading the state matrix
-   # in four SSE registers. It performs matrix operation on four words in
-   # parallel, but requires shuffling to rearrange the words after each
-   # round. 8/16-bit word rotation is done with the slightly better
-   # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
-   # traditional shift+OR.
-
-   # x0..3 = s0..3
-   movdqa  0x00(%rdi),%xmm0
-   movdqa  0x10(%rdi),%xmm1
-   movdqa  0x20(%rdi),%xmm2
-   movdqa  0x30(%rdi),%xmm3
-   movdqa  %xmm0,%xmm8
-   movdqa  %xmm1,%xmm9
-   movdqa  %xmm2,%xmm10
-   movdqa  %xmm3,%xmm11
+/*
+ * chacha20_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round.  8/16-bit word rotation 
is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * Clobbers: %ecx, %xmm4-%xmm7
+ */
+chacha20_permute:
 
movdqa  ROT8(%rip),%xmm4
movdqa  ROT16(%rip),%xmm5
-
-   mov %rcx,%rax
mov $10,%ecx
 
 .Ldoubleround:
-
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
paddd   %xmm1,%xmm0
pxor%xmm0,%xmm3
@@ -123,6 +111,29 @@ ENTRY(chacha20_block_xor_ssse3)
dec %ecx
jnz .Ldoubleround
 
+   ret
+ENDPROC(chacha20_permute)
+
+ENTRY(chacha20_block_xor_ssse3)
+   # %rdi: Input state matrix, s
+   # %rsi: up to 1 data block output, o
+   # %rdx: up to 1 data block input, i
+   # %rcx: input/output length in bytes
+   FRAME_BEGIN
+
+   # x0..3 = s0..3
+   movdqa  0x00(%rdi),%xmm0
+   movdqa  0x10(%rdi),%xmm1
+   movdqa  0x20(%rdi),%xmm2
+   movdqa  0x30(%rdi),%xmm3
+   movdqa  %xmm0,%xmm8
+   movdqa  %xmm1,%xmm9
+   movdqa  %xmm2,%xmm10
+   movdqa  %xmm3,%xmm11
+
+   mov %rcx,%rax
+   callchacha20_permute
+
# o0 = i0 ^ (x0 + s0)
paddd   %xmm8,%xmm0
cmp $0x10,%rax
@@ -156,6 +167,7 @@ ENTRY(chacha20_block_xor_ssse3)
movdqu  %xmm0,0x30(%rsi)
 
 .Ldone:
+   FRAME_END
ret
 
 .Lxorpart:
@@ -189,6 +201,25 @@ ENTRY(chacha20_block_xor_ssse3)
 
 ENDPROC(chacha20_block_xor_ssse3)
 
+ENTRY(hchacha20_block_ssse3)
+   # %rdi: Input state matrix, s
+   # %rsi: output (8 32-bit words)
+   FRAME_BEGIN
+
+   movdqa  0x00(%rdi),%xmm0
+   movdqa  0x10(%rdi),%xmm1
+   movdqa  0x20(%rdi),%xmm2
+   movdqa  0x30(%rdi),%xmm3
+
+   callchacha20_permute
+
+   movdqu  %xmm0,0x00(%rsi)
+   movdqu  %xmm3,0x10(%rsi)
+
+   FRAME_END
+   ret
+ENDPROC(hchacha20_block_ssse3)
+
 ENTRY(chacha20_4block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: up to 4 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 773d075a1483..70d388e4a3a2 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -23,6 +23,7 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, 
const u8 *src,
 unsigned int len);
 asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 

[PATCH v3 0/6] crypto: x86_64 optimized XChaCha and NHPoly1305 (for Adiantum)

2018-12-04 Thread Eric Biggers
Hello,

This series optimizes the Adiantum encryption mode for x86_64 by adding
SSE2 and AVX2 accelerated implementations of NHPoly1305, specifically
the NH part; and by modifying the existing x86_64 SSSE3/AVX2/AVX-512VL
implementation of ChaCha20 to support XChaCha20 and XChaCha12.

This greatly improves Adiantum performance on x86_64.  

For example, encrypting 4096-byte messages (single-threaded) on a
Skylake-based processor (Intel Xeon, supports AVX-512VL and AVX2):

   BeforeAfter
     -
adiantum(xchacha12,aes)348 MB/s  1493 MB/s
adiantum(xchacha20,aes)266 MB/s  1261 MB/s

And on a Zen-based processor (Threadripper 1950X, supports AVX2):

   BeforeAfter
     -
adiantum(xchacha12,aes)505 MB/s  1292 MB/s
adiantum(xchacha20,aes)387 MB/s  1037 MB/s

Decryption is almost exactly the same speed as encryption.

The biggest benefit comes from accelerating XChaCha.  Accelerating NH
gives a somewhat smaller, but still significant benefit.

Performance on 512-byte inputs is also improved, though that is much
slower in the first place.  When Adiantium is used with dm-crypt (or
cryptsetup), we recommend using a 4096-byte sector size.

For comparison, AES-256-XTS is 2710 MB/s on the Skylake CPU and
4140 MB/s on the Zen CPU.  However, AES has the benefit of direct AES-NI
hardware support whereas Adiantum is implemented entirely with
general-purpose instructions (scalar and SIMD).  Adiantum is also a
super-pseudorandom permutation over the entire sector, unlike XTS.

Note that XChaCha20 and XChaCha12 can be used for other purposes too.

Changed since v2:
  - Yield the FPU once per 4096 bytes rather than once per skcipher_walk
step.
  - Create full stack frame in hchacha_block_ssse3() and
chacha_block_xor_ssse3().

Changed since v1:
  - Rebase on top of latest cryptodev with the AVX-512VL accelerated
ChaCha20 from Martin Willi.

Eric Biggers (6):
  crypto: x86/nhpoly1305 - add SSE2 accelerated NHPoly1305
  crypto: x86/nhpoly1305 - add AVX2 accelerated NHPoly1305
  crypto: x86/chacha20 - add XChaCha20 support
  crypto: x86/chacha20 - refactor to allow varying number of rounds
  crypto: x86/chacha - add XChaCha12 support
  crypto: x86/chacha - yield the FPU occasionally

 arch/x86/crypto/Makefile  |  15 +-
 ...a20-avx2-x86_64.S => chacha-avx2-x86_64.S} |  33 +-
 ...12vl-x86_64.S => chacha-avx512vl-x86_64.S} |  35 +--
 ...0-ssse3-x86_64.S => chacha-ssse3-x86_64.S} | 104 +++---
 arch/x86/crypto/chacha20_glue.c   | 208 
 arch/x86/crypto/chacha_glue.c | 297 ++
 arch/x86/crypto/nh-avx2-x86_64.S  | 157 +
 arch/x86/crypto/nh-sse2-x86_64.S  | 123 
 arch/x86/crypto/nhpoly1305-avx2-glue.c|  77 +
 arch/x86/crypto/nhpoly1305-sse2-glue.c|  76 +
 crypto/Kconfig|  28 +-
 11 files changed, 861 insertions(+), 292 deletions(-)
 rename arch/x86/crypto/{chacha20-avx2-x86_64.S => chacha-avx2-x86_64.S} (97%)
 rename arch/x86/crypto/{chacha20-avx512vl-x86_64.S => 
chacha-avx512vl-x86_64.S} (97%)
 rename arch/x86/crypto/{chacha20-ssse3-x86_64.S => chacha-ssse3-x86_64.S} (92%)
 delete mode 100644 arch/x86/crypto/chacha20_glue.c
 create mode 100644 arch/x86/crypto/chacha_glue.c
 create mode 100644 arch/x86/crypto/nh-avx2-x86_64.S
 create mode 100644 arch/x86/crypto/nh-sse2-x86_64.S
 create mode 100644 arch/x86/crypto/nhpoly1305-avx2-glue.c
 create mode 100644 arch/x86/crypto/nhpoly1305-sse2-glue.c

-- 
2.19.2



Re: [PATCH v2 3/6] crypto: x86/chacha20 - limit the preemption-disabled section

2018-12-04 Thread Eric Biggers
On Mon, Dec 03, 2018 at 03:13:37PM +0100, Ard Biesheuvel wrote:
> On Sun, 2 Dec 2018 at 11:47, Martin Willi  wrote:
> >
> >
> > > To improve responsiveness, disable preemption for each step of the
> > > walk (which is at most PAGE_SIZE) rather than for the entire
> > > encryption/decryption operation.
> >
> > It seems that it is not that uncommon for IPsec to get small inputs
> > scattered over multiple blocks. Doing FPU context saving for each walk
> > step then can slow down things.
> >
> > An alternative approach could be to re-enable preemption not based on
> > the walk steps, but on the amount of bytes processed. This would
> > satisfy both users, I guess.
> >
> > In the long run we probably need a better approach for FPU context
> > saving, as this really hurts performance-wise. For IPsec we should find
> > a way to avoid the (multiple) per-packet FPU save/restores in softirq
> > context, but I guess this requires support from process context
> > switching.
> >
> 
> At Jason's Zinc talk at plumbers, this came up, and apparently someone
> is working on this, i.e., to ensure that on x86, the FPU restore only
> occurs lazily, when returning to userland rather than every time you
> call kernel_fpu_end() [like we do on arm64 as well]
> 
> Not sure what the ETA for that work is, though, nor did I get the name
> of the guy working on it.

Thanks for the suggestion; I'll replace this with a patch that re-enables
preemption every 4 KiB encrypted.  That also avoids having to do a
kernel_fpu_begin(), kernel_fpu_end() pair just for hchacha_block_ssse3().  But
yes, I'd definitely like repeated kernel_fpu_begin(), kernel_fpu_end() to not be
incredibly slow.  That would help in a lot of other places too.

- Eric


Re: [PATCH v2 4/6] crypto: x86/chacha20 - add XChaCha20 support

2018-12-04 Thread Eric Biggers
Hi Martin,

On Sat, Dec 01, 2018 at 05:40:40PM +0100, Martin Willi wrote:
> 
> > An SSSE3 implementation of single-block HChaCha20 is also added so
> > that XChaCha20 can use it rather than the generic
> > implementation.  This required refactoring the ChaCha permutation
> > into its own function. 
> 
> > [...]
> 
> > +ENTRY(chacha20_block_xor_ssse3)
> > +   # %rdi: Input state matrix, s
> > +   # %rsi: up to 1 data block output, o
> > +   # %rdx: up to 1 data block input, i
> > +   # %rcx: input/output length in bytes
> > +
> > +   # x0..3 = s0..3
> > +   movdqa  0x00(%rdi),%xmm0
> > +   movdqa  0x10(%rdi),%xmm1
> > +   movdqa  0x20(%rdi),%xmm2
> > +   movdqa  0x30(%rdi),%xmm3
> > +   movdqa  %xmm0,%xmm8
> > +   movdqa  %xmm1,%xmm9
> > +   movdqa  %xmm2,%xmm10
> > +   movdqa  %xmm3,%xmm11
> > +
> > +   mov %rcx,%rax
> > +   callchacha20_permute
> > +
> > # o0 = i0 ^ (x0 + s0)
> > paddd   %xmm8,%xmm0
> > cmp $0x10,%rax
> > @@ -189,6 +198,23 @@ ENTRY(chacha20_block_xor_ssse3)
> >  
> >  ENDPROC(chacha20_block_xor_ssse3)
> >  
> > +ENTRY(hchacha20_block_ssse3)
> > +   # %rdi: Input state matrix, s
> > +   # %rsi: output (8 32-bit words)
> > +
> > +   movdqa  0x00(%rdi),%xmm0
> > +   movdqa  0x10(%rdi),%xmm1
> > +   movdqa  0x20(%rdi),%xmm2
> > +   movdqa  0x30(%rdi),%xmm3
> > +
> > +   callchacha20_permute
> 
> AFAIK, the general convention is to create proper stack frames using
> FRAME_BEGIN/END for non leaf-functions. Should chacha20_permute()
> callers do so?
> 

Yes, I'll do that.  (Ard suggested similarly in the arm64 version too.)

- Eric


Re: [PATCH 2/3] dt-bindings: crypto: ccree: add dt bindings for ccree 703

2018-12-04 Thread Herbert Xu
On Tue, Dec 04, 2018 at 02:14:19PM -0600, Rob Herring wrote:
> On Thu, Nov 29, 2018 at 02:42:18PM +0800, Herbert Xu wrote:
> > On Tue, Nov 13, 2018 at 09:40:36AM +, Gilad Ben-Yossef wrote:
> > > Add device tree bindings associating Arm TrustZone CryptoCell 703 with the
> > > ccree driver.
> > > 
> > > Signed-off-by: Gilad Ben-Yossef 
> > > ---
> > >  Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 +
> > >  1 file changed, 1 insertion(+)
> > 
> > Which tree is this patch meant to go through?
> 
> You should take it as it is part of a series.

Thanks Rob!
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Using Advanced Vector eXtensions with hand-coded x64 algorithms (e.g /arch/x86/blowfish-x86_64-asm_64.S)

2018-12-04 Thread Shipof _
I was curious if it might make implementing F() faster to use
instructions that are meant to work with sets of data similar to what
would be processed


[PATCH] crypto: adiantum - propagate CRYPTO_ALG_ASYNC flag to instance

2018-12-04 Thread Eric Biggers
From: Eric Biggers 

If the stream cipher implementation is asynchronous, then the Adiantum
instance must be flagged as asynchronous as well.  Otherwise someone
asking for a synchronous algorithm can get an asynchronous algorithm.

There are no asynchronous xchacha12 or xchacha20 implementations yet
which makes this largely a theoretical issue, but it should be fixed.

Fixes: 059c2a4d8e16 ("crypto: adiantum - add Adiantum support")
Signed-off-by: Eric Biggers 
---
 crypto/adiantum.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crypto/adiantum.c b/crypto/adiantum.c
index 2dfcf12fd4529..ca27e0dc2958c 100644
--- a/crypto/adiantum.c
+++ b/crypto/adiantum.c
@@ -590,6 +590,8 @@ static int adiantum_create(struct crypto_template *tmpl, 
struct rtattr **tb)
 hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
goto out_drop_hash;
 
+   inst->alg.base.cra_flags = streamcipher_alg->base.cra_flags &
+  CRYPTO_ALG_ASYNC;
inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE;
inst->alg.base.cra_ctxsize = sizeof(struct adiantum_tfm_ctx);
inst->alg.base.cra_alignmask = streamcipher_alg->base.cra_alignmask |
-- 
2.20.0.rc1.387.gf8505762e3-goog



Re: [PATCH] fscrypt: remove CRYPTO_CTR dependency

2018-12-04 Thread Eric Biggers
On Thu, Sep 06, 2018 at 12:43:41PM +0200, Ard Biesheuvel wrote:
> On 5 September 2018 at 21:24, Eric Biggers  wrote:
> > From: Eric Biggers 
> >
> > fscrypt doesn't use the CTR mode of operation for anything, so there's
> > no need to select CRYPTO_CTR.  It was added by commit 71dea01ea2ed
> > ("ext4 crypto: require CONFIG_CRYPTO_CTR if ext4 encryption is
> > enabled").  But, I've been unable to identify the arm64 crypto bug it
> > was supposedly working around.
> >
> > I suspect the issue was seen only on some old Android device kernel
> > (circa 3.10?).  So if the fix wasn't mistaken, the real bug is probably
> > already fixed.  Or maybe it was actually a bug in a non-upstream crypto
> > driver.
> >
> > So, remove the dependency.  If it turns out there's actually still a
> > bug, we'll fix it properly.
> >
> > Signed-off-by: Eric Biggers 
> 
> Acked-by: Ard Biesheuvel 
> 
> This may be related to
> 
> 11e3b725cfc2 crypto: arm64/aes-blk - honour iv_out requirement in CBC
> and CTR modes
> 
> given that the commit in question mentions CTS. How it actually works
> around the issue is unclear to me, though.
> 
> 
> 
> 
> > ---
> >  fs/crypto/Kconfig | 1 -
> >  1 file changed, 1 deletion(-)
> >
> > diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
> > index 02b7d91c92310..284b589b4774d 100644
> > --- a/fs/crypto/Kconfig
> > +++ b/fs/crypto/Kconfig
> > @@ -6,7 +6,6 @@ config FS_ENCRYPTION
> > select CRYPTO_ECB
> > select CRYPTO_XTS
> > select CRYPTO_CTS
> > -   select CRYPTO_CTR
> > select CRYPTO_SHA256
> > select KEYS
> > help
> > --
> > 2.19.0.rc2.392.g5ba43deb5a-goog
> >

Ping.  Ted, can you consider applying this to the fscrypt tree for 4.21?

Thanks,

- Eric


Re: [PATCH v8 00/14] Appended signatures support for IMA appraisal

2018-12-04 Thread Thiago Jung Bauermann


Hello James,

Thanks for you interest in these patches.

James Morris  writes:

> On Fri, 16 Nov 2018, Thiago Jung Bauermann wrote:
>
>> On the OpenPOWER platform, secure boot and trusted boot are being
>> implemented using IMA for taking measurements and verifying signatures.
>> Since the kernel image on Power servers is an ELF binary, kernels are
>> signed using the scripts/sign-file tool and thus use the same signature
>> format as signed kernel modules.
>>
>> This patch series adds support in IMA for verifying those signatures.
>
> Are you saying you use IMA to verify kernels during boot?  From a Linux
> bootloader?

Yes to both. OpenPOWER machines have embedded in their firmware a Linux
kernel and initramfs to use as bootloader, using Petitboot. kexec is
used to load the OS and boot it.

>> It adds flexibility to OpenPOWER secure boot, because it allows it to boot
>> kernels with the signature appended to them as well as kernels where the
>> signature is stored in the IMA extended attribute.
>
> Just to clarify, with these patches, IMA will be able to verify the
> native form of signed kernel modules?

That wasn't my use case to develop the patches, but I just tested and it
works.

I just had to make a slight modification: there's a whitelist of IMA
hooks that are allowed to use the module signature format (in the
ima_hook_supports_modsig function), and I had to add MODULE_CHECK to it.
The next version of the patches will have this change.

The only difference is that IMA looks for a valid key in the IMA
keyring, while the CONFIG_MODULE_SIG code looks for the module signing
key in the builtin and secondary trusted keyrings.

> i.e. without xattrs at all, and
> this will work with existing signed modules?

No xattrs at all, and yes.

--
Thiago Jung Bauermann
IBM Linux Technology Center



[tip:core/rcu] crypto/pcrypt: Replace synchronize_rcu_bh() with synchronize_rcu()

2018-12-04 Thread tip-bot for Paul E. McKenney
Commit-ID:  a0076e1778c23de4a42d90fee4ecb4c21dbb5838
Gitweb: https://git.kernel.org/tip/a0076e1778c23de4a42d90fee4ecb4c21dbb5838
Author: Paul E. McKenney 
AuthorDate: Mon, 5 Nov 2018 16:57:40 -0800
Committer:  Paul E. McKenney 
CommitDate: Tue, 27 Nov 2018 09:18:59 -0800

crypto/pcrypt: Replace synchronize_rcu_bh() with synchronize_rcu()

Now that synchronize_rcu() waits for bh-disable regions of code as
well as RCU read-side critical sections, the synchronize_rcu_bh() in
pcrypt_cpumask_change_notify() can be replaced by synchronize_rcu().
This commit therefore makes this change.

Signed-off-by: Paul E. McKenney 
Cc: Steffen Klassert 
Cc: 
Acked-by: Herbert Xu 
---
 crypto/pcrypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index f8ec3d4ba4a8..8eb3c4c9ff67 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -382,7 +382,7 @@ static int pcrypt_cpumask_change_notify(struct 
notifier_block *self,
 
cpumask_copy(new_mask->mask, cpumask->cbcpu);
rcu_assign_pointer(pcrypt->cb_cpumask, new_mask);
-   synchronize_rcu_bh();
+   synchronize_rcu();
 
free_cpumask_var(old_mask->mask);
kfree(old_mask);


Re: [PATCH v8 00/14] Appended signatures support for IMA appraisal

2018-12-04 Thread James Morris
On Fri, 16 Nov 2018, Thiago Jung Bauermann wrote:

> On the OpenPOWER platform, secure boot and trusted boot are being
> implemented using IMA for taking measurements and verifying signatures.
> Since the kernel image on Power servers is an ELF binary, kernels are
> signed using the scripts/sign-file tool and thus use the same signature
> format as signed kernel modules.
> 
> This patch series adds support in IMA for verifying those signatures.

Are you saying you use IMA to verify kernels during boot?  From a Linux 
bootloader?

> It adds flexibility to OpenPOWER secure boot, because it allows it to boot
> kernels with the signature appended to them as well as kernels where the
> signature is stored in the IMA extended attribute.

Just to clarify, with these patches, IMA will be able to verify the 
native form of signed kernel modules?  i.e. without xattrs at all, and 
this will work with existing signed modules?



-- 
James Morris




Re: [PATCH 2/3] dt-bindings: crypto: ccree: add dt bindings for ccree 703

2018-12-04 Thread Rob Herring
On Thu, Nov 29, 2018 at 02:42:18PM +0800, Herbert Xu wrote:
> On Tue, Nov 13, 2018 at 09:40:36AM +, Gilad Ben-Yossef wrote:
> > Add device tree bindings associating Arm TrustZone CryptoCell 703 with the
> > ccree driver.
> > 
> > Signed-off-by: Gilad Ben-Yossef 
> > ---
> >  Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 +
> >  1 file changed, 1 insertion(+)
> 
> Which tree is this patch meant to go through?

You should take it as it is part of a series.

Rob


Re: [PATCH 2/3] dt-bindings: crypto: ccree: add dt bindings for ccree 703

2018-12-04 Thread Rob Herring
On Tue, 13 Nov 2018 09:40:36 +, Gilad Ben-Yossef wrote:
> Add device tree bindings associating Arm TrustZone CryptoCell 703 with the
> ccree driver.
> 
> Signed-off-by: Gilad Ben-Yossef 
> ---
>  Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 +
>  1 file changed, 1 insertion(+)
> 

Reviewed-by: Rob Herring 


Re: [PATCH v2 2/4] crypto: arm64/chacha20 - add XChaCha20 support

2018-12-04 Thread Ard Biesheuvel
On Tue, 4 Dec 2018 at 04:56, Eric Biggers  wrote:
>
> From: Eric Biggers 
>
> Add an XChaCha20 implementation that is hooked up to the ARM64 NEON
> implementation of ChaCha20.  This can be used by Adiantum.
>
> A NEON implementation of single-block HChaCha20 is also added so that
> XChaCha20 can use it rather than the generic implementation.  This
> required refactoring the ChaCha20 permutation into its own function.
>
> Signed-off-by: Eric Biggers 

Reviewed-by: Ard Biesheuvel 

> ---
>  arch/arm64/crypto/Kconfig  |   2 +-
>  arch/arm64/crypto/chacha20-neon-core.S |  65 +++-
>  arch/arm64/crypto/chacha20-neon-glue.c | 101 +++--
>  3 files changed, 125 insertions(+), 43 deletions(-)
>
> diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
> index 3f5aeb786192..d54ddb8468ef 100644
> --- a/arch/arm64/crypto/Kconfig
> +++ b/arch/arm64/crypto/Kconfig
> @@ -101,7 +101,7 @@ config CRYPTO_AES_ARM64_NEON_BLK
> select CRYPTO_SIMD
>
>  config CRYPTO_CHACHA20_NEON
> -   tristate "NEON accelerated ChaCha20 symmetric cipher"
> +   tristate "ChaCha20 and XChaCha20 stream ciphers using NEON 
> instructions"
> depends on KERNEL_MODE_NEON
> select CRYPTO_BLKCIPHER
> select CRYPTO_CHACHA20
> diff --git a/arch/arm64/crypto/chacha20-neon-core.S 
> b/arch/arm64/crypto/chacha20-neon-core.S
> index 13c85e272c2a..0571e45a1a0a 100644
> --- a/arch/arm64/crypto/chacha20-neon-core.S
> +++ b/arch/arm64/crypto/chacha20-neon-core.S
> @@ -23,25 +23,20 @@
> .text
> .align  6
>
> -ENTRY(chacha20_block_xor_neon)
> -   // x0: Input state matrix, s
> -   // x1: 1 data block output, o
> -   // x2: 1 data block input, i
> -
> -   //
> -   // This function encrypts one ChaCha20 block by loading the state 
> matrix
> -   // in four NEON registers. It performs matrix operation on four words 
> in
> -   // parallel, but requires shuffling to rearrange the words after each
> -   // round.
> -   //
> -
> -   // x0..3 = s0..3
> -   adr x3, ROT8
> -   ld1 {v0.4s-v3.4s}, [x0]
> -   ld1 {v8.4s-v11.4s}, [x0]
> -   ld1 {v12.4s}, [x3]
> +/*
> + * chacha20_permute - permute one block
> + *
> + * Permute one 64-byte block where the state matrix is stored in the four 
> NEON
> + * registers v0-v3.  It performs matrix operations on four words in parallel,
> + * but requires shuffling to rearrange the words after each round.
> + *
> + * Clobbers: x3, x10, v4, v12
> + */
> +chacha20_permute:
>
> mov x3, #10
> +   adr x10, ROT8
> +   ld1 {v12.4s}, [x10]
>
>  .Ldoubleround:
> // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
> @@ -105,6 +100,23 @@ ENTRY(chacha20_block_xor_neon)
> subsx3, x3, #1
> b.ne.Ldoubleround
>
> +   ret
> +ENDPROC(chacha20_permute)
> +
> +ENTRY(chacha20_block_xor_neon)
> +   // x0: Input state matrix, s
> +   // x1: 1 data block output, o
> +   // x2: 1 data block input, i
> +
> +   stp x29, x30, [sp, #-16]!
> +   mov x29, sp
> +
> +   // x0..3 = s0..3
> +   ld1 {v0.4s-v3.4s}, [x0]
> +   ld1 {v8.4s-v11.4s}, [x0]
> +
> +   bl  chacha20_permute
> +
> ld1 {v4.16b-v7.16b}, [x2]
>
> // o0 = i0 ^ (x0 + s0)
> @@ -125,9 +137,28 @@ ENTRY(chacha20_block_xor_neon)
>
> st1 {v0.16b-v3.16b}, [x1]
>
> +   ldp x29, x30, [sp], #16
> ret
>  ENDPROC(chacha20_block_xor_neon)
>
> +ENTRY(hchacha20_block_neon)
> +   // x0: Input state matrix, s
> +   // x1: output (8 32-bit words)
> +
> +   stp x29, x30, [sp, #-16]!
> +   mov x29, sp
> +
> +   ld1 {v0.4s-v3.4s}, [x0]
> +
> +   bl  chacha20_permute
> +
> +   st1 {v0.16b}, [x1], #16
> +   st1 {v3.16b}, [x1]
> +
> +   ldp x29, x30, [sp], #16
> +   ret
> +ENDPROC(hchacha20_block_neon)
> +
> .align  6
>  ENTRY(chacha20_4block_xor_neon)
> // x0: Input state matrix, s
> diff --git a/arch/arm64/crypto/chacha20-neon-glue.c 
> b/arch/arm64/crypto/chacha20-neon-glue.c
> index 96e0cfb8c3f5..a5b9cbc0c4de 100644
> --- a/arch/arm64/crypto/chacha20-neon-glue.c
> +++ b/arch/arm64/crypto/chacha20-neon-glue.c
> @@ -30,6 +30,7 @@
>
>  asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
>  asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
> +asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out);
>
>  static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
> unsigned int bytes)
> @@ -65,20 +66,16 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 
> *src,
> kernel_neon_end();
>  }
>
> -static int ch

[PATCH v2 0/3] crypto: arm64/chacha - performance improvements

2018-12-04 Thread Ard Biesheuvel
Improve the performance of NEON based ChaCha:

Patch #1 adds a block size of 1472 to the tcrypt test template so we have
something that reflects the VPN case.

Patch #2 improves performance for arbitrary length inputs: on deep pipelines,
throughput increases ~30% when running on inputs blocks whose size is drawn
randomly from the interval [64, 1024)

Patch #3 adopts the OpenSSL approach to use the ALU in parallel with the
SIMD unit to process a fifth block while the SIMD is operating on 4 blocks.

Performance on Cortex-A57:

BEFORE:
===
testing speed of async chacha20 (chacha20-neon) encryption
tcrypt: test 0 (256 bit key, 16 byte blocks): 2528223 operations in 1 seconds 
(40451568 bytes)
tcrypt: test 1 (256 bit key, 64 byte blocks): 2518155 operations in 1 seconds 
(161161920 bytes)
tcrypt: test 2 (256 bit key, 256 byte blocks): 1207948 operations in 1 seconds 
(309234688 bytes)
tcrypt: test 3 (256 bit key, 1024 byte blocks): 332194 operations in 1 seconds 
(340166656 bytes)
tcrypt: test 4 (256 bit key, 1472 byte blocks): 185659 operations in 1 seconds 
(273290048 bytes)
tcrypt: test 5 (256 bit key, 8192 byte blocks): 41829 operations in 1 seconds 
(342663168 bytes)

AFTER:
==
testing speed of async chacha20 (chacha20-neon) encryption
tcrypt: test 0 (256 bit key, 16 byte blocks): 2530018 operations in 1 seconds 
(40480288 bytes)
tcrypt: test 1 (256 bit key, 64 byte blocks): 2518270 operations in 1 seconds 
(161169280 bytes)
tcrypt: test 2 (256 bit key, 256 byte blocks): 1187760 operations in 1 seconds 
(304066560 bytes)
tcrypt: test 3 (256 bit key, 1024 byte blocks): 361652 operations in 1 seconds 
(370331648 bytes)
tcrypt: test 4 (256 bit key, 1472 byte blocks): 280971 operations in 1 seconds 
(413589312 bytes)
tcrypt: test 5 (256 bit key, 8192 byte blocks): 53654 operations in 1 seconds 
(439533568 bytes)

Zinc:
=
testing speed of async chacha20 (chacha20-software) encryption
tcrypt: test 0 (256 bit key, 16 byte blocks): 2510300 operations in 1 seconds 
(40164800 bytes)
tcrypt: test 1 (256 bit key, 64 byte blocks): 2663794 operations in 1 seconds 
(170482816 bytes)
tcrypt: test 2 (256 bit key, 256 byte blocks): 1237617 operations in 1 seconds 
(316829952 bytes)
tcrypt: test 3 (256 bit key, 1024 byte blocks): 364645 operations in 1 seconds 
(373396480 bytes)
tcrypt: test 4 (256 bit key, 1472 byte blocks): 251548 operations in 1 seconds 
(370278656 bytes)
tcrypt: test 5 (256 bit key, 8192 byte blocks): 47650 operations in 1 seconds 
(390348800 bytes)

Cc: Eric Biggers 
Cc: Martin Willi 

Ard Biesheuvel (3):
  crypto: tcrypt - add block size of 1472 to skcipher template
  crypto: arm64/chacha - optimize for arbitrary length inputs
  crypto: arm64/chacha - use combined SIMD/ALU routine for more speed

 arch/arm64/crypto/chacha-neon-core.S | 396 +++-
 arch/arm64/crypto/chacha-neon-glue.c |  59 ++-
 crypto/tcrypt.c  |   2 +-
 3 files changed, 404 insertions(+), 53 deletions(-)

-- 
2.19.2



[PATCH v2 1/3] crypto: tcrypt - add block size of 1472 to skcipher template

2018-12-04 Thread Ard Biesheuvel
In order to have better coverage of algorithms operating on block
sizes that are in the ballpark of a VPN  packet, add 1472 to the
block_sizes array.

Signed-off-by: Ard Biesheuvel 
---
 crypto/tcrypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 0590a9204562..e7fb87e114a5 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -81,7 +81,7 @@ static char *check[] = {
NULL
 };
 
-static u32 block_sizes[] = { 16, 64, 256, 1024, 8192, 0 };
+static u32 block_sizes[] = { 16, 64, 256, 1024, 1472, 8192, 0 };
 static u32 aead_sizes[] = { 16, 64, 256, 512, 1024, 2048, 4096, 8192, 0 };
 
 #define XBUFSIZE 8
-- 
2.19.2



[PATCH v2 3/3] crypto: arm64/chacha - use combined SIMD/ALU routine for more speed

2018-12-04 Thread Ard Biesheuvel
To some degree, most known AArch64 micro-architectures appear to be
able to issue ALU instructions in parellel to SIMD instructions
without affecting the SIMD throughput. This means we can use the ALU
to process a fifth ChaCha block while the SIMD is processing four
blocks in parallel.

Signed-off-by: Ard Biesheuvel 
---
 arch/arm64/crypto/chacha-neon-core.S | 235 ++--
 arch/arm64/crypto/chacha-neon-glue.c |  39 ++--
 2 files changed, 239 insertions(+), 35 deletions(-)

diff --git a/arch/arm64/crypto/chacha-neon-core.S 
b/arch/arm64/crypto/chacha-neon-core.S
index 32086709e6b3..534e0a3fafa4 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -1,13 +1,13 @@
 /*
  * ChaCha/XChaCha NEON helper functions
  *
- * Copyright (C) 2016 Linaro, Ltd. 
+ * Copyright (C) 2016-2018 Linaro, Ltd. 
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- * Based on:
+ * Originally based on:
  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
  *
  * Copyright (C) 2015 Martin Willi
@@ -160,8 +160,27 @@ ENTRY(hchacha_block_neon)
ret x9
 ENDPROC(hchacha_block_neon)
 
+   a0  .reqw12
+   a1  .reqw13
+   a2  .reqw14
+   a3  .reqw15
+   a4  .reqw16
+   a5  .reqw17
+   a6  .reqw19
+   a7  .reqw20
+   a8  .reqw21
+   a9  .reqw22
+   a10 .reqw23
+   a11 .reqw24
+   a12 .reqw25
+   a13 .reqw26
+   a14 .reqw27
+   a15 .reqw28
+
.align  6
 ENTRY(chacha_4block_xor_neon)
+   frame_push  10
+
// x0: Input state matrix, s
// x1: 4 data blocks output, o
// x2: 4 data blocks input, i
@@ -181,6 +200,9 @@ ENTRY(chacha_4block_xor_neon)
// matrix by interleaving 32- and then 64-bit words, which allows us to
// do XOR in NEON registers.
//
+   // At the same time, a fifth block is encrypted in parallel using
+   // scalar registers
+   //
adr_l   x9, CTRINC  // ... and ROT8
ld1 {v30.4s-v31.4s}, [x9]
 
@@ -191,7 +213,24 @@ ENTRY(chacha_4block_xor_neon)
ld4r{ v8.4s-v11.4s}, [x8], #16
ld4r{v12.4s-v15.4s}, [x8]
 
-   // x12 += counter values 0-3
+   mov a0, v0.s[0]
+   mov a1, v1.s[0]
+   mov a2, v2.s[0]
+   mov a3, v3.s[0]
+   mov a4, v4.s[0]
+   mov a5, v5.s[0]
+   mov a6, v6.s[0]
+   mov a7, v7.s[0]
+   mov a8, v8.s[0]
+   mov a9, v9.s[0]
+   mov a10, v10.s[0]
+   mov a11, v11.s[0]
+   mov a12, v12.s[0]
+   mov a13, v13.s[0]
+   mov a14, v14.s[0]
+   mov a15, v15.s[0]
+
+   // x12 += counter values 1-4
add v12.4s, v12.4s, v30.4s
 
 .Ldoubleround4:
@@ -200,33 +239,53 @@ ENTRY(chacha_4block_xor_neon)
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
add v0.4s, v0.4s, v4.4s
+ add   a0, a0, a4
add v1.4s, v1.4s, v5.4s
+ add   a1, a1, a5
add v2.4s, v2.4s, v6.4s
+ add   a2, a2, a6
add v3.4s, v3.4s, v7.4s
+ add   a3, a3, a7
 
eor v12.16b, v12.16b, v0.16b
+ eor   a12, a12, a0
eor v13.16b, v13.16b, v1.16b
+ eor   a13, a13, a1
eor v14.16b, v14.16b, v2.16b
+ eor   a14, a14, a2
eor v15.16b, v15.16b, v3.16b
+ eor   a15, a15, a3
 
rev32   v12.8h, v12.8h
+ ror   a12, a12, #16
rev32   v13.8h, v13.8h
+ ror   a13, a13, #16
rev32   v14.8h, v14.8h
+ ror   a14, a14, #16
rev32   v15.8h, v15.8h
+ ror   a15, a15, #16
 
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
add v8.4s, v8.4s, v12.4s
+ add   a8, a8, a12
add v9.4s, v9.4s, v13.4s
+ add   a9, a9, a13
add v10.4s, v10.4s, v14.4s
+ add   a10, a10, a14
add v11.4s, v11.4s, v15.4s
+ add   a11

[PATCH v2 2/3] crypto: arm64/chacha - optimize for arbitrary length inputs

2018-12-04 Thread Ard Biesheuvel
Update the 4-way NEON ChaCha routine so it can handle input of any
length >64 bytes in its entirety, rather than having to call into
the 1-way routine and/or memcpy()s via temp buffers to handle the
tail of a ChaCha invocation that is not a multiple of 256 bytes.

On inputs that are a multiple of 256 bytes (and thus in tcrypt
benchmarks), performance drops by around 1% on Cortex-A57, while
performance for inputs drawn randomly from the range [64, 1024)
increases by around 30%.

Signed-off-by: Ard Biesheuvel 
---
 arch/arm64/crypto/chacha-neon-core.S | 183 ++--
 arch/arm64/crypto/chacha-neon-glue.c |  38 ++--
 2 files changed, 184 insertions(+), 37 deletions(-)

diff --git a/arch/arm64/crypto/chacha-neon-core.S 
b/arch/arm64/crypto/chacha-neon-core.S
index 75b4e06cee79..32086709e6b3 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -19,6 +19,8 @@
  */
 
 #include 
+#include 
+#include 
 
.text
.align  6
@@ -36,7 +38,7 @@
  */
 chacha_permute:
 
-   adr x10, ROT8
+   adr_l   x10, ROT8
ld1 {v12.4s}, [x10]
 
 .Ldoubleround:
@@ -164,6 +166,12 @@ ENTRY(chacha_4block_xor_neon)
// x1: 4 data blocks output, o
// x2: 4 data blocks input, i
// w3: nrounds
+   // x4: byte count
+
+   adr_l   x10, .Lpermute
+   and x5, x4, #63
+   add x10, x10, x5
+   add x11, x10, #64
 
//
// This function encrypts four consecutive ChaCha blocks by loading
@@ -173,15 +181,15 @@ ENTRY(chacha_4block_xor_neon)
// matrix by interleaving 32- and then 64-bit words, which allows us to
// do XOR in NEON registers.
//
-   adr x9, CTRINC  // ... and ROT8
+   adr_l   x9, CTRINC  // ... and ROT8
ld1 {v30.4s-v31.4s}, [x9]
 
// x0..15[0-3] = s0..3[0..3]
-   mov x4, x0
-   ld4r{ v0.4s- v3.4s}, [x4], #16
-   ld4r{ v4.4s- v7.4s}, [x4], #16
-   ld4r{ v8.4s-v11.4s}, [x4], #16
-   ld4r{v12.4s-v15.4s}, [x4]
+   add x8, x0, #16
+   ld4r{ v0.4s- v3.4s}, [x0]
+   ld4r{ v4.4s- v7.4s}, [x8], #16
+   ld4r{ v8.4s-v11.4s}, [x8], #16
+   ld4r{v12.4s-v15.4s}, [x8]
 
// x12 += counter values 0-3
add v12.4s, v12.4s, v30.4s
@@ -425,24 +433,47 @@ ENTRY(chacha_4block_xor_neon)
zip1v30.4s, v14.4s, v15.4s
zip2v31.4s, v14.4s, v15.4s
 
+   mov x3, #64
+   subsx5, x4, #64
+   add x6, x5, x2
+   cselx3, x3, xzr, ge
+   cselx2, x2, x6, ge
+
// interleave 64-bit words in state n, n+2
zip1v0.2d, v16.2d, v18.2d
zip2v4.2d, v16.2d, v18.2d
zip1v8.2d, v17.2d, v19.2d
zip2v12.2d, v17.2d, v19.2d
-   ld1 {v16.16b-v19.16b}, [x2], #64
+   ld1 {v16.16b-v19.16b}, [x2], x3
+
+   subsx6, x4, #128
+   ccmpx3, xzr, #4, lt
+   add x7, x6, x2
+   cselx3, x3, xzr, eq
+   cselx2, x2, x7, eq
 
zip1v1.2d, v20.2d, v22.2d
zip2v5.2d, v20.2d, v22.2d
zip1v9.2d, v21.2d, v23.2d
zip2v13.2d, v21.2d, v23.2d
-   ld1 {v20.16b-v23.16b}, [x2], #64
+   ld1 {v20.16b-v23.16b}, [x2], x3
+
+   subsx7, x4, #192
+   ccmpx3, xzr, #4, lt
+   add x8, x7, x2
+   cselx3, x3, xzr, eq
+   cselx2, x2, x8, eq
 
zip1v2.2d, v24.2d, v26.2d
zip2v6.2d, v24.2d, v26.2d
zip1v10.2d, v25.2d, v27.2d
zip2v14.2d, v25.2d, v27.2d
-   ld1 {v24.16b-v27.16b}, [x2], #64
+   ld1 {v24.16b-v27.16b}, [x2], x3
+
+   subsx8, x4, #256
+   ccmpx3, xzr, #4, lt
+   add x9, x8, x2
+   cselx2, x2, x9, eq
 
zip1v3.2d, v28.2d, v30.2d
zip2v7.2d, v28.2d, v30.2d
@@ -451,29 +482,155 @@ ENTRY(chacha_4block_xor_neon)
ld1 {v28.16b-v31.16b}, [x2]
 
// xor with corresponding input, write to output
+   tbnzx5, #63, 0f
eor v16.16b, v16.16b, v0.16b
eor v17.16b, v17.16b, v1.16b
eor v18.16b, v18.16b, v2.16b
eor v19.16b, v19.16b, v3.16b
+   st1 {v16.16b-v19.16b}, [x1], #64
+
+   tbnzx6, #63, 1f
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b

[PATCH] crypto: cavium/nitrox - Enabled Mailbox support

2018-12-04 Thread Srikanth, Jampala
Enabled the PF->VF Mailbox support. Mailbox message are interpreted
as {type, opcode, data}. Supported message types are REQ, ACK and NACK.

Signed-off-by: Srikanth Jampala 
---
 drivers/crypto/cavium/nitrox/Makefile |   3 +-
 drivers/crypto/cavium/nitrox/nitrox_csr.h |  12 +-
 drivers/crypto/cavium/nitrox/nitrox_debugfs.h |  22 ++
 drivers/crypto/cavium/nitrox/nitrox_dev.h |  61 +-
 drivers/crypto/cavium/nitrox/nitrox_hal.c | 114 +++---
 drivers/crypto/cavium/nitrox/nitrox_hal.h |   2 +
 drivers/crypto/cavium/nitrox/nitrox_isr.c |   8 +-
 drivers/crypto/cavium/nitrox/nitrox_main.c|   3 +-
 drivers/crypto/cavium/nitrox/nitrox_mbx.c | 204 ++
 drivers/crypto/cavium/nitrox/nitrox_mbx.h |   9 +
 drivers/crypto/cavium/nitrox/nitrox_sriov.c   |  57 -
 11 files changed, 441 insertions(+), 54 deletions(-)
 create mode 100644 drivers/crypto/cavium/nitrox/nitrox_debugfs.h
 create mode 100644 drivers/crypto/cavium/nitrox/nitrox_mbx.c
 create mode 100644 drivers/crypto/cavium/nitrox/nitrox_mbx.h

diff --git a/drivers/crypto/cavium/nitrox/Makefile 
b/drivers/crypto/cavium/nitrox/Makefile
index e12954791673..ad0546630ad8 100644
--- a/drivers/crypto/cavium/nitrox/Makefile
+++ b/drivers/crypto/cavium/nitrox/Makefile
@@ -6,7 +6,8 @@ n5pf-objs := nitrox_main.o \
nitrox_lib.o \
nitrox_hal.o \
nitrox_reqmgr.o \
-   nitrox_algs.o
+   nitrox_algs.o   \
+   nitrox_mbx.o
 
 n5pf-$(CONFIG_PCI_IOV) += nitrox_sriov.o
 n5pf-$(CONFIG_DEBUG_FS) += nitrox_debugfs.o
diff --git a/drivers/crypto/cavium/nitrox/nitrox_csr.h 
b/drivers/crypto/cavium/nitrox/nitrox_csr.h
index 1ad27b1a87c5..a2a452642b38 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_csr.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_csr.h
@@ -54,7 +54,13 @@
 #define NPS_STATS_PKT_DMA_WR_CNT   0x1000190
 
 /* NPS packet registers */
-#define NPS_PKT_INT0x1040018
+#define NPS_PKT_INT0x1040018
+#define NPS_PKT_MBOX_INT_LO0x1040020
+#define NPS_PKT_MBOX_INT_LO_ENA_W1C0x1040030
+#define NPS_PKT_MBOX_INT_LO_ENA_W1S0x1040038
+#define NPS_PKT_MBOX_INT_HI0x1040040
+#define NPS_PKT_MBOX_INT_HI_ENA_W1C0x1040050
+#define NPS_PKT_MBOX_INT_HI_ENA_W1S0x1040058
 #define NPS_PKT_IN_RERR_HI 0x1040108
 #define NPS_PKT_IN_RERR_HI_ENA_W1S 0x1040120
 #define NPS_PKT_IN_RERR_LO 0x1040128
@@ -74,6 +80,10 @@
 #define NPS_PKT_SLC_RERR_LO_ENA_W1S0x1040240
 #define NPS_PKT_SLC_ERR_TYPE   0x1040248
 #define NPS_PKT_SLC_ERR_TYPE_ENA_W1S   0x1040260
+/* Mailbox PF->VF PF Accessible Data registers */
+#define NPS_PKT_MBOX_PF_VF_PFDATAX(_i) (0x1040800 + ((_i) * 0x8))
+#define NPS_PKT_MBOX_VF_PF_PFDATAX(_i) (0x1040C00 + ((_i) * 0x8))
+
 #define NPS_PKT_SLC_CTLX(_i)   (0x1 + ((_i) * 0x4))
 #define NPS_PKT_SLC_CNTSX(_i)  (0x10008 + ((_i) * 0x4))
 #define NPS_PKT_SLC_INT_LEVELSX(_i)(0x10010 + ((_i) * 0x4))
diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.h 
b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h
new file mode 100644
index ..7b701ea6227a
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __NITROX_DEBUGFS_H
+#define __NITROX_DEBUGFS_H
+
+#include "nitrox_dev.h"
+
+#ifdef CONFIG_DEBUG_FS
+int nitrox_debugfs_init(struct nitrox_device *ndev);
+void nitrox_debugfs_exit(struct nitrox_device *ndev);
+#else
+static inline int nitrox_debugfs_init(struct nitrox_device *ndev)
+{
+   return 0;
+}
+
+static inline int nitrox_sriov_debugfs_init(struct nitrox_device *ndev)
+{
+   return 0;
+}
+#endif /* !CONFIG_DEBUG_FS */
+
+#endif /* __NITROX_DEBUGFS_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_dev.h 
b/drivers/crypto/cavium/nitrox/nitrox_dev.h
index 247df32f687c..0338877b828f 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_dev.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_dev.h
@@ -8,6 +8,8 @@
 #include 
 
 #define VERSION_LEN 32
+/* Maximum queues in PF mode */
+#define MAX_PF_QUEUES  64
 
 /**
  * struct nitrox_cmdq - NITROX command queue
@@ -103,13 +105,58 @@ struct nitrox_q_vector {
};
 };
 
+/**
+ * mbox_msg - Mailbox message data
+ * @type: message type
+ * @opcode: message opcode
+ * @data: message data
+ */
+union mbox_msg {
+   u64 value;
+   struct {
+   u64 type: 2;
+   u64 opcode: 6;
+   u64 data: 58;
+   };
+   struct {
+   u64 type: 2;
+   u64 opcode: 6;
+   u64 chipid: 8;
+   u64 vfid: 8;
+   } id;
+};
+
+/**
+ * nitrox_vfdev - NITROX VF device instance in PF
+ * @state: VF device state
+ * @vfno: VF number
+ * @nr_queues: number of queues enabled in VF
+ * @ring: ring to communicate with VF
+ * @msg: Mailbox message data from VF
+ * @mbx_resp: Mailbox counters
+ */
+struct nitrox_vfdev {
+   atomic_t state;
+  

Re: [PATCH 2/3] dt-bindings: crypto: ccree: add dt bindings for ccree 703

2018-12-04 Thread Gilad Ben-Yossef
On Thu, Nov 29, 2018 at 8:42 AM Herbert Xu  wrote:
>
> On Tue, Nov 13, 2018 at 09:40:36AM +, Gilad Ben-Yossef wrote:
> > Add device tree bindings associating Arm TrustZone CryptoCell 703 with the
> > ccree driver.
> >
> > Signed-off-by: Gilad Ben-Yossef 
> > ---
> >  Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 +
> >  1 file changed, 1 insertion(+)
>
> Which tree is this patch meant to go through?

I'm not sure the question was addressed to me but if it did - either
going through the device tree or the crypto tree is fine by me.

Thanks,
Gilad