[PATCH v3 2/6] crypto: x86/nhpoly1305 - add AVX2 accelerated NHPoly1305
From: Eric Biggers Add a 64-bit AVX2 implementation of NHPoly1305, an ε-almost-∆-universal hash function used in the Adiantum encryption mode. For now, only the NH portion is actually AVX2-accelerated; the Poly1305 part is less performance-critical so is just implemented in C. Signed-off-by: Eric Biggers --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/nh-avx2-x86_64.S | 157 + arch/x86/crypto/nhpoly1305-avx2-glue.c | 77 crypto/Kconfig | 8 ++ 4 files changed, 245 insertions(+) create mode 100644 arch/x86/crypto/nh-avx2-x86_64.S create mode 100644 arch/x86/crypto/nhpoly1305-avx2-glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 2a6acb4de373..0b31b16f49d8 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -106,6 +107,8 @@ ifeq ($(avx2_supported),yes) serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o + + nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o endif ifeq ($(avx512_supported),yes) diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S new file mode 100644 index ..f7946ea1b704 --- /dev/null +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers + */ + +#include + +#definePASS0_SUMS %ymm0 +#definePASS1_SUMS %ymm1 +#definePASS2_SUMS %ymm2 +#definePASS3_SUMS %ymm3 +#defineK0 %ymm4 +#defineK0_XMM %xmm4 +#defineK1 %ymm5 +#defineK1_XMM %xmm5 +#defineK2 %ymm6 +#defineK2_XMM %xmm6 +#defineK3 %ymm7 +#defineK3_XMM %xmm7 +#defineT0 %ymm8 +#defineT1 %ymm9 +#defineT2 %ymm10 +#defineT2_XMM %xmm10 +#defineT3 %ymm11 +#defineT3_XMM %xmm11 +#defineT4 %ymm12 +#defineT5 %ymm13 +#defineT6 %ymm14 +#defineT7 %ymm15 +#defineKEY %rdi +#defineMESSAGE %rsi +#defineMESSAGE_LEN %rdx +#defineHASH%rcx + +.macro _nh_2xstridek0, k1, k2, k3 + + // Add message words to key words + vpaddd \k0, T3, T0 + vpaddd \k1, T3, T1 + vpaddd \k2, T3, T2 + vpaddd \k3, T3, T3 + + // Multiply 32x32 => 64 and accumulate + vpshufd $0x10, T0, T4 + vpshufd $0x32, T0, T0 + vpshufd $0x10, T1, T5 + vpshufd $0x32, T1, T1 + vpshufd $0x10, T2, T6 + vpshufd $0x32, T2, T2 + vpshufd $0x10, T3, T7 + vpshufd $0x32, T3, T3 + vpmuludqT4, T0, T0 + vpmuludqT5, T1, T1 + vpmuludqT6, T2, T2 + vpmuludqT7, T3, T3 + vpaddq T0, PASS0_SUMS, PASS0_SUMS + vpaddq T1, PASS1_SUMS, PASS1_SUMS + vpaddq T2, PASS2_SUMS, PASS2_SUMS + vpaddq T3, PASS3_SUMS, PASS3_SUMS +.endm + +/* + * void nh_avx2(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_avx2) + + vmovdqu 0x00(KEY), K0 + vmovdqu 0x10(KEY), K1 + add $0x20, KEY + vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS + vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS + vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS + vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS + + sub $0x40, MESSAGE_LEN + jl .Lloop4_done +.Lloop4: + vmovdqu (MESSAGE), T3 + vmovdqu 0x00(KEY), K2 + vmovdqu 0x10(KEY), K3 + _nh_2xstrideK0, K1, K2, K3 + + vmovdqu 0x20(MESSAGE), T3 + vmovdqu 0x20(KEY), K0 + vmovdqu 0x30(KEY), K1 + _nh_2xstrideK2, K3, K0, K1 + + add $0x
[PATCH v3 1/6] crypto: x86/nhpoly1305 - add SSE2 accelerated NHPoly1305
From: Eric Biggers Add a 64-bit SSE2 implementation of NHPoly1305, an ε-almost-∆-universal hash function used in the Adiantum encryption mode. For now, only the NH portion is actually SSE2-accelerated; the Poly1305 part is less performance-critical so is just implemented in C. Signed-off-by: Eric Biggers --- arch/x86/crypto/Makefile | 4 + arch/x86/crypto/nh-sse2-x86_64.S | 123 + arch/x86/crypto/nhpoly1305-sse2-glue.c | 76 +++ crypto/Kconfig | 8 ++ 4 files changed, 211 insertions(+) create mode 100644 arch/x86/crypto/nh-sse2-x86_64.S create mode 100644 arch/x86/crypto/nhpoly1305-sse2-glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index ce4e43642984..2a6acb4de373 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -47,6 +47,8 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o + # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ @@ -85,6 +87,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o +nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o + ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ camellia_aesni_avx_glue.o diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S new file mode 100644 index ..51f52d4ab4bb --- /dev/null +++ b/arch/x86/crypto/nh-sse2-x86_64.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers + */ + +#include + +#definePASS0_SUMS %xmm0 +#definePASS1_SUMS %xmm1 +#definePASS2_SUMS %xmm2 +#definePASS3_SUMS %xmm3 +#defineK0 %xmm4 +#defineK1 %xmm5 +#defineK2 %xmm6 +#defineK3 %xmm7 +#defineT0 %xmm8 +#defineT1 %xmm9 +#defineT2 %xmm10 +#defineT3 %xmm11 +#defineT4 %xmm12 +#defineT5 %xmm13 +#defineT6 %xmm14 +#defineT7 %xmm15 +#defineKEY %rdi +#defineMESSAGE %rsi +#defineMESSAGE_LEN %rdx +#defineHASH%rcx + +.macro _nh_stride k0, k1, k2, k3, offset + + // Load next message stride + movdqu \offset(MESSAGE), T1 + + // Load next key stride + movdqu \offset(KEY), \k3 + + // Add message words to key words + movdqa T1, T2 + movdqa T1, T3 + paddd T1, \k0// reuse k0 to avoid a move + paddd \k1, T1 + paddd \k2, T2 + paddd \k3, T3 + + // Multiply 32x32 => 64 and accumulate + pshufd $0x10, \k0, T4 + pshufd $0x32, \k0, \k0 + pshufd $0x10, T1, T5 + pshufd $0x32, T1, T1 + pshufd $0x10, T2, T6 + pshufd $0x32, T2, T2 + pshufd $0x10, T3, T7 + pshufd $0x32, T3, T3 + pmuludq T4, \k0 + pmuludq T5, T1 + pmuludq T6, T2 + pmuludq T7, T3 + paddq \k0, PASS0_SUMS + paddq T1, PASS1_SUMS + paddq T2, PASS2_SUMS + paddq T3, PASS3_SUMS +.endm + +/* + * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_sse2) + + movdqu 0x00(KEY), K0 + movdqu 0x10(KEY), K1 + movdqu 0x20(KEY), K2 + add $0x30, KEY + pxorPASS0_SUMS, PASS0_SUMS + pxorPASS1_SUMS, PASS1_SUMS + pxorPASS2_SUMS, PASS2_SUMS + pxorPASS3_SUMS, PASS3_SUMS + + sub $0x40, MESSAGE_LEN + jl .Lloop4_done +.Lloop4: + _nh_stride K0, K1, K2, K3, 0x00 + _nh_stride K1, K2, K3, K0, 0x10 + _nh_stride K2, K3, K0, K1, 0x20 + _nh_stride K3, K0, K1, K2, 0x30 + add $0x40, KEY + add $0x40, MES
[PATCH v3 4/6] crypto: x86/chacha20 - refactor to allow varying number of rounds
From: Eric Biggers In preparation for adding XChaCha12 support, rename/refactor the x86_64 SIMD implementations of ChaCha20 to support different numbers of rounds. Reviewed-by: Martin Willi Signed-off-by: Eric Biggers --- arch/x86/crypto/Makefile | 8 +- ...a20-avx2-x86_64.S => chacha-avx2-x86_64.S} | 33 ++-- ...12vl-x86_64.S => chacha-avx512vl-x86_64.S} | 35 ++-- ...0-ssse3-x86_64.S => chacha-ssse3-x86_64.S} | 41 ++--- .../crypto/{chacha20_glue.c => chacha_glue.c} | 150 +- 5 files changed, 136 insertions(+), 131 deletions(-) rename arch/x86/crypto/{chacha20-avx2-x86_64.S => chacha-avx2-x86_64.S} (97%) rename arch/x86/crypto/{chacha20-avx512vl-x86_64.S => chacha-avx512vl-x86_64.S} (97%) rename arch/x86/crypto/{chacha20-ssse3-x86_64.S => chacha-ssse3-x86_64.S} (96%) rename arch/x86/crypto/{chacha20_glue.c => chacha_glue.c} (51%) diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 0b31b16f49d8..45734e1cf967 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -78,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o +chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o @@ -103,7 +103,7 @@ endif ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o - chacha20-x86_64-y += chacha20-avx2-x86_64.o + chacha-x86_64-y += chacha-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o @@ -112,7 +112,7 @@ ifeq ($(avx2_supported),yes) endif ifeq ($(avx512_supported),yes) - chacha20-x86_64-y += chacha20-avx512vl-x86_64.o + chacha-x86_64-y += chacha-avx512vl-x86_64.o endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S similarity index 97% rename from arch/x86/crypto/chacha20-avx2-x86_64.S rename to arch/x86/crypto/chacha-avx2-x86_64.S index b6ab082be657..32903fd450af 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha-avx2-x86_64.S @@ -1,5 +1,5 @@ /* - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions + * ChaCha 256-bit cipher algorithm, x64 AVX2 functions * * Copyright (C) 2015 Martin Willi * @@ -38,13 +38,14 @@ CTR4BL: .octa 0x0002 .text -ENTRY(chacha20_2block_xor_avx2) +ENTRY(chacha_2block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 2 data blocks output, o # %rdx: up to 2 data blocks input, i # %rcx: input/output length in bytes + # %r8d: nrounds - # This function encrypts two ChaCha20 blocks by loading the state + # This function encrypts two ChaCha blocks by loading the state # matrix twice across four AVX registers. It performs matrix operations # on four words in each matrix in parallel, but requires shuffling to # rearrange the words after each round. @@ -68,7 +69,6 @@ ENTRY(chacha20_2block_xor_avx2) vmovdqa ROT16(%rip),%ymm5 mov %rcx,%rax - mov $10,%ecx .Ldoubleround: @@ -138,7 +138,7 @@ ENTRY(chacha20_2block_xor_avx2) # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) vpshufd $0x39,%ymm3,%ymm3 - dec %ecx + sub $2,%r8d jnz .Ldoubleround # o0 = i0 ^ (x0 + s0) @@ -228,15 +228,16 @@ ENTRY(chacha20_2block_xor_avx2) lea -8(%r10),%rsp jmp .Ldone2 -ENDPROC(chacha20_2block_xor_avx2) +ENDPROC(chacha_2block_xor_avx2) -ENTRY(chacha20_4block_xor_avx2) +ENTRY(chacha_4block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o # %rdx: up to 4 data blocks input, i # %rcx: input/output length in bytes + # %r8d: nrounds - # This function encrypts four ChaCh
[PATCH v3 5/6] crypto: x86/chacha - add XChaCha12 support
From: Eric Biggers Now that the x86_64 SIMD implementations of ChaCha20 and XChaCha20 have been refactored to support varying the number of rounds, add support for XChaCha12. This is identical to XChaCha20 except for the number of rounds, which is 12 instead of 20. This can be used by Adiantum. Reviewed-by: Martin Willi Signed-off-by: Eric Biggers --- arch/x86/crypto/chacha_glue.c | 17 + crypto/Kconfig| 4 ++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 35fd02b50d27..d19c2908be90 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -232,6 +232,21 @@ static struct skcipher_alg algs[] = { .setkey = crypto_chacha20_setkey, .encrypt= xchacha_simd, .decrypt= xchacha_simd, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module= THIS_MODULE, + + .min_keysize= CHACHA_KEY_SIZE, + .max_keysize= CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha12_setkey, + .encrypt= xchacha_simd, + .decrypt= xchacha_simd, }, }; @@ -268,3 +283,5 @@ MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-simd"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-simd"); diff --git a/crypto/Kconfig b/crypto/Kconfig index df466771e9bf..29865c599b04 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1473,8 +1473,8 @@ config CRYPTO_CHACHA20_X86_64 select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 help - SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20 - and XChaCha20 stream ciphers. + SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20, + XChaCha20, and XChaCha12 stream ciphers. config CRYPTO_SEED tristate "SEED cipher algorithm" -- 2.19.2
[PATCH v3 6/6] crypto: x86/chacha - yield the FPU occasionally
From: Eric Biggers To improve responsiveness, yield the FPU (temporarily re-enabling preemption) every 4 KiB encrypted/decrypted, rather than keeping preemption disabled during the entire encryption/decryption operation. Alternatively we could do this for every skcipher_walk step, but steps may be small in some cases, and yielding the FPU is expensive on x86. Suggested-by: Martin Willi Signed-off-by: Eric Biggers --- arch/x86/crypto/chacha_glue.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index d19c2908be90..9b1d3fac4943 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -132,6 +132,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, { u32 *state, state_buf[16 + 2] __aligned(8); struct skcipher_walk walk; + int next_yield = 4096; /* bytes until next FPU yield */ int err; BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); @@ -144,12 +145,21 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; - if (nbytes < walk.total) + if (nbytes < walk.total) { nbytes = round_down(nbytes, walk.stride); + next_yield -= nbytes; + } chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); + if (next_yield <= 0) { + /* temporarily allow preemption */ + kernel_fpu_end(); + kernel_fpu_begin(); + next_yield = 4096; + } + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } -- 2.19.2
[PATCH v3 3/6] crypto: x86/chacha20 - add XChaCha20 support
From: Eric Biggers Add an XChaCha20 implementation that is hooked up to the x86_64 SIMD implementations of ChaCha20. This can be used by Adiantum. An SSSE3 implementation of single-block HChaCha20 is also added so that XChaCha20 can use it rather than the generic implementation. This required refactoring the ChaCha permutation into its own function. Signed-off-by: Eric Biggers --- arch/x86/crypto/chacha20-ssse3-x86_64.S | 81 -- arch/x86/crypto/chacha20_glue.c | 108 ++-- crypto/Kconfig | 12 +-- 3 files changed, 141 insertions(+), 60 deletions(-) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index d8ac75bb448f..f6792789f875 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -10,6 +10,7 @@ */ #include +#include .section .rodata.cst16.ROT8, "aM", @progbits, 16 .align 16 @@ -23,37 +24,24 @@ CTRINC: .octa 0x000300020001 .text -ENTRY(chacha20_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - - # This function encrypts one ChaCha20 block by loading the state matrix - # in four SSE registers. It performs matrix operation on four words in - # parallel, but requires shuffling to rearrange the words after each - # round. 8/16-bit word rotation is done with the slightly better - # performing SSSE3 byte shuffling, 7/12-bit word rotation uses - # traditional shift+OR. - - # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 +/* + * chacha20_permute - permute one block + * + * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This + * function performs matrix operations on four words in parallel, but requires + * shuffling to rearrange the words after each round. 8/16-bit word rotation is + * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word + * rotation uses traditional shift+OR. + * + * Clobbers: %ecx, %xmm4-%xmm7 + */ +chacha20_permute: movdqa ROT8(%rip),%xmm4 movdqa ROT16(%rip),%xmm5 - - mov %rcx,%rax mov $10,%ecx .Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) paddd %xmm1,%xmm0 pxor%xmm0,%xmm3 @@ -123,6 +111,29 @@ ENTRY(chacha20_block_xor_ssse3) dec %ecx jnz .Ldoubleround + ret +ENDPROC(chacha20_permute) + +ENTRY(chacha20_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes + FRAME_BEGIN + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + mov %rcx,%rax + callchacha20_permute + # o0 = i0 ^ (x0 + s0) paddd %xmm8,%xmm0 cmp $0x10,%rax @@ -156,6 +167,7 @@ ENTRY(chacha20_block_xor_ssse3) movdqu %xmm0,0x30(%rsi) .Ldone: + FRAME_END ret .Lxorpart: @@ -189,6 +201,25 @@ ENTRY(chacha20_block_xor_ssse3) ENDPROC(chacha20_block_xor_ssse3) +ENTRY(hchacha20_block_ssse3) + # %rdi: Input state matrix, s + # %rsi: output (8 32-bit words) + FRAME_BEGIN + + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + + callchacha20_permute + + movdqu %xmm0,0x00(%rsi) + movdqu %xmm3,0x10(%rsi) + + FRAME_END + ret +ENDPROC(hchacha20_block_ssse3) + ENTRY(chacha20_4block_xor_ssse3) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 773d075a1483..70d388e4a3a2 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -23,6 +23,7 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
[PATCH v3 0/6] crypto: x86_64 optimized XChaCha and NHPoly1305 (for Adiantum)
Hello, This series optimizes the Adiantum encryption mode for x86_64 by adding SSE2 and AVX2 accelerated implementations of NHPoly1305, specifically the NH part; and by modifying the existing x86_64 SSSE3/AVX2/AVX-512VL implementation of ChaCha20 to support XChaCha20 and XChaCha12. This greatly improves Adiantum performance on x86_64. For example, encrypting 4096-byte messages (single-threaded) on a Skylake-based processor (Intel Xeon, supports AVX-512VL and AVX2): BeforeAfter - adiantum(xchacha12,aes)348 MB/s 1493 MB/s adiantum(xchacha20,aes)266 MB/s 1261 MB/s And on a Zen-based processor (Threadripper 1950X, supports AVX2): BeforeAfter - adiantum(xchacha12,aes)505 MB/s 1292 MB/s adiantum(xchacha20,aes)387 MB/s 1037 MB/s Decryption is almost exactly the same speed as encryption. The biggest benefit comes from accelerating XChaCha. Accelerating NH gives a somewhat smaller, but still significant benefit. Performance on 512-byte inputs is also improved, though that is much slower in the first place. When Adiantium is used with dm-crypt (or cryptsetup), we recommend using a 4096-byte sector size. For comparison, AES-256-XTS is 2710 MB/s on the Skylake CPU and 4140 MB/s on the Zen CPU. However, AES has the benefit of direct AES-NI hardware support whereas Adiantum is implemented entirely with general-purpose instructions (scalar and SIMD). Adiantum is also a super-pseudorandom permutation over the entire sector, unlike XTS. Note that XChaCha20 and XChaCha12 can be used for other purposes too. Changed since v2: - Yield the FPU once per 4096 bytes rather than once per skcipher_walk step. - Create full stack frame in hchacha_block_ssse3() and chacha_block_xor_ssse3(). Changed since v1: - Rebase on top of latest cryptodev with the AVX-512VL accelerated ChaCha20 from Martin Willi. Eric Biggers (6): crypto: x86/nhpoly1305 - add SSE2 accelerated NHPoly1305 crypto: x86/nhpoly1305 - add AVX2 accelerated NHPoly1305 crypto: x86/chacha20 - add XChaCha20 support crypto: x86/chacha20 - refactor to allow varying number of rounds crypto: x86/chacha - add XChaCha12 support crypto: x86/chacha - yield the FPU occasionally arch/x86/crypto/Makefile | 15 +- ...a20-avx2-x86_64.S => chacha-avx2-x86_64.S} | 33 +- ...12vl-x86_64.S => chacha-avx512vl-x86_64.S} | 35 +-- ...0-ssse3-x86_64.S => chacha-ssse3-x86_64.S} | 104 +++--- arch/x86/crypto/chacha20_glue.c | 208 arch/x86/crypto/chacha_glue.c | 297 ++ arch/x86/crypto/nh-avx2-x86_64.S | 157 + arch/x86/crypto/nh-sse2-x86_64.S | 123 arch/x86/crypto/nhpoly1305-avx2-glue.c| 77 + arch/x86/crypto/nhpoly1305-sse2-glue.c| 76 + crypto/Kconfig| 28 +- 11 files changed, 861 insertions(+), 292 deletions(-) rename arch/x86/crypto/{chacha20-avx2-x86_64.S => chacha-avx2-x86_64.S} (97%) rename arch/x86/crypto/{chacha20-avx512vl-x86_64.S => chacha-avx512vl-x86_64.S} (97%) rename arch/x86/crypto/{chacha20-ssse3-x86_64.S => chacha-ssse3-x86_64.S} (92%) delete mode 100644 arch/x86/crypto/chacha20_glue.c create mode 100644 arch/x86/crypto/chacha_glue.c create mode 100644 arch/x86/crypto/nh-avx2-x86_64.S create mode 100644 arch/x86/crypto/nh-sse2-x86_64.S create mode 100644 arch/x86/crypto/nhpoly1305-avx2-glue.c create mode 100644 arch/x86/crypto/nhpoly1305-sse2-glue.c -- 2.19.2
Re: [PATCH v2 3/6] crypto: x86/chacha20 - limit the preemption-disabled section
On Mon, Dec 03, 2018 at 03:13:37PM +0100, Ard Biesheuvel wrote: > On Sun, 2 Dec 2018 at 11:47, Martin Willi wrote: > > > > > > > To improve responsiveness, disable preemption for each step of the > > > walk (which is at most PAGE_SIZE) rather than for the entire > > > encryption/decryption operation. > > > > It seems that it is not that uncommon for IPsec to get small inputs > > scattered over multiple blocks. Doing FPU context saving for each walk > > step then can slow down things. > > > > An alternative approach could be to re-enable preemption not based on > > the walk steps, but on the amount of bytes processed. This would > > satisfy both users, I guess. > > > > In the long run we probably need a better approach for FPU context > > saving, as this really hurts performance-wise. For IPsec we should find > > a way to avoid the (multiple) per-packet FPU save/restores in softirq > > context, but I guess this requires support from process context > > switching. > > > > At Jason's Zinc talk at plumbers, this came up, and apparently someone > is working on this, i.e., to ensure that on x86, the FPU restore only > occurs lazily, when returning to userland rather than every time you > call kernel_fpu_end() [like we do on arm64 as well] > > Not sure what the ETA for that work is, though, nor did I get the name > of the guy working on it. Thanks for the suggestion; I'll replace this with a patch that re-enables preemption every 4 KiB encrypted. That also avoids having to do a kernel_fpu_begin(), kernel_fpu_end() pair just for hchacha_block_ssse3(). But yes, I'd definitely like repeated kernel_fpu_begin(), kernel_fpu_end() to not be incredibly slow. That would help in a lot of other places too. - Eric
Re: [PATCH v2 4/6] crypto: x86/chacha20 - add XChaCha20 support
Hi Martin, On Sat, Dec 01, 2018 at 05:40:40PM +0100, Martin Willi wrote: > > > An SSSE3 implementation of single-block HChaCha20 is also added so > > that XChaCha20 can use it rather than the generic > > implementation. This required refactoring the ChaCha permutation > > into its own function. > > > [...] > > > +ENTRY(chacha20_block_xor_ssse3) > > + # %rdi: Input state matrix, s > > + # %rsi: up to 1 data block output, o > > + # %rdx: up to 1 data block input, i > > + # %rcx: input/output length in bytes > > + > > + # x0..3 = s0..3 > > + movdqa 0x00(%rdi),%xmm0 > > + movdqa 0x10(%rdi),%xmm1 > > + movdqa 0x20(%rdi),%xmm2 > > + movdqa 0x30(%rdi),%xmm3 > > + movdqa %xmm0,%xmm8 > > + movdqa %xmm1,%xmm9 > > + movdqa %xmm2,%xmm10 > > + movdqa %xmm3,%xmm11 > > + > > + mov %rcx,%rax > > + callchacha20_permute > > + > > # o0 = i0 ^ (x0 + s0) > > paddd %xmm8,%xmm0 > > cmp $0x10,%rax > > @@ -189,6 +198,23 @@ ENTRY(chacha20_block_xor_ssse3) > > > > ENDPROC(chacha20_block_xor_ssse3) > > > > +ENTRY(hchacha20_block_ssse3) > > + # %rdi: Input state matrix, s > > + # %rsi: output (8 32-bit words) > > + > > + movdqa 0x00(%rdi),%xmm0 > > + movdqa 0x10(%rdi),%xmm1 > > + movdqa 0x20(%rdi),%xmm2 > > + movdqa 0x30(%rdi),%xmm3 > > + > > + callchacha20_permute > > AFAIK, the general convention is to create proper stack frames using > FRAME_BEGIN/END for non leaf-functions. Should chacha20_permute() > callers do so? > Yes, I'll do that. (Ard suggested similarly in the arm64 version too.) - Eric
Re: [PATCH 2/3] dt-bindings: crypto: ccree: add dt bindings for ccree 703
On Tue, Dec 04, 2018 at 02:14:19PM -0600, Rob Herring wrote: > On Thu, Nov 29, 2018 at 02:42:18PM +0800, Herbert Xu wrote: > > On Tue, Nov 13, 2018 at 09:40:36AM +, Gilad Ben-Yossef wrote: > > > Add device tree bindings associating Arm TrustZone CryptoCell 703 with the > > > ccree driver. > > > > > > Signed-off-by: Gilad Ben-Yossef > > > --- > > > Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 + > > > 1 file changed, 1 insertion(+) > > > > Which tree is this patch meant to go through? > > You should take it as it is part of a series. Thanks Rob! -- Email: Herbert Xu Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Using Advanced Vector eXtensions with hand-coded x64 algorithms (e.g /arch/x86/blowfish-x86_64-asm_64.S)
I was curious if it might make implementing F() faster to use instructions that are meant to work with sets of data similar to what would be processed
[PATCH] crypto: adiantum - propagate CRYPTO_ALG_ASYNC flag to instance
From: Eric Biggers If the stream cipher implementation is asynchronous, then the Adiantum instance must be flagged as asynchronous as well. Otherwise someone asking for a synchronous algorithm can get an asynchronous algorithm. There are no asynchronous xchacha12 or xchacha20 implementations yet which makes this largely a theoretical issue, but it should be fixed. Fixes: 059c2a4d8e16 ("crypto: adiantum - add Adiantum support") Signed-off-by: Eric Biggers --- crypto/adiantum.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crypto/adiantum.c b/crypto/adiantum.c index 2dfcf12fd4529..ca27e0dc2958c 100644 --- a/crypto/adiantum.c +++ b/crypto/adiantum.c @@ -590,6 +590,8 @@ static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb) hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto out_drop_hash; + inst->alg.base.cra_flags = streamcipher_alg->base.cra_flags & + CRYPTO_ALG_ASYNC; inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE; inst->alg.base.cra_ctxsize = sizeof(struct adiantum_tfm_ctx); inst->alg.base.cra_alignmask = streamcipher_alg->base.cra_alignmask | -- 2.20.0.rc1.387.gf8505762e3-goog
Re: [PATCH] fscrypt: remove CRYPTO_CTR dependency
On Thu, Sep 06, 2018 at 12:43:41PM +0200, Ard Biesheuvel wrote: > On 5 September 2018 at 21:24, Eric Biggers wrote: > > From: Eric Biggers > > > > fscrypt doesn't use the CTR mode of operation for anything, so there's > > no need to select CRYPTO_CTR. It was added by commit 71dea01ea2ed > > ("ext4 crypto: require CONFIG_CRYPTO_CTR if ext4 encryption is > > enabled"). But, I've been unable to identify the arm64 crypto bug it > > was supposedly working around. > > > > I suspect the issue was seen only on some old Android device kernel > > (circa 3.10?). So if the fix wasn't mistaken, the real bug is probably > > already fixed. Or maybe it was actually a bug in a non-upstream crypto > > driver. > > > > So, remove the dependency. If it turns out there's actually still a > > bug, we'll fix it properly. > > > > Signed-off-by: Eric Biggers > > Acked-by: Ard Biesheuvel > > This may be related to > > 11e3b725cfc2 crypto: arm64/aes-blk - honour iv_out requirement in CBC > and CTR modes > > given that the commit in question mentions CTS. How it actually works > around the issue is unclear to me, though. > > > > > > --- > > fs/crypto/Kconfig | 1 - > > 1 file changed, 1 deletion(-) > > > > diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig > > index 02b7d91c92310..284b589b4774d 100644 > > --- a/fs/crypto/Kconfig > > +++ b/fs/crypto/Kconfig > > @@ -6,7 +6,6 @@ config FS_ENCRYPTION > > select CRYPTO_ECB > > select CRYPTO_XTS > > select CRYPTO_CTS > > - select CRYPTO_CTR > > select CRYPTO_SHA256 > > select KEYS > > help > > -- > > 2.19.0.rc2.392.g5ba43deb5a-goog > > Ping. Ted, can you consider applying this to the fscrypt tree for 4.21? Thanks, - Eric
Re: [PATCH v8 00/14] Appended signatures support for IMA appraisal
Hello James, Thanks for you interest in these patches. James Morris writes: > On Fri, 16 Nov 2018, Thiago Jung Bauermann wrote: > >> On the OpenPOWER platform, secure boot and trusted boot are being >> implemented using IMA for taking measurements and verifying signatures. >> Since the kernel image on Power servers is an ELF binary, kernels are >> signed using the scripts/sign-file tool and thus use the same signature >> format as signed kernel modules. >> >> This patch series adds support in IMA for verifying those signatures. > > Are you saying you use IMA to verify kernels during boot? From a Linux > bootloader? Yes to both. OpenPOWER machines have embedded in their firmware a Linux kernel and initramfs to use as bootloader, using Petitboot. kexec is used to load the OS and boot it. >> It adds flexibility to OpenPOWER secure boot, because it allows it to boot >> kernels with the signature appended to them as well as kernels where the >> signature is stored in the IMA extended attribute. > > Just to clarify, with these patches, IMA will be able to verify the > native form of signed kernel modules? That wasn't my use case to develop the patches, but I just tested and it works. I just had to make a slight modification: there's a whitelist of IMA hooks that are allowed to use the module signature format (in the ima_hook_supports_modsig function), and I had to add MODULE_CHECK to it. The next version of the patches will have this change. The only difference is that IMA looks for a valid key in the IMA keyring, while the CONFIG_MODULE_SIG code looks for the module signing key in the builtin and secondary trusted keyrings. > i.e. without xattrs at all, and > this will work with existing signed modules? No xattrs at all, and yes. -- Thiago Jung Bauermann IBM Linux Technology Center
[tip:core/rcu] crypto/pcrypt: Replace synchronize_rcu_bh() with synchronize_rcu()
Commit-ID: a0076e1778c23de4a42d90fee4ecb4c21dbb5838 Gitweb: https://git.kernel.org/tip/a0076e1778c23de4a42d90fee4ecb4c21dbb5838 Author: Paul E. McKenney AuthorDate: Mon, 5 Nov 2018 16:57:40 -0800 Committer: Paul E. McKenney CommitDate: Tue, 27 Nov 2018 09:18:59 -0800 crypto/pcrypt: Replace synchronize_rcu_bh() with synchronize_rcu() Now that synchronize_rcu() waits for bh-disable regions of code as well as RCU read-side critical sections, the synchronize_rcu_bh() in pcrypt_cpumask_change_notify() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Steffen Klassert Cc: Acked-by: Herbert Xu --- crypto/pcrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index f8ec3d4ba4a8..8eb3c4c9ff67 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -382,7 +382,7 @@ static int pcrypt_cpumask_change_notify(struct notifier_block *self, cpumask_copy(new_mask->mask, cpumask->cbcpu); rcu_assign_pointer(pcrypt->cb_cpumask, new_mask); - synchronize_rcu_bh(); + synchronize_rcu(); free_cpumask_var(old_mask->mask); kfree(old_mask);
Re: [PATCH v8 00/14] Appended signatures support for IMA appraisal
On Fri, 16 Nov 2018, Thiago Jung Bauermann wrote: > On the OpenPOWER platform, secure boot and trusted boot are being > implemented using IMA for taking measurements and verifying signatures. > Since the kernel image on Power servers is an ELF binary, kernels are > signed using the scripts/sign-file tool and thus use the same signature > format as signed kernel modules. > > This patch series adds support in IMA for verifying those signatures. Are you saying you use IMA to verify kernels during boot? From a Linux bootloader? > It adds flexibility to OpenPOWER secure boot, because it allows it to boot > kernels with the signature appended to them as well as kernels where the > signature is stored in the IMA extended attribute. Just to clarify, with these patches, IMA will be able to verify the native form of signed kernel modules? i.e. without xattrs at all, and this will work with existing signed modules? -- James Morris
Re: [PATCH 2/3] dt-bindings: crypto: ccree: add dt bindings for ccree 703
On Thu, Nov 29, 2018 at 02:42:18PM +0800, Herbert Xu wrote: > On Tue, Nov 13, 2018 at 09:40:36AM +, Gilad Ben-Yossef wrote: > > Add device tree bindings associating Arm TrustZone CryptoCell 703 with the > > ccree driver. > > > > Signed-off-by: Gilad Ben-Yossef > > --- > > Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 + > > 1 file changed, 1 insertion(+) > > Which tree is this patch meant to go through? You should take it as it is part of a series. Rob
Re: [PATCH 2/3] dt-bindings: crypto: ccree: add dt bindings for ccree 703
On Tue, 13 Nov 2018 09:40:36 +, Gilad Ben-Yossef wrote: > Add device tree bindings associating Arm TrustZone CryptoCell 703 with the > ccree driver. > > Signed-off-by: Gilad Ben-Yossef > --- > Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 + > 1 file changed, 1 insertion(+) > Reviewed-by: Rob Herring
Re: [PATCH v2 2/4] crypto: arm64/chacha20 - add XChaCha20 support
On Tue, 4 Dec 2018 at 04:56, Eric Biggers wrote: > > From: Eric Biggers > > Add an XChaCha20 implementation that is hooked up to the ARM64 NEON > implementation of ChaCha20. This can be used by Adiantum. > > A NEON implementation of single-block HChaCha20 is also added so that > XChaCha20 can use it rather than the generic implementation. This > required refactoring the ChaCha20 permutation into its own function. > > Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel > --- > arch/arm64/crypto/Kconfig | 2 +- > arch/arm64/crypto/chacha20-neon-core.S | 65 +++- > arch/arm64/crypto/chacha20-neon-glue.c | 101 +++-- > 3 files changed, 125 insertions(+), 43 deletions(-) > > diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig > index 3f5aeb786192..d54ddb8468ef 100644 > --- a/arch/arm64/crypto/Kconfig > +++ b/arch/arm64/crypto/Kconfig > @@ -101,7 +101,7 @@ config CRYPTO_AES_ARM64_NEON_BLK > select CRYPTO_SIMD > > config CRYPTO_CHACHA20_NEON > - tristate "NEON accelerated ChaCha20 symmetric cipher" > + tristate "ChaCha20 and XChaCha20 stream ciphers using NEON > instructions" > depends on KERNEL_MODE_NEON > select CRYPTO_BLKCIPHER > select CRYPTO_CHACHA20 > diff --git a/arch/arm64/crypto/chacha20-neon-core.S > b/arch/arm64/crypto/chacha20-neon-core.S > index 13c85e272c2a..0571e45a1a0a 100644 > --- a/arch/arm64/crypto/chacha20-neon-core.S > +++ b/arch/arm64/crypto/chacha20-neon-core.S > @@ -23,25 +23,20 @@ > .text > .align 6 > > -ENTRY(chacha20_block_xor_neon) > - // x0: Input state matrix, s > - // x1: 1 data block output, o > - // x2: 1 data block input, i > - > - // > - // This function encrypts one ChaCha20 block by loading the state > matrix > - // in four NEON registers. It performs matrix operation on four words > in > - // parallel, but requires shuffling to rearrange the words after each > - // round. > - // > - > - // x0..3 = s0..3 > - adr x3, ROT8 > - ld1 {v0.4s-v3.4s}, [x0] > - ld1 {v8.4s-v11.4s}, [x0] > - ld1 {v12.4s}, [x3] > +/* > + * chacha20_permute - permute one block > + * > + * Permute one 64-byte block where the state matrix is stored in the four > NEON > + * registers v0-v3. It performs matrix operations on four words in parallel, > + * but requires shuffling to rearrange the words after each round. > + * > + * Clobbers: x3, x10, v4, v12 > + */ > +chacha20_permute: > > mov x3, #10 > + adr x10, ROT8 > + ld1 {v12.4s}, [x10] > > .Ldoubleround: > // x0 += x1, x3 = rotl32(x3 ^ x0, 16) > @@ -105,6 +100,23 @@ ENTRY(chacha20_block_xor_neon) > subsx3, x3, #1 > b.ne.Ldoubleround > > + ret > +ENDPROC(chacha20_permute) > + > +ENTRY(chacha20_block_xor_neon) > + // x0: Input state matrix, s > + // x1: 1 data block output, o > + // x2: 1 data block input, i > + > + stp x29, x30, [sp, #-16]! > + mov x29, sp > + > + // x0..3 = s0..3 > + ld1 {v0.4s-v3.4s}, [x0] > + ld1 {v8.4s-v11.4s}, [x0] > + > + bl chacha20_permute > + > ld1 {v4.16b-v7.16b}, [x2] > > // o0 = i0 ^ (x0 + s0) > @@ -125,9 +137,28 @@ ENTRY(chacha20_block_xor_neon) > > st1 {v0.16b-v3.16b}, [x1] > > + ldp x29, x30, [sp], #16 > ret > ENDPROC(chacha20_block_xor_neon) > > +ENTRY(hchacha20_block_neon) > + // x0: Input state matrix, s > + // x1: output (8 32-bit words) > + > + stp x29, x30, [sp, #-16]! > + mov x29, sp > + > + ld1 {v0.4s-v3.4s}, [x0] > + > + bl chacha20_permute > + > + st1 {v0.16b}, [x1], #16 > + st1 {v3.16b}, [x1] > + > + ldp x29, x30, [sp], #16 > + ret > +ENDPROC(hchacha20_block_neon) > + > .align 6 > ENTRY(chacha20_4block_xor_neon) > // x0: Input state matrix, s > diff --git a/arch/arm64/crypto/chacha20-neon-glue.c > b/arch/arm64/crypto/chacha20-neon-glue.c > index 96e0cfb8c3f5..a5b9cbc0c4de 100644 > --- a/arch/arm64/crypto/chacha20-neon-glue.c > +++ b/arch/arm64/crypto/chacha20-neon-glue.c > @@ -30,6 +30,7 @@ > > asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); > asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); > +asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out); > > static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, > unsigned int bytes) > @@ -65,20 +66,16 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 > *src, > kernel_neon_end(); > } > > -static int ch
[PATCH v2 0/3] crypto: arm64/chacha - performance improvements
Improve the performance of NEON based ChaCha: Patch #1 adds a block size of 1472 to the tcrypt test template so we have something that reflects the VPN case. Patch #2 improves performance for arbitrary length inputs: on deep pipelines, throughput increases ~30% when running on inputs blocks whose size is drawn randomly from the interval [64, 1024) Patch #3 adopts the OpenSSL approach to use the ALU in parallel with the SIMD unit to process a fifth block while the SIMD is operating on 4 blocks. Performance on Cortex-A57: BEFORE: === testing speed of async chacha20 (chacha20-neon) encryption tcrypt: test 0 (256 bit key, 16 byte blocks): 2528223 operations in 1 seconds (40451568 bytes) tcrypt: test 1 (256 bit key, 64 byte blocks): 2518155 operations in 1 seconds (161161920 bytes) tcrypt: test 2 (256 bit key, 256 byte blocks): 1207948 operations in 1 seconds (309234688 bytes) tcrypt: test 3 (256 bit key, 1024 byte blocks): 332194 operations in 1 seconds (340166656 bytes) tcrypt: test 4 (256 bit key, 1472 byte blocks): 185659 operations in 1 seconds (273290048 bytes) tcrypt: test 5 (256 bit key, 8192 byte blocks): 41829 operations in 1 seconds (342663168 bytes) AFTER: == testing speed of async chacha20 (chacha20-neon) encryption tcrypt: test 0 (256 bit key, 16 byte blocks): 2530018 operations in 1 seconds (40480288 bytes) tcrypt: test 1 (256 bit key, 64 byte blocks): 2518270 operations in 1 seconds (161169280 bytes) tcrypt: test 2 (256 bit key, 256 byte blocks): 1187760 operations in 1 seconds (304066560 bytes) tcrypt: test 3 (256 bit key, 1024 byte blocks): 361652 operations in 1 seconds (370331648 bytes) tcrypt: test 4 (256 bit key, 1472 byte blocks): 280971 operations in 1 seconds (413589312 bytes) tcrypt: test 5 (256 bit key, 8192 byte blocks): 53654 operations in 1 seconds (439533568 bytes) Zinc: = testing speed of async chacha20 (chacha20-software) encryption tcrypt: test 0 (256 bit key, 16 byte blocks): 2510300 operations in 1 seconds (40164800 bytes) tcrypt: test 1 (256 bit key, 64 byte blocks): 2663794 operations in 1 seconds (170482816 bytes) tcrypt: test 2 (256 bit key, 256 byte blocks): 1237617 operations in 1 seconds (316829952 bytes) tcrypt: test 3 (256 bit key, 1024 byte blocks): 364645 operations in 1 seconds (373396480 bytes) tcrypt: test 4 (256 bit key, 1472 byte blocks): 251548 operations in 1 seconds (370278656 bytes) tcrypt: test 5 (256 bit key, 8192 byte blocks): 47650 operations in 1 seconds (390348800 bytes) Cc: Eric Biggers Cc: Martin Willi Ard Biesheuvel (3): crypto: tcrypt - add block size of 1472 to skcipher template crypto: arm64/chacha - optimize for arbitrary length inputs crypto: arm64/chacha - use combined SIMD/ALU routine for more speed arch/arm64/crypto/chacha-neon-core.S | 396 +++- arch/arm64/crypto/chacha-neon-glue.c | 59 ++- crypto/tcrypt.c | 2 +- 3 files changed, 404 insertions(+), 53 deletions(-) -- 2.19.2
[PATCH v2 1/3] crypto: tcrypt - add block size of 1472 to skcipher template
In order to have better coverage of algorithms operating on block sizes that are in the ballpark of a VPN packet, add 1472 to the block_sizes array. Signed-off-by: Ard Biesheuvel --- crypto/tcrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 0590a9204562..e7fb87e114a5 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -81,7 +81,7 @@ static char *check[] = { NULL }; -static u32 block_sizes[] = { 16, 64, 256, 1024, 8192, 0 }; +static u32 block_sizes[] = { 16, 64, 256, 1024, 1472, 8192, 0 }; static u32 aead_sizes[] = { 16, 64, 256, 512, 1024, 2048, 4096, 8192, 0 }; #define XBUFSIZE 8 -- 2.19.2
[PATCH v2 3/3] crypto: arm64/chacha - use combined SIMD/ALU routine for more speed
To some degree, most known AArch64 micro-architectures appear to be able to issue ALU instructions in parellel to SIMD instructions without affecting the SIMD throughput. This means we can use the ALU to process a fifth ChaCha block while the SIMD is processing four blocks in parallel. Signed-off-by: Ard Biesheuvel --- arch/arm64/crypto/chacha-neon-core.S | 235 ++-- arch/arm64/crypto/chacha-neon-glue.c | 39 ++-- 2 files changed, 239 insertions(+), 35 deletions(-) diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index 32086709e6b3..534e0a3fafa4 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -1,13 +1,13 @@ /* * ChaCha/XChaCha NEON helper functions * - * Copyright (C) 2016 Linaro, Ltd. + * Copyright (C) 2016-2018 Linaro, Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * - * Based on: + * Originally based on: * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions * * Copyright (C) 2015 Martin Willi @@ -160,8 +160,27 @@ ENTRY(hchacha_block_neon) ret x9 ENDPROC(hchacha_block_neon) + a0 .reqw12 + a1 .reqw13 + a2 .reqw14 + a3 .reqw15 + a4 .reqw16 + a5 .reqw17 + a6 .reqw19 + a7 .reqw20 + a8 .reqw21 + a9 .reqw22 + a10 .reqw23 + a11 .reqw24 + a12 .reqw25 + a13 .reqw26 + a14 .reqw27 + a15 .reqw28 + .align 6 ENTRY(chacha_4block_xor_neon) + frame_push 10 + // x0: Input state matrix, s // x1: 4 data blocks output, o // x2: 4 data blocks input, i @@ -181,6 +200,9 @@ ENTRY(chacha_4block_xor_neon) // matrix by interleaving 32- and then 64-bit words, which allows us to // do XOR in NEON registers. // + // At the same time, a fifth block is encrypted in parallel using + // scalar registers + // adr_l x9, CTRINC // ... and ROT8 ld1 {v30.4s-v31.4s}, [x9] @@ -191,7 +213,24 @@ ENTRY(chacha_4block_xor_neon) ld4r{ v8.4s-v11.4s}, [x8], #16 ld4r{v12.4s-v15.4s}, [x8] - // x12 += counter values 0-3 + mov a0, v0.s[0] + mov a1, v1.s[0] + mov a2, v2.s[0] + mov a3, v3.s[0] + mov a4, v4.s[0] + mov a5, v5.s[0] + mov a6, v6.s[0] + mov a7, v7.s[0] + mov a8, v8.s[0] + mov a9, v9.s[0] + mov a10, v10.s[0] + mov a11, v11.s[0] + mov a12, v12.s[0] + mov a13, v13.s[0] + mov a14, v14.s[0] + mov a15, v15.s[0] + + // x12 += counter values 1-4 add v12.4s, v12.4s, v30.4s .Ldoubleround4: @@ -200,33 +239,53 @@ ENTRY(chacha_4block_xor_neon) // x2 += x6, x14 = rotl32(x14 ^ x2, 16) // x3 += x7, x15 = rotl32(x15 ^ x3, 16) add v0.4s, v0.4s, v4.4s + add a0, a0, a4 add v1.4s, v1.4s, v5.4s + add a1, a1, a5 add v2.4s, v2.4s, v6.4s + add a2, a2, a6 add v3.4s, v3.4s, v7.4s + add a3, a3, a7 eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 rev32 v12.8h, v12.8h + ror a12, a12, #16 rev32 v13.8h, v13.8h + ror a13, a13, #16 rev32 v14.8h, v14.8h + ror a14, a14, #16 rev32 v15.8h, v15.8h + ror a15, a15, #16 // x8 += x12, x4 = rotl32(x4 ^ x8, 12) // x9 += x13, x5 = rotl32(x5 ^ x9, 12) // x10 += x14, x6 = rotl32(x6 ^ x10, 12) // x11 += x15, x7 = rotl32(x7 ^ x11, 12) add v8.4s, v8.4s, v12.4s + add a8, a8, a12 add v9.4s, v9.4s, v13.4s + add a9, a9, a13 add v10.4s, v10.4s, v14.4s + add a10, a10, a14 add v11.4s, v11.4s, v15.4s + add a11
[PATCH v2 2/3] crypto: arm64/chacha - optimize for arbitrary length inputs
Update the 4-way NEON ChaCha routine so it can handle input of any length >64 bytes in its entirety, rather than having to call into the 1-way routine and/or memcpy()s via temp buffers to handle the tail of a ChaCha invocation that is not a multiple of 256 bytes. On inputs that are a multiple of 256 bytes (and thus in tcrypt benchmarks), performance drops by around 1% on Cortex-A57, while performance for inputs drawn randomly from the range [64, 1024) increases by around 30%. Signed-off-by: Ard Biesheuvel --- arch/arm64/crypto/chacha-neon-core.S | 183 ++-- arch/arm64/crypto/chacha-neon-glue.c | 38 ++-- 2 files changed, 184 insertions(+), 37 deletions(-) diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index 75b4e06cee79..32086709e6b3 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -19,6 +19,8 @@ */ #include +#include +#include .text .align 6 @@ -36,7 +38,7 @@ */ chacha_permute: - adr x10, ROT8 + adr_l x10, ROT8 ld1 {v12.4s}, [x10] .Ldoubleround: @@ -164,6 +166,12 @@ ENTRY(chacha_4block_xor_neon) // x1: 4 data blocks output, o // x2: 4 data blocks input, i // w3: nrounds + // x4: byte count + + adr_l x10, .Lpermute + and x5, x4, #63 + add x10, x10, x5 + add x11, x10, #64 // // This function encrypts four consecutive ChaCha blocks by loading @@ -173,15 +181,15 @@ ENTRY(chacha_4block_xor_neon) // matrix by interleaving 32- and then 64-bit words, which allows us to // do XOR in NEON registers. // - adr x9, CTRINC // ... and ROT8 + adr_l x9, CTRINC // ... and ROT8 ld1 {v30.4s-v31.4s}, [x9] // x0..15[0-3] = s0..3[0..3] - mov x4, x0 - ld4r{ v0.4s- v3.4s}, [x4], #16 - ld4r{ v4.4s- v7.4s}, [x4], #16 - ld4r{ v8.4s-v11.4s}, [x4], #16 - ld4r{v12.4s-v15.4s}, [x4] + add x8, x0, #16 + ld4r{ v0.4s- v3.4s}, [x0] + ld4r{ v4.4s- v7.4s}, [x8], #16 + ld4r{ v8.4s-v11.4s}, [x8], #16 + ld4r{v12.4s-v15.4s}, [x8] // x12 += counter values 0-3 add v12.4s, v12.4s, v30.4s @@ -425,24 +433,47 @@ ENTRY(chacha_4block_xor_neon) zip1v30.4s, v14.4s, v15.4s zip2v31.4s, v14.4s, v15.4s + mov x3, #64 + subsx5, x4, #64 + add x6, x5, x2 + cselx3, x3, xzr, ge + cselx2, x2, x6, ge + // interleave 64-bit words in state n, n+2 zip1v0.2d, v16.2d, v18.2d zip2v4.2d, v16.2d, v18.2d zip1v8.2d, v17.2d, v19.2d zip2v12.2d, v17.2d, v19.2d - ld1 {v16.16b-v19.16b}, [x2], #64 + ld1 {v16.16b-v19.16b}, [x2], x3 + + subsx6, x4, #128 + ccmpx3, xzr, #4, lt + add x7, x6, x2 + cselx3, x3, xzr, eq + cselx2, x2, x7, eq zip1v1.2d, v20.2d, v22.2d zip2v5.2d, v20.2d, v22.2d zip1v9.2d, v21.2d, v23.2d zip2v13.2d, v21.2d, v23.2d - ld1 {v20.16b-v23.16b}, [x2], #64 + ld1 {v20.16b-v23.16b}, [x2], x3 + + subsx7, x4, #192 + ccmpx3, xzr, #4, lt + add x8, x7, x2 + cselx3, x3, xzr, eq + cselx2, x2, x8, eq zip1v2.2d, v24.2d, v26.2d zip2v6.2d, v24.2d, v26.2d zip1v10.2d, v25.2d, v27.2d zip2v14.2d, v25.2d, v27.2d - ld1 {v24.16b-v27.16b}, [x2], #64 + ld1 {v24.16b-v27.16b}, [x2], x3 + + subsx8, x4, #256 + ccmpx3, xzr, #4, lt + add x9, x8, x2 + cselx2, x2, x9, eq zip1v3.2d, v28.2d, v30.2d zip2v7.2d, v28.2d, v30.2d @@ -451,29 +482,155 @@ ENTRY(chacha_4block_xor_neon) ld1 {v28.16b-v31.16b}, [x2] // xor with corresponding input, write to output + tbnzx5, #63, 0f eor v16.16b, v16.16b, v0.16b eor v17.16b, v17.16b, v1.16b eor v18.16b, v18.16b, v2.16b eor v19.16b, v19.16b, v3.16b + st1 {v16.16b-v19.16b}, [x1], #64 + + tbnzx6, #63, 1f eor v20.16b, v20.16b, v4.16b eor v21.16b, v21.16b, v5.16b
[PATCH] crypto: cavium/nitrox - Enabled Mailbox support
Enabled the PF->VF Mailbox support. Mailbox message are interpreted as {type, opcode, data}. Supported message types are REQ, ACK and NACK. Signed-off-by: Srikanth Jampala --- drivers/crypto/cavium/nitrox/Makefile | 3 +- drivers/crypto/cavium/nitrox/nitrox_csr.h | 12 +- drivers/crypto/cavium/nitrox/nitrox_debugfs.h | 22 ++ drivers/crypto/cavium/nitrox/nitrox_dev.h | 61 +- drivers/crypto/cavium/nitrox/nitrox_hal.c | 114 +++--- drivers/crypto/cavium/nitrox/nitrox_hal.h | 2 + drivers/crypto/cavium/nitrox/nitrox_isr.c | 8 +- drivers/crypto/cavium/nitrox/nitrox_main.c| 3 +- drivers/crypto/cavium/nitrox/nitrox_mbx.c | 204 ++ drivers/crypto/cavium/nitrox/nitrox_mbx.h | 9 + drivers/crypto/cavium/nitrox/nitrox_sriov.c | 57 - 11 files changed, 441 insertions(+), 54 deletions(-) create mode 100644 drivers/crypto/cavium/nitrox/nitrox_debugfs.h create mode 100644 drivers/crypto/cavium/nitrox/nitrox_mbx.c create mode 100644 drivers/crypto/cavium/nitrox/nitrox_mbx.h diff --git a/drivers/crypto/cavium/nitrox/Makefile b/drivers/crypto/cavium/nitrox/Makefile index e12954791673..ad0546630ad8 100644 --- a/drivers/crypto/cavium/nitrox/Makefile +++ b/drivers/crypto/cavium/nitrox/Makefile @@ -6,7 +6,8 @@ n5pf-objs := nitrox_main.o \ nitrox_lib.o \ nitrox_hal.o \ nitrox_reqmgr.o \ - nitrox_algs.o + nitrox_algs.o \ + nitrox_mbx.o n5pf-$(CONFIG_PCI_IOV) += nitrox_sriov.o n5pf-$(CONFIG_DEBUG_FS) += nitrox_debugfs.o diff --git a/drivers/crypto/cavium/nitrox/nitrox_csr.h b/drivers/crypto/cavium/nitrox/nitrox_csr.h index 1ad27b1a87c5..a2a452642b38 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_csr.h +++ b/drivers/crypto/cavium/nitrox/nitrox_csr.h @@ -54,7 +54,13 @@ #define NPS_STATS_PKT_DMA_WR_CNT 0x1000190 /* NPS packet registers */ -#define NPS_PKT_INT0x1040018 +#define NPS_PKT_INT0x1040018 +#define NPS_PKT_MBOX_INT_LO0x1040020 +#define NPS_PKT_MBOX_INT_LO_ENA_W1C0x1040030 +#define NPS_PKT_MBOX_INT_LO_ENA_W1S0x1040038 +#define NPS_PKT_MBOX_INT_HI0x1040040 +#define NPS_PKT_MBOX_INT_HI_ENA_W1C0x1040050 +#define NPS_PKT_MBOX_INT_HI_ENA_W1S0x1040058 #define NPS_PKT_IN_RERR_HI 0x1040108 #define NPS_PKT_IN_RERR_HI_ENA_W1S 0x1040120 #define NPS_PKT_IN_RERR_LO 0x1040128 @@ -74,6 +80,10 @@ #define NPS_PKT_SLC_RERR_LO_ENA_W1S0x1040240 #define NPS_PKT_SLC_ERR_TYPE 0x1040248 #define NPS_PKT_SLC_ERR_TYPE_ENA_W1S 0x1040260 +/* Mailbox PF->VF PF Accessible Data registers */ +#define NPS_PKT_MBOX_PF_VF_PFDATAX(_i) (0x1040800 + ((_i) * 0x8)) +#define NPS_PKT_MBOX_VF_PF_PFDATAX(_i) (0x1040C00 + ((_i) * 0x8)) + #define NPS_PKT_SLC_CTLX(_i) (0x1 + ((_i) * 0x4)) #define NPS_PKT_SLC_CNTSX(_i) (0x10008 + ((_i) * 0x4)) #define NPS_PKT_SLC_INT_LEVELSX(_i)(0x10010 + ((_i) * 0x4)) diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.h b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h new file mode 100644 index ..7b701ea6227a --- /dev/null +++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __NITROX_DEBUGFS_H +#define __NITROX_DEBUGFS_H + +#include "nitrox_dev.h" + +#ifdef CONFIG_DEBUG_FS +int nitrox_debugfs_init(struct nitrox_device *ndev); +void nitrox_debugfs_exit(struct nitrox_device *ndev); +#else +static inline int nitrox_debugfs_init(struct nitrox_device *ndev) +{ + return 0; +} + +static inline int nitrox_sriov_debugfs_init(struct nitrox_device *ndev) +{ + return 0; +} +#endif /* !CONFIG_DEBUG_FS */ + +#endif /* __NITROX_DEBUGFS_H */ diff --git a/drivers/crypto/cavium/nitrox/nitrox_dev.h b/drivers/crypto/cavium/nitrox/nitrox_dev.h index 247df32f687c..0338877b828f 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_dev.h +++ b/drivers/crypto/cavium/nitrox/nitrox_dev.h @@ -8,6 +8,8 @@ #include #define VERSION_LEN 32 +/* Maximum queues in PF mode */ +#define MAX_PF_QUEUES 64 /** * struct nitrox_cmdq - NITROX command queue @@ -103,13 +105,58 @@ struct nitrox_q_vector { }; }; +/** + * mbox_msg - Mailbox message data + * @type: message type + * @opcode: message opcode + * @data: message data + */ +union mbox_msg { + u64 value; + struct { + u64 type: 2; + u64 opcode: 6; + u64 data: 58; + }; + struct { + u64 type: 2; + u64 opcode: 6; + u64 chipid: 8; + u64 vfid: 8; + } id; +}; + +/** + * nitrox_vfdev - NITROX VF device instance in PF + * @state: VF device state + * @vfno: VF number + * @nr_queues: number of queues enabled in VF + * @ring: ring to communicate with VF + * @msg: Mailbox message data from VF + * @mbx_resp: Mailbox counters + */ +struct nitrox_vfdev { + atomic_t state; +
Re: [PATCH 2/3] dt-bindings: crypto: ccree: add dt bindings for ccree 703
On Thu, Nov 29, 2018 at 8:42 AM Herbert Xu wrote: > > On Tue, Nov 13, 2018 at 09:40:36AM +, Gilad Ben-Yossef wrote: > > Add device tree bindings associating Arm TrustZone CryptoCell 703 with the > > ccree driver. > > > > Signed-off-by: Gilad Ben-Yossef > > --- > > Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 + > > 1 file changed, 1 insertion(+) > > Which tree is this patch meant to go through? I'm not sure the question was addressed to me but if it did - either going through the device tree or the crypto tree is fine by me. Thanks, Gilad