Re: [PATCH v2 0/3] crypto: X25519 supports for ppc64le
Thanks Herbert. On 5/31/24 5:20 AM, Herbert Xu wrote: On Thu, May 16, 2024 at 11:19:54AM -0400, Danny Tsen wrote: This patch series provide X25519 support for ppc64le with a new module curve25519-ppc64le. The implementation is based on CRYPTOGAMs perl output from x25519-ppc64.pl. (see https://github.com/dot-asm/cryptogams/) Modified and added 4 supporting functions. This patch has passed the selftest by running modprobe curve25519-ppc64le. Danny Tsen (3): X25519 low-level primitives for ppc64le. X25519 core functions for ppc64le Update Kconfig and Makefile for ppc64le x25519. arch/powerpc/crypto/Kconfig | 11 + arch/powerpc/crypto/Makefile | 2 + arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 arch/powerpc/crypto/curve25519-ppc64le_asm.S | 671 ++ 4 files changed, 983 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S -- 2.31.1 All applied. Thanks.
[PATCH v2 3/3] crypto: Update Kconfig and Makefile for ppc64le x25519.
Defined CRYPTO_CURVE25519_PPC64 to support X25519 for ppc64le. Added new module curve25519-ppc64le for X25519. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig | 11 +++ arch/powerpc/crypto/Makefile | 2 ++ 2 files changed, 13 insertions(+) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 1e201b7ae2fc..09ebcbdfb34f 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -2,6 +2,17 @@ menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" +config CRYPTO_CURVE25519_PPC64 + tristate "Public key crypto: Curve25519 (PowerPC64)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_LIB_CURVE25519_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + help + Curve25519 algorithm + + Architecture: PowerPC64 + - Little-endian + config CRYPTO_CRC32C_VPMSUM tristate "CRC32c" depends on PPC64 && ALTIVEC diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index fca0e9739869..59808592f0a1 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o +obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -29,6 +30,7 @@ aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-p chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o +curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) override flavour := linux-ppc64le -- 2.31.1
[PATCH v2 2/3] crypto: X25519 core functions for ppc64le
X25519 core functions to handle scalar multiplication for ppc64le. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 ++ 1 file changed, 299 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c new file mode 100644 index ..4e3e44ea4484 --- /dev/null +++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2024- IBM Corp. + * + * X25519 scalar multiplication with 51 bits limbs for PPC64le. + * Based on RFC7748 and AArch64 optimized implementation for X25519 + * - Algorithm 1 Scalar multiplication of a variable point + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +typedef uint64_t fe51[5]; + +asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); +asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f); +asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f); +asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n); +asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s); +asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h); +asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit); + +#define fmul x25519_fe51_mul +#define fsqr x25519_fe51_sqr +#define fmul121666 x25519_fe51_mul121666 +#define fe51_tobytes x25519_fe51_tobytes + +static void fadd(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = f[0] + g[0]; + h[1] = f[1] + g[1]; + h[2] = f[2] + g[2]; + h[3] = f[3] + g[3]; + h[4] = f[4] + g[4]; +} + +/* + * Prime = 2 ** 255 - 19, 255 bits + *(0x7fff ffed) + * + * Prime in 5 51-bit limbs + */ +static fe51 prime51 = { 0x7ffed, 0x7, 0x7, 0x7, 0x7}; + +static void fsub(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = (f[0] + ((prime51[0] * 2))) - g[0]; + h[1] = (f[1] + ((prime51[1] * 2))) - g[1]; + h[2] = (f[2] + ((prime51[2] * 2))) - g[2]; + h[3] = (f[3] + ((prime51[3] * 2))) - g[3]; + h[4] = (f[4] + ((prime51[4] * 2))) - g[4]; +} + +static void fe51_frombytes(fe51 h, const uint8_t *s) +{ + /* +* Make sure 64-bit aligned. +*/ + unsigned char sbuf[32+8]; + unsigned char *sb = PTR_ALIGN((void *)sbuf, 8); + + memcpy(sb, s, 32); + x25519_fe51_frombytes(h, sb); +} + +static void finv(fe51 o, const fe51 i) +{ + fe51 a0, b, c, t00; + + fsqr(a0, i); + x25519_fe51_sqr_times(t00, a0, 2); + + fmul(b, t00, i); + fmul(a0, b, a0); + + fsqr(t00, a0); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 5); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 10); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 20); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 10); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 50); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 100); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 50); + + fmul(t00, t00, b); + x25519_fe51_sqr_times(t00, t00, 5); + + fmul(o, t00, a0); +} + +static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) +{ + fe51 x1, x2, z2, x3, z3; + uint8_t s[32]; + unsigned int swap = 0; + int i; + + memcpy(s, scalar, 32); + s[0] &= 0xf8; + s[31] &= 0x7f; + s[31] |= 0x40; + fe51_frombytes(x1, point); + + z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0; + x3[0] = x1[0]; + x3[1] = x1[1]; + x3[2] = x1[2]; + x3[3] = x1[3]; + x3[4] = x1[4]; + + x2[0] = z3[0] = 1; + x2[1] = z3[1] = 0; + x2[2] = z3[2] = 0; + x2[3] = z3[3] = 0; + x2[4] = z3[4] = 0; + + for (i = 254; i >= 0; --i) { + unsigned int k_t = 1 & (s[i / 8] >> (i & 7)); + fe51 a, b, c, d, e; + fe51 da, cb, aa, bb; + fe51 dacb_p, dacb_m; + + swap ^= k_t; + x25519_cswap(x2, x3, swap); + x25519_cswap(z2, z3, swap); + swap = k_t; + + fsub(b, x2, z2);// B = x_2 - z_2 + fadd(a, x2, z2);// A = x_2 + z_2 + fsub(d, x3, z3);// D = x_3 - z_3 + fadd(c, x3, z3);// C = x_3 + z_3 + + fsqr(bb, b);// BB = B^2 + fsqr(aa, a);// AA = A^2 + fmul(da, d, a); // DA = D * A + fmul(cb, c, b); // CB = C * B + +
[PATCH v2 0/3] crypto: X25519 supports for ppc64le
This patch series provide X25519 support for ppc64le with a new module curve25519-ppc64le. The implementation is based on CRYPTOGAMs perl output from x25519-ppc64.pl. (see https://github.com/dot-asm/cryptogams/) Modified and added 4 supporting functions. This patch has passed the selftest by running modprobe curve25519-ppc64le. Danny Tsen (3): X25519 low-level primitives for ppc64le. X25519 core functions for ppc64le Update Kconfig and Makefile for ppc64le x25519. arch/powerpc/crypto/Kconfig | 11 + arch/powerpc/crypto/Makefile | 2 + arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 arch/powerpc/crypto/curve25519-ppc64le_asm.S | 671 ++ 4 files changed, 983 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S -- 2.31.1
[PATCH v2 1/3] crypto: X25519 low-level primitives for ppc64le.
Use the perl output of x25519-ppc64.pl from CRYPTOGAMs (see https://github.com/dot-asm/cryptogams/) and added four supporting functions, x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes and x25519_cswap. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/curve25519-ppc64le_asm.S | 671 +++ 1 file changed, 671 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/arch/powerpc/crypto/curve25519-ppc64le_asm.S new file mode 100644 index ..06c1febe24b9 --- /dev/null +++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S @@ -0,0 +1,671 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# This code is taken from CRYPTOGAMs[1] and is included here using the option +# in the license to distribute the code under the GPL. Therefore this program +# is free software; you can redistribute it and/or modify it under the terms of +# the GNU General Public License version 2 as published by the Free Software +# Foundation. +# +# [1] https://github.com/dot-asm/cryptogams/ + +# Copyright (c) 2006-2017, CRYPTOGAMS by +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the CRYPTOGAMS nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# + +# +# +# Written and Modified by Danny Tsen +# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes +# and x25519_cswap +# +# Copyright 2024- IBM Corp. +# +# X25519 lower-level primitives for PPC64. +# + +#include + +.text + +.align 5 +SYM_FUNC_START(x25519_fe51_mul) + + stdu1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 6,0(5) + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mulld 22,7,6 + mulhdu 23,7,6 + + mulld 24,8,6 + mulhdu 25,8,6 + + mulld 30,11,6 + mulhdu 31,11,6 + ld 4,8(5) + mulli 11,11,19 + + mulld 26,9,6 + mulhdu 27,9,6 + + mulld 28,10,6 + mulhdu 29,10,6 + mulld 12,11,4 + mulhdu 21,11,4 + addc22,22,12 + adde23,23,21 + + mulld 12,7,4 + mulhdu 21,7,4 + addc24,24,12 + adde25,25,21 + + mulld 12,10,4 + mulhdu 21,10,4 + ld 6,16(5) + mulli 10,10,19 + addc30,30,12 + adde31,31,21 + + mulld 12,8,4 + mulhdu 21,8,4 + addc26,26,12 + adde27,27,21 + + mulld 12,9,4 + mulhdu 21,9,4 +
Re: [PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.
Hi Andy, I learned something here. Will fix this. Thanks. -Danny On 5/16/24 3:38 AM, Andy Polyakov wrote: Hi, +.abiversion 2 I'd prefer that was left to the compiler flags. Problem is that it's the compiler that is responsible for providing this directive in the intermediate .s prior invoking the assembler. And there is no assembler flag to pass through -Wa. If concern is ABI neutrality, then solution would rather be #if (_CALL_ELF-0) == 2/#endif. One can also make a case for #ifdef _CALL_ELF .abiversion _CALL_ELF #endif Cheers.
Re: [PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.
On 5/15/24 11:53 PM, Michael Ellerman wrote: Hi Danny, Danny Tsen writes: Use the perl output of x25519-ppc64.pl from CRYPTOGAMs and added three supporting functions, x25519_fe51_sqr_times, x25519_fe51_frombytes and x25519_fe51_tobytes. For other algorithms we have checked-in the perl script and generated the code at runtime. Is there a reason you've done it differently this time? Hi Michael, It's easier for me to read and use just assembly not mixed with perl and it's easier for me to debug and testing also I copied some code and made some modification. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/curve25519-ppc64le_asm.S | 648 +++ 1 file changed, 648 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/arch/powerpc/crypto/curve25519-ppc64le_asm.S new file mode 100644 index ..8a018104838a --- /dev/null +++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S @@ -0,0 +1,648 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Copyright 2024- IBM Corp. All Rights Reserved. I'm not a lawyer, but AFAIK "All Rights Reserved" is not required and can be confusing - because we are not reserving all rights, we are granting some rights under the GPL. I also think the IBM copyright should be down below where your modifications are described. Will change that. +# This code is taken from CRYPTOGAMs[1] and is included here using the option +# in the license to distribute the code under the GPL. Therefore this program +# is free software; you can redistribute it and/or modify it under the terms of +# the GNU General Public License version 2 as published by the Free Software +# Foundation. +# +# [1] https://www.openssl.org/~appro/cryptogams/ + +# Copyright (c) 2006-2017, CRYPTOGAMS by +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the CRYPTOGAMS nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# + +# +# +# Written and Modified by Danny Tsen +# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes ie. here. +# X25519 lower-level primitives for PPC64. +# + +#include + +.machine "any" Please don't add new .machine directives unless they are required. +.abiversion2 I'd prefer that was left to the compiler flags. Ok. Thanks. -Danny cheers
Re: [PATCH 2/3] crypto: X25519 core functions for ppc64le
Hi Andy, Thanks for the info. I should be able to do it. I was hoping an assembly guru like you can show me some tricks here if there is :) Thanks. -Danny On 5/15/24 8:33 AM, Andy Polyakov wrote: +static void cswap(fe51 p, fe51 q, unsigned int bit) +{ + u64 t, i; + u64 c = 0 - (u64) bit; + + for (i = 0; i < 5; ++i) { + t = c & (p[i] ^ q[i]); + p[i] ^= t; + q[i] ^= t; + } +} The "c" in cswap stands for "constant-time," and the problem is that contemporary compilers have exhibited the ability to produce non-constant-time machine code as result of compilation of the above kind of technique. The outcome is platform-specific and ironically some of PPC code generators were observed to generate "most" non-constant-time code. "Most" in sense that execution time variations would be most easy to catch. Just to substantiate the point, consider https://godbolt.org/z/faYnEcPT7, and note the conditional branch in the middle of the loop, which flies in the face of constant-time-ness. In case you object 'bit &= 1' on line 7 in the C code. Indeed, if you comment it out, the generated code will be fine. But the point is that the compiler is capable of and was in fact observed to figure out that the caller passes either one or zero and generate the machine code in the assembly window. In other words 'bit &= 1' is just a reflection of what the caller does. ... the permanent solution is to do it in assembly. I can put together something... Though you should be able to do this just as well :-) So should I or would you? Cheers.
Re: [PATCH 2/3] crypto: X25519 core functions for ppc64le
Hi Andy, Points taken. And much appreciate for the help. Thanks. -Danny On 5/15/24 3:29 AM, Andy Polyakov wrote: Hi, +static void cswap(fe51 p, fe51 q, unsigned int bit) +{ + u64 t, i; + u64 c = 0 - (u64) bit; + + for (i = 0; i < 5; ++i) { + t = c & (p[i] ^ q[i]); + p[i] ^= t; + q[i] ^= t; + } +} The "c" in cswap stands for "constant-time," and the problem is that contemporary compilers have exhibited the ability to produce non-constant-time machine code as result of compilation of the above kind of technique. The outcome is platform-specific and ironically some of PPC code generators were observed to generate "most" non-constant-time code. "Most" in sense that execution time variations would be most easy to catch. One way to work around the problem, at least for the time being, is to add 'asm volatile("" : "+r"(c))' after you calculate 'c'. But there is no guarantee that the next compiler version won't see through it, hence the permanent solution is to do it in assembly. I can put together something... Cheers.
Re: [PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.
See inline. On 5/15/24 4:06 AM, Andy Polyakov wrote: Hi, +SYM_FUNC_START(x25519_fe51_sqr_times) ... + +.Lsqr_times_loop: ... + + std 9,16(3) + std 10,24(3) + std 11,32(3) + std 7,0(3) + std 8,8(3) + bdnz .Lsqr_times_loop I see no reason for why the stores can't be moved outside the loop in question. Yeah. I'll fix it. +SYM_FUNC_START(x25519_fe51_frombytes) +.align 5 + + li 12, -1 + srdi 12, 12, 13 # 0x7 + + ld 5, 0(4) + ld 6, 8(4) + ld 7, 16(4) + ld 8, 24(4) Is there actual guarantee that the byte input is 64-bit aligned? While it is true that processor is obliged to handle misaligned loads and stores by the ISA specification, them being inefficient doesn't go against it. Most notably inefficiency is likely to be noted at the page boundaries. What I'm trying to say is that it would be more appropriate to avoid the unaligned loads (and stores). Good point. Maybe I can handle it with 64-bit aligned for the input. Thanks. Cheers.
Re: [PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.
Thank you Andy. Will fix this. On 5/15/24 3:11 AM, Andy Polyakov wrote: Hi, Couple of remarks inline. +# [1] https://www.openssl.org/~appro/cryptogams/ https://github.com/dot-asm/cryptogams/ is arguably better reference. +SYM_FUNC_START(x25519_fe51_mul) +.align 5 The goal is to align the label, not the first instruction after the directive. It's not a problem in this spot, in the beginning of the module that is, but further below it's likely to inject redundant nops between the label and meaningful code. But since the directive in question is not position-sensitive one can resolve this by changing the order of the directive and the SYM_FUNC_START macro. Cheers.
[PATCH 3/3] crypto: Update Kconfig and Makefile for ppc64le x25519.
Defined CRYPTO_CURVE25519_PPC64 to support X25519 for ppc64le. Added new module curve25519-ppc64le for X25519. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig | 11 +++ arch/powerpc/crypto/Makefile | 2 ++ 2 files changed, 13 insertions(+) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 1e201b7ae2fc..09ebcbdfb34f 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -2,6 +2,17 @@ menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" +config CRYPTO_CURVE25519_PPC64 + tristate "Public key crypto: Curve25519 (PowerPC64)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_LIB_CURVE25519_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + help + Curve25519 algorithm + + Architecture: PowerPC64 + - Little-endian + config CRYPTO_CRC32C_VPMSUM tristate "CRC32c" depends on PPC64 && ALTIVEC diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index fca0e9739869..59808592f0a1 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o +obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -29,6 +30,7 @@ aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-p chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o +curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) override flavour := linux-ppc64le -- 2.31.1
[PATCH 2/3] crypto: X25519 core functions for ppc64le
X25519 core functions to handle scalar multiplication for ppc64le. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 ++ 1 file changed, 299 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c new file mode 100644 index ..6a8b5efc40ce --- /dev/null +++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2024- IBM Corp. All rights reserved. + * + * X25519 scalar multiplication with 51 bits limbs for PPC64le. + * Based on RFC7748 and AArch64 optimized implementation for X25519 + * - Algorithm 1 Scalar multiplication of a variable point + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +typedef uint64_t fe51[5]; + +asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); +asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f); +asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f); +asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n); +asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s); +asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h); + +#define fmul x25519_fe51_mul +#define fsqr x25519_fe51_sqr +#define fmul121666 x25519_fe51_mul121666 +#define fe51_tobytes x25519_fe51_tobytes +#define fe51_frombytes x25519_fe51_frombytes + +static void cswap(fe51 p, fe51 q, unsigned int bit) +{ + u64 t, i; + u64 c = 0 - (u64) bit; + + for (i = 0; i < 5; ++i) { + t = c & (p[i] ^ q[i]); + p[i] ^= t; + q[i] ^= t; + } +} + +static void fadd(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = f[0] + g[0]; + h[1] = f[1] + g[1]; + h[2] = f[2] + g[2]; + h[3] = f[3] + g[3]; + h[4] = f[4] + g[4]; +} + +/* + * Prime = 2 ** 255 - 19, 255 bits + *(0x7fff ffed) + * + * Prime in 5 51-bit limbs + */ +static fe51 prime51 = { 0x7ffed, 0x7, 0x7, 0x7, 0x7}; + +static void fsub(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = (f[0] + ((prime51[0] * 2))) - g[0]; + h[1] = (f[1] + ((prime51[1] * 2))) - g[1]; + h[2] = (f[2] + ((prime51[2] * 2))) - g[2]; + h[3] = (f[3] + ((prime51[3] * 2))) - g[3]; + h[4] = (f[4] + ((prime51[4] * 2))) - g[4]; +} + +static void finv(fe51 o, const fe51 i) +{ + fe51 a0, b, c, t00; + + fsqr(a0, i); + x25519_fe51_sqr_times(t00, a0, 2); + + fmul(b, t00, i); + fmul(a0, b, a0); + + fsqr(t00, a0); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 5); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 10); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 20); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 10); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 50); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 100); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 50); + + fmul(t00, t00, b); + x25519_fe51_sqr_times(t00, t00, 5); + + fmul(o, t00, a0); +} + +static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) +{ + fe51 x1, x2, z2, x3, z3; + uint8_t s[32]; + unsigned int swap = 0; + int i; + + memcpy(s, scalar, 32); + s[0] &= 0xf8; + s[31] &= 0x7f; + s[31] |= 0x40; + fe51_frombytes(x1, point); + + z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0; + x3[0] = x1[0]; + x3[1] = x1[1]; + x3[2] = x1[2]; + x3[3] = x1[3]; + x3[4] = x1[4]; + + x2[0] = z3[0] = 1; + x2[1] = z3[1] = 0; + x2[2] = z3[2] = 0; + x2[3] = z3[3] = 0; + x2[4] = z3[4] = 0; + + for (i = 254; i >= 0; --i) { + unsigned int k_t = 1 & (s[i / 8] >> (i & 7)); + fe51 a, b, c, d, e; + fe51 da, cb, aa, bb; + fe51 dacb_p, dacb_m; + + swap ^= k_t; + cswap(x2, x3, swap); + cswap(z2, z3, swap); + swap = k_t; + + fsub(b, x2, z2);// B = x_2 - z_2 + fadd(a, x2, z2);// A = x_2 + z_2 + fsub(d, x3, z3);// D = x_3 - z_3 + fadd(c, x3, z3);// C = x_3 + z_3 + + fsqr(bb, b);// BB = B^2 + fsqr(aa, a);// AA = A^2 + fmul(da, d, a); // DA = D * A + fmul(cb, c, b); // CB = C * B + + fsub(e, aa, bb
[PATCH 0/3] crypto: X25519 supports for ppc64le
This patch series provide X25519 support for ppc64le with a new module curve25519-ppc64le. The implementation is based on CRYPTOGAMs perl output from x25519-ppc64.pl. Modified and added 3 supporting functions. This patch has passed the selftest by running modprobe curve25519-ppc64le. Danny Tsen (3): X25519 low-level primitives for ppc64le. X25519 core functions to handle scalar multiplication for ppc64le. Update Kconfig and Makefile. arch/powerpc/crypto/Kconfig | 11 + arch/powerpc/crypto/Makefile | 2 + arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 arch/powerpc/crypto/curve25519-ppc64le_asm.S | 648 ++ 4 files changed, 960 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S -- 2.31.1
[PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.
Use the perl output of x25519-ppc64.pl from CRYPTOGAMs and added three supporting functions, x25519_fe51_sqr_times, x25519_fe51_frombytes and x25519_fe51_tobytes. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/curve25519-ppc64le_asm.S | 648 +++ 1 file changed, 648 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/arch/powerpc/crypto/curve25519-ppc64le_asm.S new file mode 100644 index ..8a018104838a --- /dev/null +++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S @@ -0,0 +1,648 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Copyright 2024- IBM Corp. All Rights Reserved. +# +# This code is taken from CRYPTOGAMs[1] and is included here using the option +# in the license to distribute the code under the GPL. Therefore this program +# is free software; you can redistribute it and/or modify it under the terms of +# the GNU General Public License version 2 as published by the Free Software +# Foundation. +# +# [1] https://www.openssl.org/~appro/cryptogams/ + +# Copyright (c) 2006-2017, CRYPTOGAMS by +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the CRYPTOGAMS nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# + +# +# +# Written and Modified by Danny Tsen +# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes +# +# X25519 lower-level primitives for PPC64. +# + +#include + +.machine "any" +.abiversion2 +.text + +SYM_FUNC_START(x25519_fe51_mul) +.align 5 + + stdu1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 6,0(5) + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mulld 22,7,6 + mulhdu 23,7,6 + + mulld 24,8,6 + mulhdu 25,8,6 + + mulld 30,11,6 + mulhdu 31,11,6 + ld 4,8(5) + mulli 11,11,19 + + mulld 26,9,6 + mulhdu 27,9,6 + + mulld 28,10,6 + mulhdu 29,10,6 + mulld 12,11,4 + mulhdu 21,11,4 + addc22,22,12 + adde23,23,21 + + mulld 12,7,4 + mulhdu 21,7,4 + addc24,24,12 + adde25,25,21 + + mulld 12,10,4 + mulhdu 21,10,4 + ld 6,16(5) + mulli 10,10,19 + addc30,30,12 + adde31,31,21 + + mulld 12,8,4 + mulhdu 21,8,4 + addc26,26,12 + adde27,27,21 + + mulld 12,9,4 + mulhdu 21,9,4 +
Re: [PATCH] crypto:vmx: Move ppc vmx diirectory to arch/powerpc/crypto.
Thanks Herbert. -Danny On 1/26/24 4:58 PM, Herbert Xu wrote: On Tue, Jan 02, 2024 at 03:58:56PM -0500, Danny Tsen wrote: Relocate all crypto files in vmx driver to arch/powerpc/crypto directory and remove vmx directory. drivers/crypto/vmx/aes.c rename to arch/powerpc/crypto/aes.c drivers/crypto/vmx/aes_cbc.c rename to arch/powerpc/crypto/aes_cbc.c drivers/crypto/vmx/aes_ctr.c rename to arch/powerpc/crypto/aes_ctr.c drivers/crypto/vmx/aes_xts.c rename to arch/powerpc/crypto/aes_xts.c drivers/crypto/vmx/aesp8-ppc.h rename to arch/powerpc/crypto/aesp8-ppc.h drivers/crypto/vmx/aesp8-ppc.pl rename to arch/powerpc/crypto/aesp8-ppc.pl drivers/crypto/vmx/ghash.c rename to arch/powerpc/crypto/ghash.c drivers/crypto/vmx/ghashp8-ppc.pl rename to arch/powerpc/crypto/ghashp8-ppc.pl drivers/crypto/vmx/vmx.c rename to arch/powerpc/crypto/vmx.c deleted files: drivers/crypto/vmx/Makefile drivers/crypto/vmx/Kconfig drivers/crypto/vmx/ppc-xlate.pl This patch has been tested has passed the selftest. The patch is also tested with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig | 20 ++ arch/powerpc/crypto/Makefile | 20 +- .../crypto/vmx => arch/powerpc/crypto}/aes.c | 0 .../vmx => arch/powerpc/crypto}/aes_cbc.c | 0 .../vmx => arch/powerpc/crypto}/aes_ctr.c | 0 .../vmx => arch/powerpc/crypto}/aes_xts.c | 0 .../vmx => arch/powerpc/crypto}/aesp8-ppc.h | 0 .../vmx => arch/powerpc/crypto}/aesp8-ppc.pl | 0 .../vmx => arch/powerpc/crypto}/ghash.c | 0 .../powerpc/crypto}/ghashp8-ppc.pl| 0 .../crypto/vmx => arch/powerpc/crypto}/vmx.c | 0 drivers/crypto/Kconfig| 14 +- drivers/crypto/Makefile | 2 +- drivers/crypto/vmx/.gitignore | 3 - drivers/crypto/vmx/Kconfig| 14 -- drivers/crypto/vmx/Makefile | 23 -- drivers/crypto/vmx/ppc-xlate.pl | 231 -- 17 files changed, 46 insertions(+), 281 deletions(-) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_cbc.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_ctr.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_xts.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.pl (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghash.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghashp8-ppc.pl (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c (100%) delete mode 100644 drivers/crypto/vmx/.gitignore delete mode 100644 drivers/crypto/vmx/Kconfig delete mode 100644 drivers/crypto/vmx/Makefile delete mode 100644 drivers/crypto/vmx/ppc-xlate.pl Patch applied. Thanks.
[PATCH] crypto:vmx: Move ppc vmx diirectory to arch/powerpc/crypto.
Relocate all crypto files in vmx driver to arch/powerpc/crypto directory and remove vmx directory. drivers/crypto/vmx/aes.c rename to arch/powerpc/crypto/aes.c drivers/crypto/vmx/aes_cbc.c rename to arch/powerpc/crypto/aes_cbc.c drivers/crypto/vmx/aes_ctr.c rename to arch/powerpc/crypto/aes_ctr.c drivers/crypto/vmx/aes_xts.c rename to arch/powerpc/crypto/aes_xts.c drivers/crypto/vmx/aesp8-ppc.h rename to arch/powerpc/crypto/aesp8-ppc.h drivers/crypto/vmx/aesp8-ppc.pl rename to arch/powerpc/crypto/aesp8-ppc.pl drivers/crypto/vmx/ghash.c rename to arch/powerpc/crypto/ghash.c drivers/crypto/vmx/ghashp8-ppc.pl rename to arch/powerpc/crypto/ghashp8-ppc.pl drivers/crypto/vmx/vmx.c rename to arch/powerpc/crypto/vmx.c deleted files: drivers/crypto/vmx/Makefile drivers/crypto/vmx/Kconfig drivers/crypto/vmx/ppc-xlate.pl This patch has been tested has passed the selftest. The patch is also tested with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig | 20 ++ arch/powerpc/crypto/Makefile | 20 +- .../crypto/vmx => arch/powerpc/crypto}/aes.c | 0 .../vmx => arch/powerpc/crypto}/aes_cbc.c | 0 .../vmx => arch/powerpc/crypto}/aes_ctr.c | 0 .../vmx => arch/powerpc/crypto}/aes_xts.c | 0 .../vmx => arch/powerpc/crypto}/aesp8-ppc.h | 0 .../vmx => arch/powerpc/crypto}/aesp8-ppc.pl | 0 .../vmx => arch/powerpc/crypto}/ghash.c | 0 .../powerpc/crypto}/ghashp8-ppc.pl| 0 .../crypto/vmx => arch/powerpc/crypto}/vmx.c | 0 drivers/crypto/Kconfig| 14 +- drivers/crypto/Makefile | 2 +- drivers/crypto/vmx/.gitignore | 3 - drivers/crypto/vmx/Kconfig| 14 -- drivers/crypto/vmx/Makefile | 23 -- drivers/crypto/vmx/ppc-xlate.pl | 231 -- 17 files changed, 46 insertions(+), 281 deletions(-) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_cbc.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_ctr.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_xts.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.pl (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghash.c (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghashp8-ppc.pl (100%) rename {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c (100%) delete mode 100644 drivers/crypto/vmx/.gitignore delete mode 100644 drivers/crypto/vmx/Kconfig delete mode 100644 drivers/crypto/vmx/Makefile delete mode 100644 drivers/crypto/vmx/ppc-xlate.pl diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 6fc2248ca561..1e201b7ae2fc 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -137,4 +137,24 @@ config CRYPTO_POLY1305_P10 - Power10 or later - Little-endian +config CRYPTO_DEV_VMX +bool "Support for VMX cryptographic acceleration instructions" +depends on PPC64 && VSX +help + Support for VMX cryptographic acceleration instructions. + +config CRYPTO_DEV_VMX_ENCRYPT + tristate "Encryption acceleration support on P8 CPU" + depends on CRYPTO_DEV_VMX + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_CTR + select CRYPTO_GHASH + select CRYPTO_XTS + default m + help + Support for VMX cryptographic acceleration instructions on Power8 CPU. + This module supports acceleration for AES and GHASH in hardware. If you + choose 'M' here, this module will be called vmx-crypto. + endmenu diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index ebdac1b9eb9a..fca0e9739869 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o +obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -27,14 +28,29 @@ crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o +vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o + +ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) +override flavour := linux-ppc64le +e
Re: [PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.
Still waiting for the CCLA to send to Openssl. Thanks. -Danny On 9/15/23 8:29 AM, Michael Ellerman wrote: Danny Tsen writes: Improve AES/XTS performance of 6-way unrolling for PowerPC up to 17% with tcrypt. This is done by using one instruction, vpermxor, to replace xor and vsldoi. The same changes were applied to OpenSSL code and a pull request was submitted. https://github.com/openssl/openssl/pull/21812 Still unmerged as of today. cheers
Re: [PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.
Thanks Herbert. -Danny On 9/15/23 5:41 AM, Herbert Xu wrote: On Wed, Aug 30, 2023 at 09:49:11AM -0400, Danny Tsen wrote: Improve AES/XTS performance of 6-way unrolling for PowerPC up to 17% with tcrypt. This is done by using one instruction, vpermxor, to replace xor and vsldoi. The same changes were applied to OpenSSL code and a pull request was submitted. This patch has been tested with the kernel crypto module tcrypt.ko and has passed the selftest. The patch is also tested with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. Signed-off-by: Danny Tsen --- drivers/crypto/vmx/aesp8-ppc.pl | 141 +--- 1 file changed, 92 insertions(+), 49 deletions(-) Patch applied. Thanks.
Re: [PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.
Hi Michael, I just submitted the v2 patch. Thanks. -Danny On 8/29/23 11:37 PM, Michael Ellerman wrote: Danny Tsen writes: Improve AES/XTS performance of 6-way unrolling for PowerPC up to 17% with tcrypt. This is done by using one instruction, vpermxor, to replace xor and vsldoi. This patch has been tested with the kernel crypto module tcrypt.ko and has passed the selftest. The patch is also tested with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. Signed-off-by: Danny Tsen --- drivers/crypto/vmx/aesp8-ppc.pl | 141 +--- 1 file changed, 92 insertions(+), 49 deletions(-) That's CRYPTOGAMS code, and is so far largely unchanged from the original. I see you've sent the same change to openssl, but it's not merged yet. Please document that in the change log, we want to keep the code in sync as much as possible, and document any divergences. cheers diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl index 50a0a18f35da..f729589d792e 100644 --- a/drivers/crypto/vmx/aesp8-ppc.pl +++ b/drivers/crypto/vmx/aesp8-ppc.pl @@ -132,11 +132,12 @@ rcon: .long 0x1b00, 0x1b00, 0x1b00, 0x1b00 ?rev .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev .long 0,0,0,0 ?asis +.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe Lconsts: mflrr0 bcl 20,31,\$+4 mflr$ptr #v "distance between . and rcon - addi$ptr,$ptr,-0x48 + addi$ptr,$ptr,-0x58 mtlrr0 blr .long 0 @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x: li $x70,0x70 mtspr 256,r0 + xxlor 2, 32+$eighty7, 32+$eighty7 + vsldoi $eighty7,$tmp,$eighty7,1# 0x010101..87 + xxlor 1, 32+$eighty7, 32+$eighty7 + + # Load XOR Lconsts. + mr $x70, r6 + bl Lconsts + lxvw4x 0, $x40, r6 # load XOR contents + mr r6, $x70 + li $x70,0x70 + subi$rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key1 # load key schedule @@ -2537,69 +2549,77 @@ Load_xts_enc_key: ?vperm v31,v31,$twk5,$keyperm lvx v25,$x10,$key_ # pre-load round[2] + # Switch to use the following codes with 0x010101..87 to generate tweak. + # eighty7 = 0x010101..87 + # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits + # vand tmp, tmp, eighty7 # last byte with carry + # vaddubm tweak, tweak, tweak # left shift 1 bit (x2) + # xxlor vsx, 0, 0 + # vpermxor tweak, tweak, tmp, vsx + vperm $in0,$inout,$inptail,$inpperm subi $inp,$inp,31# undo "caller" vxor$twk0,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vand$tmp,$tmp,$eighty7 vxor $out0,$in0,$twk0 - vxor$tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in1 lvx_u $in1,$x10,$inp vxor$twk1,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in1,$in1,$in1,$leperm vand$tmp,$tmp,$eighty7 vxor $out1,$in1,$twk1 - vxor$tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in2 lvx_u $in2,$x20,$inp andi. $taillen,$len,15 vxor$twk2,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in2,$in2,$in2,$leperm vand$tmp,$tmp,$eighty7 vxor $out2,$in2,$twk2 - vxor$tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in3 lvx_u $in3,$x30,$inp sub$len,$len,$taillen vxor$twk3,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in3,$in3,$in3,$leperm vand$tmp,$tmp,$eighty7 vxor $out3,$in3,$twk3 - vxor$tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in4 lvx_u $in4,$x40,$inp subi $len,$len,0x60 vxor
[PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.
Improve AES/XTS performance of 6-way unrolling for PowerPC up to 17% with tcrypt. This is done by using one instruction, vpermxor, to replace xor and vsldoi. The same changes were applied to OpenSSL code and a pull request was submitted. This patch has been tested with the kernel crypto module tcrypt.ko and has passed the selftest. The patch is also tested with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. Signed-off-by: Danny Tsen --- drivers/crypto/vmx/aesp8-ppc.pl | 141 +--- 1 file changed, 92 insertions(+), 49 deletions(-) diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl index 50a0a18f35da..f729589d792e 100644 --- a/drivers/crypto/vmx/aesp8-ppc.pl +++ b/drivers/crypto/vmx/aesp8-ppc.pl @@ -132,11 +132,12 @@ rcon: .long 0x1b00, 0x1b00, 0x1b00, 0x1b00 ?rev .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev .long 0,0,0,0 ?asis +.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe Lconsts: mflrr0 bcl 20,31,\$+4 mflr$ptr #v "distance between . and rcon - addi$ptr,$ptr,-0x48 + addi$ptr,$ptr,-0x58 mtlrr0 blr .long 0 @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x: li $x70,0x70 mtspr 256,r0 + xxlor 2, 32+$eighty7, 32+$eighty7 + vsldoi $eighty7,$tmp,$eighty7,1# 0x010101..87 + xxlor 1, 32+$eighty7, 32+$eighty7 + + # Load XOR Lconsts. + mr $x70, r6 + bl Lconsts + lxvw4x 0, $x40, r6 # load XOR contents + mr r6, $x70 + li $x70,0x70 + subi$rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key1 # load key schedule @@ -2537,69 +2549,77 @@ Load_xts_enc_key: ?vperm v31,v31,$twk5,$keyperm lvx v25,$x10,$key_ # pre-load round[2] + # Switch to use the following codes with 0x010101..87 to generate tweak. + # eighty7 = 0x010101..87 + # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits + # vand tmp, tmp, eighty7 # last byte with carry + # vaddubm tweak, tweak, tweak # left shift 1 bit (x2) + # xxlor vsx, 0, 0 + # vpermxor tweak, tweak, tmp, vsx + vperm $in0,$inout,$inptail,$inpperm subi $inp,$inp,31# undo "caller" vxor$twk0,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vand$tmp,$tmp,$eighty7 vxor $out0,$in0,$twk0 - vxor$tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in1 lvx_u $in1,$x10,$inp vxor$twk1,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in1,$in1,$in1,$leperm vand$tmp,$tmp,$eighty7 vxor $out1,$in1,$twk1 - vxor$tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in2 lvx_u $in2,$x20,$inp andi. $taillen,$len,15 vxor$twk2,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in2,$in2,$in2,$leperm vand$tmp,$tmp,$eighty7 vxor $out2,$in2,$twk2 - vxor$tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in3 lvx_u $in3,$x30,$inp sub$len,$len,$taillen vxor$twk3,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in3,$in3,$in3,$leperm vand$tmp,$tmp,$eighty7 vxor $out3,$in3,$twk3 - vxor$tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in4 lvx_u $in4,$x40,$inp subi $len,$len,0x60 vxor$twk4,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in4,$in4,$in4,$leperm vand
[PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.
Improve AES/XTS performance of 6-way unrolling for PowerPC up to 17% with tcrypt. This is done by using one instruction, vpermxor, to replace xor and vsldoi. This patch has been tested with the kernel crypto module tcrypt.ko and has passed the selftest. The patch is also tested with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. Signed-off-by: Danny Tsen --- drivers/crypto/vmx/aesp8-ppc.pl | 141 +--- 1 file changed, 92 insertions(+), 49 deletions(-) diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl index 50a0a18f35da..f729589d792e 100644 --- a/drivers/crypto/vmx/aesp8-ppc.pl +++ b/drivers/crypto/vmx/aesp8-ppc.pl @@ -132,11 +132,12 @@ rcon: .long 0x1b00, 0x1b00, 0x1b00, 0x1b00 ?rev .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev .long 0,0,0,0 ?asis +.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe Lconsts: mflrr0 bcl 20,31,\$+4 mflr$ptr #v "distance between . and rcon - addi$ptr,$ptr,-0x48 + addi$ptr,$ptr,-0x58 mtlrr0 blr .long 0 @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x: li $x70,0x70 mtspr 256,r0 + xxlor 2, 32+$eighty7, 32+$eighty7 + vsldoi $eighty7,$tmp,$eighty7,1# 0x010101..87 + xxlor 1, 32+$eighty7, 32+$eighty7 + + # Load XOR Lconsts. + mr $x70, r6 + bl Lconsts + lxvw4x 0, $x40, r6 # load XOR contents + mr r6, $x70 + li $x70,0x70 + subi$rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key1 # load key schedule @@ -2537,69 +2549,77 @@ Load_xts_enc_key: ?vperm v31,v31,$twk5,$keyperm lvx v25,$x10,$key_ # pre-load round[2] + # Switch to use the following codes with 0x010101..87 to generate tweak. + # eighty7 = 0x010101..87 + # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits + # vand tmp, tmp, eighty7 # last byte with carry + # vaddubm tweak, tweak, tweak # left shift 1 bit (x2) + # xxlor vsx, 0, 0 + # vpermxor tweak, tweak, tmp, vsx + vperm $in0,$inout,$inptail,$inpperm subi $inp,$inp,31# undo "caller" vxor$twk0,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vand$tmp,$tmp,$eighty7 vxor $out0,$in0,$twk0 - vxor$tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in1 lvx_u $in1,$x10,$inp vxor$twk1,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in1,$in1,$in1,$leperm vand$tmp,$tmp,$eighty7 vxor $out1,$in1,$twk1 - vxor$tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in2 lvx_u $in2,$x20,$inp andi. $taillen,$len,15 vxor$twk2,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in2,$in2,$in2,$leperm vand$tmp,$tmp,$eighty7 vxor $out2,$in2,$twk2 - vxor$tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in3 lvx_u $in3,$x30,$inp sub$len,$len,$taillen vxor$twk3,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in3,$in3,$in3,$leperm vand$tmp,$tmp,$eighty7 vxor $out3,$in3,$twk3 - vxor$tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor$tweak, $tweak, $tmp, $in4 lvx_u $in4,$x40,$inp subi $len,$len,0x60 vxor$twk4,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in4,$in4,$in4,$leperm vand$tmp,$tmp,$eighty7 vxor $out4,$in4,$twk4 - vxor
RE: [PATCH v2 0/5] crypto: Accelerated Chacha20/Poly1305 implementation
Thanks. -Danny From: Herbert Xu Sent: Friday, July 14, 2023 4:49 PM To: Danny Tsen Cc: linux-cry...@vger.kernel.org ; lei...@debian.org ; na...@linux.ibm.com ; ap...@cryptogams.org ; linux-ker...@vger.kernel.org ; linuxppc-dev@lists.ozlabs.org ; m...@ellerman.id.au ; ltc...@linux.vnet.ibm.com ; Danny Tsen Subject: [EXTERNAL] Re: [PATCH v2 0/5] crypto: Accelerated Chacha20/Poly1305 implementation On Wed, Apr 26, 2023 at 03:11:42PM -0400, Danny Tsen wrote: > This patch series provide an accelerated/optimized Chacha20 and Poly1305 > implementation for Power10 or later CPU (ppc64le). This module > implements algorithm specified in RFC7539. The implementation > provides 3.5X better performance than the baseline for Chacha20 and > Poly1305 individually and 1.5X improvement for Chacha20/Poly1305 > operation. > > This patch has been tested with the kernel crypto module tcrypt.ko and > has passed the selftest. The patch is also tested with > CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. > > > Danny Tsen (5): > An optimized Chacha20 implementation with 8-way unrolling for ppc64le. > Glue code for optmized Chacha20 implementation for ppc64le. > An optimized Poly1305 implementation with 4-way unrolling for ppc64le. > Glue code for optmized Poly1305 implementation for ppc64le. > Update Kconfig and Makefile. > > arch/powerpc/crypto/Kconfig | 26 + > arch/powerpc/crypto/Makefile|4 + > arch/powerpc/crypto/chacha-p10-glue.c | 221 + > arch/powerpc/crypto/chacha-p10le-8x.S | 842 ++ > arch/powerpc/crypto/poly1305-p10-glue.c | 186 > arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++ > 6 files changed, 2354 insertions(+) > create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c > create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S > create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c > create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S > > -- > 2.31.1 All applied. Thanks. -- Email: Herbert Xu Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[PATCH v2 5/5] Update Kconfig and Makefile.
Defined CRYPTO_CHACHA20_P10 and CRYPTO POLY1305_P10 in Kconfig to support optimized implementation for Power10 and later CPU. Added new module driver chacha-p10-crypto and poly1305-p10-crypto. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig | 26 ++ arch/powerpc/crypto/Makefile | 4 2 files changed, 30 insertions(+) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 7113f9355165..f74d9dd6574b 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -111,4 +111,30 @@ config CRYPTO_AES_GCM_P10 Support for cryptographic acceleration instructions on Power10 or later CPU. This module supports stitched acceleration for AES/GCM. +config CRYPTO_CHACHA20_P10 + tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_SKCIPHER + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + help + Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 + stream cipher algorithms + + Architecture: PowerPC64 + - Power10 or later + - Little-endian + +config CRYPTO_POLY1305_P10 + tristate "Hash functions: Poly1305 (P10 or later)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_HASH + select CRYPTO_LIB_POLY1305_GENERIC + help + Poly1305 authenticator algorithm (RFC7539) + + Architecture: PowerPC64 + - Power10 or later + - Little-endian + endmenu diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 05c7486f42c5..cd5282eff451 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -14,6 +14,8 @@ obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o +obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o +obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -23,6 +25,8 @@ sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp8-ppc.o aesp8-ppc.o +chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o +poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o quiet_cmd_perl = PERL$@ cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, linux-ppc64) > $@ -- 2.31.1
[PATCH v2 2/5] Glue code for optmized Chacha20 implementation for ppc64le.
Signed-off-by: Danny Tsen --- arch/powerpc/crypto/chacha-p10-glue.c | 221 ++ 1 file changed, 221 insertions(+) create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c new file mode 100644 index ..74fb86b0d209 --- /dev/null +++ b/arch/powerpc/crypto/chacha-p10-glue.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PowerPC P10 (ppc64le) accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright 2023- IBM Corp. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void chacha_p10le_8x(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static void chacha_p10_do_8x(u32 *state, u8 *dst, const u8 *src, +unsigned int bytes, int nrounds) +{ + unsigned int l = bytes & ~0x0FF; + + if (l > 0) { + chacha_p10le_8x(state, dst, src, l, nrounds); + bytes -= l; + src += l; + dst += l; + state[12] += l / CHACHA_BLOCK_SIZE; + } + + if (bytes > 0) + chacha_crypt_generic(state, dst, src, bytes, nrounds); +} + +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + hchacha_block_generic(state, stream, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + if (!static_branch_likely(_p10) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + vsx_begin(); + chacha_p10_do_8x(state, dst, src, todo, nrounds); + vsx_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static int chacha_p10_stream_xor(struct skcipher_request *req, +const struct chacha_ctx *ctx, const u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(, req, false); + if (err) + return err; + + chacha_init_generic(state, ctx->key, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = rounddown(nbytes, walk.stride); + + if (!crypto_simd_usable()) { + chacha_crypt_generic(state, walk.dst.virt.addr, +walk.src.virt.addr, nbytes, +ctx->nrounds); + } else { + vsx_begin(); + chacha_p10_do_8x(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, ctx->nrounds); + vsx_end(); + } + err = skcipher_walk_done(, walk.nbytes - nbytes); + if (err) + break; + } + + return err; +} + +static int chacha_p10(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + return chacha_p10_stream_xor(req, ctx, req->iv); +} + +static int xchacha_p10(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + chacha_init_generic(state, ctx->key, req->iv); + hchacha_block_arch(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + memcpy(_iv[0], req->iv + 24, 8); + memcpy(_iv[8], req->iv + 16, 8); + return chacha_p10_stream_xor(req, , real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-p10", + .base.cra_priority = 300, + .base.cra_blocksize =
[PATCH v2 1/5] An optimized Chacha20 implementation with 8-way unrolling for ppc64le.
Improve overall performance of chacha20 encrypt and decrypt operations for Power10 or later CPU. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/chacha-p10le-8x.S | 842 ++ 1 file changed, 842 insertions(+) create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S b/arch/powerpc/crypto/chacha-p10le-8x.S new file mode 100644 index ..17bedb66b822 --- /dev/null +++ b/arch/powerpc/crypto/chacha-p10le-8x.S @@ -0,0 +1,842 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated chacha20 implementation for ppc64le. +# +# Copyright 2023- IBM Corp. All rights reserved +# +#=== +# Written by Danny Tsen +# +# chacha_p10le_8x(u32 *state, byte *dst, const byte *src, +# size_t len, int nrounds); +# +# do rounds, 8 quarter rounds +# 1. a += b; d ^= a; d <<<= 16; +# 2. c += d; b ^= c; b <<<= 12; +# 3. a += b; d ^= a; d <<<= 8; +# 4. c += d; b ^= c; b <<<= 7 +# +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 +# +# 4 blocks (a b c d) +# +# a0 b0 c0 d0 +# a1 b1 c1 d1 +# ... +# a4 b4 c4 d4 +# ... +# a8 b8 c8 d8 +# ... +# a12 b12 c12 d12 +# a13 ... +# a14 ... +# a15 b15 c15 d15 +# +# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) +# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) +# + +#include +#include +#include +#include + +.machine "any" +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx\VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx\VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416, 9 + RESTORE_VSX 29, 432, 9 + RESTORE_VSX 30, 448, 9 + RESTORE_VSX 31, 464, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE
[PATCH v2 4/5] Glue code for optmized Poly1305 implementation for ppc64le.
Signed-off-by: Danny Tsen --- arch/powerpc/crypto/poly1305-p10-glue.c | 186 1 file changed, 186 insertions(+) create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c diff --git a/arch/powerpc/crypto/poly1305-p10-glue.c b/arch/powerpc/crypto/poly1305-p10-glue.c new file mode 100644 index ..95dd708573ee --- /dev/null +++ b/arch/powerpc/crypto/poly1305-p10-glue.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Poly1305 authenticator algorithm, RFC7539. + * + * Copyright 2023- IBM Corp. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_p10le_4blocks(void *h, const u8 *m, u32 mlen); +asmlinkage void poly1305_64s(void *h, const u8 *m, u32 mlen, int highbit); +asmlinkage void poly1305_emit_64(void *h, void *s, u8 *dst); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static int crypto_poly1305_p10_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + poly1305_core_init(>h); + dctx->buflen = 0; + dctx->rset = 0; + dctx->sset = false; + + return 0; +} + +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, + const u8 *inp, unsigned int len) +{ + unsigned int acc = 0; + + if (unlikely(!dctx->sset)) { + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { + struct poly1305_core_key *key = >core_r; + + key->key.r64[0] = get_unaligned_le64([0]); + key->key.r64[1] = get_unaligned_le64([8]); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; + dctx->rset = 1; + } + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32([0]); + dctx->s[1] = get_unaligned_le32([4]); + dctx->s[2] = get_unaligned_le32([8]); + dctx->s[3] = get_unaligned_le32([12]); + acc += POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + } + return acc; +} + +static int crypto_poly1305_p10_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + unsigned int bytes, used; + + if (unlikely(dctx->buflen)) { + bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + srclen -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, + POLY1305_BLOCK_SIZE))) { + vsx_begin(); + poly1305_64s(>h, dctx->buf, + POLY1305_BLOCK_SIZE, 1); + vsx_end(); + } + dctx->buflen = 0; + } + } + + if (likely(srclen >= POLY1305_BLOCK_SIZE)) { + bytes = round_down(srclen, POLY1305_BLOCK_SIZE); + used = crypto_poly1305_setdctxkey(dctx, src, bytes); + if (likely(used)) { + srclen -= used; + src += used; + } + if (crypto_simd_usable() && (srclen >= POLY1305_BLOCK_SIZE*4)) { + vsx_begin(); + poly1305_p10le_4blocks(>h, src, srclen); + vsx_end(); + src += srclen - (srclen % (POLY1305_BLOCK_SIZE * 4)); + srclen %= POLY1305_BLOCK_SIZE * 4; + } + while (srclen >= POLY1305_BLOCK_SIZE) { + vsx_begin(); + poly1305_64s(>h, src, POLY1305_BLOCK_SIZE, 1); + vsx_end(); + srclen -= POLY1305_BLOCK_SIZE; + src += POLY1305_BLOCK_SIZE; + } + } + + if (unlikely(srclen)) { + dctx->buflen = srclen; + memcpy(dctx->buf, src, srclen); + } + + return 0; +} + +static int crypto_poly1305_p10_final(struct shash_desc *desc, u8 *dst) +{ + stru
[PATCH v2 0/5] crypto: Accelerated Chacha20/Poly1305 implementation
This patch series provide an accelerated/optimized Chacha20 and Poly1305 implementation for Power10 or later CPU (ppc64le). This module implements algorithm specified in RFC7539. The implementation provides 3.5X better performance than the baseline for Chacha20 and Poly1305 individually and 1.5X improvement for Chacha20/Poly1305 operation. This patch has been tested with the kernel crypto module tcrypt.ko and has passed the selftest. The patch is also tested with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. Danny Tsen (5): An optimized Chacha20 implementation with 8-way unrolling for ppc64le. Glue code for optmized Chacha20 implementation for ppc64le. An optimized Poly1305 implementation with 4-way unrolling for ppc64le. Glue code for optmized Poly1305 implementation for ppc64le. Update Kconfig and Makefile. arch/powerpc/crypto/Kconfig | 26 + arch/powerpc/crypto/Makefile|4 + arch/powerpc/crypto/chacha-p10-glue.c | 221 + arch/powerpc/crypto/chacha-p10le-8x.S | 842 ++ arch/powerpc/crypto/poly1305-p10-glue.c | 186 arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++ 6 files changed, 2354 insertions(+) create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S -- 2.31.1
[PATCH v2 3/5] An optimized Poly1305 implementation with 4-way unrolling for ppc64le.
Improve overall performance of Poly1305 for Power10 or later CPU. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++ 1 file changed, 1075 insertions(+) create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S diff --git a/arch/powerpc/crypto/poly1305-p10le_64.S b/arch/powerpc/crypto/poly1305-p10le_64.S new file mode 100644 index ..a3c1987f1ecd --- /dev/null +++ b/arch/powerpc/crypto/poly1305-p10le_64.S @@ -0,0 +1,1075 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated poly1305 implementation for ppc64le. +# +# Copyright 2023- IBM Corp. All rights reserved +# +#=== +# Written by Danny Tsen +# +# Poly1305 - this version mainly using vector/VSX/Scalar +# - 26 bits limbs +# - Handle multiple 64 byte blcok. +# +# Block size 16 bytes +# key = (r, s) +# clamp r &= 0x0FFC0FFC 0x0FFC0FFF +# p = 2^130 - 5 +# a += m +# a = (r + a) % p +# a += s +# +# Improve performance by breaking down polynominal to the sum of products with +# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r +# +# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 +# to 9 vectors for multiplications. +# +# setup r^4, r^3, r^2, r vectors +#vs[r^1, r^3, r^2, r^4] +#vs0 = [r0,.] +#vs1 = [r1,.] +#vs2 = [r2,.] +#vs3 = [r3,.] +#vs4 = [r4,.] +#vs5 = [r1*5,...] +#vs6 = [r2*5,...] +#vs7 = [r2*5,...] +#vs8 = [r4*5,...] +# +# Each word in a vector consists a member of a "r/s" in [a * r/s]. +# +# r0, r4*5, r3*5, r2*5, r1*5; +# r1, r0, r4*5, r3*5, r2*5; +# r2, r1, r0, r4*5, r3*5; +# r3, r2, r1, r0, r4*5; +# r4, r3, r2, r1, r0 ; +# +# +# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) +# k = 32 bytes key +# r3 = k (r, s) +# r4 = mlen +# r5 = m +# +#include +#include +#include +#include + +.machine "any" + +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx\VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx\VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416,
Re: [PATCH 1/5] An optimized Chacha20 implementation with 8-way unrolling for ppc64le.
Hi Michael, It's in IBM repo. Thanks. -Danny On 4/25/23 7:02 AM, Michael Ellerman wrote: Danny Tsen writes: This is recommended template to use for IBM copyright. According to who? The documentation I've seen specifies "IBM Corp." or "IBM Corporation". cheers
Re: [PATCH 5/5] Update Kconfig and Makefile.
I was not sure at the time when I use IS_REACHABLE. Will fix it in init code. Thanks. -Danny On 4/25/23 12:46 AM, Herbert Xu wrote: On Mon, Apr 24, 2023 at 02:47:26PM -0400, Danny Tsen wrote: +config CRYPTO_CHACHA20_P10 + tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_SKCIPHER I thought your IS_REACHABLE test was so that you could build this without the Crypto API? Colour me confused. Cheers,
Re: [PATCH 4/5] Glue code for optmized Poly1305 implementation for ppc64le.
Did not notice that. Will do fix it. Thanks. -Danny On 4/25/23 12:44 AM, Herbert Xu wrote: On Mon, Apr 24, 2023 at 02:47:25PM -0400, Danny Tsen wrote: + if (likely(srclen >= POLY1305_BLOCK_SIZE)) { + bytes = round_down(srclen, POLY1305_BLOCK_SIZE); + used = crypto_poly1305_setdctxkey(dctx, src, bytes); + if (likely(used)) { + srclen -= used; + src += used; + } + if (srclen >= POLY1305_BLOCK_SIZE*4) { + vsx_begin(); Your chacha code has a SIMD-fallback, how come this one doesn't? Thanks,
Re: [PATCH 2/5] Glue code for optmized Chacha20 implementation for ppc64le.
Got it. Will fix it. Thanks. -Danny On 4/25/23 12:41 AM, Herbert Xu wrote: On Mon, Apr 24, 2023 at 02:47:23PM -0400, Danny Tsen wrote: +static int chacha_p10_stream_xor(struct skcipher_request *req, +const struct chacha_ctx *ctx, const u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(, req, false); + if (err) + return err; + + chacha_init_generic(state, ctx->key, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = rounddown(nbytes, walk.stride); + + if (!static_branch_likely(_p10) || You don't need the static branch in the Crypto API code since the registration is already conditional. Cheers,
Re: [PATCH 1/5] An optimized Chacha20 implementation with 8-way unrolling for ppc64le.
This is recommended template to use for IBM copyright. Thanks. -Danny On 4/24/23 3:40 PM, Elliott, Robert (Servers) wrote: +# Copyright 2023- IBM Inc. All rights reserved I don't think any such entity exists - you probably mean IBM Corporation.
[PATCH 5/5] Update Kconfig and Makefile.
Defined CRYPTO_CHACHA20_P10 and CRYPTO POLY1305_P10 in Kconfig to support optimized implementation for Power10 and later CPU. Added new module driver chacha-p10-crypto and poly1305-p10-crypto. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig | 26 ++ arch/powerpc/crypto/Makefile | 4 2 files changed, 30 insertions(+) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 7113f9355165..f74d9dd6574b 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -111,4 +111,30 @@ config CRYPTO_AES_GCM_P10 Support for cryptographic acceleration instructions on Power10 or later CPU. This module supports stitched acceleration for AES/GCM. +config CRYPTO_CHACHA20_P10 + tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_SKCIPHER + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + help + Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 + stream cipher algorithms + + Architecture: PowerPC64 + - Power10 or later + - Little-endian + +config CRYPTO_POLY1305_P10 + tristate "Hash functions: Poly1305 (P10 or later)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_HASH + select CRYPTO_LIB_POLY1305_GENERIC + help + Poly1305 authenticator algorithm (RFC7539) + + Architecture: PowerPC64 + - Power10 or later + - Little-endian + endmenu diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 05c7486f42c5..cd5282eff451 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -14,6 +14,8 @@ obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o +obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o +obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -23,6 +25,8 @@ sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp8-ppc.o aesp8-ppc.o +chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o +poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o quiet_cmd_perl = PERL$@ cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, linux-ppc64) > $@ -- 2.31.1
[PATCH 4/5] Glue code for optmized Poly1305 implementation for ppc64le.
Signed-off-by: Danny Tsen --- arch/powerpc/crypto/poly1305-p10-glue.c | 186 1 file changed, 186 insertions(+) create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c diff --git a/arch/powerpc/crypto/poly1305-p10-glue.c b/arch/powerpc/crypto/poly1305-p10-glue.c new file mode 100644 index ..b1800f7b6af8 --- /dev/null +++ b/arch/powerpc/crypto/poly1305-p10-glue.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Poly1305 authenticator algorithm, RFC7539. + * + * Copyright 2023- IBM Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_p10le_4blocks(void *h, const u8 *m, u32 mlen); +asmlinkage void poly1305_64s(void *h, const u8 *m, u32 mlen, int highbit); +asmlinkage void poly1305_emit_64(void *h, void *s, u8 *dst); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static int crypto_poly1305_p10_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + poly1305_core_init(>h); + dctx->buflen = 0; + dctx->rset = 0; + dctx->sset = false; + + return 0; +} + +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, + const u8 *inp, unsigned int len) +{ + unsigned int acc = 0; + + if (unlikely(!dctx->sset)) { + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { + struct poly1305_core_key *key = >core_r; + + key->key.r64[0] = get_unaligned_le64([0]); + key->key.r64[1] = get_unaligned_le64([8]); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; + dctx->rset = 1; + } + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32([0]); + dctx->s[1] = get_unaligned_le32([4]); + dctx->s[2] = get_unaligned_le32([8]); + dctx->s[3] = get_unaligned_le32([12]); + acc += POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + } + return acc; +} + +static int crypto_poly1305_p10_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + unsigned int bytes, used; + + if (unlikely(dctx->buflen)) { + bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + srclen -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, + POLY1305_BLOCK_SIZE))) { + vsx_begin(); + poly1305_64s(>h, dctx->buf, + POLY1305_BLOCK_SIZE, 1); + vsx_end(); + } + dctx->buflen = 0; + } + } + + if (likely(srclen >= POLY1305_BLOCK_SIZE)) { + bytes = round_down(srclen, POLY1305_BLOCK_SIZE); + used = crypto_poly1305_setdctxkey(dctx, src, bytes); + if (likely(used)) { + srclen -= used; + src += used; + } + if (srclen >= POLY1305_BLOCK_SIZE*4) { + vsx_begin(); + poly1305_p10le_4blocks(>h, src, srclen); + vsx_end(); + src += srclen - (srclen % (POLY1305_BLOCK_SIZE * 4)); + srclen %= POLY1305_BLOCK_SIZE * 4; + } + while (srclen >= POLY1305_BLOCK_SIZE) { + vsx_begin(); + poly1305_64s(>h, src, POLY1305_BLOCK_SIZE, 1); + vsx_end(); + srclen -= POLY1305_BLOCK_SIZE; + src += POLY1305_BLOCK_SIZE; + } + } + + if (unlikely(srclen)) { + dctx->buflen = srclen; + memcpy(dctx->buf, src, srclen); + } + + return 0; +} + +static int crypto_poly1305_p10_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_de
[PATCH 2/5] Glue code for optmized Chacha20 implementation for ppc64le.
Signed-off-by: Danny Tsen --- arch/powerpc/crypto/chacha-p10-glue.c | 223 ++ 1 file changed, 223 insertions(+) create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c new file mode 100644 index ..cefb150e7b3c --- /dev/null +++ b/arch/powerpc/crypto/chacha-p10-glue.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PowerPC P10 (ppc64le) accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright 2023- IBM Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void chacha_p10le_8x(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static void chacha_p10_do_8x(u32 *state, u8 *dst, const u8 *src, +unsigned int bytes, int nrounds) +{ + unsigned int l = bytes & ~0x0FF; + + if (l > 0) { + chacha_p10le_8x(state, dst, src, l, nrounds); + bytes -= l; + src += l; + dst += l; + state[12] += l / CHACHA_BLOCK_SIZE; + } + + if (bytes > 0) + chacha_crypt_generic(state, dst, src, bytes, nrounds); +} + +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + hchacha_block_generic(state, stream, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + if (!static_branch_likely(_p10) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + vsx_begin(); + chacha_p10_do_8x(state, dst, src, todo, nrounds); + vsx_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static int chacha_p10_stream_xor(struct skcipher_request *req, +const struct chacha_ctx *ctx, const u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(, req, false); + if (err) + return err; + + chacha_init_generic(state, ctx->key, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = rounddown(nbytes, walk.stride); + + if (!static_branch_likely(_p10) || + !crypto_simd_usable()) { + chacha_crypt_generic(state, walk.dst.virt.addr, +walk.src.virt.addr, nbytes, +ctx->nrounds); + } else { + vsx_begin(); + chacha_p10_do_8x(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, ctx->nrounds); + vsx_end(); + } + err = skcipher_walk_done(, walk.nbytes - nbytes); + if (err) + break; + } + + return err; +} + +static int chacha_p10(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + return chacha_p10_stream_xor(req, ctx, req->iv); +} + +static int xchacha_p10(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + chacha_init_generic(state, ctx->key, req->iv); + hchacha_block_arch(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + memcpy(_iv[0], req->iv + 24, 8); + memcpy(_iv[8], req->iv + 16, 8); + return chacha_p10_stream_xor(req, , real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-p10", + .base
[PATCH 3/5] An optimized Poly1305 implementation with 4-way unrolling for ppc64le.
Improve overall performance of Poly1305 for Power10 or later CPU. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++ 1 file changed, 1075 insertions(+) create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S diff --git a/arch/powerpc/crypto/poly1305-p10le_64.S b/arch/powerpc/crypto/poly1305-p10le_64.S new file mode 100644 index ..22fd255af87e --- /dev/null +++ b/arch/powerpc/crypto/poly1305-p10le_64.S @@ -0,0 +1,1075 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated poly1305 implementation for ppc64le. +# +# Copyright 2023- IBM Inc. All rights reserved +# +#=== +# Written by Danny Tsen +# +# Poly1305 - this version mainly using vector/VSX/Scalar +# - 26 bits limbs +# - Handle multiple 64 byte blcok. +# +# Block size 16 bytes +# key = (r, s) +# clamp r &= 0x0FFC0FFC 0x0FFC0FFF +# p = 2^130 - 5 +# a += m +# a = (r + a) % p +# a += s +# +# Improve performance by breaking down polynominal to the sum of products with +# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r +# +# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 +# to 9 vectors for multiplications. +# +# setup r^4, r^3, r^2, r vectors +#vs[r^1, r^3, r^2, r^4] +#vs0 = [r0,.] +#vs1 = [r1,.] +#vs2 = [r2,.] +#vs3 = [r3,.] +#vs4 = [r4,.] +#vs5 = [r1*5,...] +#vs6 = [r2*5,...] +#vs7 = [r2*5,...] +#vs8 = [r4*5,...] +# +# Each word in a vector consists a member of a "r/s" in [a * r/s]. +# +# r0, r4*5, r3*5, r2*5, r1*5; +# r1, r0, r4*5, r3*5, r2*5; +# r2, r1, r0, r4*5, r3*5; +# r3, r2, r1, r0, r4*5; +# r4, r3, r2, r1, r0 ; +# +# +# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) +# k = 32 bytes key +# r3 = k (r, s) +# r4 = mlen +# r5 = m +# +#include +#include +#include +#include + +.machine "any" + +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx\VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx\VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416,
[PATCH 1/5] An optimized Chacha20 implementation with 8-way unrolling for ppc64le.
Improve overall performance of chacha20 encrypt and decrypt operations for Power10 or later CPU. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/chacha-p10le-8x.S | 842 ++ 1 file changed, 842 insertions(+) create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S b/arch/powerpc/crypto/chacha-p10le-8x.S new file mode 100644 index ..7c15d17101d7 --- /dev/null +++ b/arch/powerpc/crypto/chacha-p10le-8x.S @@ -0,0 +1,842 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated chacha20 implementation for ppc64le. +# +# Copyright 2023- IBM Inc. All rights reserved +# +#=== +# Written by Danny Tsen +# +# chacha_p10le_8x(u32 *state, byte *dst, const byte *src, +# size_t len, int nrounds); +# +# do rounds, 8 quarter rounds +# 1. a += b; d ^= a; d <<<= 16; +# 2. c += d; b ^= c; b <<<= 12; +# 3. a += b; d ^= a; d <<<= 8; +# 4. c += d; b ^= c; b <<<= 7 +# +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 +# +# 4 blocks (a b c d) +# +# a0 b0 c0 d0 +# a1 b1 c1 d1 +# ... +# a4 b4 c4 d4 +# ... +# a8 b8 c8 d8 +# ... +# a12 b12 c12 d12 +# a13 ... +# a14 ... +# a15 b15 c15 d15 +# +# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) +# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) +# + +#include +#include +#include +#include + +.machine "any" +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx\VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx\VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416, 9 + RESTORE_VSX 29, 432, 9 + RESTORE_VSX 30, 448, 9 + RESTORE_VSX 31, 464, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE
[PATCH 0/5] crypto: Accelerated Chacha20/Poly1305 implementation
This patch series provide an accelerated/optimized Chacha20 and Poly1305 implementation for Power10 or later CPU (ppc64le). This module implements algorithm specified in RFC7539. The implementation provides 3.5X better performance than the baseline for Chacha20 and Poly1305 individually and 1.5X improvement for Chacha20/Poly1305 operation. This patch has been tested with the kernel crypto module tcrypt.ko and has passed the selftest. The patch is also tested with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. Danny Tsen (5): An optimized Chacha20 implementation with 8-way unrolling for ppc64le. Glue code for optmized Chacha20 implementation for ppc64le. An optimized Poly1305 implementation with 4-way unrolling for ppc64le. Glue code for optmized Poly1305 implementation for ppc64le. Update Kconfig and Makefile. arch/powerpc/crypto/Kconfig | 26 + arch/powerpc/crypto/Makefile|4 + arch/powerpc/crypto/chacha-p10-glue.c | 223 + arch/powerpc/crypto/chacha-p10le-8x.S | 842 ++ arch/powerpc/crypto/poly1305-p10-glue.c | 186 arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++ 6 files changed, 2356 insertions(+) create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S -- 2.31.1
Re: [PATCH v2 1/2] Remove POWER10_CPU dependency.
Thanks Michael. -Danny On 4/14/23 8:08 AM, Michael Ellerman wrote: Danny Tsen writes: Remove Power10 dependency in Kconfig and detect Power10 feature at runtime. ... using the existing call to module_cpu_feature_match() :) Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Acked-by: Michael Ellerman (powerpc) cheers diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 1f8f02b494e1..7113f9355165 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE config CRYPTO_AES_GCM_P10 tristate "Stitched AES/GCM acceleration support on P10 or later CPU (PPC)" - depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN + depends on PPC64 && CPU_LITTLE_ENDIAN select CRYPTO_LIB_AES select CRYPTO_ALGAPI select CRYPTO_AEAD -- 2.31.1
[PATCH v2 2/2] Move Power10 feature, PPC_MODULE_FEATURE_P10.
Move Power10 feature, PPC_MODULE_FEATURE_P10, definition to be in arch/powerpc/include/asm/cpufeature.h. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 - arch/powerpc/include/asm/cpufeature.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c index 1533c8cdd26f..bd3475f5348d 100644 --- a/arch/powerpc/crypto/aes-gcm-p10-glue.c +++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c @@ -22,7 +22,6 @@ #include #include -#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) #definePPC_ALIGN 16 #define GCM_IV_SIZE12 diff --git a/arch/powerpc/include/asm/cpufeature.h b/arch/powerpc/include/asm/cpufeature.h index f6f790a90367..2dcc66225e7f 100644 --- a/arch/powerpc/include/asm/cpufeature.h +++ b/arch/powerpc/include/asm/cpufeature.h @@ -22,6 +22,7 @@ */ #define PPC_MODULE_FEATURE_VEC_CRYPTO (32 + ilog2(PPC_FEATURE2_VEC_CRYPTO)) +#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) #define cpu_feature(x) (x) -- 2.31.1
[PATCH v2 0/2] Remove POWER10_CPU dependency and move PPC_MODULE_FEATURE_P10.
Remove Power10 dependency in Kconfig and detect Power10 feature at runtime. Move PPC_MODULE_FEATURE_P10 definition to be in arch/powerpc/include/asm/cpufeature.h. Signed-off-by: Danny Tsen Danny Tsen (2): Kconfig: Remove POWER10_CPU dependency. aes-gcm-p10-glue.c, cpufeature.h: Move Power10 feature, PPC_MODULE_FEATURE_P10. arch/powerpc/crypto/Kconfig| 2 +- arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 - arch/powerpc/include/asm/cpufeature.h | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) -- 2.31.1
[PATCH v2 1/2] Remove POWER10_CPU dependency.
Remove Power10 dependency in Kconfig and detect Power10 feature at runtime. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 1f8f02b494e1..7113f9355165 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE config CRYPTO_AES_GCM_P10 tristate "Stitched AES/GCM acceleration support on P10 or later CPU (PPC)" - depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN + depends on PPC64 && CPU_LITTLE_ENDIAN select CRYPTO_LIB_AES select CRYPTO_ALGAPI select CRYPTO_AEAD -- 2.31.1
Re: [PATCH] Remove POWER10_CPU dependency and move PPC_MODULE_FEATURE_P10.
On 4/13/23 8:18 AM, Danny Tsen wrote: Hi Michael, If I do separate patch for moving PPC_MODULE_FEATURE_P10, this will break the build since it is currently defined in aes-gcm-p10-glue.c. And the p10 will be detected when loading the module in module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init); so it won't load if it's not P10. Thanks. -Danny On 4/13/23 8:12 AM, Michael Ellerman wrote: Danny Tsen writes: Remove Power10 dependency in Kconfig and detect Power10 feature at runtime. Move PPC_MODULE_FEATURE_P10 definition to be in arch/powerpc/include/asm/cpufeature.h. This should be two patches, one for the Kconfig change and one moving the feature flag. I think I misunderstood. I can do two patches one for Kconfig change and one moving the feature flag. I'll fix it. Thanks. -Danny Also don't you need a cpu feature check in p10_init()? Otherwise the driver can be loaded on non-P10 CPUs, either by being built-in, or manually. cheers Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig | 2 +- arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 - arch/powerpc/include/asm/cpufeature.h | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 1f8f02b494e1..7113f9355165 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE config CRYPTO_AES_GCM_P10 tristate "Stitched AES/GCM acceleration support on P10 or later CPU (PPC)" - depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN + depends on PPC64 && CPU_LITTLE_ENDIAN select CRYPTO_LIB_AES select CRYPTO_ALGAPI select CRYPTO_AEAD diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c index 1533c8cdd26f..bd3475f5348d 100644 --- a/arch/powerpc/crypto/aes-gcm-p10-glue.c +++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c @@ -22,7 +22,6 @@ #include #include -#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) #define PPC_ALIGN 16 #define GCM_IV_SIZE 12 diff --git a/arch/powerpc/include/asm/cpufeature.h b/arch/powerpc/include/asm/cpufeature.h index f6f790a90367..2dcc66225e7f 100644 --- a/arch/powerpc/include/asm/cpufeature.h +++ b/arch/powerpc/include/asm/cpufeature.h @@ -22,6 +22,7 @@ */ #define PPC_MODULE_FEATURE_VEC_CRYPTO (32 + ilog2(PPC_FEATURE2_VEC_CRYPTO)) +#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) #define cpu_feature(x) (x) -- 2.31.1
Re: [PATCH] Remove POWER10_CPU dependency and move PPC_MODULE_FEATURE_P10.
Hi Michael, If I do separate patch for moving PPC_MODULE_FEATURE_P10, this will break the build since it is currently defined in aes-gcm-p10-glue.c. And the p10 will be detected when loading the module in module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init); so it won't load if it's not P10. Thanks. -Danny On 4/13/23 8:12 AM, Michael Ellerman wrote: Danny Tsen writes: Remove Power10 dependency in Kconfig and detect Power10 feature at runtime. Move PPC_MODULE_FEATURE_P10 definition to be in arch/powerpc/include/asm/cpufeature.h. This should be two patches, one for the Kconfig change and one moving the feature flag. Also don't you need a cpu feature check in p10_init()? Otherwise the driver can be loaded on non-P10 CPUs, either by being built-in, or manually. cheers Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig| 2 +- arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 - arch/powerpc/include/asm/cpufeature.h | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 1f8f02b494e1..7113f9355165 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE config CRYPTO_AES_GCM_P10 tristate "Stitched AES/GCM acceleration support on P10 or later CPU (PPC)" - depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN + depends on PPC64 && CPU_LITTLE_ENDIAN select CRYPTO_LIB_AES select CRYPTO_ALGAPI select CRYPTO_AEAD diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c index 1533c8cdd26f..bd3475f5348d 100644 --- a/arch/powerpc/crypto/aes-gcm-p10-glue.c +++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c @@ -22,7 +22,6 @@ #include #include -#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) #define PPC_ALIGN 16 #define GCM_IV_SIZE 12 diff --git a/arch/powerpc/include/asm/cpufeature.h b/arch/powerpc/include/asm/cpufeature.h index f6f790a90367..2dcc66225e7f 100644 --- a/arch/powerpc/include/asm/cpufeature.h +++ b/arch/powerpc/include/asm/cpufeature.h @@ -22,6 +22,7 @@ */ #define PPC_MODULE_FEATURE_VEC_CRYPTO (32 + ilog2(PPC_FEATURE2_VEC_CRYPTO)) +#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) #define cpu_feature(x) (x) -- 2.31.1
[PATCH] Remove POWER10_CPU dependency and move PPC_MODULE_FEATURE_P10.
Remove Power10 dependency in Kconfig and detect Power10 feature at runtime. Move PPC_MODULE_FEATURE_P10 definition to be in arch/powerpc/include/asm/cpufeature.h. Signed-off-by: Danny Tsen --- arch/powerpc/crypto/Kconfig| 2 +- arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 - arch/powerpc/include/asm/cpufeature.h | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 1f8f02b494e1..7113f9355165 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE config CRYPTO_AES_GCM_P10 tristate "Stitched AES/GCM acceleration support on P10 or later CPU (PPC)" - depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN + depends on PPC64 && CPU_LITTLE_ENDIAN select CRYPTO_LIB_AES select CRYPTO_ALGAPI select CRYPTO_AEAD diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c index 1533c8cdd26f..bd3475f5348d 100644 --- a/arch/powerpc/crypto/aes-gcm-p10-glue.c +++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c @@ -22,7 +22,6 @@ #include #include -#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) #definePPC_ALIGN 16 #define GCM_IV_SIZE12 diff --git a/arch/powerpc/include/asm/cpufeature.h b/arch/powerpc/include/asm/cpufeature.h index f6f790a90367..2dcc66225e7f 100644 --- a/arch/powerpc/include/asm/cpufeature.h +++ b/arch/powerpc/include/asm/cpufeature.h @@ -22,6 +22,7 @@ */ #define PPC_MODULE_FEATURE_VEC_CRYPTO (32 + ilog2(PPC_FEATURE2_VEC_CRYPTO)) +#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) #define cpu_feature(x) (x) -- 2.31.1