Re: [PATCH v2 0/3] crypto: X25519 supports for ppc64le

2024-05-31 Thread Danny Tsen

Thanks Herbert.


On 5/31/24 5:20 AM, Herbert Xu wrote:

On Thu, May 16, 2024 at 11:19:54AM -0400, Danny Tsen wrote:

This patch series provide X25519 support for ppc64le with a new module
curve25519-ppc64le.

The implementation is based on CRYPTOGAMs perl output from x25519-ppc64.pl.
(see https://github.com/dot-asm/cryptogams/)
Modified and added 4 supporting functions.

This patch has passed the selftest by running modprobe
curve25519-ppc64le.

Danny Tsen (3):
   X25519 low-level primitives for ppc64le.
   X25519 core functions for ppc64le
   Update Kconfig and Makefile for ppc64le x25519.

  arch/powerpc/crypto/Kconfig   |  11 +
  arch/powerpc/crypto/Makefile  |   2 +
  arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 
  arch/powerpc/crypto/curve25519-ppc64le_asm.S  | 671 ++
  4 files changed, 983 insertions(+)
  create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c
  create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S

--
2.31.1

All applied.  Thanks.


[PATCH v2 3/3] crypto: Update Kconfig and Makefile for ppc64le x25519.

2024-05-16 Thread Danny Tsen
Defined CRYPTO_CURVE25519_PPC64 to support X25519 for ppc64le.

Added new module curve25519-ppc64le for X25519.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/Kconfig  | 11 +++
 arch/powerpc/crypto/Makefile |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 1e201b7ae2fc..09ebcbdfb34f 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -2,6 +2,17 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (powerpc)"
 
+config CRYPTO_CURVE25519_PPC64
+   tristate "Public key crypto: Curve25519 (PowerPC64)"
+   depends on PPC64 && CPU_LITTLE_ENDIAN
+   select CRYPTO_LIB_CURVE25519_GENERIC
+   select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+   help
+ Curve25519 algorithm
+
+ Architecture: PowerPC64
+ - Little-endian
+
 config CRYPTO_CRC32C_VPMSUM
tristate "CRC32c"
depends on PPC64 && ALTIVEC
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index fca0e9739869..59808592f0a1 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
 obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
 obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
 obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
+obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o 
aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
@@ -29,6 +30,7 @@ aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o 
ghashp10-ppc.o aesp10-p
 chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
 poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
 vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o 
aes_xts.o ghash.o
+curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
 override flavour := linux-ppc64le
-- 
2.31.1



[PATCH v2 2/3] crypto: X25519 core functions for ppc64le

2024-05-16 Thread Danny Tsen
X25519 core functions to handle scalar multiplication for ppc64le.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 ++
 1 file changed, 299 insertions(+)
 create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c

diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c 
b/arch/powerpc/crypto/curve25519-ppc64le-core.c
new file mode 100644
index ..4e3e44ea4484
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2024- IBM Corp.
+ *
+ * X25519 scalar multiplication with 51 bits limbs for PPC64le.
+ *   Based on RFC7748 and AArch64 optimized implementation for X25519
+ * - Algorithm 1 Scalar multiplication of a variable point
+ */
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+typedef uint64_t fe51[5];
+
+asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f);
+asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f);
+asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n);
+asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s);
+asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h);
+asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit);
+
+#define fmul x25519_fe51_mul
+#define fsqr x25519_fe51_sqr
+#define fmul121666 x25519_fe51_mul121666
+#define fe51_tobytes x25519_fe51_tobytes
+
+static void fadd(fe51 h, const fe51 f, const fe51 g)
+{
+   h[0] = f[0] + g[0];
+   h[1] = f[1] + g[1];
+   h[2] = f[2] + g[2];
+   h[3] = f[3] + g[3];
+   h[4] = f[4] + g[4];
+}
+
+/*
+ * Prime = 2 ** 255 - 19, 255 bits
+ *(0x7fff       
ffed)
+ *
+ * Prime in 5 51-bit limbs
+ */
+static fe51 prime51 = { 0x7ffed, 0x7, 0x7, 
0x7, 0x7};
+
+static void fsub(fe51 h, const fe51 f, const fe51 g)
+{
+   h[0] = (f[0] + ((prime51[0] * 2))) - g[0];
+   h[1] = (f[1] + ((prime51[1] * 2))) - g[1];
+   h[2] = (f[2] + ((prime51[2] * 2))) - g[2];
+   h[3] = (f[3] + ((prime51[3] * 2))) - g[3];
+   h[4] = (f[4] + ((prime51[4] * 2))) - g[4];
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+   /*
+* Make sure 64-bit aligned.
+*/
+   unsigned char sbuf[32+8];
+   unsigned char *sb = PTR_ALIGN((void *)sbuf, 8);
+
+   memcpy(sb, s, 32);
+   x25519_fe51_frombytes(h, sb);
+}
+
+static void finv(fe51 o, const fe51 i)
+{
+   fe51 a0, b, c, t00;
+
+   fsqr(a0, i);
+   x25519_fe51_sqr_times(t00, a0, 2);
+
+   fmul(b, t00, i);
+   fmul(a0, b, a0);
+
+   fsqr(t00, a0);
+
+   fmul(b, t00, b);
+   x25519_fe51_sqr_times(t00, b, 5);
+
+   fmul(b, t00, b);
+   x25519_fe51_sqr_times(t00, b, 10);
+
+   fmul(c, t00, b);
+   x25519_fe51_sqr_times(t00, c, 20);
+
+   fmul(t00, t00, c);
+   x25519_fe51_sqr_times(t00, t00, 10);
+
+   fmul(b, t00, b);
+   x25519_fe51_sqr_times(t00, b, 50);
+
+   fmul(c, t00, b);
+   x25519_fe51_sqr_times(t00, c, 100);
+
+   fmul(t00, t00, c);
+   x25519_fe51_sqr_times(t00, t00, 50);
+
+   fmul(t00, t00, b);
+   x25519_fe51_sqr_times(t00, t00, 5);
+
+   fmul(o, t00, a0);
+}
+
+static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32],
+   const uint8_t point[32])
+{
+   fe51 x1, x2, z2, x3, z3;
+   uint8_t s[32];
+   unsigned int swap = 0;
+   int i;
+
+   memcpy(s, scalar, 32);
+   s[0]  &= 0xf8;
+   s[31] &= 0x7f;
+   s[31] |= 0x40;
+   fe51_frombytes(x1, point);
+
+   z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0;
+   x3[0] = x1[0];
+   x3[1] = x1[1];
+   x3[2] = x1[2];
+   x3[3] = x1[3];
+   x3[4] = x1[4];
+
+   x2[0] = z3[0] = 1;
+   x2[1] = z3[1] = 0;
+   x2[2] = z3[2] = 0;
+   x2[3] = z3[3] = 0;
+   x2[4] = z3[4] = 0;
+
+   for (i = 254; i >= 0; --i) {
+   unsigned int k_t = 1 & (s[i / 8] >> (i & 7));
+   fe51 a, b, c, d, e;
+   fe51 da, cb, aa, bb;
+   fe51 dacb_p, dacb_m;
+
+   swap ^= k_t;
+   x25519_cswap(x2, x3, swap);
+   x25519_cswap(z2, z3, swap);
+   swap = k_t;
+
+   fsub(b, x2, z2);// B = x_2 - z_2
+   fadd(a, x2, z2);// A = x_2 + z_2
+   fsub(d, x3, z3);// D = x_3 - z_3
+   fadd(c, x3, z3);// C = x_3 + z_3
+
+   fsqr(bb, b);// BB = B^2
+   fsqr(aa, a);// AA = A^2
+   fmul(da, d, a); // DA = D * A
+   fmul(cb, c, b); // CB = C * B
+
+

[PATCH v2 0/3] crypto: X25519 supports for ppc64le

2024-05-16 Thread Danny Tsen
This patch series provide X25519 support for ppc64le with a new module
curve25519-ppc64le.

The implementation is based on CRYPTOGAMs perl output from x25519-ppc64.pl.
(see https://github.com/dot-asm/cryptogams/)
Modified and added 4 supporting functions.

This patch has passed the selftest by running modprobe
curve25519-ppc64le.

Danny Tsen (3):
  X25519 low-level primitives for ppc64le.
  X25519 core functions for ppc64le
  Update Kconfig and Makefile for ppc64le x25519.

 arch/powerpc/crypto/Kconfig   |  11 +
 arch/powerpc/crypto/Makefile  |   2 +
 arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 
 arch/powerpc/crypto/curve25519-ppc64le_asm.S  | 671 ++
 4 files changed, 983 insertions(+)
 create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c
 create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S

-- 
2.31.1



[PATCH v2 1/3] crypto: X25519 low-level primitives for ppc64le.

2024-05-16 Thread Danny Tsen
Use the perl output of x25519-ppc64.pl from CRYPTOGAMs
(see https://github.com/dot-asm/cryptogams/) and added four
supporting functions, x25519_fe51_sqr_times, x25519_fe51_frombytes,
x25519_fe51_tobytes and x25519_cswap.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/curve25519-ppc64le_asm.S | 671 +++
 1 file changed, 671 insertions(+)
 create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S

diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S 
b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
new file mode 100644
index ..06c1febe24b9
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
@@ -0,0 +1,671 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://github.com/dot-asm/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by 
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+#   * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+#   * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+#   * Neither the name of the CRYPTOGAMS nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# 
+# Written by Andy Polyakov  for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# 
+
+#
+# 
+# Written and Modified by Danny Tsen 
+# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes
+#   and x25519_cswap
+#
+# Copyright 2024- IBM Corp.
+#
+# X25519 lower-level primitives for PPC64.
+#
+
+#include 
+
+.text
+
+.align 5
+SYM_FUNC_START(x25519_fe51_mul)
+
+   stdu1,-144(1)
+   std 21,56(1)
+   std 22,64(1)
+   std 23,72(1)
+   std 24,80(1)
+   std 25,88(1)
+   std 26,96(1)
+   std 27,104(1)
+   std 28,112(1)
+   std 29,120(1)
+   std 30,128(1)
+   std 31,136(1)
+
+   ld  6,0(5)
+   ld  7,0(4)
+   ld  8,8(4)
+   ld  9,16(4)
+   ld  10,24(4)
+   ld  11,32(4)
+
+   mulld   22,7,6
+   mulhdu  23,7,6
+
+   mulld   24,8,6
+   mulhdu  25,8,6
+
+   mulld   30,11,6
+   mulhdu  31,11,6
+   ld  4,8(5)
+   mulli   11,11,19
+
+   mulld   26,9,6
+   mulhdu  27,9,6
+
+   mulld   28,10,6
+   mulhdu  29,10,6
+   mulld   12,11,4
+   mulhdu  21,11,4
+   addc22,22,12
+   adde23,23,21
+
+   mulld   12,7,4
+   mulhdu  21,7,4
+   addc24,24,12
+   adde25,25,21
+
+   mulld   12,10,4
+   mulhdu  21,10,4
+   ld  6,16(5)
+   mulli   10,10,19
+   addc30,30,12
+   adde31,31,21
+
+   mulld   12,8,4
+   mulhdu  21,8,4
+   addc26,26,12
+   adde27,27,21
+
+   mulld   12,9,4
+   mulhdu  21,9,4
+ 

Re: [PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.

2024-05-16 Thread Danny Tsen

Hi Andy,

I learned something here.  Will fix this.  Thanks.

-Danny

On 5/16/24 3:38 AM, Andy Polyakov wrote:

Hi,


+.abiversion    2


I'd prefer that was left to the compiler flags.


Problem is that it's the compiler that is responsible for providing 
this directive in the intermediate .s prior invoking the assembler. 
And there is no assembler flag to pass through -Wa. If concern is ABI 
neutrality, then solution would rather be #if (_CALL_ELF-0) == 
2/#endif. One can also make a case for


#ifdef _CALL_ELF
.abiversion _CALL_ELF
#endif

Cheers.



Re: [PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.

2024-05-16 Thread Danny Tsen



On 5/15/24 11:53 PM, Michael Ellerman wrote:

Hi Danny,

Danny Tsen  writes:

Use the perl output of x25519-ppc64.pl from CRYPTOGAMs and added three
supporting functions, x25519_fe51_sqr_times, x25519_fe51_frombytes
and x25519_fe51_tobytes.

For other algorithms we have checked-in the perl script and generated
the code at runtime. Is there a reason you've done it differently this time?


Hi Michael,

It's easier for me to read and use just assembly not mixed with perl and 
it's easier for me to debug and testing also I copied some code and made 
some modification.



Signed-off-by: Danny Tsen 
---
  arch/powerpc/crypto/curve25519-ppc64le_asm.S | 648 +++
  1 file changed, 648 insertions(+)
  create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S

diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S 
b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
new file mode 100644
index ..8a018104838a
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
@@ -0,0 +1,648 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Copyright 2024- IBM Corp.  All Rights Reserved.
  
I'm not a lawyer, but AFAIK "All Rights Reserved" is not required and

can be confusing - because we are not reserving all rights, we are
granting some rights under the GPL.

I also think the IBM copyright should be down below where your
modifications are described.

Will change that.

+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://www.openssl.org/~appro/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by 
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+#   * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+#   * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+#   * Neither the name of the CRYPTOGAMS nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# 
+# Written by Andy Polyakov  for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# 
+
+#
+# 
+# Written and Modified by Danny Tsen 
+# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes

ie. here.


+# X25519 lower-level primitives for PPC64.
+#
+
+#include 
+
+.machine "any"
  
Please don't add new .machine directives unless they are required.



+.abiversion2

I'd prefer that was left to the compiler flags.


Ok.

Thanks.

-Danny



cheers



Re: [PATCH 2/3] crypto: X25519 core functions for ppc64le

2024-05-15 Thread Danny Tsen

Hi Andy,

Thanks for the info.  I should be able to do it.  I was hoping an 
assembly guru like you can show me some tricks here if there is :)


Thanks.

-Danny

On 5/15/24 8:33 AM, Andy Polyakov wrote:

+static void cswap(fe51 p, fe51 q, unsigned int bit)
+{
+    u64 t, i;
+    u64 c = 0 - (u64) bit;
+
+    for (i = 0; i < 5; ++i) {
+    t = c & (p[i] ^ q[i]);
+    p[i] ^= t;
+    q[i] ^= t;
+    }
+}


The "c" in cswap stands for "constant-time," and the problem is that 
contemporary compilers have exhibited the ability to produce 
non-constant-time machine code as result of compilation of the above 
kind of technique. The outcome is platform-specific and ironically 
some of PPC code generators were observed to generate "most" 
non-constant-time code. "Most" in sense that execution time 
variations would be most easy to catch.


Just to substantiate the point, consider 
https://godbolt.org/z/faYnEcPT7, and note the conditional branch in 
the middle of the loop, which flies in the face of constant-time-ness. 
In case you object 'bit &= 1' on line 7 in the C code. Indeed, if you 
comment it out, the generated code will be fine. But the point is that 
the compiler is capable of and was in fact observed to figure out that 
the caller passes either one or zero and generate the machine code in 
the assembly window. In other words 'bit &= 1' is just a reflection of 
what the caller does.


... the permanent solution is to do it in assembly. I can put 
together something...


Though you should be able to do this just as well :-) So should I or 
would you?


Cheers.



Re: [PATCH 2/3] crypto: X25519 core functions for ppc64le

2024-05-15 Thread Danny Tsen

Hi Andy,

Points taken.  And much appreciate for the help.

Thanks.

-Danny

On 5/15/24 3:29 AM, Andy Polyakov wrote:

Hi,


+static void cswap(fe51 p, fe51 q, unsigned int bit)
+{
+    u64 t, i;
+    u64 c = 0 - (u64) bit;
+
+    for (i = 0; i < 5; ++i) {
+    t = c & (p[i] ^ q[i]);
+    p[i] ^= t;
+    q[i] ^= t;
+    }
+}


The "c" in cswap stands for "constant-time," and the problem is that 
contemporary compilers have exhibited the ability to produce 
non-constant-time machine code as result of compilation of the above 
kind of technique. The outcome is platform-specific and ironically 
some of PPC code generators were observed to generate "most" 
non-constant-time code. "Most" in sense that execution time variations 
would be most easy to catch. One way to work around the problem, at 
least for the time being, is to add 'asm volatile("" : "+r"(c))' after 
you calculate 'c'. But there is no guarantee that the next compiler 
version won't see through it, hence the permanent solution is to do it 
in assembly. I can put together something...


Cheers.



Re: [PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.

2024-05-15 Thread Danny Tsen

See inline.

On 5/15/24 4:06 AM, Andy Polyakov wrote:

Hi,


+SYM_FUNC_START(x25519_fe51_sqr_times)
...
+
+.Lsqr_times_loop:
...
+
+    std    9,16(3)
+    std    10,24(3)
+    std    11,32(3)
+    std    7,0(3)
+    std    8,8(3)
+    bdnz    .Lsqr_times_loop


I see no reason for why the stores can't be moved outside the loop in 
question.



Yeah.  I'll fix it.



+SYM_FUNC_START(x25519_fe51_frombytes)
+.align    5
+
+    li    12, -1
+    srdi    12, 12, 13    # 0x7
+
+    ld    5, 0(4)
+    ld    6, 8(4)
+    ld    7, 16(4)
+    ld    8, 24(4)


Is there actual guarantee that the byte input is 64-bit aligned? While 
it is true that processor is obliged to handle misaligned loads and 
stores by the ISA specification, them being inefficient doesn't go 
against it. Most notably inefficiency is likely to be noted at the 
page boundaries. What I'm trying to say is that it would be more 
appropriate to avoid the unaligned loads (and stores).


Good point.  Maybe I can handle it with 64-bit aligned for the input.

Thanks.




Cheers.



Re: [PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.

2024-05-15 Thread Danny Tsen

Thank you Andy.  Will fix this.

On 5/15/24 3:11 AM, Andy Polyakov wrote:

Hi,

Couple of remarks inline.


+# [1] https://www.openssl.org/~appro/cryptogams/


https://github.com/dot-asm/cryptogams/ is arguably better reference.


+SYM_FUNC_START(x25519_fe51_mul)
+.align    5


The goal is to align the label, not the first instruction after the 
directive. It's not a problem in this spot, in the beginning of the 
module that is, but further below it's likely to inject redundant nops 
between the label and meaningful code. But since the directive in 
question is not position-sensitive one can resolve this by changing 
the order of the directive and the SYM_FUNC_START macro.


Cheers.



[PATCH 3/3] crypto: Update Kconfig and Makefile for ppc64le x25519.

2024-05-14 Thread Danny Tsen
Defined CRYPTO_CURVE25519_PPC64 to support X25519 for ppc64le.

Added new module curve25519-ppc64le for X25519.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/Kconfig  | 11 +++
 arch/powerpc/crypto/Makefile |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 1e201b7ae2fc..09ebcbdfb34f 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -2,6 +2,17 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (powerpc)"
 
+config CRYPTO_CURVE25519_PPC64
+   tristate "Public key crypto: Curve25519 (PowerPC64)"
+   depends on PPC64 && CPU_LITTLE_ENDIAN
+   select CRYPTO_LIB_CURVE25519_GENERIC
+   select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+   help
+ Curve25519 algorithm
+
+ Architecture: PowerPC64
+ - Little-endian
+
 config CRYPTO_CRC32C_VPMSUM
tristate "CRC32c"
depends on PPC64 && ALTIVEC
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index fca0e9739869..59808592f0a1 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
 obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
 obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
 obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
+obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o 
aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
@@ -29,6 +30,7 @@ aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o 
ghashp10-ppc.o aesp10-p
 chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
 poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
 vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o 
aes_xts.o ghash.o
+curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
 override flavour := linux-ppc64le
-- 
2.31.1



[PATCH 2/3] crypto: X25519 core functions for ppc64le

2024-05-14 Thread Danny Tsen
X25519 core functions to handle scalar multiplication for ppc64le.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 ++
 1 file changed, 299 insertions(+)
 create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c

diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c 
b/arch/powerpc/crypto/curve25519-ppc64le-core.c
new file mode 100644
index ..6a8b5efc40ce
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2024- IBM Corp. All rights reserved.
+ *
+ * X25519 scalar multiplication with 51 bits limbs for PPC64le.
+ *   Based on RFC7748 and AArch64 optimized implementation for X25519
+ * - Algorithm 1 Scalar multiplication of a variable point
+ */
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+typedef uint64_t fe51[5];
+
+asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f);
+asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f);
+asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n);
+asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s);
+asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h);
+
+#define fmul x25519_fe51_mul
+#define fsqr x25519_fe51_sqr
+#define fmul121666 x25519_fe51_mul121666
+#define fe51_tobytes x25519_fe51_tobytes
+#define fe51_frombytes x25519_fe51_frombytes
+
+static void cswap(fe51 p, fe51 q, unsigned int bit)
+{
+   u64 t, i;
+   u64 c = 0 - (u64) bit;
+
+   for (i = 0; i < 5; ++i) {
+   t = c & (p[i] ^ q[i]);
+   p[i] ^= t;
+   q[i] ^= t;
+   }
+}
+
+static void fadd(fe51 h, const fe51 f, const fe51 g)
+{
+   h[0] = f[0] + g[0];
+   h[1] = f[1] + g[1];
+   h[2] = f[2] + g[2];
+   h[3] = f[3] + g[3];
+   h[4] = f[4] + g[4];
+}
+
+/*
+ * Prime = 2 ** 255 - 19, 255 bits
+ *(0x7fff       
ffed)
+ *
+ * Prime in 5 51-bit limbs
+ */
+static fe51 prime51 = { 0x7ffed, 0x7, 0x7, 
0x7, 0x7};
+
+static void fsub(fe51 h, const fe51 f, const fe51 g)
+{
+   h[0] = (f[0] + ((prime51[0] * 2))) - g[0];
+   h[1] = (f[1] + ((prime51[1] * 2))) - g[1];
+   h[2] = (f[2] + ((prime51[2] * 2))) - g[2];
+   h[3] = (f[3] + ((prime51[3] * 2))) - g[3];
+   h[4] = (f[4] + ((prime51[4] * 2))) - g[4];
+}
+
+static void finv(fe51 o, const fe51 i)
+{
+   fe51 a0, b, c, t00;
+
+   fsqr(a0, i);
+   x25519_fe51_sqr_times(t00, a0, 2);
+
+   fmul(b, t00, i);
+   fmul(a0, b, a0);
+
+   fsqr(t00, a0);
+
+   fmul(b, t00, b);
+   x25519_fe51_sqr_times(t00, b, 5);
+
+   fmul(b, t00, b);
+   x25519_fe51_sqr_times(t00, b, 10);
+
+   fmul(c, t00, b);
+   x25519_fe51_sqr_times(t00, c, 20);
+
+   fmul(t00, t00, c);
+   x25519_fe51_sqr_times(t00, t00, 10);
+
+   fmul(b, t00, b);
+   x25519_fe51_sqr_times(t00, b, 50);
+
+   fmul(c, t00, b);
+   x25519_fe51_sqr_times(t00, c, 100);
+
+   fmul(t00, t00, c);
+   x25519_fe51_sqr_times(t00, t00, 50);
+
+   fmul(t00, t00, b);
+   x25519_fe51_sqr_times(t00, t00, 5);
+
+   fmul(o, t00, a0);
+}
+
+static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32],
+   const uint8_t point[32])
+{
+   fe51 x1, x2, z2, x3, z3;
+   uint8_t s[32];
+   unsigned int swap = 0;
+   int i;
+
+   memcpy(s, scalar, 32);
+   s[0]  &= 0xf8;
+   s[31] &= 0x7f;
+   s[31] |= 0x40;
+   fe51_frombytes(x1, point);
+
+   z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0;
+   x3[0] = x1[0];
+   x3[1] = x1[1];
+   x3[2] = x1[2];
+   x3[3] = x1[3];
+   x3[4] = x1[4];
+
+   x2[0] = z3[0] = 1;
+   x2[1] = z3[1] = 0;
+   x2[2] = z3[2] = 0;
+   x2[3] = z3[3] = 0;
+   x2[4] = z3[4] = 0;
+
+   for (i = 254; i >= 0; --i) {
+   unsigned int k_t = 1 & (s[i / 8] >> (i & 7));
+   fe51 a, b, c, d, e;
+   fe51 da, cb, aa, bb;
+   fe51 dacb_p, dacb_m;
+
+   swap ^= k_t;
+   cswap(x2, x3, swap);
+   cswap(z2, z3, swap);
+   swap = k_t;
+
+   fsub(b, x2, z2);// B = x_2 - z_2
+   fadd(a, x2, z2);// A = x_2 + z_2
+   fsub(d, x3, z3);// D = x_3 - z_3
+   fadd(c, x3, z3);// C = x_3 + z_3
+
+   fsqr(bb, b);// BB = B^2
+   fsqr(aa, a);// AA = A^2
+   fmul(da, d, a); // DA = D * A
+   fmul(cb, c, b); // CB = C * B
+
+   fsub(e, aa, bb

[PATCH 0/3] crypto: X25519 supports for ppc64le

2024-05-14 Thread Danny Tsen
This patch series provide X25519 support for ppc64le with a new module
curve25519-ppc64le.

The implementation is based on CRYPTOGAMs perl output from x25519-ppc64.pl.
Modified and added 3 supporting functions.

This patch has passed the selftest by running modprobe
curve25519-ppc64le.

Danny Tsen (3):
  X25519 low-level primitives for ppc64le.
  X25519 core functions to handle scalar multiplication for ppc64le.
  Update Kconfig and Makefile.

 arch/powerpc/crypto/Kconfig   |  11 +
 arch/powerpc/crypto/Makefile  |   2 +
 arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 
 arch/powerpc/crypto/curve25519-ppc64le_asm.S  | 648 ++
 4 files changed, 960 insertions(+)
 create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c
 create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S

-- 
2.31.1



[PATCH 1/3] crypto: X25519 low-level primitives for ppc64le.

2024-05-14 Thread Danny Tsen
Use the perl output of x25519-ppc64.pl from CRYPTOGAMs and added three
supporting functions, x25519_fe51_sqr_times, x25519_fe51_frombytes
and x25519_fe51_tobytes.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/curve25519-ppc64le_asm.S | 648 +++
 1 file changed, 648 insertions(+)
 create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S

diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S 
b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
new file mode 100644
index ..8a018104838a
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
@@ -0,0 +1,648 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Copyright 2024- IBM Corp.  All Rights Reserved.
+#
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://www.openssl.org/~appro/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by 
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+#   * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+#   * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+#   * Neither the name of the CRYPTOGAMS nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# 
+# Written by Andy Polyakov  for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# 
+
+#
+# 
+# Written and Modified by Danny Tsen 
+# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes
+#
+# X25519 lower-level primitives for PPC64.
+#
+
+#include 
+
+.machine "any"
+.abiversion2
+.text
+
+SYM_FUNC_START(x25519_fe51_mul)
+.align 5
+
+   stdu1,-144(1)
+   std 21,56(1)
+   std 22,64(1)
+   std 23,72(1)
+   std 24,80(1)
+   std 25,88(1)
+   std 26,96(1)
+   std 27,104(1)
+   std 28,112(1)
+   std 29,120(1)
+   std 30,128(1)
+   std 31,136(1)
+
+   ld  6,0(5)
+   ld  7,0(4)
+   ld  8,8(4)
+   ld  9,16(4)
+   ld  10,24(4)
+   ld  11,32(4)
+
+   mulld   22,7,6
+   mulhdu  23,7,6
+
+   mulld   24,8,6
+   mulhdu  25,8,6
+
+   mulld   30,11,6
+   mulhdu  31,11,6
+   ld  4,8(5)
+   mulli   11,11,19
+
+   mulld   26,9,6
+   mulhdu  27,9,6
+
+   mulld   28,10,6
+   mulhdu  29,10,6
+   mulld   12,11,4
+   mulhdu  21,11,4
+   addc22,22,12
+   adde23,23,21
+
+   mulld   12,7,4
+   mulhdu  21,7,4
+   addc24,24,12
+   adde25,25,21
+
+   mulld   12,10,4
+   mulhdu  21,10,4
+   ld  6,16(5)
+   mulli   10,10,19
+   addc30,30,12
+   adde31,31,21
+
+   mulld   12,8,4
+   mulhdu  21,8,4
+   addc26,26,12
+   adde27,27,21
+
+   mulld   12,9,4
+   mulhdu  21,9,4
+   

Re: [PATCH] crypto:vmx: Move ppc vmx diirectory to arch/powerpc/crypto.

2024-01-26 Thread Danny Tsen

Thanks Herbert.

-Danny

On 1/26/24 4:58 PM, Herbert Xu wrote:

On Tue, Jan 02, 2024 at 03:58:56PM -0500, Danny Tsen wrote:

Relocate all crypto files in vmx driver to arch/powerpc/crypto directory
and remove vmx directory.

drivers/crypto/vmx/aes.c rename to arch/powerpc/crypto/aes.c
drivers/crypto/vmx/aes_cbc.c rename to arch/powerpc/crypto/aes_cbc.c
drivers/crypto/vmx/aes_ctr.c rename to arch/powerpc/crypto/aes_ctr.c
drivers/crypto/vmx/aes_xts.c rename to arch/powerpc/crypto/aes_xts.c
drivers/crypto/vmx/aesp8-ppc.h rename to arch/powerpc/crypto/aesp8-ppc.h
drivers/crypto/vmx/aesp8-ppc.pl rename to arch/powerpc/crypto/aesp8-ppc.pl
drivers/crypto/vmx/ghash.c rename to arch/powerpc/crypto/ghash.c
drivers/crypto/vmx/ghashp8-ppc.pl rename to arch/powerpc/crypto/ghashp8-ppc.pl
drivers/crypto/vmx/vmx.c rename to arch/powerpc/crypto/vmx.c

deleted files:
drivers/crypto/vmx/Makefile
drivers/crypto/vmx/Kconfig
drivers/crypto/vmx/ppc-xlate.pl

This patch has been tested has passed the selftest.  The patch is also tested 
with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.

Signed-off-by: Danny Tsen 
---
  arch/powerpc/crypto/Kconfig   |  20 ++
  arch/powerpc/crypto/Makefile  |  20 +-
  .../crypto/vmx => arch/powerpc/crypto}/aes.c  |   0
  .../vmx => arch/powerpc/crypto}/aes_cbc.c |   0
  .../vmx => arch/powerpc/crypto}/aes_ctr.c |   0
  .../vmx => arch/powerpc/crypto}/aes_xts.c |   0
  .../vmx => arch/powerpc/crypto}/aesp8-ppc.h   |   0
  .../vmx => arch/powerpc/crypto}/aesp8-ppc.pl  |   0
  .../vmx => arch/powerpc/crypto}/ghash.c   |   0
  .../powerpc/crypto}/ghashp8-ppc.pl|   0
  .../crypto/vmx => arch/powerpc/crypto}/vmx.c  |   0
  drivers/crypto/Kconfig|  14 +-
  drivers/crypto/Makefile   |   2 +-
  drivers/crypto/vmx/.gitignore |   3 -
  drivers/crypto/vmx/Kconfig|  14 --
  drivers/crypto/vmx/Makefile   |  23 --
  drivers/crypto/vmx/ppc-xlate.pl   | 231 --
  17 files changed, 46 insertions(+), 281 deletions(-)
  rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c (100%)
  rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_cbc.c (100%)
  rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_ctr.c (100%)
  rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_xts.c (100%)
  rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h (100%)
  rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.pl (100%)
  rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghash.c (100%)
  rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghashp8-ppc.pl (100%)
  rename {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c (100%)
  delete mode 100644 drivers/crypto/vmx/.gitignore
  delete mode 100644 drivers/crypto/vmx/Kconfig
  delete mode 100644 drivers/crypto/vmx/Makefile
  delete mode 100644 drivers/crypto/vmx/ppc-xlate.pl

Patch applied.  Thanks.


[PATCH] crypto:vmx: Move ppc vmx diirectory to arch/powerpc/crypto.

2024-01-02 Thread Danny Tsen
Relocate all crypto files in vmx driver to arch/powerpc/crypto directory
and remove vmx directory.

drivers/crypto/vmx/aes.c rename to arch/powerpc/crypto/aes.c
drivers/crypto/vmx/aes_cbc.c rename to arch/powerpc/crypto/aes_cbc.c
drivers/crypto/vmx/aes_ctr.c rename to arch/powerpc/crypto/aes_ctr.c
drivers/crypto/vmx/aes_xts.c rename to arch/powerpc/crypto/aes_xts.c
drivers/crypto/vmx/aesp8-ppc.h rename to arch/powerpc/crypto/aesp8-ppc.h
drivers/crypto/vmx/aesp8-ppc.pl rename to arch/powerpc/crypto/aesp8-ppc.pl
drivers/crypto/vmx/ghash.c rename to arch/powerpc/crypto/ghash.c
drivers/crypto/vmx/ghashp8-ppc.pl rename to arch/powerpc/crypto/ghashp8-ppc.pl
drivers/crypto/vmx/vmx.c rename to arch/powerpc/crypto/vmx.c

deleted files:
drivers/crypto/vmx/Makefile
drivers/crypto/vmx/Kconfig
drivers/crypto/vmx/ppc-xlate.pl

This patch has been tested has passed the selftest.  The patch is also tested 
with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/Kconfig   |  20 ++
 arch/powerpc/crypto/Makefile  |  20 +-
 .../crypto/vmx => arch/powerpc/crypto}/aes.c  |   0
 .../vmx => arch/powerpc/crypto}/aes_cbc.c |   0
 .../vmx => arch/powerpc/crypto}/aes_ctr.c |   0
 .../vmx => arch/powerpc/crypto}/aes_xts.c |   0
 .../vmx => arch/powerpc/crypto}/aesp8-ppc.h   |   0
 .../vmx => arch/powerpc/crypto}/aesp8-ppc.pl  |   0
 .../vmx => arch/powerpc/crypto}/ghash.c   |   0
 .../powerpc/crypto}/ghashp8-ppc.pl|   0
 .../crypto/vmx => arch/powerpc/crypto}/vmx.c  |   0
 drivers/crypto/Kconfig|  14 +-
 drivers/crypto/Makefile   |   2 +-
 drivers/crypto/vmx/.gitignore |   3 -
 drivers/crypto/vmx/Kconfig|  14 --
 drivers/crypto/vmx/Makefile   |  23 --
 drivers/crypto/vmx/ppc-xlate.pl   | 231 --
 17 files changed, 46 insertions(+), 281 deletions(-)
 rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c (100%)
 rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_cbc.c (100%)
 rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_ctr.c (100%)
 rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_xts.c (100%)
 rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h (100%)
 rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.pl (100%)
 rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghash.c (100%)
 rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghashp8-ppc.pl (100%)
 rename {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c (100%)
 delete mode 100644 drivers/crypto/vmx/.gitignore
 delete mode 100644 drivers/crypto/vmx/Kconfig
 delete mode 100644 drivers/crypto/vmx/Makefile
 delete mode 100644 drivers/crypto/vmx/ppc-xlate.pl

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 6fc2248ca561..1e201b7ae2fc 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -137,4 +137,24 @@ config CRYPTO_POLY1305_P10
  - Power10 or later
  - Little-endian
 
+config CRYPTO_DEV_VMX
+bool "Support for VMX cryptographic acceleration instructions"
+depends on PPC64 && VSX
+help
+  Support for VMX cryptographic acceleration instructions.
+
+config CRYPTO_DEV_VMX_ENCRYPT
+   tristate "Encryption acceleration support on P8 CPU"
+   depends on CRYPTO_DEV_VMX
+   select CRYPTO_AES
+   select CRYPTO_CBC
+   select CRYPTO_CTR
+   select CRYPTO_GHASH
+   select CRYPTO_XTS
+   default m
+   help
+ Support for VMX cryptographic acceleration instructions on Power8 CPU.
+ This module supports acceleration for AES and GHASH in hardware. If 
you
+ choose 'M' here, this module will be called vmx-crypto.
+
 endmenu
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index ebdac1b9eb9a..fca0e9739869 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
 obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
 obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
 obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
+obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o 
aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
@@ -27,14 +28,29 @@ crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o 
crct10dif-vpmsum_glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o 
aesp10-ppc.o
 chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
 poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
+vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o 
aes_xts.o ghash.o
+
+ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
+override flavour := linux-ppc64le
+e

Re: [PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-09-15 Thread Danny Tsen

Still waiting for the CCLA to send to Openssl.

Thanks.

-Danny

On 9/15/23 8:29 AM, Michael Ellerman wrote:

Danny Tsen  writes:

Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt.  This is done by using one instruction,
vpermxor, to replace xor and vsldoi.

The same changes were applied to OpenSSL code and a pull request was
submitted.

https://github.com/openssl/openssl/pull/21812

Still unmerged as of today.

cheers


Re: [PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-09-15 Thread Danny Tsen

Thanks Herbert.

-Danny

On 9/15/23 5:41 AM, Herbert Xu wrote:

On Wed, Aug 30, 2023 at 09:49:11AM -0400, Danny Tsen wrote:

Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt.  This is done by using one instruction,
vpermxor, to replace xor and vsldoi.

The same changes were applied to OpenSSL code and a pull request was
submitted.

This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest.  The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.

Signed-off-by: Danny Tsen 
---
  drivers/crypto/vmx/aesp8-ppc.pl | 141 +---
  1 file changed, 92 insertions(+), 49 deletions(-)

Patch applied.  Thanks.


Re: [PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-08-30 Thread Danny Tsen

Hi Michael,

I just submitted the v2 patch.

Thanks.

-Danny

On 8/29/23 11:37 PM, Michael Ellerman wrote:

Danny Tsen  writes:

Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt.  This is done by using one instruction,
vpermxor, to replace xor and vsldoi.

This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest.  The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.

Signed-off-by: Danny Tsen 
---
  drivers/crypto/vmx/aesp8-ppc.pl | 141 +---
  1 file changed, 92 insertions(+), 49 deletions(-)

That's CRYPTOGAMS code, and is so far largely unchanged from the
original. I see you've sent the same change to openssl, but it's not
merged yet. Please document that in the change log, we want to keep the
code in sync as much as possible, and document any divergences.

cheers


diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
index 50a0a18f35da..f729589d792e 100644
--- a/drivers/crypto/vmx/aesp8-ppc.pl
+++ b/drivers/crypto/vmx/aesp8-ppc.pl
@@ -132,11 +132,12 @@ rcon:
  .long 0x1b00, 0x1b00, 0x1b00, 0x1b00  ?rev
  .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
  .long 0,0,0,0 ?asis
+.long  0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
  Lconsts:
mflrr0
bcl 20,31,\$+4
mflr$ptr #v "distance between . and rcon
-   addi$ptr,$ptr,-0x48
+   addi$ptr,$ptr,-0x58
mtlrr0
blr
.long   0
@@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
li  $x70,0x70
mtspr   256,r0
  
+	xxlor		2, 32+$eighty7, 32+$eighty7

+   vsldoi  $eighty7,$tmp,$eighty7,1# 0x010101..87
+   xxlor   1, 32+$eighty7, 32+$eighty7
+
+   # Load XOR Lconsts.
+   mr  $x70, r6
+   bl  Lconsts
+   lxvw4x  0, $x40, r6 # load XOR contents
+   mr  r6, $x70
+   li  $x70,0x70
+
subi$rounds,$rounds,3   # -4 in total
  
  	lvx		$rndkey0,$x00,$key1	# load key schedule

@@ -2537,69 +2549,77 @@ Load_xts_enc_key:
?vperm  v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_  # pre-load round[2]
  
+	# Switch to use the following codes with 0x010101..87 to generate tweak.

+   # eighty7 = 0x010101..87
+   # vsrab tmp, tweak, seven   # next tweak value, right shift 
7 bits
+   # vand  tmp, tmp, eighty7   # last byte with carry
+   # vaddubm   tweak, tweak, tweak # left shift 1 bit (x2)
+   # xxlor vsx, 0, 0
+   # vpermxor  tweak, tweak, tmp, vsx
+
 vperm  $in0,$inout,$inptail,$inpperm
 subi   $inp,$inp,31# undo "caller"
vxor$twk0,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
vand$tmp,$tmp,$eighty7
 vxor   $out0,$in0,$twk0
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in1, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in1
  
  	 lvx_u		$in1,$x10,$inp

vxor$twk1,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in1,$in1,$in1,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out1,$in1,$twk1
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in2, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in2
  
  	 lvx_u		$in2,$x20,$inp

 andi.  $taillen,$len,15
vxor$twk2,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in2,$in2,$in2,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out2,$in2,$twk2
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in3, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in3
  
  	 lvx_u		$in3,$x30,$inp

 sub$len,$len,$taillen
vxor$twk3,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in3,$in3,$in3,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out3,$in3,$twk3
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in4, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in4
  
  	 lvx_u		$in4,$x40,$inp

 subi   $len,$len,0x60
vxor  

[PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-08-30 Thread Danny Tsen
Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt.  This is done by using one instruction,
vpermxor, to replace xor and vsldoi.

The same changes were applied to OpenSSL code and a pull request was
submitted.

This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest.  The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.

Signed-off-by: Danny Tsen 
---
 drivers/crypto/vmx/aesp8-ppc.pl | 141 +---
 1 file changed, 92 insertions(+), 49 deletions(-)

diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
index 50a0a18f35da..f729589d792e 100644
--- a/drivers/crypto/vmx/aesp8-ppc.pl
+++ b/drivers/crypto/vmx/aesp8-ppc.pl
@@ -132,11 +132,12 @@ rcon:
 .long  0x1b00, 0x1b00, 0x1b00, 0x1b00  ?rev
 .long  0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
 .long  0,0,0,0 ?asis
+.long  0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
 Lconsts:
mflrr0
bcl 20,31,\$+4
mflr$ptr #v "distance between . and rcon
-   addi$ptr,$ptr,-0x48
+   addi$ptr,$ptr,-0x58
mtlrr0
blr
.long   0
@@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
li  $x70,0x70
mtspr   256,r0
 
+   xxlor   2, 32+$eighty7, 32+$eighty7
+   vsldoi  $eighty7,$tmp,$eighty7,1# 0x010101..87
+   xxlor   1, 32+$eighty7, 32+$eighty7
+
+   # Load XOR Lconsts.
+   mr  $x70, r6
+   bl  Lconsts
+   lxvw4x  0, $x40, r6 # load XOR contents
+   mr  r6, $x70
+   li  $x70,0x70
+
subi$rounds,$rounds,3   # -4 in total
 
lvx $rndkey0,$x00,$key1 # load key schedule
@@ -2537,69 +2549,77 @@ Load_xts_enc_key:
?vperm  v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_  # pre-load round[2]
 
+   # Switch to use the following codes with 0x010101..87 to generate tweak.
+   # eighty7 = 0x010101..87
+   # vsrab tmp, tweak, seven   # next tweak value, right shift 
7 bits
+   # vand  tmp, tmp, eighty7   # last byte with carry
+   # vaddubm   tweak, tweak, tweak # left shift 1 bit (x2)
+   # xxlor vsx, 0, 0
+   # vpermxor  tweak, tweak, tmp, vsx
+
 vperm  $in0,$inout,$inptail,$inpperm
 subi   $inp,$inp,31# undo "caller"
vxor$twk0,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
vand$tmp,$tmp,$eighty7
 vxor   $out0,$in0,$twk0
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in1, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in1
 
 lvx_u  $in1,$x10,$inp
vxor$twk1,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in1,$in1,$in1,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out1,$in1,$twk1
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in2, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in2
 
 lvx_u  $in2,$x20,$inp
 andi.  $taillen,$len,15
vxor$twk2,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in2,$in2,$in2,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out2,$in2,$twk2
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in3, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in3
 
 lvx_u  $in3,$x30,$inp
 sub$len,$len,$taillen
vxor$twk3,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in3,$in3,$in3,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out3,$in3,$twk3
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in4, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in4
 
 lvx_u  $in4,$x40,$inp
 subi   $len,$len,0x60
vxor$twk4,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in4,$in4,$in4,$leperm
vand  

[PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-08-29 Thread Danny Tsen
Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt.  This is done by using one instruction,
vpermxor, to replace xor and vsldoi.

This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest.  The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.

Signed-off-by: Danny Tsen 
---
 drivers/crypto/vmx/aesp8-ppc.pl | 141 +---
 1 file changed, 92 insertions(+), 49 deletions(-)

diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
index 50a0a18f35da..f729589d792e 100644
--- a/drivers/crypto/vmx/aesp8-ppc.pl
+++ b/drivers/crypto/vmx/aesp8-ppc.pl
@@ -132,11 +132,12 @@ rcon:
 .long  0x1b00, 0x1b00, 0x1b00, 0x1b00  ?rev
 .long  0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
 .long  0,0,0,0 ?asis
+.long  0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
 Lconsts:
mflrr0
bcl 20,31,\$+4
mflr$ptr #v "distance between . and rcon
-   addi$ptr,$ptr,-0x48
+   addi$ptr,$ptr,-0x58
mtlrr0
blr
.long   0
@@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
li  $x70,0x70
mtspr   256,r0
 
+   xxlor   2, 32+$eighty7, 32+$eighty7
+   vsldoi  $eighty7,$tmp,$eighty7,1# 0x010101..87
+   xxlor   1, 32+$eighty7, 32+$eighty7
+
+   # Load XOR Lconsts.
+   mr  $x70, r6
+   bl  Lconsts
+   lxvw4x  0, $x40, r6 # load XOR contents
+   mr  r6, $x70
+   li  $x70,0x70
+
subi$rounds,$rounds,3   # -4 in total
 
lvx $rndkey0,$x00,$key1 # load key schedule
@@ -2537,69 +2549,77 @@ Load_xts_enc_key:
?vperm  v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_  # pre-load round[2]
 
+   # Switch to use the following codes with 0x010101..87 to generate tweak.
+   # eighty7 = 0x010101..87
+   # vsrab tmp, tweak, seven   # next tweak value, right shift 
7 bits
+   # vand  tmp, tmp, eighty7   # last byte with carry
+   # vaddubm   tweak, tweak, tweak # left shift 1 bit (x2)
+   # xxlor vsx, 0, 0
+   # vpermxor  tweak, tweak, tmp, vsx
+
 vperm  $in0,$inout,$inptail,$inpperm
 subi   $inp,$inp,31# undo "caller"
vxor$twk0,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
vand$tmp,$tmp,$eighty7
 vxor   $out0,$in0,$twk0
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in1, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in1
 
 lvx_u  $in1,$x10,$inp
vxor$twk1,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in1,$in1,$in1,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out1,$in1,$twk1
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in2, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in2
 
 lvx_u  $in2,$x20,$inp
 andi.  $taillen,$len,15
vxor$twk2,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in2,$in2,$in2,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out2,$in2,$twk2
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in3, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in3
 
 lvx_u  $in3,$x30,$inp
 sub$len,$len,$taillen
vxor$twk3,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in3,$in3,$in3,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out3,$in3,$twk3
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in4, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in4
 
 lvx_u  $in4,$x40,$inp
 subi   $len,$len,0x60
vxor$twk4,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in4,$in4,$in4,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out4,$in4,$twk4
-   vxor  

RE: [PATCH v2 0/5] crypto: Accelerated Chacha20/Poly1305 implementation

2023-07-14 Thread Danny Tsen
Thanks.
-Danny

From: Herbert Xu 
Sent: Friday, July 14, 2023 4:49 PM
To: Danny Tsen 
Cc: linux-cry...@vger.kernel.org ; 
lei...@debian.org ; na...@linux.ibm.com 
; ap...@cryptogams.org ; 
linux-ker...@vger.kernel.org ; 
linuxppc-dev@lists.ozlabs.org ; 
m...@ellerman.id.au ; ltc...@linux.vnet.ibm.com 
; Danny Tsen 
Subject: [EXTERNAL] Re: [PATCH v2 0/5] crypto: Accelerated Chacha20/Poly1305 
implementation

On Wed, Apr 26, 2023 at 03:11:42PM -0400, Danny Tsen wrote:
> This patch series provide an accelerated/optimized Chacha20 and Poly1305
> implementation for Power10 or later CPU (ppc64le).  This module
> implements algorithm specified in RFC7539.  The implementation
> provides 3.5X better performance than the baseline for Chacha20 and
> Poly1305 individually and 1.5X improvement for Chacha20/Poly1305
> operation.
>
> This patch has been tested with the kernel crypto module tcrypt.ko and
> has passed the selftest.  The patch is also tested with
> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
>
>
> Danny Tsen (5):
>   An optimized Chacha20 implementation with 8-way unrolling for ppc64le.
>   Glue code for optmized Chacha20 implementation for ppc64le.
>   An optimized Poly1305 implementation with 4-way unrolling for ppc64le.
>   Glue code for optmized Poly1305 implementation for ppc64le.
>   Update Kconfig and Makefile.
>
>  arch/powerpc/crypto/Kconfig |   26 +
>  arch/powerpc/crypto/Makefile|4 +
>  arch/powerpc/crypto/chacha-p10-glue.c   |  221 +
>  arch/powerpc/crypto/chacha-p10le-8x.S   |  842 ++
>  arch/powerpc/crypto/poly1305-p10-glue.c |  186 
>  arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++
>  6 files changed, 2354 insertions(+)
>  create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c
>  create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S
>  create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c
>  create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S
>
> --
> 2.31.1

All applied.  Thanks.
--
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH v2 5/5] Update Kconfig and Makefile.

2023-04-26 Thread Danny Tsen
Defined CRYPTO_CHACHA20_P10 and CRYPTO POLY1305_P10 in Kconfig to
support optimized implementation for Power10 and later CPU.

Added new module driver chacha-p10-crypto and poly1305-p10-crypto.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/Kconfig  | 26 ++
 arch/powerpc/crypto/Makefile |  4 
 2 files changed, 30 insertions(+)

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 7113f9355165..f74d9dd6574b 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -111,4 +111,30 @@ config CRYPTO_AES_GCM_P10
  Support for cryptographic acceleration instructions on Power10 or
  later CPU. This module supports stitched acceleration for AES/GCM.
 
+config CRYPTO_CHACHA20_P10
+   tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)"
+   depends on PPC64 && CPU_LITTLE_ENDIAN
+   select CRYPTO_SKCIPHER
+   select CRYPTO_LIB_CHACHA_GENERIC
+   select CRYPTO_ARCH_HAVE_LIB_CHACHA
+   help
+ Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
+ stream cipher algorithms
+
+ Architecture: PowerPC64
+ - Power10 or later
+ - Little-endian
+
+config CRYPTO_POLY1305_P10
+   tristate "Hash functions: Poly1305 (P10 or later)"
+   depends on PPC64 && CPU_LITTLE_ENDIAN
+   select CRYPTO_HASH
+   select CRYPTO_LIB_POLY1305_GENERIC
+   help
+ Poly1305 authenticator algorithm (RFC7539)
+
+ Architecture: PowerPC64
+ - Power10 or later
+ - Little-endian
+
 endmenu
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index 05c7486f42c5..cd5282eff451 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -14,6 +14,8 @@ obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
 obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
 obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
+obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
+obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o 
aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
@@ -23,6 +25,8 @@ sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
 crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
 crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp8-ppc.o 
aesp8-ppc.o
+chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
+poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
 
 quiet_cmd_perl = PERL$@
   cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, 
linux-ppc64) > $@
-- 
2.31.1



[PATCH v2 2/5] Glue code for optmized Chacha20 implementation for ppc64le.

2023-04-26 Thread Danny Tsen
Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/chacha-p10-glue.c | 221 ++
 1 file changed, 221 insertions(+)
 create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c

diff --git a/arch/powerpc/crypto/chacha-p10-glue.c 
b/arch/powerpc/crypto/chacha-p10-glue.c
new file mode 100644
index ..74fb86b0d209
--- /dev/null
+++ b/arch/powerpc/crypto/chacha-p10-glue.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerPC P10 (ppc64le) accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+asmlinkage void chacha_p10le_8x(u32 *state, u8 *dst, const u8 *src,
+   unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+   preempt_disable();
+   enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+   disable_kernel_vsx();
+   preempt_enable();
+}
+
+static void chacha_p10_do_8x(u32 *state, u8 *dst, const u8 *src,
+unsigned int bytes, int nrounds)
+{
+   unsigned int l = bytes & ~0x0FF;
+
+   if (l > 0) {
+   chacha_p10le_8x(state, dst, src, l, nrounds);
+   bytes -= l;
+   src += l;
+   dst += l;
+   state[12] += l / CHACHA_BLOCK_SIZE;
+   }
+
+   if (bytes > 0)
+   chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+   hchacha_block_generic(state, stream, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+   chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+  int nrounds)
+{
+   if (!static_branch_likely(_p10) || bytes <= CHACHA_BLOCK_SIZE ||
+   !crypto_simd_usable())
+   return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+   do {
+   unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+   vsx_begin();
+   chacha_p10_do_8x(state, dst, src, todo, nrounds);
+   vsx_end();
+
+   bytes -= todo;
+   src += todo;
+   dst += todo;
+   } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+static int chacha_p10_stream_xor(struct skcipher_request *req,
+const struct chacha_ctx *ctx, const u8 *iv)
+{
+   struct skcipher_walk walk;
+   u32 state[16];
+   int err;
+
+   err = skcipher_walk_virt(, req, false);
+   if (err)
+   return err;
+
+   chacha_init_generic(state, ctx->key, iv);
+
+   while (walk.nbytes > 0) {
+   unsigned int nbytes = walk.nbytes;
+
+   if (nbytes < walk.total)
+   nbytes = rounddown(nbytes, walk.stride);
+
+   if (!crypto_simd_usable()) {
+   chacha_crypt_generic(state, walk.dst.virt.addr,
+walk.src.virt.addr, nbytes,
+ctx->nrounds);
+   } else {
+   vsx_begin();
+   chacha_p10_do_8x(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes, ctx->nrounds);
+   vsx_end();
+   }
+   err = skcipher_walk_done(, walk.nbytes - nbytes);
+   if (err)
+   break;
+   }
+
+   return err;
+}
+
+static int chacha_p10(struct skcipher_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+   struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+   return chacha_p10_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_p10(struct skcipher_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+   struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct chacha_ctx subctx;
+   u32 state[16];
+   u8 real_iv[16];
+
+   chacha_init_generic(state, ctx->key, req->iv);
+   hchacha_block_arch(state, subctx.key, ctx->nrounds);
+   subctx.nrounds = ctx->nrounds;
+
+   memcpy(_iv[0], req->iv + 24, 8);
+   memcpy(_iv[8], req->iv + 16, 8);
+   return chacha_p10_stream_xor(req, , real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+   {
+   .base.cra_name  = "chacha20",
+   .base.cra_driver_name   = "chacha20-p10",
+   .base.cra_priority  = 300,
+   .base.cra_blocksize = 

[PATCH v2 1/5] An optimized Chacha20 implementation with 8-way unrolling for ppc64le.

2023-04-26 Thread Danny Tsen
Improve overall performance of chacha20 encrypt and decrypt operations
for Power10 or later CPU.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/chacha-p10le-8x.S | 842 ++
 1 file changed, 842 insertions(+)
 create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S

diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S 
b/arch/powerpc/crypto/chacha-p10le-8x.S
new file mode 100644
index ..17bedb66b822
--- /dev/null
+++ b/arch/powerpc/crypto/chacha-p10le-8x.S
@@ -0,0 +1,842 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated chacha20 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===
+# Written by Danny Tsen 
+#
+# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
+#   size_t len, int nrounds);
+#
+# do rounds,  8 quarter rounds
+# 1.  a += b; d ^= a; d <<<= 16;
+# 2.  c += d; b ^= c; b <<<= 12;
+# 3.  a += b; d ^= a; d <<<= 8;
+# 4.  c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16
+# row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, 
v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4, 
 v9, v14)
+#
+
+#include 
+#include 
+#include 
+#include 
+
+.machine   "any"
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+   std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+   li  16, \OFFSET
+   stvx\VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+   li  16, \OFFSET
+   stxvx   \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+   ld  \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+   li  16, \OFFSET
+   lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+   li  16, \OFFSET
+   lxvx\VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+   mflr 0
+   std 0, 16(1)
+   stdu 1,-752(1)
+
+   SAVE_GPR 14, 112, 1
+   SAVE_GPR 15, 120, 1
+   SAVE_GPR 16, 128, 1
+   SAVE_GPR 17, 136, 1
+   SAVE_GPR 18, 144, 1
+   SAVE_GPR 19, 152, 1
+   SAVE_GPR 20, 160, 1
+   SAVE_GPR 21, 168, 1
+   SAVE_GPR 22, 176, 1
+   SAVE_GPR 23, 184, 1
+   SAVE_GPR 24, 192, 1
+   SAVE_GPR 25, 200, 1
+   SAVE_GPR 26, 208, 1
+   SAVE_GPR 27, 216, 1
+   SAVE_GPR 28, 224, 1
+   SAVE_GPR 29, 232, 1
+   SAVE_GPR 30, 240, 1
+   SAVE_GPR 31, 248, 1
+
+   addi9, 1, 256
+   SAVE_VRS 20, 0, 9
+   SAVE_VRS 21, 16, 9
+   SAVE_VRS 22, 32, 9
+   SAVE_VRS 23, 48, 9
+   SAVE_VRS 24, 64, 9
+   SAVE_VRS 25, 80, 9
+   SAVE_VRS 26, 96, 9
+   SAVE_VRS 27, 112, 9
+   SAVE_VRS 28, 128, 9
+   SAVE_VRS 29, 144, 9
+   SAVE_VRS 30, 160, 9
+   SAVE_VRS 31, 176, 9
+
+   SAVE_VSX 14, 192, 9
+   SAVE_VSX 15, 208, 9
+   SAVE_VSX 16, 224, 9
+   SAVE_VSX 17, 240, 9
+   SAVE_VSX 18, 256, 9
+   SAVE_VSX 19, 272, 9
+   SAVE_VSX 20, 288, 9
+   SAVE_VSX 21, 304, 9
+   SAVE_VSX 22, 320, 9
+   SAVE_VSX 23, 336, 9
+   SAVE_VSX 24, 352, 9
+   SAVE_VSX 25, 368, 9
+   SAVE_VSX 26, 384, 9
+   SAVE_VSX 27, 400, 9
+   SAVE_VSX 28, 416, 9
+   SAVE_VSX 29, 432, 9
+   SAVE_VSX 30, 448, 9
+   SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+   addi9, 1, 256
+   RESTORE_VRS 20, 0, 9
+   RESTORE_VRS 21, 16, 9
+   RESTORE_VRS 22, 32, 9
+   RESTORE_VRS 23, 48, 9
+   RESTORE_VRS 24, 64, 9
+   RESTORE_VRS 25, 80, 9
+   RESTORE_VRS 26, 96, 9
+   RESTORE_VRS 27, 112, 9
+   RESTORE_VRS 28, 128, 9
+   RESTORE_VRS 29, 144, 9
+   RESTORE_VRS 30, 160, 9
+   RESTORE_VRS 31, 176, 9
+
+   RESTORE_VSX 14, 192, 9
+   RESTORE_VSX 15, 208, 9
+   RESTORE_VSX 16, 224, 9
+   RESTORE_VSX 17, 240, 9
+   RESTORE_VSX 18, 256, 9
+   RESTORE_VSX 19, 272, 9
+   RESTORE_VSX 20, 288, 9
+   RESTORE_VSX 21, 304, 9
+   RESTORE_VSX 22, 320, 9
+   RESTORE_VSX 23, 336, 9
+   RESTORE_VSX 24, 352, 9
+   RESTORE_VSX 25, 368, 9
+   RESTORE_VSX 26, 384, 9
+   RESTORE_VSX 27, 400, 9
+   RESTORE_VSX 28, 416, 9
+   RESTORE_VSX 29, 432, 9
+   RESTORE_VSX 30, 448, 9
+   RESTORE_VSX 31, 464, 9
+
+   RESTORE_GPR 14, 112, 1
+   RESTORE_GPR 15, 120, 1
+   RESTORE_GPR 16, 128, 1
+   RESTORE

[PATCH v2 4/5] Glue code for optmized Poly1305 implementation for ppc64le.

2023-04-26 Thread Danny Tsen
Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/poly1305-p10-glue.c | 186 
 1 file changed, 186 insertions(+)
 create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c

diff --git a/arch/powerpc/crypto/poly1305-p10-glue.c 
b/arch/powerpc/crypto/poly1305-p10-glue.c
new file mode 100644
index ..95dd708573ee
--- /dev/null
+++ b/arch/powerpc/crypto/poly1305-p10-glue.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Poly1305 authenticator algorithm, RFC7539.
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+asmlinkage void poly1305_p10le_4blocks(void *h, const u8 *m, u32 mlen);
+asmlinkage void poly1305_64s(void *h, const u8 *m, u32 mlen, int highbit);
+asmlinkage void poly1305_emit_64(void *h, void *s, u8 *dst);
+
+static void vsx_begin(void)
+{
+   preempt_disable();
+   enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+   disable_kernel_vsx();
+   preempt_enable();
+}
+
+static int crypto_poly1305_p10_init(struct shash_desc *desc)
+{
+   struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+   poly1305_core_init(>h);
+   dctx->buflen = 0;
+   dctx->rset = 0;
+   dctx->sset = false;
+
+   return 0;
+}
+
+static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
+  const u8 *inp, unsigned int len)
+{
+   unsigned int acc = 0;
+
+   if (unlikely(!dctx->sset)) {
+   if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
+   struct poly1305_core_key *key = >core_r;
+
+   key->key.r64[0] = get_unaligned_le64([0]);
+   key->key.r64[1] = get_unaligned_le64([8]);
+   inp += POLY1305_BLOCK_SIZE;
+   len -= POLY1305_BLOCK_SIZE;
+   acc += POLY1305_BLOCK_SIZE;
+   dctx->rset = 1;
+   }
+   if (len >= POLY1305_BLOCK_SIZE) {
+   dctx->s[0] = get_unaligned_le32([0]);
+   dctx->s[1] = get_unaligned_le32([4]);
+   dctx->s[2] = get_unaligned_le32([8]);
+   dctx->s[3] = get_unaligned_le32([12]);
+   acc += POLY1305_BLOCK_SIZE;
+   dctx->sset = true;
+   }
+   }
+   return acc;
+}
+
+static int crypto_poly1305_p10_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+   struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+   unsigned int bytes, used;
+
+   if (unlikely(dctx->buflen)) {
+   bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+   memcpy(dctx->buf + dctx->buflen, src, bytes);
+   src += bytes;
+   srclen -= bytes;
+   dctx->buflen += bytes;
+
+   if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+   if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf,
+  
POLY1305_BLOCK_SIZE))) {
+   vsx_begin();
+   poly1305_64s(>h, dctx->buf,
+ POLY1305_BLOCK_SIZE, 1);
+   vsx_end();
+   }
+   dctx->buflen = 0;
+   }
+   }
+
+   if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+   bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
+   used = crypto_poly1305_setdctxkey(dctx, src, bytes);
+   if (likely(used)) {
+   srclen -= used;
+   src += used;
+   }
+   if (crypto_simd_usable() && (srclen >= POLY1305_BLOCK_SIZE*4)) {
+   vsx_begin();
+   poly1305_p10le_4blocks(>h, src, srclen);
+   vsx_end();
+   src += srclen - (srclen % (POLY1305_BLOCK_SIZE * 4));
+   srclen %= POLY1305_BLOCK_SIZE * 4;
+   }
+   while (srclen >= POLY1305_BLOCK_SIZE) {
+   vsx_begin();
+   poly1305_64s(>h, src, POLY1305_BLOCK_SIZE, 1);
+   vsx_end();
+   srclen -= POLY1305_BLOCK_SIZE;
+   src += POLY1305_BLOCK_SIZE;
+   }
+   }
+
+   if (unlikely(srclen)) {
+   dctx->buflen = srclen;
+   memcpy(dctx->buf, src, srclen);
+   }
+
+   return 0;
+}
+
+static int crypto_poly1305_p10_final(struct shash_desc *desc, u8 *dst)
+{
+   stru

[PATCH v2 0/5] crypto: Accelerated Chacha20/Poly1305 implementation

2023-04-26 Thread Danny Tsen
This patch series provide an accelerated/optimized Chacha20 and Poly1305
implementation for Power10 or later CPU (ppc64le).  This module
implements algorithm specified in RFC7539.  The implementation
provides 3.5X better performance than the baseline for Chacha20 and
Poly1305 individually and 1.5X improvement for Chacha20/Poly1305
operation.

This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest.  The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.


Danny Tsen (5):
  An optimized Chacha20 implementation with 8-way unrolling for ppc64le.
  Glue code for optmized Chacha20 implementation for ppc64le.
  An optimized Poly1305 implementation with 4-way unrolling for ppc64le.
  Glue code for optmized Poly1305 implementation for ppc64le.
  Update Kconfig and Makefile.

 arch/powerpc/crypto/Kconfig |   26 +
 arch/powerpc/crypto/Makefile|4 +
 arch/powerpc/crypto/chacha-p10-glue.c   |  221 +
 arch/powerpc/crypto/chacha-p10le-8x.S   |  842 ++
 arch/powerpc/crypto/poly1305-p10-glue.c |  186 
 arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++
 6 files changed, 2354 insertions(+)
 create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c
 create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S
 create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c
 create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S

-- 
2.31.1



[PATCH v2 3/5] An optimized Poly1305 implementation with 4-way unrolling for ppc64le.

2023-04-26 Thread Danny Tsen
Improve overall performance of Poly1305 for Power10 or later CPU.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++
 1 file changed, 1075 insertions(+)
 create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S

diff --git a/arch/powerpc/crypto/poly1305-p10le_64.S 
b/arch/powerpc/crypto/poly1305-p10le_64.S
new file mode 100644
index ..a3c1987f1ecd
--- /dev/null
+++ b/arch/powerpc/crypto/poly1305-p10le_64.S
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated poly1305 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===
+# Written by Danny Tsen 
+#
+# Poly1305 - this version mainly using vector/VSX/Scalar
+#  - 26 bits limbs
+#  - Handle multiple 64 byte blcok.
+#
+# Block size 16 bytes
+# key = (r, s)
+# clamp r &= 0x0FFC0FFC 0x0FFC0FFF
+# p = 2^130 - 5
+# a += m
+# a = (r + a) % p
+# a += s
+#
+# Improve performance by breaking down polynominal to the sum of products with
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+#
+#  07/22/21 - this revison based on the above sum of products.  Setup r^4, 
r^3, r^2, r and s3, s2, s1, s0
+# to 9 vectors for multiplications.
+#
+# setup r^4, r^3, r^2, r vectors
+#vs[r^1, r^3, r^2, r^4]
+#vs0 = [r0,.]
+#vs1 = [r1,.]
+#vs2 = [r2,.]
+#vs3 = [r3,.]
+#vs4 = [r4,.]
+#vs5 = [r1*5,...]
+#vs6 = [r2*5,...]
+#vs7 = [r2*5,...]
+#vs8 = [r4*5,...]
+#
+#  Each word in a vector consists a member of a "r/s" in [a * r/s].
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0,   r4*5, r3*5, r2*5;
+# r2, r1,   r0,   r4*5, r3*5;
+# r3, r2,   r1,   r0,   r4*5;
+# r4, r3,   r2,   r1,   r0  ;
+#
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+#  k = 32 bytes key
+#  r3 = k (r, s)
+#  r4 = mlen
+#  r5 = m
+#
+#include 
+#include 
+#include 
+#include 
+
+.machine "any"
+
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+   std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+   li  16, \OFFSET
+   stvx\VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+   li  16, \OFFSET
+   stxvx   \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+   ld  \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+   li  16, \OFFSET
+   lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+   li  16, \OFFSET
+   lxvx\VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+   mflr 0
+   std 0, 16(1)
+   stdu 1,-752(1)
+
+   SAVE_GPR 14, 112, 1
+   SAVE_GPR 15, 120, 1
+   SAVE_GPR 16, 128, 1
+   SAVE_GPR 17, 136, 1
+   SAVE_GPR 18, 144, 1
+   SAVE_GPR 19, 152, 1
+   SAVE_GPR 20, 160, 1
+   SAVE_GPR 21, 168, 1
+   SAVE_GPR 22, 176, 1
+   SAVE_GPR 23, 184, 1
+   SAVE_GPR 24, 192, 1
+   SAVE_GPR 25, 200, 1
+   SAVE_GPR 26, 208, 1
+   SAVE_GPR 27, 216, 1
+   SAVE_GPR 28, 224, 1
+   SAVE_GPR 29, 232, 1
+   SAVE_GPR 30, 240, 1
+   SAVE_GPR 31, 248, 1
+
+   addi9, 1, 256
+   SAVE_VRS 20, 0, 9
+   SAVE_VRS 21, 16, 9
+   SAVE_VRS 22, 32, 9
+   SAVE_VRS 23, 48, 9
+   SAVE_VRS 24, 64, 9
+   SAVE_VRS 25, 80, 9
+   SAVE_VRS 26, 96, 9
+   SAVE_VRS 27, 112, 9
+   SAVE_VRS 28, 128, 9
+   SAVE_VRS 29, 144, 9
+   SAVE_VRS 30, 160, 9
+   SAVE_VRS 31, 176, 9
+
+   SAVE_VSX 14, 192, 9
+   SAVE_VSX 15, 208, 9
+   SAVE_VSX 16, 224, 9
+   SAVE_VSX 17, 240, 9
+   SAVE_VSX 18, 256, 9
+   SAVE_VSX 19, 272, 9
+   SAVE_VSX 20, 288, 9
+   SAVE_VSX 21, 304, 9
+   SAVE_VSX 22, 320, 9
+   SAVE_VSX 23, 336, 9
+   SAVE_VSX 24, 352, 9
+   SAVE_VSX 25, 368, 9
+   SAVE_VSX 26, 384, 9
+   SAVE_VSX 27, 400, 9
+   SAVE_VSX 28, 416, 9
+   SAVE_VSX 29, 432, 9
+   SAVE_VSX 30, 448, 9
+   SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+   addi9, 1, 256
+   RESTORE_VRS 20, 0, 9
+   RESTORE_VRS 21, 16, 9
+   RESTORE_VRS 22, 32, 9
+   RESTORE_VRS 23, 48, 9
+   RESTORE_VRS 24, 64, 9
+   RESTORE_VRS 25, 80, 9
+   RESTORE_VRS 26, 96, 9
+   RESTORE_VRS 27, 112, 9
+   RESTORE_VRS 28, 128, 9
+   RESTORE_VRS 29, 144, 9
+   RESTORE_VRS 30, 160, 9
+   RESTORE_VRS 31, 176, 9
+
+   RESTORE_VSX 14, 192, 9
+   RESTORE_VSX 15, 208, 9
+   RESTORE_VSX 16, 224, 9
+   RESTORE_VSX 17, 240, 9
+   RESTORE_VSX 18, 256, 9
+   RESTORE_VSX 19, 272, 9
+   RESTORE_VSX 20, 288, 9
+   RESTORE_VSX 21, 304, 9
+   RESTORE_VSX 22, 320, 9
+   RESTORE_VSX 23, 336, 9
+   RESTORE_VSX 24, 352, 9
+   RESTORE_VSX 25, 368, 9
+   RESTORE_VSX 26, 384, 9
+   RESTORE_VSX 27, 400, 9
+   RESTORE_VSX 28, 416, 

Re: [PATCH 1/5] An optimized Chacha20 implementation with 8-way unrolling for ppc64le.

2023-04-25 Thread Danny Tsen

Hi Michael,

It's in IBM repo.

Thanks.

-Danny

On 4/25/23 7:02 AM, Michael Ellerman wrote:

Danny Tsen  writes:

This is recommended template to use for IBM copyright.

According to who?

The documentation I've seen specifies "IBM Corp." or "IBM Corporation".

cheers


Re: [PATCH 5/5] Update Kconfig and Makefile.

2023-04-25 Thread Danny Tsen
I was not sure at the time when I use IS_REACHABLE.  Will fix it in init 
code.


Thanks.

-Danny

On 4/25/23 12:46 AM, Herbert Xu wrote:

On Mon, Apr 24, 2023 at 02:47:26PM -0400, Danny Tsen wrote:

+config CRYPTO_CHACHA20_P10
+   tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)"
+   depends on PPC64 && CPU_LITTLE_ENDIAN
+   select CRYPTO_SKCIPHER

I thought your IS_REACHABLE test was so that you could build this
without the Crypto API? Colour me confused.

Cheers,


Re: [PATCH 4/5] Glue code for optmized Poly1305 implementation for ppc64le.

2023-04-25 Thread Danny Tsen

Did not notice that.  Will do fix it.

Thanks.

-Danny

On 4/25/23 12:44 AM, Herbert Xu wrote:

On Mon, Apr 24, 2023 at 02:47:25PM -0400, Danny Tsen wrote:

+   if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+   bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
+   used = crypto_poly1305_setdctxkey(dctx, src, bytes);
+   if (likely(used)) {
+   srclen -= used;
+   src += used;
+   }
+   if (srclen >= POLY1305_BLOCK_SIZE*4) {
+   vsx_begin();

Your chacha code has a SIMD-fallback, how come this one doesn't?

Thanks,


Re: [PATCH 2/5] Glue code for optmized Chacha20 implementation for ppc64le.

2023-04-25 Thread Danny Tsen

Got it.  Will fix it.

Thanks.

-Danny


On 4/25/23 12:41 AM, Herbert Xu wrote:

On Mon, Apr 24, 2023 at 02:47:23PM -0400, Danny Tsen wrote:

+static int chacha_p10_stream_xor(struct skcipher_request *req,
+const struct chacha_ctx *ctx, const u8 *iv)
+{
+   struct skcipher_walk walk;
+   u32 state[16];
+   int err;
+
+   err = skcipher_walk_virt(, req, false);
+   if (err)
+   return err;
+
+   chacha_init_generic(state, ctx->key, iv);
+
+   while (walk.nbytes > 0) {
+   unsigned int nbytes = walk.nbytes;
+
+   if (nbytes < walk.total)
+   nbytes = rounddown(nbytes, walk.stride);
+
+   if (!static_branch_likely(_p10) ||

You don't need the static branch in the Crypto API code since
the registration is already conditional.

Cheers,


Re: [PATCH 1/5] An optimized Chacha20 implementation with 8-way unrolling for ppc64le.

2023-04-24 Thread Danny Tsen

This is recommended template to use for IBM copyright.

Thanks.

-Danny

On 4/24/23 3:40 PM, Elliott, Robert (Servers) wrote:

+# Copyright 2023- IBM Inc. All rights reserved

I don't think any such entity exists - you probably mean IBM Corporation.


[PATCH 5/5] Update Kconfig and Makefile.

2023-04-24 Thread Danny Tsen
Defined CRYPTO_CHACHA20_P10 and CRYPTO POLY1305_P10 in Kconfig to
support optimized implementation for Power10 and later CPU.

Added new module driver chacha-p10-crypto and poly1305-p10-crypto.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/Kconfig  | 26 ++
 arch/powerpc/crypto/Makefile |  4 
 2 files changed, 30 insertions(+)

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 7113f9355165..f74d9dd6574b 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -111,4 +111,30 @@ config CRYPTO_AES_GCM_P10
  Support for cryptographic acceleration instructions on Power10 or
  later CPU. This module supports stitched acceleration for AES/GCM.
 
+config CRYPTO_CHACHA20_P10
+   tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)"
+   depends on PPC64 && CPU_LITTLE_ENDIAN
+   select CRYPTO_SKCIPHER
+   select CRYPTO_LIB_CHACHA_GENERIC
+   select CRYPTO_ARCH_HAVE_LIB_CHACHA
+   help
+ Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
+ stream cipher algorithms
+
+ Architecture: PowerPC64
+ - Power10 or later
+ - Little-endian
+
+config CRYPTO_POLY1305_P10
+   tristate "Hash functions: Poly1305 (P10 or later)"
+   depends on PPC64 && CPU_LITTLE_ENDIAN
+   select CRYPTO_HASH
+   select CRYPTO_LIB_POLY1305_GENERIC
+   help
+ Poly1305 authenticator algorithm (RFC7539)
+
+ Architecture: PowerPC64
+ - Power10 or later
+ - Little-endian
+
 endmenu
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index 05c7486f42c5..cd5282eff451 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -14,6 +14,8 @@ obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
 obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
 obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
+obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
+obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o 
aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
@@ -23,6 +25,8 @@ sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
 crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
 crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp8-ppc.o 
aesp8-ppc.o
+chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
+poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
 
 quiet_cmd_perl = PERL$@
   cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, 
linux-ppc64) > $@
-- 
2.31.1



[PATCH 4/5] Glue code for optmized Poly1305 implementation for ppc64le.

2023-04-24 Thread Danny Tsen
Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/poly1305-p10-glue.c | 186 
 1 file changed, 186 insertions(+)
 create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c

diff --git a/arch/powerpc/crypto/poly1305-p10-glue.c 
b/arch/powerpc/crypto/poly1305-p10-glue.c
new file mode 100644
index ..b1800f7b6af8
--- /dev/null
+++ b/arch/powerpc/crypto/poly1305-p10-glue.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Poly1305 authenticator algorithm, RFC7539.
+ *
+ * Copyright 2023- IBM Inc. All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+asmlinkage void poly1305_p10le_4blocks(void *h, const u8 *m, u32 mlen);
+asmlinkage void poly1305_64s(void *h, const u8 *m, u32 mlen, int highbit);
+asmlinkage void poly1305_emit_64(void *h, void *s, u8 *dst);
+
+static void vsx_begin(void)
+{
+   preempt_disable();
+   enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+   disable_kernel_vsx();
+   preempt_enable();
+}
+
+static int crypto_poly1305_p10_init(struct shash_desc *desc)
+{
+   struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+   poly1305_core_init(>h);
+   dctx->buflen = 0;
+   dctx->rset = 0;
+   dctx->sset = false;
+
+   return 0;
+}
+
+static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
+  const u8 *inp, unsigned int len)
+{
+   unsigned int acc = 0;
+
+   if (unlikely(!dctx->sset)) {
+   if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
+   struct poly1305_core_key *key = >core_r;
+
+   key->key.r64[0] = get_unaligned_le64([0]);
+   key->key.r64[1] = get_unaligned_le64([8]);
+   inp += POLY1305_BLOCK_SIZE;
+   len -= POLY1305_BLOCK_SIZE;
+   acc += POLY1305_BLOCK_SIZE;
+   dctx->rset = 1;
+   }
+   if (len >= POLY1305_BLOCK_SIZE) {
+   dctx->s[0] = get_unaligned_le32([0]);
+   dctx->s[1] = get_unaligned_le32([4]);
+   dctx->s[2] = get_unaligned_le32([8]);
+   dctx->s[3] = get_unaligned_le32([12]);
+   acc += POLY1305_BLOCK_SIZE;
+   dctx->sset = true;
+   }
+   }
+   return acc;
+}
+
+static int crypto_poly1305_p10_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+   struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+   unsigned int bytes, used;
+
+   if (unlikely(dctx->buflen)) {
+   bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+   memcpy(dctx->buf + dctx->buflen, src, bytes);
+   src += bytes;
+   srclen -= bytes;
+   dctx->buflen += bytes;
+
+   if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+   if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf,
+  
POLY1305_BLOCK_SIZE))) {
+   vsx_begin();
+   poly1305_64s(>h, dctx->buf,
+ POLY1305_BLOCK_SIZE, 1);
+   vsx_end();
+   }
+   dctx->buflen = 0;
+   }
+   }
+
+   if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+   bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
+   used = crypto_poly1305_setdctxkey(dctx, src, bytes);
+   if (likely(used)) {
+   srclen -= used;
+   src += used;
+   }
+   if (srclen >= POLY1305_BLOCK_SIZE*4) {
+   vsx_begin();
+   poly1305_p10le_4blocks(>h, src, srclen);
+   vsx_end();
+   src += srclen - (srclen % (POLY1305_BLOCK_SIZE * 4));
+   srclen %= POLY1305_BLOCK_SIZE * 4;
+   }
+   while (srclen >= POLY1305_BLOCK_SIZE) {
+   vsx_begin();
+   poly1305_64s(>h, src, POLY1305_BLOCK_SIZE, 1);
+   vsx_end();
+   srclen -= POLY1305_BLOCK_SIZE;
+   src += POLY1305_BLOCK_SIZE;
+   }
+   }
+
+   if (unlikely(srclen)) {
+   dctx->buflen = srclen;
+   memcpy(dctx->buf, src, srclen);
+   }
+
+   return 0;
+}
+
+static int crypto_poly1305_p10_final(struct shash_desc *desc, u8 *dst)
+{
+   struct poly1305_desc_ctx *dctx = shash_de

[PATCH 2/5] Glue code for optmized Chacha20 implementation for ppc64le.

2023-04-24 Thread Danny Tsen
Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/chacha-p10-glue.c | 223 ++
 1 file changed, 223 insertions(+)
 create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c

diff --git a/arch/powerpc/crypto/chacha-p10-glue.c 
b/arch/powerpc/crypto/chacha-p10-glue.c
new file mode 100644
index ..cefb150e7b3c
--- /dev/null
+++ b/arch/powerpc/crypto/chacha-p10-glue.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerPC P10 (ppc64le) accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright 2023- IBM Inc. All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+asmlinkage void chacha_p10le_8x(u32 *state, u8 *dst, const u8 *src,
+   unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+   preempt_disable();
+   enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+   disable_kernel_vsx();
+   preempt_enable();
+}
+
+static void chacha_p10_do_8x(u32 *state, u8 *dst, const u8 *src,
+unsigned int bytes, int nrounds)
+{
+   unsigned int l = bytes & ~0x0FF;
+
+   if (l > 0) {
+   chacha_p10le_8x(state, dst, src, l, nrounds);
+   bytes -= l;
+   src += l;
+   dst += l;
+   state[12] += l / CHACHA_BLOCK_SIZE;
+   }
+
+   if (bytes > 0)
+   chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+   hchacha_block_generic(state, stream, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+   chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+  int nrounds)
+{
+   if (!static_branch_likely(_p10) || bytes <= CHACHA_BLOCK_SIZE ||
+   !crypto_simd_usable())
+   return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+   do {
+   unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+   vsx_begin();
+   chacha_p10_do_8x(state, dst, src, todo, nrounds);
+   vsx_end();
+
+   bytes -= todo;
+   src += todo;
+   dst += todo;
+   } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+static int chacha_p10_stream_xor(struct skcipher_request *req,
+const struct chacha_ctx *ctx, const u8 *iv)
+{
+   struct skcipher_walk walk;
+   u32 state[16];
+   int err;
+
+   err = skcipher_walk_virt(, req, false);
+   if (err)
+   return err;
+
+   chacha_init_generic(state, ctx->key, iv);
+
+   while (walk.nbytes > 0) {
+   unsigned int nbytes = walk.nbytes;
+
+   if (nbytes < walk.total)
+   nbytes = rounddown(nbytes, walk.stride);
+
+   if (!static_branch_likely(_p10) ||
+   !crypto_simd_usable()) {
+   chacha_crypt_generic(state, walk.dst.virt.addr,
+walk.src.virt.addr, nbytes,
+ctx->nrounds);
+   } else {
+   vsx_begin();
+   chacha_p10_do_8x(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes, ctx->nrounds);
+   vsx_end();
+   }
+   err = skcipher_walk_done(, walk.nbytes - nbytes);
+   if (err)
+   break;
+   }
+
+   return err;
+}
+
+static int chacha_p10(struct skcipher_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+   struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+   return chacha_p10_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_p10(struct skcipher_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+   struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct chacha_ctx subctx;
+   u32 state[16];
+   u8 real_iv[16];
+
+   chacha_init_generic(state, ctx->key, req->iv);
+   hchacha_block_arch(state, subctx.key, ctx->nrounds);
+   subctx.nrounds = ctx->nrounds;
+
+   memcpy(_iv[0], req->iv + 24, 8);
+   memcpy(_iv[8], req->iv + 16, 8);
+   return chacha_p10_stream_xor(req, , real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+   {
+   .base.cra_name  = "chacha20",
+   .base.cra_driver_name   = "chacha20-p10",
+   .base

[PATCH 3/5] An optimized Poly1305 implementation with 4-way unrolling for ppc64le.

2023-04-24 Thread Danny Tsen
Improve overall performance of Poly1305 for Power10 or later CPU.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++
 1 file changed, 1075 insertions(+)
 create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S

diff --git a/arch/powerpc/crypto/poly1305-p10le_64.S 
b/arch/powerpc/crypto/poly1305-p10le_64.S
new file mode 100644
index ..22fd255af87e
--- /dev/null
+++ b/arch/powerpc/crypto/poly1305-p10le_64.S
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated poly1305 implementation for ppc64le.
+#
+# Copyright 2023- IBM Inc. All rights reserved
+#
+#===
+# Written by Danny Tsen 
+#
+# Poly1305 - this version mainly using vector/VSX/Scalar
+#  - 26 bits limbs
+#  - Handle multiple 64 byte blcok.
+#
+# Block size 16 bytes
+# key = (r, s)
+# clamp r &= 0x0FFC0FFC 0x0FFC0FFF
+# p = 2^130 - 5
+# a += m
+# a = (r + a) % p
+# a += s
+#
+# Improve performance by breaking down polynominal to the sum of products with
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+#
+#  07/22/21 - this revison based on the above sum of products.  Setup r^4, 
r^3, r^2, r and s3, s2, s1, s0
+# to 9 vectors for multiplications.
+#
+# setup r^4, r^3, r^2, r vectors
+#vs[r^1, r^3, r^2, r^4]
+#vs0 = [r0,.]
+#vs1 = [r1,.]
+#vs2 = [r2,.]
+#vs3 = [r3,.]
+#vs4 = [r4,.]
+#vs5 = [r1*5,...]
+#vs6 = [r2*5,...]
+#vs7 = [r2*5,...]
+#vs8 = [r4*5,...]
+#
+#  Each word in a vector consists a member of a "r/s" in [a * r/s].
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0,   r4*5, r3*5, r2*5;
+# r2, r1,   r0,   r4*5, r3*5;
+# r3, r2,   r1,   r0,   r4*5;
+# r4, r3,   r2,   r1,   r0  ;
+#
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+#  k = 32 bytes key
+#  r3 = k (r, s)
+#  r4 = mlen
+#  r5 = m
+#
+#include 
+#include 
+#include 
+#include 
+
+.machine "any"
+
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+   std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+   li  16, \OFFSET
+   stvx\VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+   li  16, \OFFSET
+   stxvx   \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+   ld  \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+   li  16, \OFFSET
+   lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+   li  16, \OFFSET
+   lxvx\VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+   mflr 0
+   std 0, 16(1)
+   stdu 1,-752(1)
+
+   SAVE_GPR 14, 112, 1
+   SAVE_GPR 15, 120, 1
+   SAVE_GPR 16, 128, 1
+   SAVE_GPR 17, 136, 1
+   SAVE_GPR 18, 144, 1
+   SAVE_GPR 19, 152, 1
+   SAVE_GPR 20, 160, 1
+   SAVE_GPR 21, 168, 1
+   SAVE_GPR 22, 176, 1
+   SAVE_GPR 23, 184, 1
+   SAVE_GPR 24, 192, 1
+   SAVE_GPR 25, 200, 1
+   SAVE_GPR 26, 208, 1
+   SAVE_GPR 27, 216, 1
+   SAVE_GPR 28, 224, 1
+   SAVE_GPR 29, 232, 1
+   SAVE_GPR 30, 240, 1
+   SAVE_GPR 31, 248, 1
+
+   addi9, 1, 256
+   SAVE_VRS 20, 0, 9
+   SAVE_VRS 21, 16, 9
+   SAVE_VRS 22, 32, 9
+   SAVE_VRS 23, 48, 9
+   SAVE_VRS 24, 64, 9
+   SAVE_VRS 25, 80, 9
+   SAVE_VRS 26, 96, 9
+   SAVE_VRS 27, 112, 9
+   SAVE_VRS 28, 128, 9
+   SAVE_VRS 29, 144, 9
+   SAVE_VRS 30, 160, 9
+   SAVE_VRS 31, 176, 9
+
+   SAVE_VSX 14, 192, 9
+   SAVE_VSX 15, 208, 9
+   SAVE_VSX 16, 224, 9
+   SAVE_VSX 17, 240, 9
+   SAVE_VSX 18, 256, 9
+   SAVE_VSX 19, 272, 9
+   SAVE_VSX 20, 288, 9
+   SAVE_VSX 21, 304, 9
+   SAVE_VSX 22, 320, 9
+   SAVE_VSX 23, 336, 9
+   SAVE_VSX 24, 352, 9
+   SAVE_VSX 25, 368, 9
+   SAVE_VSX 26, 384, 9
+   SAVE_VSX 27, 400, 9
+   SAVE_VSX 28, 416, 9
+   SAVE_VSX 29, 432, 9
+   SAVE_VSX 30, 448, 9
+   SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+   addi9, 1, 256
+   RESTORE_VRS 20, 0, 9
+   RESTORE_VRS 21, 16, 9
+   RESTORE_VRS 22, 32, 9
+   RESTORE_VRS 23, 48, 9
+   RESTORE_VRS 24, 64, 9
+   RESTORE_VRS 25, 80, 9
+   RESTORE_VRS 26, 96, 9
+   RESTORE_VRS 27, 112, 9
+   RESTORE_VRS 28, 128, 9
+   RESTORE_VRS 29, 144, 9
+   RESTORE_VRS 30, 160, 9
+   RESTORE_VRS 31, 176, 9
+
+   RESTORE_VSX 14, 192, 9
+   RESTORE_VSX 15, 208, 9
+   RESTORE_VSX 16, 224, 9
+   RESTORE_VSX 17, 240, 9
+   RESTORE_VSX 18, 256, 9
+   RESTORE_VSX 19, 272, 9
+   RESTORE_VSX 20, 288, 9
+   RESTORE_VSX 21, 304, 9
+   RESTORE_VSX 22, 320, 9
+   RESTORE_VSX 23, 336, 9
+   RESTORE_VSX 24, 352, 9
+   RESTORE_VSX 25, 368, 9
+   RESTORE_VSX 26, 384, 9
+   RESTORE_VSX 27, 400, 9
+   RESTORE_VSX 28, 416, 

[PATCH 1/5] An optimized Chacha20 implementation with 8-way unrolling for ppc64le.

2023-04-24 Thread Danny Tsen
Improve overall performance of chacha20 encrypt and decrypt operations
for Power10 or later CPU.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/chacha-p10le-8x.S | 842 ++
 1 file changed, 842 insertions(+)
 create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S

diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S 
b/arch/powerpc/crypto/chacha-p10le-8x.S
new file mode 100644
index ..7c15d17101d7
--- /dev/null
+++ b/arch/powerpc/crypto/chacha-p10le-8x.S
@@ -0,0 +1,842 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated chacha20 implementation for ppc64le.
+#
+# Copyright 2023- IBM Inc. All rights reserved
+#
+#===
+# Written by Danny Tsen 
+#
+# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
+#   size_t len, int nrounds);
+#
+# do rounds,  8 quarter rounds
+# 1.  a += b; d ^= a; d <<<= 16;
+# 2.  c += d; b ^= c; b <<<= 12;
+# 3.  a += b; d ^= a; d <<<= 8;
+# 4.  c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16
+# row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, 
v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4, 
 v9, v14)
+#
+
+#include 
+#include 
+#include 
+#include 
+
+.machine   "any"
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+   std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+   li  16, \OFFSET
+   stvx\VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+   li  16, \OFFSET
+   stxvx   \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+   ld  \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+   li  16, \OFFSET
+   lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+   li  16, \OFFSET
+   lxvx\VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+   mflr 0
+   std 0, 16(1)
+   stdu 1,-752(1)
+
+   SAVE_GPR 14, 112, 1
+   SAVE_GPR 15, 120, 1
+   SAVE_GPR 16, 128, 1
+   SAVE_GPR 17, 136, 1
+   SAVE_GPR 18, 144, 1
+   SAVE_GPR 19, 152, 1
+   SAVE_GPR 20, 160, 1
+   SAVE_GPR 21, 168, 1
+   SAVE_GPR 22, 176, 1
+   SAVE_GPR 23, 184, 1
+   SAVE_GPR 24, 192, 1
+   SAVE_GPR 25, 200, 1
+   SAVE_GPR 26, 208, 1
+   SAVE_GPR 27, 216, 1
+   SAVE_GPR 28, 224, 1
+   SAVE_GPR 29, 232, 1
+   SAVE_GPR 30, 240, 1
+   SAVE_GPR 31, 248, 1
+
+   addi9, 1, 256
+   SAVE_VRS 20, 0, 9
+   SAVE_VRS 21, 16, 9
+   SAVE_VRS 22, 32, 9
+   SAVE_VRS 23, 48, 9
+   SAVE_VRS 24, 64, 9
+   SAVE_VRS 25, 80, 9
+   SAVE_VRS 26, 96, 9
+   SAVE_VRS 27, 112, 9
+   SAVE_VRS 28, 128, 9
+   SAVE_VRS 29, 144, 9
+   SAVE_VRS 30, 160, 9
+   SAVE_VRS 31, 176, 9
+
+   SAVE_VSX 14, 192, 9
+   SAVE_VSX 15, 208, 9
+   SAVE_VSX 16, 224, 9
+   SAVE_VSX 17, 240, 9
+   SAVE_VSX 18, 256, 9
+   SAVE_VSX 19, 272, 9
+   SAVE_VSX 20, 288, 9
+   SAVE_VSX 21, 304, 9
+   SAVE_VSX 22, 320, 9
+   SAVE_VSX 23, 336, 9
+   SAVE_VSX 24, 352, 9
+   SAVE_VSX 25, 368, 9
+   SAVE_VSX 26, 384, 9
+   SAVE_VSX 27, 400, 9
+   SAVE_VSX 28, 416, 9
+   SAVE_VSX 29, 432, 9
+   SAVE_VSX 30, 448, 9
+   SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+   addi9, 1, 256
+   RESTORE_VRS 20, 0, 9
+   RESTORE_VRS 21, 16, 9
+   RESTORE_VRS 22, 32, 9
+   RESTORE_VRS 23, 48, 9
+   RESTORE_VRS 24, 64, 9
+   RESTORE_VRS 25, 80, 9
+   RESTORE_VRS 26, 96, 9
+   RESTORE_VRS 27, 112, 9
+   RESTORE_VRS 28, 128, 9
+   RESTORE_VRS 29, 144, 9
+   RESTORE_VRS 30, 160, 9
+   RESTORE_VRS 31, 176, 9
+
+   RESTORE_VSX 14, 192, 9
+   RESTORE_VSX 15, 208, 9
+   RESTORE_VSX 16, 224, 9
+   RESTORE_VSX 17, 240, 9
+   RESTORE_VSX 18, 256, 9
+   RESTORE_VSX 19, 272, 9
+   RESTORE_VSX 20, 288, 9
+   RESTORE_VSX 21, 304, 9
+   RESTORE_VSX 22, 320, 9
+   RESTORE_VSX 23, 336, 9
+   RESTORE_VSX 24, 352, 9
+   RESTORE_VSX 25, 368, 9
+   RESTORE_VSX 26, 384, 9
+   RESTORE_VSX 27, 400, 9
+   RESTORE_VSX 28, 416, 9
+   RESTORE_VSX 29, 432, 9
+   RESTORE_VSX 30, 448, 9
+   RESTORE_VSX 31, 464, 9
+
+   RESTORE_GPR 14, 112, 1
+   RESTORE_GPR 15, 120, 1
+   RESTORE_GPR 16, 128, 1
+   RESTORE

[PATCH 0/5] crypto: Accelerated Chacha20/Poly1305 implementation

2023-04-24 Thread Danny Tsen
This patch series provide an accelerated/optimized Chacha20 and Poly1305
implementation for Power10 or later CPU (ppc64le).  This module
implements algorithm specified in RFC7539.  The implementation
provides 3.5X better performance than the baseline for Chacha20 and
Poly1305 individually and 1.5X improvement for Chacha20/Poly1305
operation.

This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest.  The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.


Danny Tsen (5):
  An optimized Chacha20 implementation with 8-way unrolling for ppc64le.
  Glue code for optmized Chacha20 implementation for ppc64le.
  An optimized Poly1305 implementation with 4-way unrolling for ppc64le.
  Glue code for optmized Poly1305 implementation for ppc64le.
  Update Kconfig and Makefile.

 arch/powerpc/crypto/Kconfig |   26 +
 arch/powerpc/crypto/Makefile|4 +
 arch/powerpc/crypto/chacha-p10-glue.c   |  223 +
 arch/powerpc/crypto/chacha-p10le-8x.S   |  842 ++
 arch/powerpc/crypto/poly1305-p10-glue.c |  186 
 arch/powerpc/crypto/poly1305-p10le_64.S | 1075 +++
 6 files changed, 2356 insertions(+)
 create mode 100644 arch/powerpc/crypto/chacha-p10-glue.c
 create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S
 create mode 100644 arch/powerpc/crypto/poly1305-p10-glue.c
 create mode 100644 arch/powerpc/crypto/poly1305-p10le_64.S

-- 
2.31.1



Re: [PATCH v2 1/2] Remove POWER10_CPU dependency.

2023-04-14 Thread Danny Tsen

Thanks Michael.

-Danny

On 4/14/23 8:08 AM, Michael Ellerman wrote:

Danny Tsen  writes:

Remove Power10 dependency in Kconfig and detect Power10 feature at runtime.

... using the existing call to module_cpu_feature_match() :)


Signed-off-by: Danny Tsen 
---
  arch/powerpc/crypto/Kconfig | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

Acked-by: Michael Ellerman  (powerpc)

cheers


diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 1f8f02b494e1..7113f9355165 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE
  
  config CRYPTO_AES_GCM_P10

tristate "Stitched AES/GCM acceleration support on P10 or later CPU 
(PPC)"
-   depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN
+   depends on PPC64 && CPU_LITTLE_ENDIAN
select CRYPTO_LIB_AES
select CRYPTO_ALGAPI
select CRYPTO_AEAD
--
2.31.1


[PATCH v2 2/2] Move Power10 feature, PPC_MODULE_FEATURE_P10.

2023-04-13 Thread Danny Tsen
Move Power10 feature, PPC_MODULE_FEATURE_P10, definition to be in
arch/powerpc/include/asm/cpufeature.h.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 -
 arch/powerpc/include/asm/cpufeature.h  | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c 
b/arch/powerpc/crypto/aes-gcm-p10-glue.c
index 1533c8cdd26f..bd3475f5348d 100644
--- a/arch/powerpc/crypto/aes-gcm-p10-glue.c
+++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c
@@ -22,7 +22,6 @@
 #include 
 #include 
 
-#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1))
 #definePPC_ALIGN   16
 #define GCM_IV_SIZE12
 
diff --git a/arch/powerpc/include/asm/cpufeature.h 
b/arch/powerpc/include/asm/cpufeature.h
index f6f790a90367..2dcc66225e7f 100644
--- a/arch/powerpc/include/asm/cpufeature.h
+++ b/arch/powerpc/include/asm/cpufeature.h
@@ -22,6 +22,7 @@
  */
 
 #define PPC_MODULE_FEATURE_VEC_CRYPTO  (32 + 
ilog2(PPC_FEATURE2_VEC_CRYPTO))
+#define PPC_MODULE_FEATURE_P10 (32 + 
ilog2(PPC_FEATURE2_ARCH_3_1))
 
 #define cpu_feature(x) (x)
 
-- 
2.31.1



[PATCH v2 0/2] Remove POWER10_CPU dependency and move PPC_MODULE_FEATURE_P10.

2023-04-13 Thread Danny Tsen
Remove Power10 dependency in Kconfig and detect Power10 feature at runtime.
Move PPC_MODULE_FEATURE_P10 definition to be in
arch/powerpc/include/asm/cpufeature.h.

Signed-off-by: Danny Tsen 

Danny Tsen (2):
  Kconfig: Remove POWER10_CPU dependency.
  aes-gcm-p10-glue.c, cpufeature.h: Move Power10 feature, 
PPC_MODULE_FEATURE_P10.

 arch/powerpc/crypto/Kconfig| 2 +-
 arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 -
 arch/powerpc/include/asm/cpufeature.h  | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

-- 
2.31.1



[PATCH v2 1/2] Remove POWER10_CPU dependency.

2023-04-13 Thread Danny Tsen
Remove Power10 dependency in Kconfig and detect Power10 feature at runtime.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 1f8f02b494e1..7113f9355165 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE
 
 config CRYPTO_AES_GCM_P10
tristate "Stitched AES/GCM acceleration support on P10 or later CPU 
(PPC)"
-   depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN
+   depends on PPC64 && CPU_LITTLE_ENDIAN
select CRYPTO_LIB_AES
select CRYPTO_ALGAPI
select CRYPTO_AEAD
-- 
2.31.1



Re: [PATCH] Remove POWER10_CPU dependency and move PPC_MODULE_FEATURE_P10.

2023-04-13 Thread Danny Tsen



On 4/13/23 8:18 AM, Danny Tsen wrote:

Hi Michael,

If I do separate patch for moving PPC_MODULE_FEATURE_P10, this will 
break the build since it is currently defined in aes-gcm-p10-glue.c.  
And the p10 will be detected when loading the module in 
module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init); so it 
won't load if it's not P10.


Thanks.

-Danny

On 4/13/23 8:12 AM, Michael Ellerman wrote:

Danny Tsen  writes:
Remove Power10 dependency in Kconfig and detect Power10 feature at 
runtime.

Move PPC_MODULE_FEATURE_P10 definition to be in
arch/powerpc/include/asm/cpufeature.h.

This should be two patches, one for the Kconfig change and one moving
the feature flag.


I think I misunderstood.  I can do two patches one for Kconfig change 
and one moving the feature flag.  I'll fix it.


Thanks.

-Danny



Also don't you need a cpu feature check in p10_init()? Otherwise the
driver can be loaded on non-P10 CPUs, either by being built-in, or
manually.

cheers


Signed-off-by: Danny Tsen 
---
  arch/powerpc/crypto/Kconfig    | 2 +-
  arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 -
  arch/powerpc/include/asm/cpufeature.h  | 1 +
  3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 1f8f02b494e1..7113f9355165 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE
    config CRYPTO_AES_GCM_P10
  tristate "Stitched AES/GCM acceleration support on P10 or 
later CPU (PPC)"

-    depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN
+    depends on PPC64 && CPU_LITTLE_ENDIAN
  select CRYPTO_LIB_AES
  select CRYPTO_ALGAPI
  select CRYPTO_AEAD
diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c 
b/arch/powerpc/crypto/aes-gcm-p10-glue.c

index 1533c8cdd26f..bd3475f5348d 100644
--- a/arch/powerpc/crypto/aes-gcm-p10-glue.c
+++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c
@@ -22,7 +22,6 @@
  #include 
  #include 
  -#define PPC_MODULE_FEATURE_P10    (32 + 
ilog2(PPC_FEATURE2_ARCH_3_1))

  #define    PPC_ALIGN    16
  #define GCM_IV_SIZE    12
  diff --git a/arch/powerpc/include/asm/cpufeature.h 
b/arch/powerpc/include/asm/cpufeature.h

index f6f790a90367..2dcc66225e7f 100644
--- a/arch/powerpc/include/asm/cpufeature.h
+++ b/arch/powerpc/include/asm/cpufeature.h
@@ -22,6 +22,7 @@
   */
    #define PPC_MODULE_FEATURE_VEC_CRYPTO    (32 + 
ilog2(PPC_FEATURE2_VEC_CRYPTO))
+#define PPC_MODULE_FEATURE_P10    (32 + 
ilog2(PPC_FEATURE2_ARCH_3_1))

    #define cpu_feature(x)    (x)
  --
2.31.1


Re: [PATCH] Remove POWER10_CPU dependency and move PPC_MODULE_FEATURE_P10.

2023-04-13 Thread Danny Tsen

Hi Michael,

If I do separate patch for moving PPC_MODULE_FEATURE_P10, this will 
break the build since it is currently defined in aes-gcm-p10-glue.c.  
And the p10 will be detected when loading the module in 
module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init); so it won't 
load if it's not P10.


Thanks.

-Danny

On 4/13/23 8:12 AM, Michael Ellerman wrote:

Danny Tsen  writes:

Remove Power10 dependency in Kconfig and detect Power10 feature at runtime.
Move PPC_MODULE_FEATURE_P10 definition to be in
arch/powerpc/include/asm/cpufeature.h.

This should be two patches, one for the Kconfig change and one moving
the feature flag.

Also don't you need a cpu feature check in p10_init()? Otherwise the
driver can be loaded on non-P10 CPUs, either by being built-in, or
manually.

cheers


Signed-off-by: Danny Tsen 
---
  arch/powerpc/crypto/Kconfig| 2 +-
  arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 -
  arch/powerpc/include/asm/cpufeature.h  | 1 +
  3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 1f8f02b494e1..7113f9355165 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE
  
  config CRYPTO_AES_GCM_P10

tristate "Stitched AES/GCM acceleration support on P10 or later CPU 
(PPC)"
-   depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN
+   depends on PPC64 && CPU_LITTLE_ENDIAN
select CRYPTO_LIB_AES
select CRYPTO_ALGAPI
select CRYPTO_AEAD
diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c 
b/arch/powerpc/crypto/aes-gcm-p10-glue.c
index 1533c8cdd26f..bd3475f5348d 100644
--- a/arch/powerpc/crypto/aes-gcm-p10-glue.c
+++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c
@@ -22,7 +22,6 @@
  #include 
  #include 
  
-#define PPC_MODULE_FEATURE_P10	(32 + ilog2(PPC_FEATURE2_ARCH_3_1))

  #define   PPC_ALIGN   16
  #define GCM_IV_SIZE   12
  
diff --git a/arch/powerpc/include/asm/cpufeature.h b/arch/powerpc/include/asm/cpufeature.h

index f6f790a90367..2dcc66225e7f 100644
--- a/arch/powerpc/include/asm/cpufeature.h
+++ b/arch/powerpc/include/asm/cpufeature.h
@@ -22,6 +22,7 @@
   */
  
  #define PPC_MODULE_FEATURE_VEC_CRYPTO			(32 + ilog2(PPC_FEATURE2_VEC_CRYPTO))

+#define PPC_MODULE_FEATURE_P10 (32 + 
ilog2(PPC_FEATURE2_ARCH_3_1))
  
  #define cpu_feature(x)		(x)
  
--

2.31.1


[PATCH] Remove POWER10_CPU dependency and move PPC_MODULE_FEATURE_P10.

2023-04-13 Thread Danny Tsen
Remove Power10 dependency in Kconfig and detect Power10 feature at runtime.
Move PPC_MODULE_FEATURE_P10 definition to be in
arch/powerpc/include/asm/cpufeature.h.

Signed-off-by: Danny Tsen 
---
 arch/powerpc/crypto/Kconfig| 2 +-
 arch/powerpc/crypto/aes-gcm-p10-glue.c | 1 -
 arch/powerpc/include/asm/cpufeature.h  | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 1f8f02b494e1..7113f9355165 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -96,7 +96,7 @@ config CRYPTO_AES_PPC_SPE
 
 config CRYPTO_AES_GCM_P10
tristate "Stitched AES/GCM acceleration support on P10 or later CPU 
(PPC)"
-   depends on PPC64 && POWER10_CPU && CPU_LITTLE_ENDIAN
+   depends on PPC64 && CPU_LITTLE_ENDIAN
select CRYPTO_LIB_AES
select CRYPTO_ALGAPI
select CRYPTO_AEAD
diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c 
b/arch/powerpc/crypto/aes-gcm-p10-glue.c
index 1533c8cdd26f..bd3475f5348d 100644
--- a/arch/powerpc/crypto/aes-gcm-p10-glue.c
+++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c
@@ -22,7 +22,6 @@
 #include 
 #include 
 
-#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1))
 #definePPC_ALIGN   16
 #define GCM_IV_SIZE12
 
diff --git a/arch/powerpc/include/asm/cpufeature.h 
b/arch/powerpc/include/asm/cpufeature.h
index f6f790a90367..2dcc66225e7f 100644
--- a/arch/powerpc/include/asm/cpufeature.h
+++ b/arch/powerpc/include/asm/cpufeature.h
@@ -22,6 +22,7 @@
  */
 
 #define PPC_MODULE_FEATURE_VEC_CRYPTO  (32 + 
ilog2(PPC_FEATURE2_VEC_CRYPTO))
+#define PPC_MODULE_FEATURE_P10 (32 + 
ilog2(PPC_FEATURE2_ARCH_3_1))
 
 #define cpu_feature(x) (x)
 
-- 
2.31.1