[PATCH 2/2] powerpc64: Add optimized assembly for sha512-compress

2024-06-20 Thread Eric Richter
This patch introduces an optimized powerpc64 assembly implementation for
sha512-compress, derived from the implementation for sha256-compress-n.

The following data was captured on a POWER 10 LPAR @ ~4.050GHz

Current C implementation:
 Algorithm mode Mbyte/s
sha512   update  474.00
sha512_224   update  474.61
sha512_256   update  474.15
   hmac-sha512 64 bytes  104.08
   hmac-sha512256 bytes  220.42
   hmac-sha512   1024 bytes  368.58
   hmac-sha512   4096 bytes  436.27
   hmac-sha512   single msg  460.10

With optimized assembly:
 Algorithm mode Mbyte/s
sha512   update  746.96
sha512_224   update  746.96
sha512_256   update  746.93
   hmac-sha512 64 bytes  150.54
   hmac-sha512256 bytes  327.58
   hmac-sha512   1024 bytes  562.49
   hmac-sha512   4096 bytes  677.38
   hmac-sha512   single msg  713.06

Signed-off-by: Eric Richter 
---

This is a complete rewrite derived from the SHA256 implementation. The
benchmark numbers above have been adjusted, though they have been collected
on a different machine yielding slightly different baseline results.


 fat-ppc.c   |  10 +
 powerpc64/fat/sha512-compress-2.asm |  36 +++
 powerpc64/p8/sha512-compress.asm| 336 
 3 files changed, 382 insertions(+)
 create mode 100644 powerpc64/fat/sha512-compress-2.asm
 create mode 100644 powerpc64/p8/sha512-compress.asm

diff --git a/fat-ppc.c b/fat-ppc.c
index aaccc116..5b6efd10 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -215,6 +215,10 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress_n, 
sha256_compress_n_func)
 DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
 DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, ppc64)
+
 /* Nop implementation for _gcm_aes_encrypt and _gcm_aes_decrypt. */
 static size_t
 gcm_aes_crypt_c (struct gcm_key *key UNUSED, unsigned rounds UNUSED,
@@ -253,6 +257,7 @@ fat_init (void)
   _nettle_gcm_aes_encrypt_vec = _nettle_gcm_aes_encrypt_ppc64;
   _nettle_gcm_aes_decrypt_vec = _nettle_gcm_aes_decrypt_ppc64;
   _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64;
+  _nettle_sha512_compress_vec = _nettle_sha512_compress_ppc64;
 }
   else
 {
@@ -264,6 +269,7 @@ fat_init (void)
   _nettle_gcm_aes_encrypt_vec = gcm_aes_crypt_c;
   _nettle_gcm_aes_decrypt_vec = gcm_aes_crypt_c;
   _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
+  _nettle_sha512_compress_vec = _nettle_sha512_compress_c;
 }
   if (features.have_altivec)
 {
@@ -378,3 +384,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
(uint32_t *state, const uint32_t *k,
 size_t blocks, const uint8_t *input),
(state, k, blocks, input))
+
+DEFINE_FAT_FUNC(_nettle_sha512_compress, void,
+   (uint64_t *state, const uint8_t *input, const uint64_t *k),
+   (state, input, k))
diff --git a/powerpc64/fat/sha512-compress-2.asm 
b/powerpc64/fat/sha512-compress-2.asm
new file mode 100644
index ..9445e5ba
--- /dev/null
+++ b/powerpc64/fat/sha512-compress-2.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/sha512-compress-2.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha512_compress) picked up by configure
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/sha512-compress.asm')
diff --git a/powerpc64/p8/sha512-compress.asm b/powerpc64/p8/sha512-compress.asm
new file mode 100644
index ..bf182a45
--- /dev/null
+++ b/powerpc64/p8/sha512-compress.asm
@@ -0,0 +1,336 @@
+C x86_64/sha512-compress.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric

[PATCH 1/2] powerpc64: remove use of m4_unquote in the load step for sha256

2024-06-20 Thread Eric Richter
By passing in the constant offset value into the LOAD macro, the use of
m4_unquote to calculate the correct constant GPR can be avoided,
improving readability.

Signed-off-by: Eric Richter 
---
 powerpc64/p8/sha256-compress-n.asm | 36 +++---
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/powerpc64/p8/sha256-compress-n.asm 
b/powerpc64/p8/sha256-compress-n.asm
index 4848461e..309db1fa 100644
--- a/powerpc64/p8/sha256-compress-n.asm
+++ b/powerpc64/p8/sha256-compress-n.asm
@@ -177,34 +177,34 @@ define(`EXTENDROUNDS', `
 ')
 
 define(`LOAD', `
-   IF_BE(`lxvw4x   VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT')
+   IF_BE(`lxvw4x   VSR(IV($1)), $2, INPUT')
IF_LE(`
-   lxvd2x  VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT
+   lxvd2x  VSR(IV($1)), $2, INPUT
vperm   IV($1), IV($1), IV($1), VT0
')
 ')
 
 define(`DOLOADS', `
IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
-   LOAD(0)
-   LOAD(1)
-   LOAD(2)
-   LOAD(3)
+   LOAD(0, TC0)
+   LOAD(1, TC4)
+   LOAD(2, TC8)
+   LOAD(3, TC12)
addiINPUT, INPUT, 16
-   LOAD(4)
-   LOAD(5)
-   LOAD(6)
-   LOAD(7)
+   LOAD(4, TC0)
+   LOAD(5, TC4)
+   LOAD(6, TC8)
+   LOAD(7, TC12)
addiINPUT, INPUT, 16
-   LOAD(8)
-   LOAD(9)
-   LOAD(10)
-   LOAD(11)
+   LOAD(8, TC0)
+   LOAD(9, TC4)
+   LOAD(10, TC8)
+   LOAD(11, TC12)
addiINPUT, INPUT, 16
-   LOAD(12)
-   LOAD(13)
-   LOAD(14)
-   LOAD(15)
+   LOAD(12, TC0)
+   LOAD(13, TC4)
+   LOAD(14, TC8)
+   LOAD(15, TC12)
addiINPUT, INPUT, 16
 ')
 
-- 
2.45.2

___
nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se
To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se


Re: [PATCH v3] powerpc64: Add optimized assembly for sha256-compress-n

2024-06-18 Thread Eric Richter
On Fri, 2024-06-07 at 14:08 +0200, Niels Möller wrote:
> Eric Richter  writes:
> > +C ROUND(A B C D E F G H R EXT)
> > +define(`ROUND', `
> > +
> > + vadduwm VT1, VK, IV($9)   C VT1: k+W
> > + vadduwm VT4, $8, VT1  C VT4: H+k+W
> > +
> > + lxvw4x VSR(VK), TK, K    C Load Key
> > + addi TK, TK, 4   C Increment Pointer to next key
> > +
> > + vadduwm VT2, $4, $8   C VT2: H+D
> > + vadduwm VT2, VT2, VT1 C VT2: H+D+k+W
> 
> Could the above two instructions be changed to
> 
>  vadduwm VT2, VT4, $4    C Should be the same,(H+k+W) + D
> 
> (which would need one less register)? I realize there's slight change
> in
> the dependency chain. Do you know how many cycles one of these rounds
> takes, and what's the bottleneck (I would guess either latency of the
> dependency chain between rounds, or throughput of one of the
> execution
> units, or instruction issue rate).
> 

Theoretically it should be about 10 cycles per round, but the actual
measured performance doesn't quite hit that due to various quirks with
scheduling.

With this change, I'm getting about a +1 MB/s gain on hmac 256 bytes,
but a slight loss of speed for the rest.

> > +define(`LOAD', `
> > + IF_BE(`lxvw4x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)),
> > INPUT')
> > + IF_LE(`
> > + lxvd2x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT
> > + vperm IV($1), IV($1), IV($1), VT0
> > + ')
> > +')
> > +
> > +define(`DOLOADS', `
> > + IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
> > + LOAD(0)
> > + LOAD(1)
> > + LOAD(2)
> > + LOAD(3)
> 
> If you pass the right TCx register as argument to the load macro, you
> don't need the m4 eval thing, which could make it a bit more
> readable, imo.
> 
> > + C Store non-volatile registers
> > +
> > + li T0, -8
> > + li T1, -24
> > + stvx v20, T0, SP
> > + stvx v21, T1, SP
> > + subi T0, T0, 32
> > + subi T1, T1, 32
> 
> This could be probably be arranged with fewer instructions by having
> one
> register that is decremented as we move down in the guard area, and
> registers with constant values for indexing.
> 
> > + C Reload initial state from VSX registers
> > + xxlor VSR(VT0), VSXA, VSXA
> > + xxlor VSR(VT1), VSXB, VSXB
> > + xxlor VSR(VT2), VSXC, VSXC
> > + xxlor VSR(VT3), VSXD, VSXD
> > + xxlor VSR(VT4), VSXE, VSXE
> > + xxlor VSR(SIGA), VSXF, VSXF
> > + xxlor VSR(SIGE), VSXG, VSXG
> > + xxlor VSR(VK), VSXH, VSXH
> > +
> > + vadduwm VSA, VSA, VT0
> > + vadduwm VSB, VSB, VT1
> > + vadduwm VSC, VSC, VT2
> > + vadduwm VSD, VSD, VT3
> > + vadduwm VSE, VSE, VT4
> > + vadduwm VSF, VSF, SIGA
> > + vadduwm VSG, VSG, SIGE
> > + vadduwm VSH, VSH, VK
> 
> It's a pity that there seems to be no useful xxadd* instructions? Do
> you
> need all eight temporary registers, or would you get the same speed
> doing just four at a time, i.e., 4 xxlor instructions, 4 vadduwm, 4
> xxlor, 4 vadduwm? There's no alias "xxmov" or the like that could be
> used instead of xxlor?
> 

Unfortunately most of the VSX instructions (particularly those in the
p8 ISA) are for floating point operations, using them in this way is a
bit of a hack. I'll test four at a time, but it will likely be similar
performance unless the xxlor's are issued on a different unit.

I'm not aware of an xxmov/xxmr extended mnemonic, but this could always
be macroed instead for clarity.

> Thanks for the update!
> /Niels
> 

Thanks for merging! I'll have a clean-up patch up soon, hopefully with
the SHA512 implementation as well.
___
nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se
To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se


[PATCH v3] powerpc64: Add optimized assembly for sha256-compress-n

2024-06-04 Thread Eric Richter
This patch introduces an optimized powerpc64 assembly implementation for
sha256-compress-n. This takes advantage of the vshasigma instruction, as
well as unrolling loops to best take advantage of running instructions
in parallel.

The following data was captured on a POWER 10 LPAR @ ~3.896GHz

Current C implementation:
 Algorithm mode Mbyte/s
sha256   update  280.97
   hmac-sha256 64 bytes   80.81
   hmac-sha256256 bytes  170.50
   hmac-sha256   1024 bytes  241.92
   hmac-sha256   4096 bytes  268.54
   hmac-sha256   single msg  276.16

With optimized assembly:
 Algorithm mode Mbyte/s
sha256   update  461.45
   hmac-sha256 64 bytes  123.88
   hmac-sha256256 bytes  268.81
   hmac-sha256   1024 bytes  390.91
   hmac-sha256   4096 bytes  438.02
   hmac-sha256   single msg  453.83

Signed-off-by: Eric Richter 
---

I split this patch to be standalone, rather than delay even further trying
to update SHA512 -- I will update the SHA512 implementation when this one
stabilizes.

Regarding the load vperm needed for little endian: unfortunately we don't
have a spare vector register to store the mask between rounds, so the best
that can be done while maintaining p8 support will be to store the mask in
a VSX register like the state values, and avoid the load. This is a
negligible performance change however, yielding around +1MB/s on larger
block counts (update, hmac 1024/4096/single msg) and -1MB/s on smaller
(hmac 64/256).

Dropping p8 support allows the use of the lxvb16x instruction, which does
not need to be permuted, however that is as well a negligible performance
improvement at the cost of dropping a whole cpu set. So I see a few
options:
A) leave as-is, consider storing the mask in a VSX register
B) drop p8 support, use lxvb16x
C) have a compile-time switch to use permute on p8, and use the single
   instruction for p9 an up.


v3:
 - use protected zone instead of allocating stack space
 - add GPRs constants for multiples of 4 for loads
   - around +3.4 MB/s for sha256 update
 - move extend logic to its own macro called by EXTENDROUND
 - use 8 VSX registers to store previous state instead of the stack
   - around +11.0 MB/s for sha256 update

 fat-ppc.c |  12 +
 powerpc64/fat/sha256-compress-n-2.asm |  36 +++
 powerpc64/p8/sha256-compress-n.asm| 364 ++
 3 files changed, 412 insertions(+)
 create mode 100644 powerpc64/fat/sha256-compress-n-2.asm
 create mode 100644 powerpc64/p8/sha256-compress-n.asm

diff --git a/fat-ppc.c b/fat-ppc.c
index cd76f7a1..efbeb2ec 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks, 
poly1305_blocks_func)
 DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c)
 DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
+
 
 static void CONSTRUCTOR
 fat_init (void)
@@ -231,6 +235,8 @@ fat_init (void)
  _nettle_ghash_update_arm64() */
   _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64;
   _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
+
+  _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64;
 }
   else
 {
@@ -239,6 +245,7 @@ fat_init (void)
   _nettle_aes_invert_vec = _nettle_aes_invert_c;
   _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c;
   _nettle_ghash_update_vec = _nettle_ghash_update_c;
+  _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
 }
   if (features.have_altivec)
 {
@@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *,
  size_t blocks,
 const uint8_t *m),
(ctx, blocks, m))
+
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+   (uint32_t *state, const uint32_t *k,
+size_t blocks, const uint8_t *input),
+   (state, k, blocks, input))
diff --git a/powerpc64/fat/sha256-compress-n-2.asm 
b/powerpc64/fat/sha256-compress-n-2.asm
new file mode 100644
index ..4f4eee9d
--- /dev/null
+++ b/powerpc64/fat/sha256-compress-n-2.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/sha256-compress-n-2.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your

Re: [PATCH v2 1/2] powerpc64: Add optimized assembly for sha256-compress-n

2024-05-07 Thread Eric Richter
On Sun, 2024-05-05 at 16:10 +0200, Niels Möller wrote:
> Eric Richter  writes:
> 
> > This patch introduces an optimized powerpc64 assembly
> > implementation for
> > sha256-compress-n. This takes advantage of the vshasigma
> > instruction, as
> > well as unrolling loops to best take advantage of running
> > instructions
> > in parallel.
> 
> Thanks. I'm now having a closer read of the assembly code. Comments
> below.
> 
> > +C ROUND(A B C D E F G H R EXT)
> > +define(`ROUND', `
> > +
> > +   vadduwm VT1, VK, IV($9)   C VT1: k+W
> > +   vadduwm VT4, $8, VT1  C VT4: H+k+W
> > +
> > +   lxvw4x  VSR(VK), TK, K    C Load Key
> > +   addiTK, TK, 4     C Increment Pointer
> > to next key
> > +
> > +   vadduwm VT2, $4, $8   C VT2: H+D
> > +   vadduwm VT2, VT2, VT1 C VT2:
> > H+D+k+W
> > +
> > +   vshasigmaw  SIGE, $5, 1, 0b   C Sigma(E)  Se
> > +   vshasigmaw  SIGA, $1, 1, 0    C Sigma(A)  Sa
> > +
> > +   vxorVT3, $2, $3   C VT3: b^c
> > +   vselVT0, $7, $6, $5   C VT0: Ch.
> > +   vselVT3, $3, $1, VT3  C VT3: Maj(a,b,c)
> > +
> > +   vadduwm VT4, VT4, VT0 C VT4: Hkw +
> > Ch.
> > +   vadduwm VT3, VT3, VT4 C VT3: HkW +
> > Ch. + Maj.
> > +
> > +   vadduwm VT0, VT0, VT2 C VT0: Ch. +
> > DHKW
> > +   vadduwm $8, SIGE, SIGA    C Anext: Se
> > + Sa
> > +   vadduwm $4, VT0, SIGE C Dnext: Ch.
> > + DHKW + Se
> > +   vadduwm $8, $8, VT3   C Anext:
> > Se+Sa+HkW+Ch.+Maj.
> > +
> > +
> > +   C Schedule (data) for 16th round in future
> > +   C Extend W[i]
> > +   ifelse(`$10', `1', `
> > +   vshasigmaw  SIGE, IV($9 + 14), 0, 0b
> > +   vshasigmaw  SIGA, IV($9 + 1), 0, 0b
> > +   vadduwm IV($9), IV($9), SIGE
> > +   vadduwm IV($9), IV($9), SIGA
> > +   vadduwm IV($9), IV($9), IV($9 + 9)
> > +   ')
> > +')
> 
> I think it would be a bit simpler to take out the extend logic to its
> own macro. 
> 
> > +define(`EXTENDROUND',  `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9,
> > 1)')
> 
> If you do that, then you would define
>   
>   define(`EXTENDROUND',   `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9)
> EXTEND($9)')
> 
> (In other related code, input expansion is done at the beginning of a
> round iteration rather than at the end, but doing at the end like you
> do
> may be better scheduling).
> 

Makes sense, I'll move that extend logic into its own macro.

You are correct, the expansion logic was moved to the end of the round
for an improvement to scheduling on the CPU. The vshasigma instructions
take more cycles and are scheduled on a different unit than the other
arithmetic operations. This allows those to work in parallel with the
beginning of the next round, as there are no dependent registers until
the next vshasigma instructions in-round.

> > +define(`LOAD', `
> > +   IF_BE(`lxvw4x   VSR(IV($1)), 0, INPUT')
> > +   IF_LE(`
> > +   lxvd2x  VSR(IV($1)), 0, INPUT
> > +   vperm   IV($1), IV($1), IV($1), VT0
> > +   ')
> > +   addiINPUT, INPUT, 4
> > +')
> > +
> > +define(`DOLOADS', `
> > +   IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
> 
> Could you have a dedicated register for the permutation constant, and
> load it only once at function entry? If you have general registers to
> spare, it could also make sense to use, e.g., three registers for the
> contant values 16, 32, 48, and use for indexing. Then you don't need
> to
> update the INPUT pointer as often, and you can use the same constants
> for other load/store sequences as well.
> 

There are plenty of GPRs to spare, I will test and bench a few options
for using more GPRs as indexes.

As for VRs, unfortunately the current implementation uses all 32 VRs:
 16 for W[i]
 8 for state
 7 for round arithmetic (two of these specifically for sigma, to avoid
a dependency bubble)
 1 for storing the key constant K

That said, I'm going to experiment with some VSX instructions to see if
it is possible to spill over certain operations into VSRs, without
needing an explicit copy back from VSR to VR.

> > +   LOAD(0)
> > +   LOAD(1)
> > +   LOAD(2)
> > +   LOAD(3)
> 
> > +PROLOGUE(_nettle_sha256_compress_n)
> > +   cmpwi   0, NUMBLOCKS, 0
> > +   ble 0, .done
> > +   mtctr   NUMBLOCKS
> > +

[PATCH v2 1/2] powerpc64: Add optimized assembly for sha256-compress-n

2024-04-18 Thread Eric Richter
This patch introduces an optimized powerpc64 assembly implementation for
sha256-compress-n. This takes advantage of the vshasigma instruction, as
well as unrolling loops to best take advantage of running instructions
in parallel.

The following data was captured on a POWER 10 LPAR @ ~3.896GHz

Current C implementation:
 Algorithm mode Mbyte/s
sha256   update  280.97
   hmac-sha256 64 bytes   80.81
   hmac-sha256256 bytes  170.50
   hmac-sha256   1024 bytes  241.92
   hmac-sha256   4096 bytes  268.54
   hmac-sha256   single msg  276.16

With optimized assembly:
 Algorithm mode Mbyte/s
sha256   update  446.42
   hmac-sha256 64 bytes  124.89
   hmac-sha256256 bytes  268.90
   hmac-sha256   1024 bytes  382.06
   hmac-sha256   4096 bytes  425.38
   hmac-sha256   single msg  439.75

Signed-off-by: Eric Richter 
---
 fat-ppc.c |  12 +
 powerpc64/fat/sha256-compress-n-2.asm |  36 +++
 powerpc64/p8/sha256-compress-n.asm| 323 ++
 3 files changed, 371 insertions(+)
 create mode 100644 powerpc64/fat/sha256-compress-n-2.asm
 create mode 100644 powerpc64/p8/sha256-compress-n.asm

diff --git a/fat-ppc.c b/fat-ppc.c
index cd76f7a1..efbeb2ec 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks, 
poly1305_blocks_func)
 DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c)
 DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
+
 
 static void CONSTRUCTOR
 fat_init (void)
@@ -231,6 +235,8 @@ fat_init (void)
  _nettle_ghash_update_arm64() */
   _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64;
   _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
+
+  _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64;
 }
   else
 {
@@ -239,6 +245,7 @@ fat_init (void)
   _nettle_aes_invert_vec = _nettle_aes_invert_c;
   _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c;
   _nettle_ghash_update_vec = _nettle_ghash_update_c;
+  _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
 }
   if (features.have_altivec)
 {
@@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *,
  size_t blocks,
 const uint8_t *m),
(ctx, blocks, m))
+
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+   (uint32_t *state, const uint32_t *k,
+size_t blocks, const uint8_t *input),
+   (state, k, blocks, input))
diff --git a/powerpc64/fat/sha256-compress-n-2.asm 
b/powerpc64/fat/sha256-compress-n-2.asm
new file mode 100644
index ..4f4eee9d
--- /dev/null
+++ b/powerpc64/fat/sha256-compress-n-2.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/sha256-compress-n-2.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/sha256-compress-n.asm')
diff --git a/powerpc64/p8/sha256-compress-n.asm 
b/powerpc64/p8/sha256-compress-n.asm
new file mode 100644
index ..d76f337e
--- /dev/null
+++ b/powerpc64/p8/sha256-compress-n.asm
@@ -0,0 +1,323 @@
+C x86_64/sha256-compress-n.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published

[PATCH v2 2/2] powerpc64: Add optimized assembly for sha512-compress

2024-04-18 Thread Eric Richter
This patch introduces an optimized powerpc64 assembly implementation for
sha512-compress, derived from the implementation for sha256-compress-n.

The following data was captured on a POWER 10 LPAR @ ~3.896GHz

Current C implementation:
 Algorithm mode Mbyte/s
sha512   update  447.02
sha512-224   update  444.30
sha512-256   update  445.02
   hmac-sha512 64 bytes   97.27
   hmac-sha512256 bytes  204.55
   hmac-sha512   1024 bytes  342.86
   hmac-sha512   4096 bytes  409.57
   hmac-sha512   single msg  433.95

With optimized assembly:
 Algorithm mode Mbyte/s
sha512   update  705.36
sha512-224   update  705.63
sha512-256   update  705.34
   hmac-sha512 64 bytes  141.66
   hmac-sha512256 bytes  310.26
   hmac-sha512   1024 bytes  534.22
   hmac-sha512   4096 bytes  641.74
   hmac-sha512   single msg  677.14

Signed-off-by: Eric Richter 
---
 fat-ppc.c   |  10 +
 powerpc64/fat/sha512-compress-2.asm |  36 +++
 powerpc64/p8/sha512-compress.asm| 327 
 3 files changed, 373 insertions(+)
 create mode 100644 powerpc64/fat/sha512-compress-2.asm
 create mode 100644 powerpc64/p8/sha512-compress.asm

diff --git a/fat-ppc.c b/fat-ppc.c
index efbeb2ec..a228386a 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -207,6 +207,10 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress_n, 
sha256_compress_n_func)
 DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
 DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, ppc64)
+
 
 static void CONSTRUCTOR
 fat_init (void)
@@ -237,6 +241,7 @@ fat_init (void)
   _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
 
   _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64;
+  _nettle_sha512_compress_vec = _nettle_sha512_compress_ppc64;
 }
   else
 {
@@ -246,6 +251,7 @@ fat_init (void)
   _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c;
   _nettle_ghash_update_vec = _nettle_ghash_update_c;
   _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
+  _nettle_sha512_compress_vec = _nettle_sha512_compress_c;
 }
   if (features.have_altivec)
 {
@@ -350,3 +356,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
(uint32_t *state, const uint32_t *k,
 size_t blocks, const uint8_t *input),
(state, k, blocks, input))
+
+DEFINE_FAT_FUNC(_nettle_sha512_compress, void,
+   (uint64_t *state, const uint8_t *input, const uint64_t *k),
+   (state, input, k))
diff --git a/powerpc64/fat/sha512-compress-2.asm 
b/powerpc64/fat/sha512-compress-2.asm
new file mode 100644
index ..9445e5ba
--- /dev/null
+++ b/powerpc64/fat/sha512-compress-2.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/sha512-compress-2.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha512_compress) picked up by configure
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/sha512-compress.asm')
diff --git a/powerpc64/p8/sha512-compress.asm b/powerpc64/p8/sha512-compress.asm
new file mode 100644
index ..83fe0e36
--- /dev/null
+++ b/powerpc64/p8/sha512-compress.asm
@@ -0,0 +1,327 @@
+C x86_64/sha512-compress.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General

[PATCH v2 0/2] Add optimized powerpc64 assembly for SHA2

2024-04-18 Thread Eric Richter
I've updated this set to use the proper conventions for register names, and
also adjusted the IV macro according to the suggestions provided.

I can also confirm that I've gotten a working build environment based on
the approach the GitLab CI configuration, and that the ppc64 big-endian
build does indeed pass tests.


Amended original cover letter:

This set introduces an optimized powerpc64 assembly implementation for
SHA256 and SHA512. This have been derived from BSD-2-Clause licensed
code authored by IBM, originally released in the IBM POWER
Cryptography Reference Implementation project[1], modified to work in
Nettle, contributed under the GPL license.

Development of this new implementation targetted POWER 10, however
supports the POWER 8 and above ISA. The following commits provide the
performance data I recorded on POWER 10, though similar improvements can
be found on P8/P9.

I have tested this patch set on POWER 8 and POWER 10, hardware running
little-endian linux distributions, and via qemu-user for big-endian ppc64.


Eric Richter (2):
  powerpc64: Add optimized assembly for sha256-compress-n
  powerpc64: Add optimized assembly for sha512-compress-n

 fat-ppc.c |  22 ++
 powerpc64/fat/sha256-compress-n-2.asm |  36 +++
 powerpc64/fat/sha512-compress-2.asm   |  36 +++
 powerpc64/p8/sha256-compress-n.asm| 323 +
 powerpc64/p8/sha512-compress.asm  | 327 ++
 5 files changed, 744 insertions(+)
 create mode 100644 powerpc64/fat/sha256-compress-n-2.asm
 create mode 100644 powerpc64/fat/sha512-compress-2.asm
 create mode 100644 powerpc64/p8/sha256-compress-n.asm
 create mode 100644 powerpc64/p8/sha512-compress.asm

-- 
2.44.0

___
nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se
To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se


Re: [PATCH 0/2] Add optimized powerpc64 assembly for SHA2

2024-04-16 Thread Eric Richter
On Thu, 2024-04-04 at 21:30 +0200, Niels Möller wrote:
> If it's this macro,
> 
>    C Convert an index for W[i] to the corresponding register
>    define(`IV', `eval($1 + VW0)')
> 
> and the argument $1 is always a numerical expression, then I'd
> suggest
> deleting the definitions of VW0 - VW15 (with only a comment to
> document
> this register usage), and something like
> 
>    define(`IV', `v`'eval($1 + 16)')
> 
> You could also consider moving the % 16 operation into this macro,
> 
>    define(`IV', `v`'eval((($1) % 16) + 16)')
> 
> which should make it clear that it can't expand to a register outside
> of
> the intended v16-v31 range.
> 

Thanks for the suggestion! I moved the "% 16" into that eval to clean
up those load calls.

After a bit of fiddling with m4 though, it appears that this emits
something like "v16" without applying the translation of v16 -> 16,
causing the assembler to choke. I did manage to get it to work with a
naive concatenation macro like this:

   define(`CONCAT', `$1$2')
   define(`IV', `CONCAT(v, eval((($1) % 16) + 16))')

though I feel like there is a more elegant and clear solution. I have a
v2 queued up, I can send if this is sufficient.

Thanks!
 - Eric
___
nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se
To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se


Re: [PATCH 0/2] Add optimized powerpc64 assembly for SHA2

2024-04-03 Thread Eric Richter
On Thu, 2024-03-28 at 21:04 +0100, Niels Möller wrote:
> Eric Richter  writes:
> 
> > This set introduces an optimized powerpc64 assembly implementation
> > for
> > SHA256 and SHA512. This have been derived from BSD-2-Clause
> > licensed
> > code authored by IBM, originally released in the IBM POWER
> > Cryptography Reference Implementation project[1], modified to work
> > in
> > Nettle, contributed under the GPL license.
> > 
> > Development of this new implementation targetted POWER 10, however
> > supports the POWER 8 ISA and above. The following commits provide
> > the
> > performance data I recorded on POWER 10, though similar
> > improvements can
> > be found on P8/P9.
> 
> Thanks, I've had a first quick look. Nice speedup, and it looks
> pretty
> good. I wasn't aware of the vshasigma instructions.
> 
> One comment on the Nettle ppc conventions: I prefer to use register
> names rather than just register numbers; that helps me avoid some
> confusion when some instructions take v1 registers and others take
> vs1
> registers. Preferably by configuring with ASM_FLAGS=-mregnames during
> development. For assemblers that don't like register names (seems to
> be
> the default), machine.m4 arranges for translation from v1 --> 1, etc.
> 

Ah, thanks for letting me know, I am queuing up a version that fixes
this.

I do have a macro though that calculates which register number contains
the chunk of input data based on an index -- in other words, I use
registers v16-v31 to hold the input data, the macro just adds 16 to the
index to get the corresponding register. Right now it operates on raw
register numbers, should I adjust this macro to be more clear that it
is operating on vector registers in any way, or should I look into
changing how that is done?

> > As an aside: I have tested this patch set on POWER 8 and POWER 10
> > hardware running little-endian linux distributions, however I have
> > not
> > yet been able to test on a big-endian distro. I can confirm however
> > that
> > the original source in IPCRI does compile and pass tests for both
> > little
> > and big endian via qemu-user, so spare human error in deriving the
> > version for Nettle, it is expected to be functional.
> 
> There are big-endian tests in the ci pipeline (hosted on the mirror
> repo
> at https://gitlab.com/gnutls/nettle), using cross-compiling + qemu-
> user.
> And I also have a similar setup locally.
> 

Thanks! I'm looking into replicating this locally as well for easier
future testing, and I'll send a v2 with the updated registers once I
confirm big-endian tests pass. Should I also open a MR to trigger the
CI?

Thanks,
 - Eric

> Regards,
> /Niels
> 
> -- 
> Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
> Internet email is subject to wholesale government surveillance.
> ___
> nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se
> To unsubscribe send an email to
> nettle-bugs-le...@lists.lysator.liu.se

___
nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se
To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se


[PATCH 1/2] powerpc64: Add optimized assembly for sha256-compress-n

2024-03-28 Thread Eric Richter
This patch introduces an optimized powerpc64 assembly implementation for
sha256-compress-n. This takes advantage of the vshasigma instruction, as
well as unrolling loops to best take advantage of running instructions
in parallel.

The following data was captured on a POWER 10 LPAR @ ~3.896GHz

Current C implementation:
 Algorithm mode Mbyte/s
sha256   update  280.97
   hmac-sha256 64 bytes   80.81
   hmac-sha256256 bytes  170.50
   hmac-sha256   1024 bytes  241.92
   hmac-sha256   4096 bytes  268.54
   hmac-sha256   single msg  276.16

With optimized assembly:
 Algorithm mode Mbyte/s
sha256   update  446.42
   hmac-sha256 64 bytes  124.89
   hmac-sha256256 bytes  268.90
   hmac-sha256   1024 bytes  382.06
   hmac-sha256   4096 bytes  425.38
   hmac-sha256   single msg  439.75

Signed-off-by: Eric Richter 
---
 fat-ppc.c |  12 +
 powerpc64/fat/sha256-compress-n-2.asm |  36 +++
 powerpc64/p8/sha256-compress-n.asm| 339 ++
 3 files changed, 387 insertions(+)
 create mode 100644 powerpc64/fat/sha256-compress-n-2.asm
 create mode 100644 powerpc64/p8/sha256-compress-n.asm

diff --git a/fat-ppc.c b/fat-ppc.c
index cd76f7a1..efbeb2ec 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks, 
poly1305_blocks_func)
 DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c)
 DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
+
 
 static void CONSTRUCTOR
 fat_init (void)
@@ -231,6 +235,8 @@ fat_init (void)
  _nettle_ghash_update_arm64() */
   _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64;
   _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
+
+  _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64;
 }
   else
 {
@@ -239,6 +245,7 @@ fat_init (void)
   _nettle_aes_invert_vec = _nettle_aes_invert_c;
   _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c;
   _nettle_ghash_update_vec = _nettle_ghash_update_c;
+  _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
 }
   if (features.have_altivec)
 {
@@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *,
  size_t blocks,
 const uint8_t *m),
(ctx, blocks, m))
+
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+   (uint32_t *state, const uint32_t *k,
+size_t blocks, const uint8_t *input),
+   (state, k, blocks, input))
diff --git a/powerpc64/fat/sha256-compress-n-2.asm 
b/powerpc64/fat/sha256-compress-n-2.asm
new file mode 100644
index ..4f4eee9d
--- /dev/null
+++ b/powerpc64/fat/sha256-compress-n-2.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/sha256-compress-n-2.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/sha256-compress-n.asm')
diff --git a/powerpc64/p8/sha256-compress-n.asm 
b/powerpc64/p8/sha256-compress-n.asm
new file mode 100644
index ..52f548dc
--- /dev/null
+++ b/powerpc64/p8/sha256-compress-n.asm
@@ -0,0 +1,339 @@
+C x86_64/sha256-compress-n.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published

[PATCH 0/2] Add optimized powerpc64 assembly for SHA2

2024-03-28 Thread Eric Richter
This set introduces an optimized powerpc64 assembly implementation for
SHA256 and SHA512. This have been derived from BSD-2-Clause licensed
code authored by IBM, originally released in the IBM POWER
Cryptography Reference Implementation project[1], modified to work in
Nettle, contributed under the GPL license.

Development of this new implementation targetted POWER 10, however
supports the POWER 8 ISA and above. The following commits provide the
performance data I recorded on POWER 10, though similar improvements can
be found on P8/P9.

As an aside: I have tested this patch set on POWER 8 and POWER 10
hardware running little-endian linux distributions, however I have not
yet been able to test on a big-endian distro. I can confirm however that
the original source in IPCRI does compile and pass tests for both little
and big endian via qemu-user, so spare human error in deriving the
version for Nettle, it is expected to be functional.

[1] https://github.com/ibm/ipcri

Eric Richter (2):
  powerpc64: Add optimized assembly for sha256-compress-n
  powerpc64: Add optimized assembly for sha512-compress-n

 powerpc64/p8/sha256-compress-n.asm | 339 
 powerpc64/p8/sha512-compress.asm   | 345 +
 2 files changed, 684 insertions(+)
 create mode 100644 powerpc64/p8/sha256-compress-n.asm
 create mode 100644 powerpc64/p8/sha512-compress.asm

-- 
2.43.0

___
nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se
To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se


[PATCH 2/2] powerpc64: Add optimized assembly for sha512-compress-n

2024-03-28 Thread Eric Richter
This patch introduces an optimized powerpc64 assembly implementation for
sha512-compress, derived from the implementation for sha256-compress-n.

The following data was captured on a POWER 10 LPAR @ ~3.896GHz

Current C implementation:
 Algorithm mode Mbyte/s
sha512   update  447.02
sha512-224   update  444.30
sha512-256   update  445.02
   hmac-sha512 64 bytes   97.27
   hmac-sha512256 bytes  204.55
   hmac-sha512   1024 bytes  342.86
   hmac-sha512   4096 bytes  409.57
   hmac-sha512   single msg  433.95

With optimized assembly:
 Algorithm mode Mbyte/s
sha512   update  705.36
sha512-224   update  705.63
sha512-256   update  705.34
   hmac-sha512 64 bytes  141.66
   hmac-sha512256 bytes  310.26
   hmac-sha512   1024 bytes  534.22
   hmac-sha512   4096 bytes  641.74
   hmac-sha512   single msg  677.14

Signed-off-by: Eric Richter 
---
 fat-ppc.c   |  10 +
 powerpc64/fat/sha512-compress-2.asm |  36 +++
 powerpc64/p8/sha512-compress.asm| 345 
 3 files changed, 391 insertions(+)
 create mode 100644 powerpc64/fat/sha512-compress-2.asm
 create mode 100644 powerpc64/p8/sha512-compress.asm

diff --git a/fat-ppc.c b/fat-ppc.c
index efbeb2ec..a228386a 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -207,6 +207,10 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress_n, 
sha256_compress_n_func)
 DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
 DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, ppc64)
+
 
 static void CONSTRUCTOR
 fat_init (void)
@@ -237,6 +241,7 @@ fat_init (void)
   _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
 
   _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64;
+  _nettle_sha512_compress_vec = _nettle_sha512_compress_ppc64;
 }
   else
 {
@@ -246,6 +251,7 @@ fat_init (void)
   _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c;
   _nettle_ghash_update_vec = _nettle_ghash_update_c;
   _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
+  _nettle_sha512_compress_vec = _nettle_sha512_compress_c;
 }
   if (features.have_altivec)
 {
@@ -350,3 +356,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
(uint32_t *state, const uint32_t *k,
 size_t blocks, const uint8_t *input),
(state, k, blocks, input))
+
+DEFINE_FAT_FUNC(_nettle_sha512_compress, void,
+   (uint64_t *state, const uint8_t *input, const uint64_t *k),
+   (state, input, k))
diff --git a/powerpc64/fat/sha512-compress-2.asm 
b/powerpc64/fat/sha512-compress-2.asm
new file mode 100644
index ..9445e5ba
--- /dev/null
+++ b/powerpc64/fat/sha512-compress-2.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/sha512-compress-2.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha512_compress) picked up by configure
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/sha512-compress.asm')
diff --git a/powerpc64/p8/sha512-compress.asm b/powerpc64/p8/sha512-compress.asm
new file mode 100644
index ..36dd011c
--- /dev/null
+++ b/powerpc64/p8/sha512-compress.asm
@@ -0,0 +1,345 @@
+C x86_64/sha512-compress.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General