[PATCH 2/2] powerpc64: Add optimized assembly for sha512-compress
This patch introduces an optimized powerpc64 assembly implementation for sha512-compress, derived from the implementation for sha256-compress-n. The following data was captured on a POWER 10 LPAR @ ~4.050GHz Current C implementation: Algorithm mode Mbyte/s sha512 update 474.00 sha512_224 update 474.61 sha512_256 update 474.15 hmac-sha512 64 bytes 104.08 hmac-sha512256 bytes 220.42 hmac-sha512 1024 bytes 368.58 hmac-sha512 4096 bytes 436.27 hmac-sha512 single msg 460.10 With optimized assembly: Algorithm mode Mbyte/s sha512 update 746.96 sha512_224 update 746.96 sha512_256 update 746.93 hmac-sha512 64 bytes 150.54 hmac-sha512256 bytes 327.58 hmac-sha512 1024 bytes 562.49 hmac-sha512 4096 bytes 677.38 hmac-sha512 single msg 713.06 Signed-off-by: Eric Richter --- This is a complete rewrite derived from the SHA256 implementation. The benchmark numbers above have been adjusted, though they have been collected on a different machine yielding slightly different baseline results. fat-ppc.c | 10 + powerpc64/fat/sha512-compress-2.asm | 36 +++ powerpc64/p8/sha512-compress.asm| 336 3 files changed, 382 insertions(+) create mode 100644 powerpc64/fat/sha512-compress-2.asm create mode 100644 powerpc64/p8/sha512-compress.asm diff --git a/fat-ppc.c b/fat-ppc.c index aaccc116..5b6efd10 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -215,6 +215,10 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64) +DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, ppc64) + /* Nop implementation for _gcm_aes_encrypt and _gcm_aes_decrypt. */ static size_t gcm_aes_crypt_c (struct gcm_key *key UNUSED, unsigned rounds UNUSED, @@ -253,6 +257,7 @@ fat_init (void) _nettle_gcm_aes_encrypt_vec = _nettle_gcm_aes_encrypt_ppc64; _nettle_gcm_aes_decrypt_vec = _nettle_gcm_aes_decrypt_ppc64; _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; + _nettle_sha512_compress_vec = _nettle_sha512_compress_ppc64; } else { @@ -264,6 +269,7 @@ fat_init (void) _nettle_gcm_aes_encrypt_vec = gcm_aes_crypt_c; _nettle_gcm_aes_decrypt_vec = gcm_aes_crypt_c; _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; + _nettle_sha512_compress_vec = _nettle_sha512_compress_c; } if (features.have_altivec) { @@ -378,3 +384,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, (uint32_t *state, const uint32_t *k, size_t blocks, const uint8_t *input), (state, k, blocks, input)) + +DEFINE_FAT_FUNC(_nettle_sha512_compress, void, + (uint64_t *state, const uint8_t *input, const uint64_t *k), + (state, input, k)) diff --git a/powerpc64/fat/sha512-compress-2.asm b/powerpc64/fat/sha512-compress-2.asm new file mode 100644 index ..9445e5ba --- /dev/null +++ b/powerpc64/fat/sha512-compress-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha512-compress-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha512_compress) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/sha512-compress.asm') diff --git a/powerpc64/p8/sha512-compress.asm b/powerpc64/p8/sha512-compress.asm new file mode 100644 index ..bf182a45 --- /dev/null +++ b/powerpc64/p8/sha512-compress.asm @@ -0,0 +1,336 @@ +C x86_64/sha512-compress.asm + +ifelse(` + Copyright (C) 2024 Eric
[PATCH 1/2] powerpc64: remove use of m4_unquote in the load step for sha256
By passing in the constant offset value into the LOAD macro, the use of m4_unquote to calculate the correct constant GPR can be avoided, improving readability. Signed-off-by: Eric Richter --- powerpc64/p8/sha256-compress-n.asm | 36 +++--- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm index 4848461e..309db1fa 100644 --- a/powerpc64/p8/sha256-compress-n.asm +++ b/powerpc64/p8/sha256-compress-n.asm @@ -177,34 +177,34 @@ define(`EXTENDROUNDS', ` ') define(`LOAD', ` - IF_BE(`lxvw4x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT') + IF_BE(`lxvw4x VSR(IV($1)), $2, INPUT') IF_LE(` - lxvd2x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT + lxvd2x VSR(IV($1)), $2, INPUT vperm IV($1), IV($1), IV($1), VT0 ') ') define(`DOLOADS', ` IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') - LOAD(0) - LOAD(1) - LOAD(2) - LOAD(3) + LOAD(0, TC0) + LOAD(1, TC4) + LOAD(2, TC8) + LOAD(3, TC12) addiINPUT, INPUT, 16 - LOAD(4) - LOAD(5) - LOAD(6) - LOAD(7) + LOAD(4, TC0) + LOAD(5, TC4) + LOAD(6, TC8) + LOAD(7, TC12) addiINPUT, INPUT, 16 - LOAD(8) - LOAD(9) - LOAD(10) - LOAD(11) + LOAD(8, TC0) + LOAD(9, TC4) + LOAD(10, TC8) + LOAD(11, TC12) addiINPUT, INPUT, 16 - LOAD(12) - LOAD(13) - LOAD(14) - LOAD(15) + LOAD(12, TC0) + LOAD(13, TC4) + LOAD(14, TC8) + LOAD(15, TC12) addiINPUT, INPUT, 16 ') -- 2.45.2 ___ nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se
Re: [PATCH v3] powerpc64: Add optimized assembly for sha256-compress-n
On Fri, 2024-06-07 at 14:08 +0200, Niels Möller wrote: > Eric Richter writes: > > +C ROUND(A B C D E F G H R EXT) > > +define(`ROUND', ` > > + > > + vadduwm VT1, VK, IV($9) C VT1: k+W > > + vadduwm VT4, $8, VT1 C VT4: H+k+W > > + > > + lxvw4x VSR(VK), TK, K C Load Key > > + addi TK, TK, 4 C Increment Pointer to next key > > + > > + vadduwm VT2, $4, $8 C VT2: H+D > > + vadduwm VT2, VT2, VT1 C VT2: H+D+k+W > > Could the above two instructions be changed to > > vadduwm VT2, VT4, $4 C Should be the same,(H+k+W) + D > > (which would need one less register)? I realize there's slight change > in > the dependency chain. Do you know how many cycles one of these rounds > takes, and what's the bottleneck (I would guess either latency of the > dependency chain between rounds, or throughput of one of the > execution > units, or instruction issue rate). > Theoretically it should be about 10 cycles per round, but the actual measured performance doesn't quite hit that due to various quirks with scheduling. With this change, I'm getting about a +1 MB/s gain on hmac 256 bytes, but a slight loss of speed for the rest. > > +define(`LOAD', ` > > + IF_BE(`lxvw4x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), > > INPUT') > > + IF_LE(` > > + lxvd2x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT > > + vperm IV($1), IV($1), IV($1), VT0 > > + ') > > +') > > + > > +define(`DOLOADS', ` > > + IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') > > + LOAD(0) > > + LOAD(1) > > + LOAD(2) > > + LOAD(3) > > If you pass the right TCx register as argument to the load macro, you > don't need the m4 eval thing, which could make it a bit more > readable, imo. > > > + C Store non-volatile registers > > + > > + li T0, -8 > > + li T1, -24 > > + stvx v20, T0, SP > > + stvx v21, T1, SP > > + subi T0, T0, 32 > > + subi T1, T1, 32 > > This could be probably be arranged with fewer instructions by having > one > register that is decremented as we move down in the guard area, and > registers with constant values for indexing. > > > + C Reload initial state from VSX registers > > + xxlor VSR(VT0), VSXA, VSXA > > + xxlor VSR(VT1), VSXB, VSXB > > + xxlor VSR(VT2), VSXC, VSXC > > + xxlor VSR(VT3), VSXD, VSXD > > + xxlor VSR(VT4), VSXE, VSXE > > + xxlor VSR(SIGA), VSXF, VSXF > > + xxlor VSR(SIGE), VSXG, VSXG > > + xxlor VSR(VK), VSXH, VSXH > > + > > + vadduwm VSA, VSA, VT0 > > + vadduwm VSB, VSB, VT1 > > + vadduwm VSC, VSC, VT2 > > + vadduwm VSD, VSD, VT3 > > + vadduwm VSE, VSE, VT4 > > + vadduwm VSF, VSF, SIGA > > + vadduwm VSG, VSG, SIGE > > + vadduwm VSH, VSH, VK > > It's a pity that there seems to be no useful xxadd* instructions? Do > you > need all eight temporary registers, or would you get the same speed > doing just four at a time, i.e., 4 xxlor instructions, 4 vadduwm, 4 > xxlor, 4 vadduwm? There's no alias "xxmov" or the like that could be > used instead of xxlor? > Unfortunately most of the VSX instructions (particularly those in the p8 ISA) are for floating point operations, using them in this way is a bit of a hack. I'll test four at a time, but it will likely be similar performance unless the xxlor's are issued on a different unit. I'm not aware of an xxmov/xxmr extended mnemonic, but this could always be macroed instead for clarity. > Thanks for the update! > /Niels > Thanks for merging! I'll have a clean-up patch up soon, hopefully with the SHA512 implementation as well. ___ nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se
[PATCH v3] powerpc64: Add optimized assembly for sha256-compress-n
This patch introduces an optimized powerpc64 assembly implementation for sha256-compress-n. This takes advantage of the vshasigma instruction, as well as unrolling loops to best take advantage of running instructions in parallel. The following data was captured on a POWER 10 LPAR @ ~3.896GHz Current C implementation: Algorithm mode Mbyte/s sha256 update 280.97 hmac-sha256 64 bytes 80.81 hmac-sha256256 bytes 170.50 hmac-sha256 1024 bytes 241.92 hmac-sha256 4096 bytes 268.54 hmac-sha256 single msg 276.16 With optimized assembly: Algorithm mode Mbyte/s sha256 update 461.45 hmac-sha256 64 bytes 123.88 hmac-sha256256 bytes 268.81 hmac-sha256 1024 bytes 390.91 hmac-sha256 4096 bytes 438.02 hmac-sha256 single msg 453.83 Signed-off-by: Eric Richter --- I split this patch to be standalone, rather than delay even further trying to update SHA512 -- I will update the SHA512 implementation when this one stabilizes. Regarding the load vperm needed for little endian: unfortunately we don't have a spare vector register to store the mask between rounds, so the best that can be done while maintaining p8 support will be to store the mask in a VSX register like the state values, and avoid the load. This is a negligible performance change however, yielding around +1MB/s on larger block counts (update, hmac 1024/4096/single msg) and -1MB/s on smaller (hmac 64/256). Dropping p8 support allows the use of the lxvb16x instruction, which does not need to be permuted, however that is as well a negligible performance improvement at the cost of dropping a whole cpu set. So I see a few options: A) leave as-is, consider storing the mask in a VSX register B) drop p8 support, use lxvb16x C) have a compile-time switch to use permute on p8, and use the single instruction for p9 an up. v3: - use protected zone instead of allocating stack space - add GPRs constants for multiples of 4 for loads - around +3.4 MB/s for sha256 update - move extend logic to its own macro called by EXTENDROUND - use 8 VSX registers to store previous state instead of the stack - around +11.0 MB/s for sha256 update fat-ppc.c | 12 + powerpc64/fat/sha256-compress-n-2.asm | 36 +++ powerpc64/p8/sha256-compress-n.asm| 364 ++ 3 files changed, 412 insertions(+) create mode 100644 powerpc64/fat/sha256-compress-n-2.asm create mode 100644 powerpc64/p8/sha256-compress-n.asm diff --git a/fat-ppc.c b/fat-ppc.c index cd76f7a1..efbeb2ec 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks, poly1305_blocks_func) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64) +DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64) + static void CONSTRUCTOR fat_init (void) @@ -231,6 +235,8 @@ fat_init (void) _nettle_ghash_update_arm64() */ _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64; _nettle_ghash_update_vec = _nettle_ghash_update_ppc64; + + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; } else { @@ -239,6 +245,7 @@ fat_init (void) _nettle_aes_invert_vec = _nettle_aes_invert_c; _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c; _nettle_ghash_update_vec = _nettle_ghash_update_c; + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; } if (features.have_altivec) { @@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *, size_t blocks, const uint8_t *m), (ctx, blocks, m)) + +DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, + (uint32_t *state, const uint32_t *k, +size_t blocks, const uint8_t *input), + (state, k, blocks, input)) diff --git a/powerpc64/fat/sha256-compress-n-2.asm b/powerpc64/fat/sha256-compress-n-2.asm new file mode 100644 index ..4f4eee9d --- /dev/null +++ b/powerpc64/fat/sha256-compress-n-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha256-compress-n-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your
Re: [PATCH v2 1/2] powerpc64: Add optimized assembly for sha256-compress-n
On Sun, 2024-05-05 at 16:10 +0200, Niels Möller wrote: > Eric Richter writes: > > > This patch introduces an optimized powerpc64 assembly > > implementation for > > sha256-compress-n. This takes advantage of the vshasigma > > instruction, as > > well as unrolling loops to best take advantage of running > > instructions > > in parallel. > > Thanks. I'm now having a closer read of the assembly code. Comments > below. > > > +C ROUND(A B C D E F G H R EXT) > > +define(`ROUND', ` > > + > > + vadduwm VT1, VK, IV($9) C VT1: k+W > > + vadduwm VT4, $8, VT1 C VT4: H+k+W > > + > > + lxvw4x VSR(VK), TK, K C Load Key > > + addiTK, TK, 4 C Increment Pointer > > to next key > > + > > + vadduwm VT2, $4, $8 C VT2: H+D > > + vadduwm VT2, VT2, VT1 C VT2: > > H+D+k+W > > + > > + vshasigmaw SIGE, $5, 1, 0b C Sigma(E) Se > > + vshasigmaw SIGA, $1, 1, 0 C Sigma(A) Sa > > + > > + vxorVT3, $2, $3 C VT3: b^c > > + vselVT0, $7, $6, $5 C VT0: Ch. > > + vselVT3, $3, $1, VT3 C VT3: Maj(a,b,c) > > + > > + vadduwm VT4, VT4, VT0 C VT4: Hkw + > > Ch. > > + vadduwm VT3, VT3, VT4 C VT3: HkW + > > Ch. + Maj. > > + > > + vadduwm VT0, VT0, VT2 C VT0: Ch. + > > DHKW > > + vadduwm $8, SIGE, SIGA C Anext: Se > > + Sa > > + vadduwm $4, VT0, SIGE C Dnext: Ch. > > + DHKW + Se > > + vadduwm $8, $8, VT3 C Anext: > > Se+Sa+HkW+Ch.+Maj. > > + > > + > > + C Schedule (data) for 16th round in future > > + C Extend W[i] > > + ifelse(`$10', `1', ` > > + vshasigmaw SIGE, IV($9 + 14), 0, 0b > > + vshasigmaw SIGA, IV($9 + 1), 0, 0b > > + vadduwm IV($9), IV($9), SIGE > > + vadduwm IV($9), IV($9), SIGA > > + vadduwm IV($9), IV($9), IV($9 + 9) > > + ') > > +') > > I think it would be a bit simpler to take out the extend logic to its > own macro. > > > +define(`EXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, > > 1)') > > If you do that, then you would define > > define(`EXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9) > EXTEND($9)') > > (In other related code, input expansion is done at the beginning of a > round iteration rather than at the end, but doing at the end like you > do > may be better scheduling). > Makes sense, I'll move that extend logic into its own macro. You are correct, the expansion logic was moved to the end of the round for an improvement to scheduling on the CPU. The vshasigma instructions take more cycles and are scheduled on a different unit than the other arithmetic operations. This allows those to work in parallel with the beginning of the next round, as there are no dependent registers until the next vshasigma instructions in-round. > > +define(`LOAD', ` > > + IF_BE(`lxvw4x VSR(IV($1)), 0, INPUT') > > + IF_LE(` > > + lxvd2x VSR(IV($1)), 0, INPUT > > + vperm IV($1), IV($1), IV($1), VT0 > > + ') > > + addiINPUT, INPUT, 4 > > +') > > + > > +define(`DOLOADS', ` > > + IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') > > Could you have a dedicated register for the permutation constant, and > load it only once at function entry? If you have general registers to > spare, it could also make sense to use, e.g., three registers for the > contant values 16, 32, 48, and use for indexing. Then you don't need > to > update the INPUT pointer as often, and you can use the same constants > for other load/store sequences as well. > There are plenty of GPRs to spare, I will test and bench a few options for using more GPRs as indexes. As for VRs, unfortunately the current implementation uses all 32 VRs: 16 for W[i] 8 for state 7 for round arithmetic (two of these specifically for sigma, to avoid a dependency bubble) 1 for storing the key constant K That said, I'm going to experiment with some VSX instructions to see if it is possible to spill over certain operations into VSRs, without needing an explicit copy back from VSR to VR. > > + LOAD(0) > > + LOAD(1) > > + LOAD(2) > > + LOAD(3) > > > +PROLOGUE(_nettle_sha256_compress_n) > > + cmpwi 0, NUMBLOCKS, 0 > > + ble 0, .done > > + mtctr NUMBLOCKS > > +
[PATCH v2 1/2] powerpc64: Add optimized assembly for sha256-compress-n
This patch introduces an optimized powerpc64 assembly implementation for sha256-compress-n. This takes advantage of the vshasigma instruction, as well as unrolling loops to best take advantage of running instructions in parallel. The following data was captured on a POWER 10 LPAR @ ~3.896GHz Current C implementation: Algorithm mode Mbyte/s sha256 update 280.97 hmac-sha256 64 bytes 80.81 hmac-sha256256 bytes 170.50 hmac-sha256 1024 bytes 241.92 hmac-sha256 4096 bytes 268.54 hmac-sha256 single msg 276.16 With optimized assembly: Algorithm mode Mbyte/s sha256 update 446.42 hmac-sha256 64 bytes 124.89 hmac-sha256256 bytes 268.90 hmac-sha256 1024 bytes 382.06 hmac-sha256 4096 bytes 425.38 hmac-sha256 single msg 439.75 Signed-off-by: Eric Richter --- fat-ppc.c | 12 + powerpc64/fat/sha256-compress-n-2.asm | 36 +++ powerpc64/p8/sha256-compress-n.asm| 323 ++ 3 files changed, 371 insertions(+) create mode 100644 powerpc64/fat/sha256-compress-n-2.asm create mode 100644 powerpc64/p8/sha256-compress-n.asm diff --git a/fat-ppc.c b/fat-ppc.c index cd76f7a1..efbeb2ec 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks, poly1305_blocks_func) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64) +DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64) + static void CONSTRUCTOR fat_init (void) @@ -231,6 +235,8 @@ fat_init (void) _nettle_ghash_update_arm64() */ _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64; _nettle_ghash_update_vec = _nettle_ghash_update_ppc64; + + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; } else { @@ -239,6 +245,7 @@ fat_init (void) _nettle_aes_invert_vec = _nettle_aes_invert_c; _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c; _nettle_ghash_update_vec = _nettle_ghash_update_c; + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; } if (features.have_altivec) { @@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *, size_t blocks, const uint8_t *m), (ctx, blocks, m)) + +DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, + (uint32_t *state, const uint32_t *k, +size_t blocks, const uint8_t *input), + (state, k, blocks, input)) diff --git a/powerpc64/fat/sha256-compress-n-2.asm b/powerpc64/fat/sha256-compress-n-2.asm new file mode 100644 index ..4f4eee9d --- /dev/null +++ b/powerpc64/fat/sha256-compress-n-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha256-compress-n-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/sha256-compress-n.asm') diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm new file mode 100644 index ..d76f337e --- /dev/null +++ b/powerpc64/p8/sha256-compress-n.asm @@ -0,0 +1,323 @@ +C x86_64/sha256-compress-n.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published
[PATCH v2 2/2] powerpc64: Add optimized assembly for sha512-compress
This patch introduces an optimized powerpc64 assembly implementation for sha512-compress, derived from the implementation for sha256-compress-n. The following data was captured on a POWER 10 LPAR @ ~3.896GHz Current C implementation: Algorithm mode Mbyte/s sha512 update 447.02 sha512-224 update 444.30 sha512-256 update 445.02 hmac-sha512 64 bytes 97.27 hmac-sha512256 bytes 204.55 hmac-sha512 1024 bytes 342.86 hmac-sha512 4096 bytes 409.57 hmac-sha512 single msg 433.95 With optimized assembly: Algorithm mode Mbyte/s sha512 update 705.36 sha512-224 update 705.63 sha512-256 update 705.34 hmac-sha512 64 bytes 141.66 hmac-sha512256 bytes 310.26 hmac-sha512 1024 bytes 534.22 hmac-sha512 4096 bytes 641.74 hmac-sha512 single msg 677.14 Signed-off-by: Eric Richter --- fat-ppc.c | 10 + powerpc64/fat/sha512-compress-2.asm | 36 +++ powerpc64/p8/sha512-compress.asm| 327 3 files changed, 373 insertions(+) create mode 100644 powerpc64/fat/sha512-compress-2.asm create mode 100644 powerpc64/p8/sha512-compress.asm diff --git a/fat-ppc.c b/fat-ppc.c index efbeb2ec..a228386a 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -207,6 +207,10 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64) +DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, ppc64) + static void CONSTRUCTOR fat_init (void) @@ -237,6 +241,7 @@ fat_init (void) _nettle_ghash_update_vec = _nettle_ghash_update_ppc64; _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; + _nettle_sha512_compress_vec = _nettle_sha512_compress_ppc64; } else { @@ -246,6 +251,7 @@ fat_init (void) _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c; _nettle_ghash_update_vec = _nettle_ghash_update_c; _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; + _nettle_sha512_compress_vec = _nettle_sha512_compress_c; } if (features.have_altivec) { @@ -350,3 +356,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, (uint32_t *state, const uint32_t *k, size_t blocks, const uint8_t *input), (state, k, blocks, input)) + +DEFINE_FAT_FUNC(_nettle_sha512_compress, void, + (uint64_t *state, const uint8_t *input, const uint64_t *k), + (state, input, k)) diff --git a/powerpc64/fat/sha512-compress-2.asm b/powerpc64/fat/sha512-compress-2.asm new file mode 100644 index ..9445e5ba --- /dev/null +++ b/powerpc64/fat/sha512-compress-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha512-compress-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha512_compress) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/sha512-compress.asm') diff --git a/powerpc64/p8/sha512-compress.asm b/powerpc64/p8/sha512-compress.asm new file mode 100644 index ..83fe0e36 --- /dev/null +++ b/powerpc64/p8/sha512-compress.asm @@ -0,0 +1,327 @@ +C x86_64/sha512-compress.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General
[PATCH v2 0/2] Add optimized powerpc64 assembly for SHA2
I've updated this set to use the proper conventions for register names, and also adjusted the IV macro according to the suggestions provided. I can also confirm that I've gotten a working build environment based on the approach the GitLab CI configuration, and that the ppc64 big-endian build does indeed pass tests. Amended original cover letter: This set introduces an optimized powerpc64 assembly implementation for SHA256 and SHA512. This have been derived from BSD-2-Clause licensed code authored by IBM, originally released in the IBM POWER Cryptography Reference Implementation project[1], modified to work in Nettle, contributed under the GPL license. Development of this new implementation targetted POWER 10, however supports the POWER 8 and above ISA. The following commits provide the performance data I recorded on POWER 10, though similar improvements can be found on P8/P9. I have tested this patch set on POWER 8 and POWER 10, hardware running little-endian linux distributions, and via qemu-user for big-endian ppc64. Eric Richter (2): powerpc64: Add optimized assembly for sha256-compress-n powerpc64: Add optimized assembly for sha512-compress-n fat-ppc.c | 22 ++ powerpc64/fat/sha256-compress-n-2.asm | 36 +++ powerpc64/fat/sha512-compress-2.asm | 36 +++ powerpc64/p8/sha256-compress-n.asm| 323 + powerpc64/p8/sha512-compress.asm | 327 ++ 5 files changed, 744 insertions(+) create mode 100644 powerpc64/fat/sha256-compress-n-2.asm create mode 100644 powerpc64/fat/sha512-compress-2.asm create mode 100644 powerpc64/p8/sha256-compress-n.asm create mode 100644 powerpc64/p8/sha512-compress.asm -- 2.44.0 ___ nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se
Re: [PATCH 0/2] Add optimized powerpc64 assembly for SHA2
On Thu, 2024-04-04 at 21:30 +0200, Niels Möller wrote: > If it's this macro, > > C Convert an index for W[i] to the corresponding register > define(`IV', `eval($1 + VW0)') > > and the argument $1 is always a numerical expression, then I'd > suggest > deleting the definitions of VW0 - VW15 (with only a comment to > document > this register usage), and something like > > define(`IV', `v`'eval($1 + 16)') > > You could also consider moving the % 16 operation into this macro, > > define(`IV', `v`'eval((($1) % 16) + 16)') > > which should make it clear that it can't expand to a register outside > of > the intended v16-v31 range. > Thanks for the suggestion! I moved the "% 16" into that eval to clean up those load calls. After a bit of fiddling with m4 though, it appears that this emits something like "v16" without applying the translation of v16 -> 16, causing the assembler to choke. I did manage to get it to work with a naive concatenation macro like this: define(`CONCAT', `$1$2') define(`IV', `CONCAT(v, eval((($1) % 16) + 16))') though I feel like there is a more elegant and clear solution. I have a v2 queued up, I can send if this is sufficient. Thanks! - Eric ___ nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se
Re: [PATCH 0/2] Add optimized powerpc64 assembly for SHA2
On Thu, 2024-03-28 at 21:04 +0100, Niels Möller wrote: > Eric Richter writes: > > > This set introduces an optimized powerpc64 assembly implementation > > for > > SHA256 and SHA512. This have been derived from BSD-2-Clause > > licensed > > code authored by IBM, originally released in the IBM POWER > > Cryptography Reference Implementation project[1], modified to work > > in > > Nettle, contributed under the GPL license. > > > > Development of this new implementation targetted POWER 10, however > > supports the POWER 8 ISA and above. The following commits provide > > the > > performance data I recorded on POWER 10, though similar > > improvements can > > be found on P8/P9. > > Thanks, I've had a first quick look. Nice speedup, and it looks > pretty > good. I wasn't aware of the vshasigma instructions. > > One comment on the Nettle ppc conventions: I prefer to use register > names rather than just register numbers; that helps me avoid some > confusion when some instructions take v1 registers and others take > vs1 > registers. Preferably by configuring with ASM_FLAGS=-mregnames during > development. For assemblers that don't like register names (seems to > be > the default), machine.m4 arranges for translation from v1 --> 1, etc. > Ah, thanks for letting me know, I am queuing up a version that fixes this. I do have a macro though that calculates which register number contains the chunk of input data based on an index -- in other words, I use registers v16-v31 to hold the input data, the macro just adds 16 to the index to get the corresponding register. Right now it operates on raw register numbers, should I adjust this macro to be more clear that it is operating on vector registers in any way, or should I look into changing how that is done? > > As an aside: I have tested this patch set on POWER 8 and POWER 10 > > hardware running little-endian linux distributions, however I have > > not > > yet been able to test on a big-endian distro. I can confirm however > > that > > the original source in IPCRI does compile and pass tests for both > > little > > and big endian via qemu-user, so spare human error in deriving the > > version for Nettle, it is expected to be functional. > > There are big-endian tests in the ci pipeline (hosted on the mirror > repo > at https://gitlab.com/gnutls/nettle), using cross-compiling + qemu- > user. > And I also have a similar setup locally. > Thanks! I'm looking into replicating this locally as well for easier future testing, and I'll send a v2 with the updated registers once I confirm big-endian tests pass. Should I also open a MR to trigger the CI? Thanks, - Eric > Regards, > /Niels > > -- > Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677. > Internet email is subject to wholesale government surveillance. > ___ > nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se > To unsubscribe send an email to > nettle-bugs-le...@lists.lysator.liu.se ___ nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se
[PATCH 1/2] powerpc64: Add optimized assembly for sha256-compress-n
This patch introduces an optimized powerpc64 assembly implementation for sha256-compress-n. This takes advantage of the vshasigma instruction, as well as unrolling loops to best take advantage of running instructions in parallel. The following data was captured on a POWER 10 LPAR @ ~3.896GHz Current C implementation: Algorithm mode Mbyte/s sha256 update 280.97 hmac-sha256 64 bytes 80.81 hmac-sha256256 bytes 170.50 hmac-sha256 1024 bytes 241.92 hmac-sha256 4096 bytes 268.54 hmac-sha256 single msg 276.16 With optimized assembly: Algorithm mode Mbyte/s sha256 update 446.42 hmac-sha256 64 bytes 124.89 hmac-sha256256 bytes 268.90 hmac-sha256 1024 bytes 382.06 hmac-sha256 4096 bytes 425.38 hmac-sha256 single msg 439.75 Signed-off-by: Eric Richter --- fat-ppc.c | 12 + powerpc64/fat/sha256-compress-n-2.asm | 36 +++ powerpc64/p8/sha256-compress-n.asm| 339 ++ 3 files changed, 387 insertions(+) create mode 100644 powerpc64/fat/sha256-compress-n-2.asm create mode 100644 powerpc64/p8/sha256-compress-n.asm diff --git a/fat-ppc.c b/fat-ppc.c index cd76f7a1..efbeb2ec 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks, poly1305_blocks_func) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64) +DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64) + static void CONSTRUCTOR fat_init (void) @@ -231,6 +235,8 @@ fat_init (void) _nettle_ghash_update_arm64() */ _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64; _nettle_ghash_update_vec = _nettle_ghash_update_ppc64; + + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; } else { @@ -239,6 +245,7 @@ fat_init (void) _nettle_aes_invert_vec = _nettle_aes_invert_c; _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c; _nettle_ghash_update_vec = _nettle_ghash_update_c; + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; } if (features.have_altivec) { @@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *, size_t blocks, const uint8_t *m), (ctx, blocks, m)) + +DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, + (uint32_t *state, const uint32_t *k, +size_t blocks, const uint8_t *input), + (state, k, blocks, input)) diff --git a/powerpc64/fat/sha256-compress-n-2.asm b/powerpc64/fat/sha256-compress-n-2.asm new file mode 100644 index ..4f4eee9d --- /dev/null +++ b/powerpc64/fat/sha256-compress-n-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha256-compress-n-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/sha256-compress-n.asm') diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm new file mode 100644 index ..52f548dc --- /dev/null +++ b/powerpc64/p8/sha256-compress-n.asm @@ -0,0 +1,339 @@ +C x86_64/sha256-compress-n.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published
[PATCH 0/2] Add optimized powerpc64 assembly for SHA2
This set introduces an optimized powerpc64 assembly implementation for SHA256 and SHA512. This have been derived from BSD-2-Clause licensed code authored by IBM, originally released in the IBM POWER Cryptography Reference Implementation project[1], modified to work in Nettle, contributed under the GPL license. Development of this new implementation targetted POWER 10, however supports the POWER 8 ISA and above. The following commits provide the performance data I recorded on POWER 10, though similar improvements can be found on P8/P9. As an aside: I have tested this patch set on POWER 8 and POWER 10 hardware running little-endian linux distributions, however I have not yet been able to test on a big-endian distro. I can confirm however that the original source in IPCRI does compile and pass tests for both little and big endian via qemu-user, so spare human error in deriving the version for Nettle, it is expected to be functional. [1] https://github.com/ibm/ipcri Eric Richter (2): powerpc64: Add optimized assembly for sha256-compress-n powerpc64: Add optimized assembly for sha512-compress-n powerpc64/p8/sha256-compress-n.asm | 339 powerpc64/p8/sha512-compress.asm | 345 + 2 files changed, 684 insertions(+) create mode 100644 powerpc64/p8/sha256-compress-n.asm create mode 100644 powerpc64/p8/sha512-compress.asm -- 2.43.0 ___ nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se
[PATCH 2/2] powerpc64: Add optimized assembly for sha512-compress-n
This patch introduces an optimized powerpc64 assembly implementation for sha512-compress, derived from the implementation for sha256-compress-n. The following data was captured on a POWER 10 LPAR @ ~3.896GHz Current C implementation: Algorithm mode Mbyte/s sha512 update 447.02 sha512-224 update 444.30 sha512-256 update 445.02 hmac-sha512 64 bytes 97.27 hmac-sha512256 bytes 204.55 hmac-sha512 1024 bytes 342.86 hmac-sha512 4096 bytes 409.57 hmac-sha512 single msg 433.95 With optimized assembly: Algorithm mode Mbyte/s sha512 update 705.36 sha512-224 update 705.63 sha512-256 update 705.34 hmac-sha512 64 bytes 141.66 hmac-sha512256 bytes 310.26 hmac-sha512 1024 bytes 534.22 hmac-sha512 4096 bytes 641.74 hmac-sha512 single msg 677.14 Signed-off-by: Eric Richter --- fat-ppc.c | 10 + powerpc64/fat/sha512-compress-2.asm | 36 +++ powerpc64/p8/sha512-compress.asm| 345 3 files changed, 391 insertions(+) create mode 100644 powerpc64/fat/sha512-compress-2.asm create mode 100644 powerpc64/p8/sha512-compress.asm diff --git a/fat-ppc.c b/fat-ppc.c index efbeb2ec..a228386a 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -207,6 +207,10 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64) +DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, ppc64) + static void CONSTRUCTOR fat_init (void) @@ -237,6 +241,7 @@ fat_init (void) _nettle_ghash_update_vec = _nettle_ghash_update_ppc64; _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; + _nettle_sha512_compress_vec = _nettle_sha512_compress_ppc64; } else { @@ -246,6 +251,7 @@ fat_init (void) _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c; _nettle_ghash_update_vec = _nettle_ghash_update_c; _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; + _nettle_sha512_compress_vec = _nettle_sha512_compress_c; } if (features.have_altivec) { @@ -350,3 +356,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, (uint32_t *state, const uint32_t *k, size_t blocks, const uint8_t *input), (state, k, blocks, input)) + +DEFINE_FAT_FUNC(_nettle_sha512_compress, void, + (uint64_t *state, const uint8_t *input, const uint64_t *k), + (state, input, k)) diff --git a/powerpc64/fat/sha512-compress-2.asm b/powerpc64/fat/sha512-compress-2.asm new file mode 100644 index ..9445e5ba --- /dev/null +++ b/powerpc64/fat/sha512-compress-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha512-compress-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha512_compress) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/sha512-compress.asm') diff --git a/powerpc64/p8/sha512-compress.asm b/powerpc64/p8/sha512-compress.asm new file mode 100644 index ..36dd011c --- /dev/null +++ b/powerpc64/p8/sha512-compress.asm @@ -0,0 +1,345 @@ +C x86_64/sha512-compress.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General