[PATCH 2/2] powerpc64: Add optimized assembly for sha512-compress-n

Eric Richter Thu, 28 Mar 2024 08:21:02 -0700

This patch introduces an optimized powerpc64 assembly implementation for
sha512-compress, derived from the implementation for sha256-compress-n.


The following data was captured on a POWER 10 LPAR @ ~3.896GHz

Current C implementation:
         Algorithm         mode Mbyte/s
            sha512       update  447.02
        sha512-224       update  444.30
        sha512-256       update  445.02
       hmac-sha512     64 bytes   97.27
       hmac-sha512    256 bytes  204.55
       hmac-sha512   1024 bytes  342.86
       hmac-sha512   4096 bytes  409.57
       hmac-sha512   single msg  433.95

With optimized assembly:
         Algorithm         mode Mbyte/s
            sha512       update  705.36
        sha512-224       update  705.63
        sha512-256       update  705.34
       hmac-sha512     64 bytes  141.66
       hmac-sha512    256 bytes  310.26
       hmac-sha512   1024 bytes  534.22
       hmac-sha512   4096 bytes  641.74
       hmac-sha512   single msg  677.14

Signed-off-by: Eric Richter <eric...@linux.ibm.com>
---
 fat-ppc.c                           |  10 +
 powerpc64/fat/sha512-compress-2.asm |  36 +++
 powerpc64/p8/sha512-compress.asm    | 345 ++++++++++++++++++++++++++++
 3 files changed, 391 insertions(+)
 create mode 100644 powerpc64/fat/sha512-compress-2.asm
 create mode 100644 powerpc64/p8/sha512-compress.asm

diff --git a/fat-ppc.c b/fat-ppc.c
index efbeb2ec..a228386a 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -207,6 +207,10 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress_n, 
sha256_compress_n_func)
 DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
 DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, ppc64)
+
 
 static void CONSTRUCTOR
 fat_init (void)
@@ -237,6 +241,7 @@ fat_init (void)
       _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
 
       _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64;
+      _nettle_sha512_compress_vec = _nettle_sha512_compress_ppc64;
     }
   else
     {
@@ -246,6 +251,7 @@ fat_init (void)
       _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c;
       _nettle_ghash_update_vec = _nettle_ghash_update_c;
       _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
+      _nettle_sha512_compress_vec = _nettle_sha512_compress_c;
     }
   if (features.have_altivec)
     {
@@ -350,3 +356,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
                (uint32_t *state, const uint32_t *k,
                 size_t blocks, const uint8_t *input),
                (state, k, blocks, input))
+
+DEFINE_FAT_FUNC(_nettle_sha512_compress, void,
+               (uint64_t *state, const uint8_t *input, const uint64_t *k),
+               (state, input, k))
diff --git a/powerpc64/fat/sha512-compress-2.asm 
b/powerpc64/fat/sha512-compress-2.asm
new file mode 100644
index 00000000..9445e5ba
--- /dev/null
+++ b/powerpc64/fat/sha512-compress-2.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/sha512-compress-2.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha512_compress) picked up by configure
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/sha512-compress.asm')
diff --git a/powerpc64/p8/sha512-compress.asm b/powerpc64/p8/sha512-compress.asm
new file mode 100644
index 00000000..36dd011c
--- /dev/null
+++ b/powerpc64/p8/sha512-compress.asm
@@ -0,0 +1,345 @@
+C x86_64/sha512-compress.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+.file "sha512-compress.asm"
+
+C Parameters in
+define(`SP', `1')
+define(`STATE', `3')
+define(`INPUT', `4')
+define(`K', `5')
+
+define(`T0', `7')
+define(`T1', `8')
+define(`TK', `9')
+define(`COUNT', `10')
+
+C State registers
+define(`VSA', `0')
+define(`VSB', `1')
+define(`VSC', `2')
+define(`VSD', `3')
+define(`VSE', `4')
+define(`VSF', `5')
+define(`VSG', `6')
+define(`VSH', `7')
+
+C Current K values
+define(`VK', `8')
+
+C Temp registers for math
+define(`VT0', `9')
+define(`VT1', `10')
+define(`VT2', `11')
+define(`VT3', `12')
+define(`VT4', `13')
+
+C Convenience named registers for sigma(a) and sigma(e)
+define(`SIGA', `14')
+define(`SIGE', `15')
+
+C Input words W[i]. Not directly referenced, but defined here to keep track
+define(`VW0', `16')
+define(`VW1', `17')
+define(`VW2', `18')
+define(`VW3', `19')
+define(`VW4', `20')
+define(`VW5', `21')
+define(`VW6', `22')
+define(`VW7', `23')
+define(`VW8', `24')
+define(`VW9', `25')
+define(`VW10', `26')
+define(`VW11', `27')
+define(`VW12', `28')
+define(`VW13', `29')
+define(`VW14', `30')
+define(`VW15', `31')
+
+C Convert an index for W[i] to the corresponding register
+define(`IV', `eval($1 + VW0)')
+
+C ROUND(A B C D E F G H R EXT)
+define(`ROUND', ` 
+
+       vaddudm VT1, VK, IV($9)             C VT1: k+W
+       vaddudm VT4, $8, VT1                C VT4: H+k+W
+
+       lxvd2x  VSR(VK), TK, K              C Load Key
+       addi    TK, TK, 8                   C Increment Pointer to next key
+
+       vaddudm VT2, $4, $8                 C VT2: H+D
+       vaddudm VT2, VT2, VT1               C VT2: H+D+k+W
+
+       vshasigmad      SIGE, $5, 1, 0b1111 C Sigma(E)  Se
+       vshasigmad      SIGA, $1, 1, 0      C Sigma(A)  Sa
+
+       vxor    VT3, $2, $3                 C VT3: b^c
+       vsel    VT0, $7, $6, $5             C VT0: Ch.
+       vsel    VT3, $3, $1, VT3            C VT3: Maj(a,b,c)
+
+       vaddudm VT4, VT4, VT0               C VT4: Hkw + Ch.
+       vaddudm VT3, VT3, VT4               C VT3: HkW + Ch. + Maj.
+
+       vaddudm VT0, VT0, VT2               C VT0: Ch. + DHKW
+       vaddudm $8, SIGE, SIGA              C Anext: Se + Sa
+       vaddudm $4, VT0, SIGE               C Dnext: Ch. + DHKW + Se
+       vaddudm $8, $8, VT3                 C Anext: Se+Sa+HkW+Ch.+Maj.
+
+
+       C Schedule (data) for 16th round in future
+       C Extend W[i]
+       ifelse(`$10', `1', `
+               vshasigmad      SIGE, IV(($9 + 14) % 16), 0, 0b1111
+               vshasigmad      SIGA, IV(($9 + 1) % 16), 0, 0b0000
+               vaddudm         IV($9), IV($9), SIGE
+               vaddudm         IV($9), IV($9), SIGA
+               vaddudm         IV($9), IV($9), IV(($9 + 9) % 16)
+       ')
+')
+
+define(`EXTENDROUND',  `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 1)')
+define(`NOEXTENDROUND',        `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 0)')
+
+define(`NOEXTENDROUNDS', `
+       NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0)
+       NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1)
+       NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2)
+       NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3)
+
+       NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4)
+       NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5)
+       NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6)
+       NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7)
+
+       NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8)
+       NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9)
+       NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10)
+       NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11)
+
+       NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12)
+       NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13)
+       NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14)
+       NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15)
+')
+
+define(`EXTENDROUNDS', `
+       EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0)
+       EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1)
+       EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2)
+       EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3)
+
+       EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4)
+       EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5)
+       EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6)
+       EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7)
+
+       EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8)
+       EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9)
+       EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10)
+       EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11)
+
+       EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12)
+       EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13)
+       EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14)
+       EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15)
+')
+
+define(`LOAD', `
+       IF_BE(`lxvd2x   VSR(IV($1)), 0, INPUT')
+       IF_LE(`
+               lxvd2x  VSR(IV($1)), 0, INPUT
+               vperm   IV($1), IV($1), IV($1), VT0
+       ')
+       addi    INPUT, INPUT, 8
+')
+
+define(`DOLOADS', `
+       IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
+       LOAD(0)
+       LOAD(1)
+       LOAD(2)
+       LOAD(3)
+
+       LOAD(4)
+       LOAD(5)
+       LOAD(6)
+       LOAD(7)
+
+       LOAD(8)
+       LOAD(9)
+       LOAD(10)
+       LOAD(11)
+
+       LOAD(12)
+       LOAD(13)
+       LOAD(14)
+       LOAD(15)
+')
+
+.text
+PROLOGUE(_nettle_sha512_compress)
+
+       C Store non-volatile registers
+       subi    SP, SP, 64+(12*16)
+       std     T0,     24(SP)
+       std     T1,     16(SP)
+       std     COUNT,  8(SP)
+
+       li      T0, 32
+       stvx    20, 0, SP
+       subi    T0, T0, 16
+       stvx    21, T0, SP
+       subi    T0, T0, 16
+       stvx    22, T0, SP
+       subi    T0, T0, 16
+       stvx    23, T0, SP
+       subi    T0, T0, 16
+       stvx    24, T0, SP
+       subi    T0, T0, 16
+       stvx    25, T0, SP
+       subi    T0, T0, 16
+       stvx    26, T0, SP
+       subi    T0, T0, 16
+       stvx    27, T0, SP
+       subi    T0, T0, 16
+       stvx    28, T0, SP
+       subi    T0, T0, 16
+       stvx    29, T0, SP
+       subi    T0, T0, 16
+       stvx    30, T0, SP
+       subi    T0, T0, 16
+       stvx    31, T0, SP
+
+       C Load state values
+       li      T0, 16
+       lxvd2x  VSR(VSA), 0, STATE      C VSA contains A, B
+       lxvd2x  VSR(VSC), T0, STATE     C VSC contains C, D
+       addi    T0, T0, 16
+       lxvd2x  VSR(VSE), T0, STATE     C VSE contains E, F
+       addi    T0, T0, 16
+       lxvd2x  VSR(VSG), T0, STATE     C VSG contains G, H
+
+       li      TK, 0
+       lxvd2x  VSR(VK), TK, K
+       addi    TK, TK, 8 C might need to be moved, or use swizzle
+
+       DOLOADS
+
+       C "permute" state from VSA containing A,B,C,D into VSA,VSB,VSC,VSD
+       vsldoi  VSB, VSA, VSA, 8
+       vsldoi  VSD, VSC, VSC, 8
+       vsldoi  VSF, VSE, VSE, 8
+       vsldoi  VSH, VSG, VSG, 8
+
+       EXTENDROUNDS
+       EXTENDROUNDS
+       EXTENDROUNDS
+       EXTENDROUNDS
+       NOEXTENDROUNDS
+
+       DATA_LOAD_VEC(VT4, .pack_lr, T0)
+
+       C Reload initial state from stack
+       li      T0, 16
+       lxvd2x  VSR(VT0), 0, STATE
+       lxvd2x  VSR(VT1), T0, STATE
+       addi    T0, T0, 16
+       lxvd2x  VSR(VT2), T0, STATE
+       addi    T0, T0, 16
+       lxvd2x  VSR(VT3), T0, STATE
+
+       C Repack VSA,VSB,VSC,VSD into VSA,VSC,VSE,VSG for storing
+       vperm   VSA, VSA, VSB, VT4
+       vperm   VSC, VSC, VSD, VT4
+       vperm   VSE, VSE, VSF, VT4
+       vperm   VSG, VSG, VSH, VT4
+
+       vaddudm VSA, VSA, VT0
+       vaddudm VSC, VSC, VT1
+       vaddudm VSE, VSE, VT2
+       vaddudm VSG, VSG, VT3
+
+       li      T0, 16
+       stxvd2x VSR(VSA), 0, STATE
+       stxvd2x VSR(VSC), T0, STATE
+       addi    T0, T0, 16
+       stxvd2x VSR(VSE), T0, STATE
+       addi    T0, T0, 16
+       stxvd2x VSR(VSG), T0, STATE
+
+       C Restore nonvolatile registers
+       li      T0, 32
+       lvx     20, 0, SP
+       subi    T0, T0, 16
+       lvx     21, T0, SP
+       subi    T0, T0, 16
+       lvx     22, T0, SP
+       subi    T0, T0, 16
+       lvx     23, T0, SP
+       subi    T0, T0, 16
+       lvx     24, T0, SP
+       subi    T0, T0, 16
+       lvx     25, T0, SP
+       subi    T0, T0, 16
+       lvx     26, T0, SP
+       subi    T0, T0, 16
+       lvx     27, T0, SP
+       subi    T0, T0, 16
+       lvx     28, T0, SP
+       subi    T0, T0, 16
+       lvx     29, T0, SP
+       subi    T0, T0, 16
+       lvx     30, T0, SP
+       subi    T0, T0, 16
+       lvx     31, T0, SP
+
+       ld      T0,     24(SP)
+       ld      T1,     16(SP)
+       ld      COUNT,  8(SP)
+       addi    SP, SP, 64+(12*16)
+
+       mr 3, INPUT
+
+       blr
+EPILOGUE(_nettle_sha512_compress)
+
+IF_LE(`
+.data
+.align 4
+.load_swap:
+       .byte 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7
+')
+.align 4
+.pack_lr:
+       IF_BE(`.byte 0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23')
+       IF_LE(`.byte 23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0')
-- 
2.44.0

_______________________________________________
nettle-bugs mailing list -- nettle-bugs@lists.lysator.liu.se
To unsubscribe send an email to nettle-bugs-le...@lists.lysator.liu.se

[PATCH 2/2] powerpc64: Add optimized assembly for sha512-compress-n

Reply via email to