[PATCH v2 5/6] ecc: Add powerpc64 assembly for ecc_25519_modp
From: Martin Schwenke Signed-off-by: Martin Schwenke Signed-off-by: Alastair D'Silva --- powerpc64/ecc-curve25519-modp.asm | 101 ++ 1 file changed, 101 insertions(+) create mode 100644 powerpc64/ecc-curve25519-modp.asm diff --git a/powerpc64/ecc-curve25519-modp.asm b/powerpc64/ecc-curve25519-modp.asm new file mode 100644 index ..8d87eeaf --- /dev/null +++ b/powerpc64/ecc-curve25519-modp.asm @@ -0,0 +1,101 @@ +C powerpc64/ecc-25519-modp.asm + +ifelse(` + Copyright (C) 2021 Martin Schwenke & Alastair D´Silva, IBM Corporation + + Based on x86_64/ecc-25519-modp.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-25519-modp.asm" + +define(`RP', `r4') +define(`XP', `r5') + +define(`U0', `r6') C Overlaps unused modulo input +define(`U1', `r7') +define(`U2', `r8') +define(`U3', `r9') +define(`T0', `r10') +define(`T1', `r11') +define(`M', `r12') + +define(`UN', r3) + + C void ecc_curve25519_modp (const struct ecc_modulo *p, mp_limb_t *rp, mp_limb_t *xp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_curve25519_modp) + + C First fold the limbs affecting bit 255 + ld UN, 56(XP) + li M, 38 + mulhdu T1, M, UN + mulld UN, M, UN + ld U3, 24(XP) + li T0, 0 + addcU3, UN, U3 + addeT0, T1, T0 + + ld UN, 40(XP) + mulhdu U2, M, UN + mulld UN, M, UN + + addcU3, U3, U3 + addeT0, T0, T0 + srdiU3, U3, 1 C Undo shift, clear high bit + + C Fold the high limb again, together with RP[5] + li T1, 19 + mulld T0, T1, T0 + ld U0, 0(XP) + ld U1, 8(XP) + ld T1, 16(XP) + addcU0, T0, U0 + addeU1, UN, U1 + ld T0, 32(XP) + addeU2, U2, T1 + addze U3, U3 + + mulhdu T1, M, T0 + mulld T0, M, T0 + addcU0, T0, U0 + addeU1, T1, U1 + std U0, 0(RP) + std U1, 8(RP) + + ld T0, 48(XP) + mulhdu T1, M, T0 + mulld UN, M, T0 + addeU2, UN, U2 + addeU3, T1, U3 + std U2, 16(RP) + std U3, 24(RP) + + blr +EPILOGUE(_nettle_ecc_curve25519_modp) -- 2.34.1 ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
[PATCH v2 6/6] ecc: Add powerpc64 assembly for ecc_448_modp
From: Martin Schwenke Signed-off-by: Martin Schwenke Signed-off-by: Amitay Isaacs --- powerpc64/ecc-curve448-modp.asm | 174 1 file changed, 174 insertions(+) create mode 100644 powerpc64/ecc-curve448-modp.asm diff --git a/powerpc64/ecc-curve448-modp.asm b/powerpc64/ecc-curve448-modp.asm new file mode 100644 index ..42ed1eb1 --- /dev/null +++ b/powerpc64/ecc-curve448-modp.asm @@ -0,0 +1,174 @@ +C powerpc/ecc-curve448-modp.asm + +ifelse(` + Copyright (C) 2021 Martin Schwenke & Amitay Isaacs, IBM Corporation + + Based on x86_64/ecc-curve448-modp.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-curve448-modp.asm" + +define(`SP', `r1') + +define(`RP', `r4') +define(`XP', `r5') + +define(`X0', `r3') +define(`X1', `r9') +define(`X2', `r10') +define(`X3', `r11') +define(`X4', `r12') +define(`X5', `r14') +define(`X6', `r15') +define(`X7', `r16') +define(`T0', `r6') +define(`T1', `r7') +define(`T2', `r8') +define(`TT', `r17') + +define(`LO', `TT') C Overlap + + C void ecc_curve448_modp (const struct ecc_modulo *p, mp_limb_t *rp, mp_limb_t *xp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_curve448_modp) + + std r14, -32(SP) + std r15, -24(SP) + std r16, -16(SP) + std r17, -8(SP) + + C First load the values to be shifted by 32. + ld T0, 88(XP) C use for X0, X1 + ld T1, 96(XP) C use for X2 + ld T2, 104(XP) C use for X3 + ld X4, 56(XP) + ld X5, 64(XP) + ld X6, 72(XP) + ld X7, 80(XP) + + C Multiply by 2^32 + sldiX0, T0, 32 + srdiLO, T0, 32 + sldiX1, T1, 32 + or X1, X1, LO + srdiLO, T1, 32 + sldiX2, T2, 32 + or X2, X2, LO + srdiLO, T2, 32 + sldiX3, X4, 32 + or X3, X3, LO + srdiLO, X4, 32 + sldiX4, X5, 32 + or X4, X4, LO + srdiLO, X5, 32 + sldiX5, X6, 32 + or X5, X5, LO + srdiLO, X6, 32 + sldiX6, X7, 32 + or X6, X6, LO + + srdiX7, X7, 32 + + C Multiply by 2 + addcT0, T0, T0 + addeT1, T1, T1 + addeT2, T2, T2 + addze X7, X7 + + C Main additions + ld TT, 56(XP) + addcX0, TT, X0 + ld TT, 64(XP) + addeX1, TT, X1 + ld TT, 72(XP) + addeX2, TT, X2 + ld TT, 80(XP) + addeX3, TT, X3 + addeX4, T0, X4 + addeX5, T1, X5 + addeX6, T2, X6 + addze X7, X7 + + ld T0, 0(XP) + addcX0, T0, X0 + ld T1, 8(XP) + addeX1, T1, X1 + ld T2, 16(XP) + addeX2, T2, X2 + ld TT, 24(XP) + addeX3, TT, X3 + ld T0, 32(XP) + addeX4, T0, X4 + ld T1, 40(XP) + addeX5, T1, X5 + ld T2, 48(XP) + addeX6, T2, X6 + addze X7, X7 + + C X7 wraparound + sldiT0, X7, 32 + srdiT1, X7, 32 + li T2, 0 + addcX0, X7, X0 + addze X1, X1 + addze X2, X2 + addeX3, T0, X3 + addeX4, T1, X4 + addze X5, X5 + addze X6, X6 + addze T2, T2 + + C Final carry wraparound. Carry T2 > 0 only if + C X6 is zero, so carry is absorbed. + sldiT0, T2, 32 + + addcX0, T2, X0 + addze X1, X1 + addze X2, X2 + addeX3, T0, X3 + addze X4, X4 + addze X5, X5 + addze X6, X6 + + std X0, 0(RP) + std X1, 8(RP) + std X2, 16(RP) + std X3, 24(RP) + std X4, 32(RP) + std X5, 40(RP) + std X6, 48(RP) + + ld r14, -32(SP) + ld r15, -24(SP) + ld r16, -16(SP) + ld r17, -8(SP) + + blr +EPILOGUE(_nettle_ecc_curve448_modp) -- 2.34.1 _
[PATCH v2 0/6] Add powerpc64 assembly for elliptic curves
Hi, This series of patches add the powerpc64 assembly for modp/redc functions for elliptic curves P192, P224, P256, P384, P521, X25519 and X448. It results in 15-30% performance improvements as measured on POWER9 system using hogweed-benchmark. I posted the modified codes in the earlier email thread, but I think posting them as a seperate series will make them easier to cherry pick. V2 changes: - Use actual register names when storing/restoring from stack - Drop m4 definitions which are not in use - Simplify C2 folding for P192 curve Amitay Isaacs (2): ecc: Add powerpc64 assembly for ecc_192_modp ecc: Add powerpc64 assembly for ecc_224_modp Martin Schwenke (4): ecc: Add powerpc64 assembly for ecc_384_modp ecc: Add powerpc64 assembly for ecc_521_modp ecc: Add powerpc64 assembly for ecc_25519_modp ecc: Add powerpc64 assembly for ecc_448_modp powerpc64/ecc-curve25519-modp.asm | 101 + powerpc64/ecc-curve448-modp.asm | 174 +++ powerpc64/ecc-secp192r1-modp.asm | 87 powerpc64/ecc-secp224r1-modp.asm | 123 powerpc64/ecc-secp384r1-modp.asm | 227 ++ powerpc64/ecc-secp521r1-modp.asm | 166 ++ 6 files changed, 878 insertions(+) create mode 100644 powerpc64/ecc-curve25519-modp.asm create mode 100644 powerpc64/ecc-curve448-modp.asm create mode 100644 powerpc64/ecc-secp192r1-modp.asm create mode 100644 powerpc64/ecc-secp224r1-modp.asm create mode 100644 powerpc64/ecc-secp384r1-modp.asm create mode 100644 powerpc64/ecc-secp521r1-modp.asm -- 2.34.1 ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
[PATCH v2 2/6] ecc: Add powerpc64 assembly for ecc_224_modp
Signed-off-by: Amitay Isaacs --- powerpc64/ecc-secp224r1-modp.asm | 123 +++ 1 file changed, 123 insertions(+) create mode 100644 powerpc64/ecc-secp224r1-modp.asm diff --git a/powerpc64/ecc-secp224r1-modp.asm b/powerpc64/ecc-secp224r1-modp.asm new file mode 100644 index ..e4bbf366 --- /dev/null +++ b/powerpc64/ecc-secp224r1-modp.asm @@ -0,0 +1,123 @@ +C powerpc64/ecc-secp224r1-modp.asm + +ifelse(` + Copyright (C) 2021 Amitay Isaacs, IBM Corporation + + Based on x86_64/ecc-secp224r1-modp.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-secp224r1-modp.asm" + +define(`SP', `r1') + +define(`RP', `r4') +define(`XP', `r5') + +define(`T0', `r6') +define(`T1', `r7') +define(`H0', `r8') +define(`H1', `r9') +define(`H2', `r10') +define(`F0', `r11') +define(`F1', `r12') +define(`F2', `r14') +define(`T2', `r3') + + C void ecc_secp224r1_modp (const struct ecc_modulo *m, mp_limb_t *rp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_secp224r1_modp) + std r14, -8(SP) + + ld H0, 48(XP) + ld H1, 56(XP) + C set (F2, F1, F0) <-- (H1, H0) << 32 + sldiF0, H0, 32 + srdiF1, H0, 32 + sldiT0, H1, 32 + srdiF2, H1, 32 + or F1, T0, F1 + + li H2, 0 + ld T0, 16(XP) + ld T1, 24(XP) + subfc T0, F0, T0 + subfe T1, F1, T1 + subfe H0, F2, H0 + addme H1, H1 + + ld T2, 32(XP) + addcH0, T2, H0 + ld T2, 40(XP) + addeH1, T2, H1 + addze H2, H2 + + C Set (F2, F1, F0) <-- (H2, H1, H0) << 32 + sldiF0, H0, 32 + srdiF1, H0, 32 + addcH0, T0, H0 + sldiT0, H1, 32 + srdiF2, H1, 32 + addeH1, T1, H1 + sldiT1, H2, 32 + addze H2, H2 + or F1, T0, F1 + or F2, T1, F2 + + ld T0, 0(XP) + ld T1, 8(XP) + subfc T0, F0, T0 + subfe T1, F1, T1 + subfe H0, F2, H0 + addme H1, H1 + addme H2, H2 + + srdiF0, H1, 32 + sldiF1, H2, 32 + or F0, F1, F0 + clrrdi F1, H1, 32 + mr F2, H2 + clrldi H1, H1, 32 + + subfc T0, F0, T0 + addme F1, F1 + addme F2, F2 + addcT1, F1, T1 + addeH0, F2, H0 + addze H1, H1 + + std T0, 0(RP) + std T1, 8(RP) + std H0, 16(RP) + std H1, 24(RP) + + ld r14, -8(SP) + + blr +EPILOGUE(_nettle_ecc_secp224r1_modp) -- 2.34.1 ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
[PATCH v2 4/6] ecc: Add powerpc64 assembly for ecc_521_modp
From: Martin Schwenke Signed-off-by: Martin Schwenke Signed-off-by: Alastair D'Silva --- powerpc64/ecc-secp521r1-modp.asm | 166 +++ 1 file changed, 166 insertions(+) create mode 100644 powerpc64/ecc-secp521r1-modp.asm diff --git a/powerpc64/ecc-secp521r1-modp.asm b/powerpc64/ecc-secp521r1-modp.asm new file mode 100644 index ..e989f9cf --- /dev/null +++ b/powerpc64/ecc-secp521r1-modp.asm @@ -0,0 +1,166 @@ +C powerpc64/ecc-secp521r1-modp.asm + +ifelse(` + Copyright (C) 2021 Martin Schwenke & Alastair D´Silva, IBM Corporation + + Based on x86_64/ecc-secp521r1-modp.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-secp521r1-modp.asm" + +define(`SP', `r1') + +define(`RP', `r4') +define(`XP', `r5') + +define(`U0', `r6') +define(`U1', `r7') +define(`U2', `r8') +define(`U3', `r9') +define(`U4', `r10') +define(`U5', `r11') +define(`U6', `r12') +define(`U7', `r14') +define(`U8', `r15') +define(`U9', `r16') + +define(`T0', `r3') +define(`T1', `r17') + + + C void ecc_secp521r1_modp (const struct ecc_modulo *p, mp_limb_t *rp, mp_limb_t *xp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_secp521r1_modp) + + std r14, -32(SP) + std r15, -24(SP) + std r16, -16(SP) + std r17, -8(SP) + + C Read top 17 limbs, shift left 55 bits + ld U1, 72(XP) + sldiU0, U1, 55 + srdiU1, U1, 9 + + ld T0, 80(XP) + srdiU2, T0, 9 + sldiT0, T0, 55 + or U1, T0, U1 + + ld T0, 88(XP) + srdiU3, T0, 9 + sldiT0, T0, 55 + or U2, T0, U2 + + ld T0, 96(XP) + srdiU4, T0, 9 + sldiT0, T0, 55 + or U3, T0, U3 + + ld T0, 104(XP) + srdiU5, T0, 9 + sldiT0, T0, 55 + or U4, T0, U4 + + ld T0, 112(XP) + srdiU6, T0, 9 + sldiT0, T0, 55 + or U5, T0, U5 + + ld T0, 120(XP) + srdiU7, T0, 9 + sldiT0, T0, 55 + or U6, T0, U6 + + ld T0, 128(XP) + srdiU8, T0, 9 + sldiT0, T0, 55 + or U7, T0, U7 + + ld T0, 136(XP) + srdiU9, T0, 9 + sldiT0, T0, 55 + or U8, T0, U8 + + ld T0, 0(XP) + ld T1, 8(XP) + addcU0, T0, U0 + addeU1, T1, U1 + ld T0, 16(XP) + ld T1, 24(XP) + addeU2, T0, U2 + addeU3, T1, U3 + ld T0, 32(XP) + ld T1, 40(XP) + addeU4, T0, U4 + addeU5, T1, U5 + ld T0, 48(XP) + ld T1, 56(XP) + addeU6, T0, U6 + addeU7, T1, U7 + ld T0, 64(XP) + addeU8, T0, U8 + addze U9, U9 + + C Top limbs are . Keep low 9 bits of 8, and fold the + C top bits (at most 65 bits). + srdiT0, U8, 9 + andi. U8, U8, 0x1ff + srdiT1, U9, 9 + sldiU9, U9, 55 + or T0, U9, T0 + + addcU0, T0, U0 + addeU1, T1, U1 + addze U2, U2 + addze U3, U3 + addze U4, U4 + addze U5, U5 + addze U6, U6 + addze U7, U7 + addze U8, U8 + + std U0, 0(RP) + std U1, 8(RP) + std U2, 16(RP) + std U3, 24(RP) + std U4, 32(RP) + std U5, 40(RP) + std U6, 48(RP) + std U7, 56(RP) + std U8, 64(RP) + + ld r14, -32(SP) + ld r15, -24(SP) + ld r16, -16(SP) + ld r17, -8(SP) + + blr +EPILOGUE(_nettle_ecc_secp521r1_modp) -- 2.34.1 ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
[PATCH v2 1/6] ecc: Add powerpc64 assembly for ecc_192_modp
Signed-off-by: Amitay Isaacs --- powerpc64/ecc-secp192r1-modp.asm | 87 1 file changed, 87 insertions(+) create mode 100644 powerpc64/ecc-secp192r1-modp.asm diff --git a/powerpc64/ecc-secp192r1-modp.asm b/powerpc64/ecc-secp192r1-modp.asm new file mode 100644 index ..ee38ec60 --- /dev/null +++ b/powerpc64/ecc-secp192r1-modp.asm @@ -0,0 +1,87 @@ +C powerpc64/ecc-secp192r1-modp.asm + +ifelse(` + Copyright (C) 2021 Amitay Isaacs, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-secp192r1-modp.asm" + +define(`RP', `r4') +define(`XP', `r5') + +define(`T0', `r6') +define(`T1', `r7') +define(`T2', `r8') +define(`T3', `r9') +define(`C1', `r10') +define(`C2', `r11') + + C void ecc_secp192r1_modp (const struct ecc_modulo *m, mp_limb_t *rp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_secp192r1_modp) + ld T0, 0(XP) + ld T1, 8(XP) + ld T2, 16(XP) + + li C1, 0 + li C2, 0 + + ld T3, 24(XP) + addcT0, T3, T0 + addeT1, T3, T1 + addze T2, T2 + addze C1, C1 + + ld T3, 32(XP) + addcT1, T3, T1 + addeT2, T3, T2 + addze C1, C1 + + ld T3, 40(XP) + addcT0, T3, T0 + addeT1, T3, T1 + addeT2, T3, T2 + addze C1, C1 + + addcT0, C1, T0 + addeT1, C1, T1 + addze T2, T2 + addze C2, C2 + + addcT0, C2, T0 + addeT1, C2, T1 + addze T2, T2 + + std T0, 0(RP) + std T1, 8(RP) + std T2, 16(RP) + + blr +EPILOGUE(_nettle_ecc_secp192r1_modp) -- 2.34.1 ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
[PATCH v2 3/6] ecc: Add powerpc64 assembly for ecc_384_modp
From: Martin Schwenke Signed-off-by: Martin Schwenke Signed-off-by: Amitay Isaacs Signed-off-by: Alastair D'Silva --- powerpc64/ecc-secp384r1-modp.asm | 227 +++ 1 file changed, 227 insertions(+) create mode 100644 powerpc64/ecc-secp384r1-modp.asm diff --git a/powerpc64/ecc-secp384r1-modp.asm b/powerpc64/ecc-secp384r1-modp.asm new file mode 100644 index ..d673bf1e --- /dev/null +++ b/powerpc64/ecc-secp384r1-modp.asm @@ -0,0 +1,227 @@ +C powerpc64/ecc-secp384r1-modp.asm + +ifelse(` + Copyright (C) 2021 Martin Schwenke, Amitay Isaacs & Alastair D´Silva, IBM Corporation + + Based on x86_64/ecc-secp256r1-redc.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-secp384r1-modp.asm" + +C Register usage: + +define(`SP', `r1') + +define(`RP', `r4') +define(`XP', `r5') + +define(`D5', `r6') +define(`T0', `r7') +define(`T1', `r8') +define(`T2', `r9') +define(`T3', `r10') +define(`T4', `r11') +define(`T5', `r12') +define(`H0', `r14') +define(`H1', `r15') +define(`H2', `r16') +define(`H3', `r17') +define(`H4', `r18') +define(`H5', `r19') +define(`C2', `r3') +define(`C0', H5) C Overlap +define(`TMP', XP) C Overlap + + + C void ecc_secp384r1_modp (const struct ecc_modulo *m, mp_limb_t *rp, mp_limb_t *xp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_secp384r1_modp) + + std r14, -48(SP) + std r15, -40(SP) + std r16, -32(SP) + std r17, -24(SP) + std r18, -16(SP) + std r19, -8(SP) + + C First get top 2 limbs, which need folding twice. + C B^10 = B^6 + B^4 + 2^32 (B-1)B^4. + C We handle the terms as follow: + C + C B^6: Folded immediatly. + C + C B^4: Delayed, added in in the next folding. + C + C 2^32(B-1) B^4: Low half limb delayed until the next + C folding. Top 1.5 limbs subtracted and shifter now, resulting + C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added + C in. + + ld H4, 80(XP) + ld H5, 88(XP) + C Shift right 32 bits, into H1, H0 + srdiH1, H5, 32 + sldiD5, H5, 32 + srdiH0, H4, 32 + or H0, H0, D5 + + C H1 H0 + C - H1 H0 + C + C H1 H0 D5 + subfic D5, H0, 0 + subfe H0, H1, H0 + addme H1, H1 + + li C2, 0 + addcH0, H4, H0 + addeH1, H5, H1 + addze C2, C2 + + C Add in to high part + ld T1, 48(XP) + ld T2, 56(XP) + addcH0, T1, H0 + addeH1, T2, H1 + addze C2, C2 C Do C2 later + + C +1 term + ld T0, 0(XP) + ld T1, 8(XP) + ld T2, 16(XP) + ld T3, 24(XP) + ld T4, 32(XP) + ld T5, 40(XP) + ld H2, 64(XP) + ld H3, 72(XP) + addcT0, H0, T0 + addeT1, H1, T1 + addeT2, H2, T2 + addeT3, H3, T3 + addeT4, H4, T4 + addeT5, H5, T5 + li C0, 0 + addze C0, C0 + + C +B^2 term + addcT2, H0, T2 + addeT3, H1, T3 + addeT4, H2, T4 + addeT5, H3, T5 + addze C0, C0 + + C Shift left, including low half of H4 + sldiH4, H4, 32 + srdiTMP, H3, 32 + or H4, TMP, H4 + + sldiH3, H3, 32 + srdiTMP, H2, 32 + or H3, TMP, H3 + + sldiH2, H2, 32 + srdiTMP, H1, 32 + or H2, TMP, H2 + + sldiH1, H1, 32 + srdiTMP, H0, 32 + or H1, TMP, H1 + + sldiH0, H0, 32 + + C H4 H3 H2 H1 H0 0 + C - H4 H3 H2 H1 H0 + C --- + C H4 H3 H2 H1 H0 TMP + + subfic TMP, H0, 0 + subfe H0, H1, H0 + subfe H1, H2, H1 + subfe H2, H3, H2 + subfe H3, H4, H3 + addme H4, H4 + + addcT0, TMP, T0 + addeT1, H0, T1 +
Re: [PATCH 2/7] ecc: Add powerpc64 assembly for ecc_224_modp
Updated version using actual register names for storing and restoring from stack. Amitay. -- The manager administers, the leader innovates. The manager maintains, the leader develops. The manager relies on systems, the leader relies on people. The manager counts on controls, the leader counts on trust. The manager does things right, the leader does the right thing. - Fortune Magazine C powerpc64/ecc-secp224r1-modp.asm ifelse(` Copyright (C) 2021 Amitay Isaacs, IBM Corporation Based on x86_64/ecc-secp224r1-modp.asm This file is part of GNU Nettle. GNU Nettle is free software: you can redistribute it and/or modify it under the terms of either: * the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. or * the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. or both in parallel, as here. GNU Nettle is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received copies of the GNU General Public License and the GNU Lesser General Public License along with this program. If not, see http://www.gnu.org/licenses/. ') .file "ecc-secp224r1-modp.asm" define(`SP', `r1') define(`RP', `r4') define(`XP', `r5') define(`T0', `r6') define(`T1', `r7') define(`H0', `r8') define(`H1', `r9') define(`H2', `r10') define(`F0', `r11') define(`F1', `r12') define(`F2', `r14') define(`T2', `r3') C void ecc_secp224r1_modp (const struct ecc_modulo *m, mp_limb_t *rp) .text define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_ecc_secp224r1_modp) std r14, -8(SP) ld H0, 48(XP) ld H1, 56(XP) C set (F2, F1, F0) <-- (H1, H0) << 32 sldiF0, H0, 32 srdiF1, H0, 32 sldiT0, H1, 32 srdiF2, H1, 32 or F1, T0, F1 li H2, 0 ld T0, 16(XP) ld T1, 24(XP) subfc T0, F0, T0 subfe T1, F1, T1 subfe H0, F2, H0 addme H1, H1 ld T2, 32(XP) addcH0, T2, H0 ld T2, 40(XP) addeH1, T2, H1 addze H2, H2 C Set (F2, F1, F0) <-- (H2, H1, H0) << 32 sldiF0, H0, 32 srdiF1, H0, 32 addcH0, T0, H0 sldiT0, H1, 32 srdiF2, H1, 32 addeH1, T1, H1 sldiT1, H2, 32 addze H2, H2 or F1, T0, F1 or F2, T1, F2 ld T0, 0(XP) ld T1, 8(XP) subfc T0, F0, T0 subfe T1, F1, T1 subfe H0, F2, H0 addme H1, H1 addme H2, H2 srdiF0, H1, 32 sldiF1, H2, 32 or F0, F1, F0 clrrdi F1, H1, 32 mr F2, H2 clrldi H1, H1, 32 subfc T0, F0, T0 addme F1, F1 addme F2, F2 addcT1, F1, T1 addeH0, F2, H0 addze H1, H1 std T0, 0(RP) std T1, 8(RP) std H0, 16(RP) std H1, 24(RP) ld r14, -8(SP) blr EPILOGUE(_nettle_ecc_secp224r1_modp) ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
Re: [PATCH 4/7] ecc: Add powerpc64 assembly for ecc_384_modp
On Tue, 2022-01-04 at 21:28 +0100, Niels Möller wrote: > > > +define(`FUNC_ALIGN', `5') > > +PROLOGUE(_nettle_ecc_secp384r1_modp) > > + > > + std H0, -48(SP) > > + std H1, -40(SP) > > + std H2, -32(SP) > > + std H3, -24(SP) > > + std H4, -16(SP) > > + std H5, -8(SP) > > I find it clearer to use register names rather than the m4 defines > for > save and restore of callee-save registers. Here's the modified code which uses the actual registers when saving and restoring from stack. Amitay. -- Before marriage, a man yearns for the woman he loves. After marriage, the 'Y' becomes silent. C powerpc64/ecc-secp384r1-modp.asm ifelse(` Copyright (C) 2021 Martin Schwenke, Amitay Isaacs & Alastair D´Silva, IBM Corporation Based on x86_64/ecc-secp256r1-redc.asm This file is part of GNU Nettle. GNU Nettle is free software: you can redistribute it and/or modify it under the terms of either: * the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. or * the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. or both in parallel, as here. GNU Nettle is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received copies of the GNU General Public License and the GNU Lesser General Public License along with this program. If not, see http://www.gnu.org/licenses/. ') .file "ecc-secp384r1-modp.asm" C Register usage: define(`SP', `r1') define(`RP', `r4') define(`XP', `r5') define(`D5', `r6') define(`T0', `r7') define(`T1', `r8') define(`T2', `r9') define(`T3', `r10') define(`T4', `r11') define(`T5', `r12') define(`H0', `r14') define(`H1', `r15') define(`H2', `r16') define(`H3', `r17') define(`H4', `r18') define(`H5', `r19') define(`C2', `r3') define(`C0', H5)C Overlap define(`TMP', XP) C Overlap C void ecc_secp384r1_modp (const struct ecc_modulo *m, mp_limb_t *rp, mp_limb_t *xp) .text define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_ecc_secp384r1_modp) std r14, -48(SP) std r15, -40(SP) std r16, -32(SP) std r17, -24(SP) std r18, -16(SP) std r19, -8(SP) C First get top 2 limbs, which need folding twice. C B^10 = B^6 + B^4 + 2^32 (B-1)B^4. C We handle the terms as follow: C C B^6: Folded immediatly. C C B^4: Delayed, added in in the next folding. C C 2^32(B-1) B^4: Low half limb delayed until the next C folding. Top 1.5 limbs subtracted and shifter now, resulting C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added C in. ld H4, 80(XP) ld H5, 88(XP) C Shift right 32 bits, into H1, H0 srdiH1, H5, 32 sldiD5, H5, 32 srdiH0, H4, 32 or H0, H0, D5 C H1 H0 C - H1 H0 C C H1 H0 D5 subfic D5, H0, 0 subfe H0, H1, H0 addme H1, H1 li C2, 0 addcH0, H4, H0 addeH1, H5, H1 addze C2, C2 C Add in to high part ld T1, 48(XP) ld T2, 56(XP) addcH0, T1, H0 addeH1, T2, H1 addze C2, C2 C Do C2 later C +1 term ld T0, 0(XP) ld T1, 8(XP) ld T2, 16(XP) ld T3, 24(XP) ld T4, 32(XP) ld T5, 40(XP) ld H2, 64(XP) ld H3, 72(XP) addcT0, H0, T0 addeT1, H1, T1 addeT2, H2, T2 addeT3, H3, T3 addeT4, H4, T4 addeT5, H5, T5 li C0, 0 addze C0, C0 C +B^2 term addcT2, H0, T2 addeT3, H1, T3 addeT4, H2, T4 addeT5, H3, T5 addze C0, C0 C Shift left, including low half of H4 sldiH4, H4, 32 srdiTMP, H3, 32 or H4, TMP, H4 sldiH3, H3, 32 srdiTMP, H2, 32 or H3, TMP, H3 sldiH2, H2, 32 srdiTMP, H1, 32 or H2, TMP, H2 sldiH1, H1, 32 srdiTMP, H0, 32 or H1, TMP, H1 sldiH0, H0, 32 C H4 H3 H2 H1 H0 0 C - H4 H3 H2 H1 H0 C --- C H4 H3 H2 H1 H0 TMP subfic TMP, H0, 0 subfe H0, H1, H0 subfe H1, H2, H1 subfe H2, H3, H2 subfe
Re: [PATCH 1/7] ecc: Add powerpc64 assembly for ecc_192_modp
Here's the updated code for P192 curve after simplifying C2 folding. Amitay. -- Retirement: When you quit working just before your heart does. Retirement: When you quit working just before your heart does. C powerpc64/ecc-secp192r1-modp.asm ifelse(` Copyright (C) 2021 Amitay Isaacs, IBM Corporation This file is part of GNU Nettle. GNU Nettle is free software: you can redistribute it and/or modify it under the terms of either: * the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. or * the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. or both in parallel, as here. GNU Nettle is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received copies of the GNU General Public License and the GNU Lesser General Public License along with this program. If not, see http://www.gnu.org/licenses/. ') .file "ecc-secp192r1-modp.asm" define(`RP', `r4') define(`XP', `r5') define(`T0', `r6') define(`T1', `r7') define(`T2', `r8') define(`T3', `r9') define(`C1', `r10') define(`C2', `r11') C void ecc_secp192r1_modp (const struct ecc_modulo *m, mp_limb_t *rp) .text define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_ecc_secp192r1_modp) ld T0, 0(XP) ld T1, 8(XP) ld T2, 16(XP) li C1, 0 li C2, 0 ld T3, 24(XP) addcT0, T3, T0 addeT1, T3, T1 addze T2, T2 addze C1, C1 ld T3, 32(XP) addcT1, T3, T1 addeT2, T3, T2 addze C1, C1 ld T3, 40(XP) addcT0, T3, T0 addeT1, T3, T1 addeT2, T3, T2 addze C1, C1 addcT0, C1, T0 addeT1, C1, T1 addze T2, T2 addze C2, C2 addcT0, C2, T0 addeT1, C2, T1 addze T2, T2 std T0, 0(RP) std T1, 8(RP) std T2, 16(RP) blr EPILOGUE(_nettle_ecc_secp192r1_modp) ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
Re: [Arm64, S390x] Optimize Chacha20
On Thu, Jan 20, 2022 at 11:08 PM Maamoun TK wrote: > On Thu, Jan 20, 2022 at 10:32 PM Niels Möller > wrote: > >> Maamoun TK writes: >> >> > As far as I understand, SIMD is called Advanced SIMD on AArch64 and it's >> > standard for this architecture. simd is enabled by default in GCC but it >> > can be disabled with nosimd option as I can see in here >> > https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html which is why I >> made >> > a specific config option for it. >> >> If it's present on all known aarch64 systems (and HWCAP_ASIMD flag >> always set), I think we can keep things simpler and use the code >> unconditionally, with no extra subdir, no fat build function pointers or >> configure flag. >> > > Ok, I'll commit the changes with vanilla assembly files. > Done! The MR is updated https://git.lysator.liu.se/nettle/nettle/-/merge_requests/37 regards, Mamone > > >> I've pushed the merge button for the s390x merge request. >> > > Nice! I've made various tests on each core function so merging the changes > is gonna be ok. > > In another topic, I'm making experiments on your poly1305 optimizing tips > and I'll get back to you once I'm up to something. > > regards, > Mamone > > Regards, >> /Niels >> >> -- >> Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677. >> Internet email is subject to wholesale government surveillance. >> > ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
Re: [Arm64, S390x] Optimize Chacha20
On Thu, Jan 20, 2022 at 10:32 PM Niels Möller wrote: > Maamoun TK writes: > > > As far as I understand, SIMD is called Advanced SIMD on AArch64 and it's > > standard for this architecture. simd is enabled by default in GCC but it > > can be disabled with nosimd option as I can see in here > > https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html which is why I > made > > a specific config option for it. > > If it's present on all known aarch64 systems (and HWCAP_ASIMD flag > always set), I think we can keep things simpler and use the code > unconditionally, with no extra subdir, no fat build function pointers or > configure flag. > Ok, I'll commit the changes with vanilla assembly files. > I've pushed the merge button for the s390x merge request. > Nice! I've made various tests on each core function so merging the changes is gonna be ok. In another topic, I'm making experiments on your poly1305 optimizing tips and I'll get back to you once I'm up to something. regards, Mamone Regards, > /Niels > > -- > Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677. > Internet email is subject to wholesale government surveillance. > ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
Re: [Arm64, S390x] Optimize Chacha20
Maamoun TK writes: > As far as I understand, SIMD is called Advanced SIMD on AArch64 and it's > standard for this architecture. simd is enabled by default in GCC but it > can be disabled with nosimd option as I can see in here > https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html which is why I made > a specific config option for it. If it's present on all known aarch64 systems (and HWCAP_ASIMD flag always set), I think we can keep things simpler and use the code unconditionally, with no extra subdir, no fat build function pointers or configure flag. I've pushed the merge button for the s390x merge request. Regards, /Niels -- Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677. Internet email is subject to wholesale government surveillance. ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
Re: [Arm64, PowerPC64, S390x] Optimize Poly1305
Maamoun TK writes: > Wider multiplication would improve the performance for 64-bit general > registers but as the case for the current SIMD implementation, the radix > 2^26 fits well there. If multiply throughput is the bottleneck, it makes sense to do as much work as possible per multiply. So I don't think I understand the benefits of interleaving, can you explain? Let's consider the 64-bit case, since that's less writing. B = 2^64 as usual. Then the state is H = h_2 B^2 + h_1 B + h_0 (with h_2 rather small, depending on how far we normalize for each block, lets assume at most 3 bits, or maybe even h_2 <= 4). R = r_1 B + r_0 By the spec, high 4 bits of both r_0 and r_1, and low 2 bits of r_1 are zero, which makes mutliplication R H (mod p) particularly nice. We get R H = r_0 h_0 + B (r_1 h_0 + r_0 h_1) + B^2 (r_1 h_1 + r_0 h_2) + B^3 r_1 h_2 But then B^2 = 5/4 (mod p), and hence B^2 r_1 = 5 r_1 / 4 (mod p), where the "/ 4" is just shifting out the two low zero bits. So let r_1' = 5 r_1 / 4, R H = r_0 h_0 + r_1' h_1 + B (r_1 h_0 + r_0 h_1 + r_1' h_2 + B r_0 h_2) These are 4 long multiplications (64x64 --> 128) and two short, 64x64 --> for the products involving h_2. (The 32-bit version would be 16 long multiplications and 4 short). From the zero high bits, we also get bounds on these terms, f_0 = r_0 h_0 + r_1' h_1 < 2^124 + 5*2^122 = 9*2^122 f_1 = r_1 h_0 + r_0 h_1 + r_1' h_2 + B r_0 h_2 < 2^125 + 5*2^61 + 2^127 So these two chains can be added together as 128-bit quantities with no overflow, in any order, there's plendy of parallelism. E.g., power vmsumudm might be useful. For final folding, we need to split f_1 into top 62 and low 66 bits, multiply low part by 5 (fits in 64 bits), and add into f_0, which still fits in 128 bits. And then take the top 64 bits of f_0 and add into f_1 (result <= 2^66 bits). The current C implementation uses radix 26, and 25 multiplies (32x32 --> 64) per block. And quite a lot of shifts. A radix 32 variant analogous to the above would need 16 long multiplies and 4 short. I'd expect that to be faster on most machines, but I'd have to try that out. In contrast, trying to use a similar scheme for multiplying by (r^2 (mod p)), as needed for an interleaved version, seems more expensive. There are several contributions to the cost: * First, the accumulation of products by power of B needs to take into account carry, as result can exceed 2^128, so one would need something closer to general schoolbok multiplication. * Second, since r^2 (mod p) may exceed 2^128, we need three words rather than two, so three more short multiplications to add in. * Third, we can't pre-divide key words by 4, since low bits are no longer guaranteed to be zero. This gives more expensive reduction, with more multiplies by 5. The two first points makes smaller radix more attractive; if we need three words for both factors, we can distribute the bits to ensure some of the most significant bits are zero. > Since the loop of block iteration is moved to inside the assembly > implementation, computing one multiple of key at the function prologue > should be ok. For large messages, that's fine, but may add a significant cost for messages of just two blocks. Regards, /Niels -- Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677. Internet email is subject to wholesale government surveillance. ___ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs