[PATCH v2 5/6] ecc: Add powerpc64 assembly for ecc_25519_modp

2022-01-20 Thread Amitay Isaacs
From: Martin Schwenke 

Signed-off-by: Martin Schwenke 
Signed-off-by: Alastair D'Silva 
---
 powerpc64/ecc-curve25519-modp.asm | 101 ++
 1 file changed, 101 insertions(+)
 create mode 100644 powerpc64/ecc-curve25519-modp.asm

diff --git a/powerpc64/ecc-curve25519-modp.asm 
b/powerpc64/ecc-curve25519-modp.asm
new file mode 100644
index ..8d87eeaf
--- /dev/null
+++ b/powerpc64/ecc-curve25519-modp.asm
@@ -0,0 +1,101 @@
+C powerpc64/ecc-25519-modp.asm
+
+ifelse(`
+   Copyright (C) 2021 Martin Schwenke & Alastair D´Silva, IBM Corporation
+
+   Based on x86_64/ecc-25519-modp.asm
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+   .file "ecc-25519-modp.asm"
+
+define(`RP', `r4')
+define(`XP', `r5')
+
+define(`U0', `r6') C Overlaps unused modulo input
+define(`U1', `r7')
+define(`U2', `r8')
+define(`U3', `r9')
+define(`T0', `r10')
+define(`T1', `r11')
+define(`M', `r12')
+
+define(`UN', r3)
+
+   C void ecc_curve25519_modp (const struct ecc_modulo *p, mp_limb_t *rp, 
mp_limb_t *xp)
+   .text
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_ecc_curve25519_modp)
+
+   C First fold the limbs affecting bit 255
+   ld  UN, 56(XP)
+   li  M, 38
+   mulhdu  T1, M, UN
+   mulld   UN, M, UN
+   ld  U3, 24(XP)
+   li  T0, 0
+   addcU3, UN, U3
+   addeT0, T1, T0
+
+   ld  UN, 40(XP)
+   mulhdu  U2, M, UN
+   mulld   UN, M, UN
+
+   addcU3, U3, U3
+   addeT0, T0, T0
+   srdiU3, U3, 1   C Undo shift, clear high bit
+
+   C Fold the high limb again, together with RP[5]
+   li  T1, 19
+   mulld   T0, T1, T0
+   ld  U0, 0(XP)
+   ld  U1, 8(XP)
+   ld  T1, 16(XP)
+   addcU0, T0, U0
+   addeU1, UN, U1
+   ld  T0, 32(XP)
+   addeU2, U2, T1
+   addze   U3, U3
+
+   mulhdu  T1, M, T0
+   mulld   T0, M, T0
+   addcU0, T0, U0
+   addeU1, T1, U1
+   std U0, 0(RP)
+   std U1, 8(RP)
+
+   ld  T0, 48(XP)
+   mulhdu  T1, M, T0
+   mulld   UN, M, T0
+   addeU2, UN, U2
+   addeU3, T1, U3
+   std U2, 16(RP)
+   std U3, 24(RP)
+
+   blr
+EPILOGUE(_nettle_ecc_curve25519_modp)
-- 
2.34.1

___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


[PATCH v2 6/6] ecc: Add powerpc64 assembly for ecc_448_modp

2022-01-20 Thread Amitay Isaacs
From: Martin Schwenke 

Signed-off-by: Martin Schwenke 
Signed-off-by: Amitay Isaacs 
---
 powerpc64/ecc-curve448-modp.asm | 174 
 1 file changed, 174 insertions(+)
 create mode 100644 powerpc64/ecc-curve448-modp.asm

diff --git a/powerpc64/ecc-curve448-modp.asm b/powerpc64/ecc-curve448-modp.asm
new file mode 100644
index ..42ed1eb1
--- /dev/null
+++ b/powerpc64/ecc-curve448-modp.asm
@@ -0,0 +1,174 @@
+C powerpc/ecc-curve448-modp.asm
+
+ifelse(`
+   Copyright (C) 2021 Martin Schwenke & Amitay Isaacs, IBM Corporation
+
+   Based on x86_64/ecc-curve448-modp.asm
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+   .file "ecc-curve448-modp.asm"
+
+define(`SP', `r1')
+
+define(`RP', `r4')
+define(`XP', `r5')
+
+define(`X0', `r3')
+define(`X1', `r9')
+define(`X2', `r10')
+define(`X3', `r11')
+define(`X4', `r12')
+define(`X5', `r14')
+define(`X6', `r15')
+define(`X7', `r16')
+define(`T0', `r6')
+define(`T1', `r7')
+define(`T2', `r8')
+define(`TT', `r17')
+
+define(`LO', `TT') C Overlap
+
+   C void ecc_curve448_modp (const struct ecc_modulo *p, mp_limb_t *rp, 
mp_limb_t *xp)
+   .text
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_ecc_curve448_modp)
+
+   std r14, -32(SP)
+   std r15, -24(SP)
+   std r16, -16(SP)
+   std r17, -8(SP)
+
+   C First load the values to be shifted by 32.
+   ld  T0, 88(XP)  C use for X0, X1
+   ld  T1, 96(XP)  C use for X2
+   ld  T2, 104(XP) C use for X3
+   ld  X4, 56(XP)
+   ld  X5, 64(XP)
+   ld  X6, 72(XP)
+   ld  X7, 80(XP)
+
+   C Multiply by 2^32
+   sldiX0, T0, 32
+   srdiLO, T0, 32
+   sldiX1, T1, 32
+   or  X1, X1, LO
+   srdiLO, T1, 32
+   sldiX2, T2, 32
+   or  X2, X2, LO
+   srdiLO, T2, 32
+   sldiX3, X4, 32
+   or  X3, X3, LO
+   srdiLO, X4, 32
+   sldiX4, X5, 32
+   or  X4, X4, LO
+   srdiLO, X5, 32
+   sldiX5, X6, 32
+   or  X5, X5, LO
+   srdiLO, X6, 32
+   sldiX6, X7, 32
+   or  X6, X6, LO
+
+   srdiX7, X7, 32
+
+   C Multiply by 2
+   addcT0, T0, T0
+   addeT1, T1, T1
+   addeT2, T2, T2
+   addze   X7, X7
+
+   C Main additions
+   ld  TT, 56(XP)
+   addcX0, TT, X0
+   ld  TT, 64(XP)
+   addeX1, TT, X1
+   ld  TT, 72(XP)
+   addeX2, TT, X2
+   ld  TT, 80(XP)
+   addeX3, TT, X3
+   addeX4, T0, X4
+   addeX5, T1, X5
+   addeX6, T2, X6
+   addze   X7, X7
+
+   ld  T0, 0(XP)
+   addcX0, T0, X0
+   ld  T1, 8(XP)
+   addeX1, T1, X1
+   ld  T2, 16(XP)
+   addeX2, T2, X2
+   ld  TT, 24(XP)
+   addeX3, TT, X3
+   ld  T0, 32(XP)
+   addeX4, T0, X4
+   ld  T1, 40(XP)
+   addeX5, T1, X5
+   ld  T2, 48(XP)
+   addeX6, T2, X6
+   addze   X7, X7
+
+   C X7 wraparound
+   sldiT0, X7, 32
+   srdiT1, X7, 32
+   li  T2, 0
+   addcX0, X7, X0
+   addze   X1, X1
+   addze   X2, X2
+   addeX3, T0, X3
+   addeX4, T1, X4
+   addze   X5, X5
+   addze   X6, X6
+   addze   T2, T2
+
+   C Final carry wraparound. Carry T2 > 0 only if
+   C X6 is zero, so carry is absorbed.
+   sldiT0, T2, 32
+
+   addcX0, T2, X0
+   addze   X1, X1
+   addze   X2, X2
+   addeX3, T0, X3
+   addze   X4, X4
+   addze   X5, X5
+   addze   X6, X6
+
+   std X0, 0(RP)
+   std X1, 8(RP)
+   std X2, 16(RP)
+   std X3, 24(RP)
+   std X4, 32(RP)
+   std X5, 40(RP)
+   std X6, 48(RP)
+
+   ld  r14, -32(SP)
+   ld  r15, -24(SP)
+   ld  r16, -16(SP)
+   ld  r17, -8(SP)
+
+   blr
+EPILOGUE(_nettle_ecc_curve448_modp)
-- 
2.34.1

_

[PATCH v2 0/6] Add powerpc64 assembly for elliptic curves

2022-01-20 Thread Amitay Isaacs
Hi,

This series of patches add the powerpc64 assembly for modp/redc functions
for elliptic curves P192, P224, P256, P384, P521, X25519 and X448. It results
in 15-30% performance improvements as measured on POWER9 system using
hogweed-benchmark.

I posted the modified codes in the earlier email thread, but I think
posting them as a seperate series will make them easier to cherry pick.


V2 changes:
  - Use actual register names when storing/restoring from stack
  - Drop m4 definitions which are not in use
  - Simplify C2 folding for P192 curve

Amitay Isaacs (2):
  ecc: Add powerpc64 assembly for ecc_192_modp
  ecc: Add powerpc64 assembly for ecc_224_modp

Martin Schwenke (4):
  ecc: Add powerpc64 assembly for ecc_384_modp
  ecc: Add powerpc64 assembly for ecc_521_modp
  ecc: Add powerpc64 assembly for ecc_25519_modp
  ecc: Add powerpc64 assembly for ecc_448_modp

 powerpc64/ecc-curve25519-modp.asm | 101 +
 powerpc64/ecc-curve448-modp.asm   | 174 +++
 powerpc64/ecc-secp192r1-modp.asm  |  87 
 powerpc64/ecc-secp224r1-modp.asm  | 123 
 powerpc64/ecc-secp384r1-modp.asm  | 227 ++
 powerpc64/ecc-secp521r1-modp.asm  | 166 ++
 6 files changed, 878 insertions(+)
 create mode 100644 powerpc64/ecc-curve25519-modp.asm
 create mode 100644 powerpc64/ecc-curve448-modp.asm
 create mode 100644 powerpc64/ecc-secp192r1-modp.asm
 create mode 100644 powerpc64/ecc-secp224r1-modp.asm
 create mode 100644 powerpc64/ecc-secp384r1-modp.asm
 create mode 100644 powerpc64/ecc-secp521r1-modp.asm

-- 
2.34.1

___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


[PATCH v2 2/6] ecc: Add powerpc64 assembly for ecc_224_modp

2022-01-20 Thread Amitay Isaacs
Signed-off-by: Amitay Isaacs 
---
 powerpc64/ecc-secp224r1-modp.asm | 123 +++
 1 file changed, 123 insertions(+)
 create mode 100644 powerpc64/ecc-secp224r1-modp.asm

diff --git a/powerpc64/ecc-secp224r1-modp.asm b/powerpc64/ecc-secp224r1-modp.asm
new file mode 100644
index ..e4bbf366
--- /dev/null
+++ b/powerpc64/ecc-secp224r1-modp.asm
@@ -0,0 +1,123 @@
+C powerpc64/ecc-secp224r1-modp.asm
+
+ifelse(`
+   Copyright (C) 2021 Amitay Isaacs, IBM Corporation
+
+   Based on x86_64/ecc-secp224r1-modp.asm
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+   .file "ecc-secp224r1-modp.asm"
+
+define(`SP', `r1')
+
+define(`RP', `r4')
+define(`XP', `r5')
+
+define(`T0', `r6')
+define(`T1', `r7')
+define(`H0', `r8')
+define(`H1', `r9')
+define(`H2', `r10')
+define(`F0', `r11')
+define(`F1', `r12')
+define(`F2', `r14')
+define(`T2', `r3')
+
+   C void ecc_secp224r1_modp (const struct ecc_modulo *m, mp_limb_t *rp)
+   .text
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_ecc_secp224r1_modp)
+   std r14, -8(SP)
+
+   ld  H0, 48(XP)
+   ld  H1, 56(XP)
+   C set (F2, F1, F0) <-- (H1, H0) << 32
+   sldiF0, H0, 32
+   srdiF1, H0, 32
+   sldiT0, H1, 32
+   srdiF2, H1, 32
+   or  F1, T0, F1
+
+   li  H2, 0
+   ld  T0, 16(XP)
+   ld  T1, 24(XP)
+   subfc   T0, F0, T0
+   subfe   T1, F1, T1
+   subfe   H0, F2, H0
+   addme   H1, H1
+
+   ld  T2, 32(XP)
+   addcH0, T2, H0
+   ld  T2, 40(XP)
+   addeH1, T2, H1
+   addze   H2, H2
+
+   C Set (F2, F1, F0) <-- (H2, H1, H0) << 32
+   sldiF0, H0, 32
+   srdiF1, H0, 32
+   addcH0, T0, H0
+   sldiT0, H1, 32
+   srdiF2, H1, 32
+   addeH1, T1, H1
+   sldiT1, H2, 32
+   addze   H2, H2
+   or  F1, T0, F1
+   or  F2, T1, F2
+
+   ld  T0, 0(XP)
+   ld  T1, 8(XP)
+   subfc   T0, F0, T0
+   subfe   T1, F1, T1
+   subfe   H0, F2, H0
+   addme   H1, H1
+   addme   H2, H2
+
+   srdiF0, H1, 32
+   sldiF1, H2, 32
+   or  F0, F1, F0
+   clrrdi  F1, H1, 32
+   mr  F2, H2
+   clrldi  H1, H1, 32
+
+   subfc   T0, F0, T0
+   addme   F1, F1
+   addme   F2, F2
+   addcT1, F1, T1
+   addeH0, F2, H0
+   addze   H1, H1
+
+   std T0, 0(RP)
+   std T1, 8(RP)
+   std H0, 16(RP)
+   std H1, 24(RP)
+
+   ld  r14, -8(SP)
+
+   blr
+EPILOGUE(_nettle_ecc_secp224r1_modp)
-- 
2.34.1

___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


[PATCH v2 4/6] ecc: Add powerpc64 assembly for ecc_521_modp

2022-01-20 Thread Amitay Isaacs
From: Martin Schwenke 

Signed-off-by: Martin Schwenke 
Signed-off-by: Alastair D'Silva 
---
 powerpc64/ecc-secp521r1-modp.asm | 166 +++
 1 file changed, 166 insertions(+)
 create mode 100644 powerpc64/ecc-secp521r1-modp.asm

diff --git a/powerpc64/ecc-secp521r1-modp.asm b/powerpc64/ecc-secp521r1-modp.asm
new file mode 100644
index ..e989f9cf
--- /dev/null
+++ b/powerpc64/ecc-secp521r1-modp.asm
@@ -0,0 +1,166 @@
+C powerpc64/ecc-secp521r1-modp.asm
+
+ifelse(`
+   Copyright (C) 2021 Martin Schwenke & Alastair D´Silva, IBM Corporation
+
+   Based on x86_64/ecc-secp521r1-modp.asm
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+   .file "ecc-secp521r1-modp.asm"
+
+define(`SP', `r1')
+
+define(`RP', `r4')
+define(`XP', `r5')
+
+define(`U0', `r6')
+define(`U1', `r7')
+define(`U2', `r8')
+define(`U3', `r9')
+define(`U4', `r10')
+define(`U5', `r11')
+define(`U6', `r12')
+define(`U7', `r14')
+define(`U8', `r15')
+define(`U9', `r16')
+
+define(`T0', `r3')
+define(`T1', `r17')
+
+
+   C void ecc_secp521r1_modp (const struct ecc_modulo *p, mp_limb_t *rp, 
mp_limb_t *xp)
+   .text
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_ecc_secp521r1_modp)
+
+   std r14, -32(SP)
+   std r15, -24(SP)
+   std r16, -16(SP)
+   std r17, -8(SP)
+
+   C Read top 17 limbs, shift left 55 bits
+   ld  U1, 72(XP)
+   sldiU0, U1, 55
+   srdiU1, U1, 9
+
+   ld  T0, 80(XP)
+   srdiU2, T0, 9
+   sldiT0, T0, 55
+   or  U1, T0, U1
+
+   ld  T0, 88(XP)
+   srdiU3, T0, 9
+   sldiT0, T0, 55
+   or  U2, T0, U2
+
+   ld  T0, 96(XP)
+   srdiU4, T0, 9
+   sldiT0, T0, 55
+   or  U3, T0, U3
+
+   ld  T0, 104(XP)
+   srdiU5, T0, 9
+   sldiT0, T0, 55
+   or  U4, T0, U4
+
+   ld  T0, 112(XP)
+   srdiU6, T0, 9
+   sldiT0, T0, 55
+   or  U5, T0, U5
+
+   ld  T0, 120(XP)
+   srdiU7, T0, 9
+   sldiT0, T0, 55
+   or  U6, T0, U6
+
+   ld  T0, 128(XP)
+   srdiU8, T0, 9
+   sldiT0, T0, 55
+   or  U7, T0, U7
+
+   ld  T0, 136(XP)
+   srdiU9, T0, 9
+   sldiT0, T0, 55
+   or  U8, T0, U8
+
+   ld  T0, 0(XP)
+   ld  T1, 8(XP)
+   addcU0, T0, U0
+   addeU1, T1, U1
+   ld  T0, 16(XP)
+   ld  T1, 24(XP)
+   addeU2, T0, U2
+   addeU3, T1, U3
+   ld  T0, 32(XP)
+   ld  T1, 40(XP)
+   addeU4, T0, U4
+   addeU5, T1, U5
+   ld  T0, 48(XP)
+   ld  T1, 56(XP)
+   addeU6, T0, U6
+   addeU7, T1, U7
+   ld  T0, 64(XP)
+   addeU8, T0, U8
+   addze   U9, U9
+
+   C Top limbs are . Keep low 9 bits of 8, and fold the
+   C top bits (at most 65 bits).
+   srdiT0, U8, 9
+   andi.   U8, U8, 0x1ff
+   srdiT1, U9, 9
+   sldiU9, U9, 55
+   or  T0, U9, T0
+
+   addcU0, T0, U0
+   addeU1, T1, U1
+   addze   U2, U2
+   addze   U3, U3
+   addze   U4, U4
+   addze   U5, U5
+   addze   U6, U6
+   addze   U7, U7
+   addze   U8, U8
+
+   std U0, 0(RP)
+   std U1, 8(RP)
+   std U2, 16(RP)
+   std U3, 24(RP)
+   std U4, 32(RP)
+   std U5, 40(RP)
+   std U6, 48(RP)
+   std U7, 56(RP)
+   std U8, 64(RP)
+
+   ld  r14, -32(SP)
+   ld  r15, -24(SP)
+   ld  r16, -16(SP)
+   ld  r17, -8(SP)
+
+   blr
+EPILOGUE(_nettle_ecc_secp521r1_modp)
-- 
2.34.1

___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


[PATCH v2 1/6] ecc: Add powerpc64 assembly for ecc_192_modp

2022-01-20 Thread Amitay Isaacs
Signed-off-by: Amitay Isaacs 
---
 powerpc64/ecc-secp192r1-modp.asm | 87 
 1 file changed, 87 insertions(+)
 create mode 100644 powerpc64/ecc-secp192r1-modp.asm

diff --git a/powerpc64/ecc-secp192r1-modp.asm b/powerpc64/ecc-secp192r1-modp.asm
new file mode 100644
index ..ee38ec60
--- /dev/null
+++ b/powerpc64/ecc-secp192r1-modp.asm
@@ -0,0 +1,87 @@
+C powerpc64/ecc-secp192r1-modp.asm
+
+ifelse(`
+   Copyright (C) 2021 Amitay Isaacs, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+   .file "ecc-secp192r1-modp.asm"
+
+define(`RP', `r4')
+define(`XP', `r5')
+
+define(`T0', `r6')
+define(`T1', `r7')
+define(`T2', `r8')
+define(`T3', `r9')
+define(`C1', `r10')
+define(`C2', `r11')
+
+   C void ecc_secp192r1_modp (const struct ecc_modulo *m, mp_limb_t *rp)
+   .text
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_ecc_secp192r1_modp)
+   ld  T0, 0(XP)
+   ld  T1, 8(XP)
+   ld  T2, 16(XP)
+
+   li  C1, 0
+   li  C2, 0
+
+   ld  T3, 24(XP)
+   addcT0, T3, T0
+   addeT1, T3, T1
+   addze   T2, T2
+   addze   C1, C1
+
+   ld  T3, 32(XP)
+   addcT1, T3, T1
+   addeT2, T3, T2
+   addze   C1, C1
+
+   ld  T3, 40(XP)
+   addcT0, T3, T0
+   addeT1, T3, T1
+   addeT2, T3, T2
+   addze   C1, C1
+
+   addcT0, C1, T0
+   addeT1, C1, T1
+   addze   T2, T2
+   addze   C2, C2
+
+   addcT0, C2, T0
+   addeT1, C2, T1
+   addze   T2, T2
+
+   std T0, 0(RP)
+   std T1, 8(RP)
+   std T2, 16(RP)
+
+   blr
+EPILOGUE(_nettle_ecc_secp192r1_modp)
-- 
2.34.1

___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


[PATCH v2 3/6] ecc: Add powerpc64 assembly for ecc_384_modp

2022-01-20 Thread Amitay Isaacs
From: Martin Schwenke 

Signed-off-by: Martin Schwenke 
Signed-off-by: Amitay Isaacs 
Signed-off-by: Alastair D'Silva 
---
 powerpc64/ecc-secp384r1-modp.asm | 227 +++
 1 file changed, 227 insertions(+)
 create mode 100644 powerpc64/ecc-secp384r1-modp.asm

diff --git a/powerpc64/ecc-secp384r1-modp.asm b/powerpc64/ecc-secp384r1-modp.asm
new file mode 100644
index ..d673bf1e
--- /dev/null
+++ b/powerpc64/ecc-secp384r1-modp.asm
@@ -0,0 +1,227 @@
+C powerpc64/ecc-secp384r1-modp.asm
+
+ifelse(`
+   Copyright (C) 2021 Martin Schwenke, Amitay Isaacs & Alastair D´Silva, IBM 
Corporation
+
+   Based on x86_64/ecc-secp256r1-redc.asm
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+   Software Foundation; either version 3 of the License, or (at your
+   option) any later version.
+
+   or
+
+ * the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your
+   option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+   .file "ecc-secp384r1-modp.asm"
+
+C Register usage:
+
+define(`SP', `r1')
+
+define(`RP', `r4')
+define(`XP', `r5')
+
+define(`D5', `r6')
+define(`T0', `r7')
+define(`T1', `r8')
+define(`T2', `r9')
+define(`T3', `r10')
+define(`T4', `r11')
+define(`T5', `r12')
+define(`H0', `r14')
+define(`H1', `r15')
+define(`H2', `r16')
+define(`H3', `r17')
+define(`H4', `r18')
+define(`H5', `r19')
+define(`C2', `r3')
+define(`C0', H5)   C Overlap
+define(`TMP', XP)  C Overlap
+
+
+   C void ecc_secp384r1_modp (const struct ecc_modulo *m, mp_limb_t *rp, 
mp_limb_t *xp)
+   .text
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_ecc_secp384r1_modp)
+
+   std r14, -48(SP)
+   std r15, -40(SP)
+   std r16, -32(SP)
+   std r17, -24(SP)
+   std r18, -16(SP)
+   std r19, -8(SP)
+
+   C First get top 2 limbs, which need folding twice.
+   C B^10 = B^6 + B^4 + 2^32 (B-1)B^4.
+   C We handle the terms as follow:
+   C
+   C B^6: Folded immediatly.
+   C
+   C B^4: Delayed, added in in the next folding.
+   C
+   C 2^32(B-1) B^4: Low half limb delayed until the next
+   C folding. Top 1.5 limbs subtracted and shifter now, resulting
+   C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added
+   C in.
+
+   ld  H4, 80(XP)
+   ld  H5, 88(XP)
+   C Shift right 32 bits, into H1, H0
+   srdiH1, H5, 32
+   sldiD5, H5, 32
+   srdiH0, H4, 32
+   or  H0, H0, D5
+
+   C   H1 H0
+   C   -  H1 H0
+   C   
+   C   H1 H0 D5
+   subfic  D5, H0, 0
+   subfe   H0, H1, H0
+   addme   H1, H1
+
+   li  C2, 0
+   addcH0, H4, H0
+   addeH1, H5, H1
+   addze   C2, C2
+
+   C Add in to high part
+   ld  T1, 48(XP)
+   ld  T2, 56(XP)
+   addcH0, T1, H0
+   addeH1, T2, H1
+   addze   C2, C2  C Do C2 later
+
+   C +1 term
+   ld  T0, 0(XP)
+   ld  T1, 8(XP)
+   ld  T2, 16(XP)
+   ld  T3, 24(XP)
+   ld  T4, 32(XP)
+   ld  T5, 40(XP)
+   ld  H2, 64(XP)
+   ld  H3, 72(XP)
+   addcT0, H0, T0
+   addeT1, H1, T1
+   addeT2, H2, T2
+   addeT3, H3, T3
+   addeT4, H4, T4
+   addeT5, H5, T5
+   li  C0, 0
+   addze   C0, C0
+
+   C +B^2 term
+   addcT2, H0, T2
+   addeT3, H1, T3
+   addeT4, H2, T4
+   addeT5, H3, T5
+   addze   C0, C0
+
+   C Shift left, including low half of H4
+   sldiH4, H4, 32
+   srdiTMP, H3, 32
+   or  H4, TMP, H4
+
+   sldiH3, H3, 32
+   srdiTMP, H2, 32
+   or  H3, TMP, H3
+
+   sldiH2, H2, 32
+   srdiTMP, H1, 32
+   or  H2, TMP, H2
+
+   sldiH1, H1, 32
+   srdiTMP, H0, 32
+   or  H1, TMP, H1
+
+   sldiH0, H0, 32
+
+   C   H4 H3 H2 H1 H0  0
+   C  -   H4 H3 H2 H1 H0
+   C  ---
+   C   H4 H3 H2 H1 H0 TMP
+
+   subfic  TMP, H0, 0
+   subfe   H0, H1, H0
+   subfe   H1, H2, H1
+   subfe   H2, H3, H2
+   subfe   H3, H4, H3
+   addme   H4, H4
+
+   addcT0, TMP, T0
+   addeT1, H0, T1
+ 

Re: [PATCH 2/7] ecc: Add powerpc64 assembly for ecc_224_modp

2022-01-20 Thread Amitay Isaacs
Updated version using actual register names for storing and restoring
from stack.

Amitay.
-- 

The manager administers, the leader innovates. The manager maintains,
the
leader develops. The manager relies on systems, the leader relies on
people.
The manager counts on controls, the leader counts on trust. The manager
does
things right, the leader does the right thing. - Fortune Magazine
C powerpc64/ecc-secp224r1-modp.asm

ifelse(`
   Copyright (C) 2021 Amitay Isaacs, IBM Corporation

   Based on x86_64/ecc-secp224r1-modp.asm

   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

 * the GNU Lesser General Public License as published by the Free
   Software Foundation; either version 3 of the License, or (at your
   option) any later version.

   or

 * the GNU General Public License as published by the Free
   Software Foundation; either version 2 of the License, or (at your
   option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
')

.file "ecc-secp224r1-modp.asm"

define(`SP', `r1')

define(`RP', `r4')
define(`XP', `r5')

define(`T0', `r6')
define(`T1', `r7')
define(`H0', `r8')
define(`H1', `r9')
define(`H2', `r10')
define(`F0', `r11')
define(`F1', `r12')
define(`F2', `r14')
define(`T2', `r3')

C void ecc_secp224r1_modp (const struct ecc_modulo *m, mp_limb_t *rp)
.text
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_ecc_secp224r1_modp)
std r14, -8(SP)

ld  H0, 48(XP)
ld  H1, 56(XP)
C set (F2, F1, F0) <-- (H1, H0) << 32
sldiF0, H0, 32
srdiF1, H0, 32
sldiT0, H1, 32
srdiF2, H1, 32
or  F1, T0, F1

li  H2, 0
ld  T0, 16(XP)
ld  T1, 24(XP)
subfc   T0, F0, T0
subfe   T1, F1, T1
subfe   H0, F2, H0
addme   H1, H1

ld  T2, 32(XP)
addcH0, T2, H0
ld  T2, 40(XP)
addeH1, T2, H1
addze   H2, H2

C Set (F2, F1, F0) <-- (H2, H1, H0) << 32
sldiF0, H0, 32
srdiF1, H0, 32
addcH0, T0, H0
sldiT0, H1, 32
srdiF2, H1, 32
addeH1, T1, H1
sldiT1, H2, 32
addze   H2, H2
or  F1, T0, F1
or  F2, T1, F2

ld  T0, 0(XP)
ld  T1, 8(XP)
subfc   T0, F0, T0
subfe   T1, F1, T1
subfe   H0, F2, H0
addme   H1, H1
addme   H2, H2

srdiF0, H1, 32
sldiF1, H2, 32
or  F0, F1, F0
clrrdi  F1, H1, 32
mr  F2, H2
clrldi  H1, H1, 32

subfc   T0, F0, T0
addme   F1, F1
addme   F2, F2
addcT1, F1, T1
addeH0, F2, H0
addze   H1, H1

std T0, 0(RP)
std T1, 8(RP)
std H0, 16(RP)
std H1, 24(RP)

ld  r14, -8(SP)

blr
EPILOGUE(_nettle_ecc_secp224r1_modp)
___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


Re: [PATCH 4/7] ecc: Add powerpc64 assembly for ecc_384_modp

2022-01-20 Thread Amitay Isaacs
On Tue, 2022-01-04 at 21:28 +0100, Niels Möller wrote:
> 
> > +define(`FUNC_ALIGN', `5')
> > +PROLOGUE(_nettle_ecc_secp384r1_modp)
> > +
> > +   std H0, -48(SP)
> > +   std H1, -40(SP)
> > +   std H2, -32(SP)
> > +   std H3, -24(SP)
> > +   std H4, -16(SP)
> > +   std H5, -8(SP)
> 
> I find it clearer to use register names rather than the m4 defines
> for
> save and restore of callee-save registers.

Here's the modified code which uses the actual registers when saving
and restoring from stack.

Amitay.
-- 

Before marriage, a man yearns for the woman he loves. After marriage,
the
'Y' becomes silent.
C powerpc64/ecc-secp384r1-modp.asm

ifelse(`
   Copyright (C) 2021 Martin Schwenke, Amitay Isaacs & Alastair D´Silva, IBM 
Corporation

   Based on x86_64/ecc-secp256r1-redc.asm

   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

 * the GNU Lesser General Public License as published by the Free
   Software Foundation; either version 3 of the License, or (at your
   option) any later version.

   or

 * the GNU General Public License as published by the Free
   Software Foundation; either version 2 of the License, or (at your
   option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
')

.file "ecc-secp384r1-modp.asm"

C Register usage:

define(`SP', `r1')

define(`RP', `r4')
define(`XP', `r5')

define(`D5', `r6')
define(`T0', `r7')
define(`T1', `r8')
define(`T2', `r9')
define(`T3', `r10')
define(`T4', `r11')
define(`T5', `r12')
define(`H0', `r14')
define(`H1', `r15')
define(`H2', `r16')
define(`H3', `r17')
define(`H4', `r18')
define(`H5', `r19')
define(`C2', `r3')
define(`C0', H5)C Overlap
define(`TMP', XP)   C Overlap


C void ecc_secp384r1_modp (const struct ecc_modulo *m, mp_limb_t *rp, 
mp_limb_t *xp)
.text
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_ecc_secp384r1_modp)

std r14, -48(SP)
std r15, -40(SP)
std r16, -32(SP)
std r17, -24(SP)
std r18, -16(SP)
std r19, -8(SP)

C First get top 2 limbs, which need folding twice.
C B^10 = B^6 + B^4 + 2^32 (B-1)B^4.
C We handle the terms as follow:
C
C B^6: Folded immediatly.
C
C B^4: Delayed, added in in the next folding.
C
C 2^32(B-1) B^4: Low half limb delayed until the next
C folding. Top 1.5 limbs subtracted and shifter now, resulting
C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added
C in.

ld  H4, 80(XP)
ld  H5, 88(XP)
C Shift right 32 bits, into H1, H0
srdiH1, H5, 32
sldiD5, H5, 32
srdiH0, H4, 32
or  H0, H0, D5

C   H1 H0
C   -  H1 H0
C   
C   H1 H0 D5
subfic  D5, H0, 0
subfe   H0, H1, H0
addme   H1, H1

li  C2, 0
addcH0, H4, H0
addeH1, H5, H1
addze   C2, C2

C Add in to high part
ld  T1, 48(XP)
ld  T2, 56(XP)
addcH0, T1, H0
addeH1, T2, H1
addze   C2, C2  C Do C2 later

C +1 term
ld  T0, 0(XP)
ld  T1, 8(XP)
ld  T2, 16(XP)
ld  T3, 24(XP)
ld  T4, 32(XP)
ld  T5, 40(XP)
ld  H2, 64(XP)
ld  H3, 72(XP)
addcT0, H0, T0
addeT1, H1, T1
addeT2, H2, T2
addeT3, H3, T3
addeT4, H4, T4
addeT5, H5, T5
li  C0, 0
addze   C0, C0

C +B^2 term
addcT2, H0, T2
addeT3, H1, T3
addeT4, H2, T4
addeT5, H3, T5
addze   C0, C0

C Shift left, including low half of H4
sldiH4, H4, 32
srdiTMP, H3, 32
or  H4, TMP, H4

sldiH3, H3, 32
srdiTMP, H2, 32
or  H3, TMP, H3

sldiH2, H2, 32
srdiTMP, H1, 32
or  H2, TMP, H2

sldiH1, H1, 32
srdiTMP, H0, 32
or  H1, TMP, H1

sldiH0, H0, 32

C   H4 H3 H2 H1 H0  0
C  -   H4 H3 H2 H1 H0
C  ---
C   H4 H3 H2 H1 H0 TMP

subfic  TMP, H0, 0
subfe   H0, H1, H0
subfe   H1, H2, H1
subfe   H2, H3, H2
subfe  

Re: [PATCH 1/7] ecc: Add powerpc64 assembly for ecc_192_modp

2022-01-20 Thread Amitay Isaacs
Here's the updated code for P192 curve after simplifying C2 folding.

Amitay.
-- 

Retirement: When you quit working just before your heart does.
Retirement: When you quit working just before your heart does.
C powerpc64/ecc-secp192r1-modp.asm

ifelse(`
   Copyright (C) 2021 Amitay Isaacs, IBM Corporation

   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

 * the GNU Lesser General Public License as published by the Free
   Software Foundation; either version 3 of the License, or (at your
   option) any later version.

   or

 * the GNU General Public License as published by the Free
   Software Foundation; either version 2 of the License, or (at your
   option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
')

.file "ecc-secp192r1-modp.asm"

define(`RP', `r4')
define(`XP', `r5')

define(`T0', `r6')
define(`T1', `r7')
define(`T2', `r8')
define(`T3', `r9')
define(`C1', `r10')
define(`C2', `r11')

C void ecc_secp192r1_modp (const struct ecc_modulo *m, mp_limb_t *rp)
.text
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_ecc_secp192r1_modp)
ld  T0, 0(XP)
ld  T1, 8(XP)
ld  T2, 16(XP)

li  C1, 0
li  C2, 0

ld  T3, 24(XP)
addcT0, T3, T0
addeT1, T3, T1
addze   T2, T2
addze   C1, C1

ld  T3, 32(XP)
addcT1, T3, T1
addeT2, T3, T2
addze   C1, C1

ld  T3, 40(XP)
addcT0, T3, T0
addeT1, T3, T1
addeT2, T3, T2
addze   C1, C1

addcT0, C1, T0
addeT1, C1, T1
addze   T2, T2
addze   C2, C2

addcT0, C2, T0
addeT1, C2, T1
addze   T2, T2

std T0, 0(RP)
std T1, 8(RP)
std T2, 16(RP)

blr
EPILOGUE(_nettle_ecc_secp192r1_modp)
___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


Re: [Arm64, S390x] Optimize Chacha20

2022-01-20 Thread Maamoun TK
On Thu, Jan 20, 2022 at 11:08 PM Maamoun TK 
wrote:

> On Thu, Jan 20, 2022 at 10:32 PM Niels Möller 
> wrote:
>
>> Maamoun TK  writes:
>>
>> > As far as I understand, SIMD is called Advanced SIMD on AArch64 and it's
>> > standard for this architecture. simd is enabled by default in GCC but it
>> > can be disabled with nosimd option as I can see in here
>> > https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html which is why I
>> made
>> > a specific config option for it.
>>
>> If it's present on all known aarch64 systems (and HWCAP_ASIMD flag
>> always set), I think we can keep things simpler and use the code
>> unconditionally, with no extra subdir, no fat build function pointers or
>> configure flag.
>>
>
> Ok, I'll commit the changes with vanilla assembly files.
>

Done! The MR is updated
https://git.lysator.liu.se/nettle/nettle/-/merge_requests/37

regards,
Mamone


>
>
>> I've pushed the merge button for the s390x merge request.
>>
>
> Nice! I've made various tests on each core function so merging the changes
> is gonna be ok.
>
> In another topic, I'm making experiments on your poly1305 optimizing tips
> and I'll get back to you once I'm up to something.
>
> regards,
> Mamone
>
> Regards,
>> /Niels
>>
>> --
>> Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
>> Internet email is subject to wholesale government surveillance.
>>
>
___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


Re: [Arm64, S390x] Optimize Chacha20

2022-01-20 Thread Maamoun TK
On Thu, Jan 20, 2022 at 10:32 PM Niels Möller  wrote:

> Maamoun TK  writes:
>
> > As far as I understand, SIMD is called Advanced SIMD on AArch64 and it's
> > standard for this architecture. simd is enabled by default in GCC but it
> > can be disabled with nosimd option as I can see in here
> > https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html which is why I
> made
> > a specific config option for it.
>
> If it's present on all known aarch64 systems (and HWCAP_ASIMD flag
> always set), I think we can keep things simpler and use the code
> unconditionally, with no extra subdir, no fat build function pointers or
> configure flag.
>

Ok, I'll commit the changes with vanilla assembly files.


> I've pushed the merge button for the s390x merge request.
>

Nice! I've made various tests on each core function so merging the changes
is gonna be ok.

In another topic, I'm making experiments on your poly1305 optimizing tips
and I'll get back to you once I'm up to something.

regards,
Mamone

Regards,
> /Niels
>
> --
> Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
> Internet email is subject to wholesale government surveillance.
>
___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


Re: [Arm64, S390x] Optimize Chacha20

2022-01-20 Thread Niels Möller
Maamoun TK  writes:

> As far as I understand, SIMD is called Advanced SIMD on AArch64 and it's
> standard for this architecture. simd is enabled by default in GCC but it
> can be disabled with nosimd option as I can see in here
> https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html which is why I made
> a specific config option for it.

If it's present on all known aarch64 systems (and HWCAP_ASIMD flag
always set), I think we can keep things simpler and use the code
unconditionally, with no extra subdir, no fat build function pointers or
configure flag.

I've pushed the merge button for the s390x merge request.

Regards,
/Niels

-- 
Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
Internet email is subject to wholesale government surveillance.
___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs


Re: [Arm64, PowerPC64, S390x] Optimize Poly1305

2022-01-20 Thread Niels Möller
Maamoun TK  writes:

> Wider multiplication would improve the performance for 64-bit general
> registers but as the case for the current SIMD implementation, the radix
> 2^26 fits well there.

If multiply throughput is the bottleneck, it makes sense to do as much
work as possible per multiply. So I don't think I understand the
benefits of interleaving, can you explain?

Let's consider the 64-bit case, since that's less writing. B = 2^64 as
usual. Then the state is

  H = h_2 B^2 + h_1 B + h_0 

(with h_2 rather small, depending on how far we normalize for each
block, lets assume at most 3 bits, or maybe even h_2 <= 4).

  R = r_1 B + r_0

By the spec, high 4 bits of both r_0 and r_1, and low 2 bits of r_1 are
zero, which makes mutliplication R H (mod p) particularly nice.

We get 

  R H = r_0 h_0 + B (r_1 h_0 + r_0 h_1) 
  + B^2 (r_1 h_1 + r_0 h_2) + B^3 r_1 h_2

But then B^2 = 5/4 (mod p), and hence B^2 r_1 = 5 r_1 / 4 (mod p), where
the "/ 4" is just shifting out the two low zero bits. So let r_1' = 5
r_1 / 4,

  R H = r_0 h_0 + r_1' h_1 + B (r_1 h_0 + r_0 h_1 + r_1' h_2 + B r_0 h_2)

These are 4 long multiplications (64x64 --> 128) and two short, 64x64
--> for the products involving h_2. (The 32-bit version would be 16 long
multiplications and 4 short).

From the zero high bits, we also get bounds on these terms,

 f_0 = r_0 h_0 + r_1' h_1 < 2^124 + 5*2^122 = 9*2^122

 f_1 = r_1 h_0 + r_0 h_1 + r_1' h_2 + B r_0 h_2
< 2^125 + 5*2^61 + 2^127

So these two chains can be added together as 128-bit quantities with no
overflow, in any order, there's plendy of parallelism. E.g., power
vmsumudm might be useful.

For final folding, we need to split f_1 into top 62 and low 66 bits,
multiply low part by 5 (fits in 64 bits), and add into f_0, which still
fits in 128 bits.

And then take the top 64 bits of f_0 and add into f_1 (result <= 2^66
bits).

The current C implementation uses radix 26, and 25 multiplies (32x32
--> 64) per block. And quite a lot of shifts. A radix 32 variant
analogous to the above would need 16 long multiplies and 4 short. I'd
expect that to be faster on most machines, but I'd have to try that out.


In contrast, trying to use a similar scheme for multiplying by (r^2 (mod
p)), as needed for an interleaved version, seems more expensive. There
are several contributions to the cost:

* First, the accumulation of products by power of B needs to take into
  account carry, as result can exceed 2^128, so one would need something
  closer to general schoolbok multiplication.

* Second, since r^2 (mod p) may exceed 2^128, we need three words rather
  than two, so three more short multiplications to add in.

* Third, we can't pre-divide key words by 4, since low bits are no longer
  guaranteed to be zero. This gives more expensive reduction, with more
  multiplies by 5.

The two first points makes smaller radix more attractive; if we need
three words for both factors, we can distribute the bits to ensure some
of the most significant bits are zero. 

> Since the loop of block iteration is moved to inside the assembly
> implementation, computing one multiple of key at the function prologue
> should be ok.

For large messages, that's fine, but may add a significant cost for
messages of just two blocks.

Regards,
/Niels

-- 
Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
Internet email is subject to wholesale government surveillance.
___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs