x86 sha_ni

2018-02-07 Thread Niels Möller
Hi,

I've been trying out the sha_ni instructions available on some newer
x86_64 processors.

Below replacement for sha1-compress.asm seems to run on roughly 2
cycles/byte when I benchmark it on an "AMD Ryzen 7 1700X" cpu in the gcc
compile farm. Still sligthly slower than openssl, to squeeze out a few
more cycles, it might help to change the sha1_compress interface to let
it process more than one 64-byte block at a time.

I hope to be able to wire it up via fat-x86_64.c reasonably soon. In the
mean time, if anyone wants to try it out, just change the
sha1-compress.asm symlink to point to this file.

Regards,
/Niels

-8<

C x86_64/sha_ni/sha1-compress.asm

ifelse(<
   Copyright (C) 2018 Niels Möller

   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

 * the GNU Lesser General Public License as published by the Free
   Software Foundation; either version 3 of the License, or (at your
   option) any later version.

   or

 * the GNU General Public License as published by the Free
   Software Foundation; either version 2 of the License, or (at your
   option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
>)

C Register usage.

C Arguments
define(,<%rdi>)dnl
define(,<%rsi>)dnl

define(,<%xmm0>)
define(,<%xmm1>)
define(,<%xmm2>)
define(,<%xmm3>)
define(,<%xmm4>)
define(,<%xmm5>)
define(,<%xmm6>)
define(, <%xmm7>)
define(, <%xmm8>)
define(,<%xmm9>)

C QROUND(M0, M1, M2, M3, E0, E1, TYPE)
define(, <
sha1nexte $1, $5
movdqa  ABCD, $6
sha1msg2 $1, $2
sha1rnds4 <$>$7, $5, ABCD
sha1msg1 $1, $4
pxor$1, $3
>)

.file "sha1-compress.asm"

C _nettle_sha1_compress(uint32_t *state, uint8_t *input)

.text
ALIGN(16)
.Lswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
PROLOGUE(_nettle_sha1_compress)
C save all registers that need to be saved
W64_ENTRY(2, 10)
movups  (STATE), ABCD
movd16(STATE), E0
movups  (INPUT), MSG0
movdqa  .Lswap_mask(%rip), SWAP_MASK
pshufd  $0x1b, ABCD, ABCD
pshufd  $0x1b, E0, E0
movdqa  ABCD, ABCD_ORIG
movdqa  E0, E_ORIG
pshufb  SWAP_MASK, MSG0

paddd   MSG0, E0
movdqa  ABCD, E1
sha1rnds4 $0, E0, ABCD  C Rounds 0-3

movups  16(INPUT), MSG1
pshufb  SWAP_MASK, MSG1

sha1nexte MSG1, E1
movdqa  ABCD, E0
sha1rnds4 $0, E1, ABCD  C Rounds 4-7
sha1msg1 MSG1, MSG0

movups  32(INPUT), MSG2
pshufb  SWAP_MASK, MSG2

sha1nexte MSG2, E0
movdqa  ABCD, E1
sha1rnds4 $0, E0, ABCD  C Rounds 8-11
sha1msg1 MSG2, MSG1
pxorMSG2, MSG0

movups  48(INPUT), MSG3
pshufb  SWAP_MASK, MSG3

QROUND(MSG3, MSG0, MSG1, MSG2, E1, E0, 0)   C Rounds 12-15
QROUND(MSG0, MSG1, MSG2, MSG3, E0, E1, 0)   C Rounds 16-19

QROUND(MSG1, MSG2, MSG3, MSG0, E1, E0, 1)   C Rounds 20-23
QROUND(MSG2, MSG3, MSG0, MSG1, E0, E1, 1)   C Rounds 24-27
QROUND(MSG3, MSG0, MSG1, MSG2, E1, E0, 1)   C Rounds 28-31
QROUND(MSG0, MSG1, MSG2, MSG3, E0, E1, 1)   C Rounds 32-35
QROUND(MSG1, MSG2, MSG3, MSG0, E1, E0, 1)   C Rounds 36-39

QROUND(MSG2, MSG3, MSG0, MSG1, E0, E1, 2)   C Rounds 40-43
QROUND(MSG3, MSG0, MSG1, MSG2, E1, E0, 2)   C Rounds 44-47
QROUND(MSG0, MSG1, MSG2, MSG3, E0, E1, 2)   C Rounds 48-51
QROUND(MSG1, MSG2, MSG3, MSG0, E1, E0, 2)   C Rounds 52-55
QROUND(MSG2, MSG3, MSG0, MSG1, E0, E1, 2)   C Rounds 56-59

QROUND(MSG3, MSG0, MSG1, MSG2, E1, E0, 3)   C Rounds 60-63
QROUND(MSG0, MSG1, MSG2, MSG3, E0, E1, 3)   C Rounds 64-67

sha1nexte MSG1, E1
movdqa  ABCD, E0
sha1msg2 MSG1, MSG2
sha1rnds4 $3, E1, ABCD  C Rounds 68-71
pxorMSG1, MSG3

sha1nexte MSG2, E0
movdqa  ABCD, E1
sha1msg2 MSG2, MSG3
sha1rnds4 $3, E0, ABCD  C Rounds 72-75

sha1nexte MSG3, E1
movdqa  ABCD, E0
sha1rnds4 $3, E1, ABCD  C Rounds 76-79

sha1nexte E_ORIG, E0
paddd   ABCD_ORIG, ABCD

pshufd  $0x1b, ABCD, ABCD
movups  ABCD, (STATE)
pshufd  $0x1b, E0, E0
movdE0, 16(STATE)

W64_EXIT(2, 10)
ret
EPILOGUE(_nettle_sha1_compress)

-- 
Niels Möller. PGP-encrypted email is preferred. 

easier version checks

2018-02-07 Thread Nikos Mavrogiannopoulos
What about extending the macros in version.h with a simple to use
combined version number?


From e96108cbb92a923e02349a0d3b672a9b2b94c8b9 Mon Sep 17 00:00:00 2001
From: Nikos Mavrogiannopoulos 
Date: Wed, 7 Feb 2018 11:29:07 +0100
Subject: [PATCH] version.h: introduce NETTLE_VERSION

That macro is more easy to use on #if clauses. E.g.,
do that
---
 configure.ac | 1 +
 version.h.in | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/configure.ac b/configure.ac
index 41bf0998..d06f55c9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -21,6 +21,7 @@ MAJOR_VERSION=`echo $PACKAGE_VERSION | sed 's/^\([[^.]]*\)\..*/\1/'`
 MINOR_VERSION=`echo $PACKAGE_VERSION | sed 's/^[[^.]]*\.\([[0-9]]*\).*/\1/'`
 AC_SUBST([MAJOR_VERSION])
 AC_SUBST([MINOR_VERSION])
+AC_SUBST([NUMBER_VERSION], $(printf "0x%02x%02x" ${MAJOR_VERSION} ${MINOR_VERSION}))
 
 AC_CANONICAL_HOST
 
diff --git a/version.h.in b/version.h.in
index cf429f25..7f10b995 100644
--- a/version.h.in
+++ b/version.h.in
@@ -43,6 +43,8 @@ extern "C" {
 #define NETTLE_VERSION_MAJOR @MAJOR_VERSION@
 #define NETTLE_VERSION_MINOR @MINOR_VERSION@
 
+#define NETTLE_VERSION @NUMBER_VERSION@
+
 #define NETTLE_USE_MINI_GMP @NETTLE_USE_MINI_GMP@
 
 /* We need a preprocessor constant for GMP_NUMB_BITS, simply using
-- 
2.14.3

___
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs