Taken from https://github.com/floodyberry/chacha-opt (released by author as public-domain-or-MIT, so I guess ok to borrow).
On x86/sse2 and x86_64: 80 to 100% faster. Passes regression test on linux/debian/stretch x86 and x86_64, benchmarks ran with patched nettle-3.4.1 (due to abi break in 3.5). *Not* tested on win{32,64} (important: win64 ABI difference). chacha-opt also contains x86{,_64}-{ssse3,avx{,2},xop} optimized code, but I don't have hardware to test (and there are difference in structure/argument layout that need to be corrected and tested). WIP, will add armv6 and arm/neon a bit later. P.S. Then I will probably take a look at poly1305 and likely try to borrow license-compatible arm asm somewhere (current nettle code is painfully slow); gcrypt is somewhat faster than nettle and LGPLv2.1+; cryptograms has definitely fastest crypto, but it is BSD-3-clause-or-GPLv2+; while it is, AFAIK, compatible with LGPL, but not sure if that's acceptable for nettle inclusion. P.S. previously posted arm neon gcm patch breaks x86_64 compilation, will post trivial fix later.
>From 7b7d54558fe4fbd20d722ee01d63b13961217416 Mon Sep 17 00:00:00 2001 From: "Yuriy M. Kaminskiy" <yum...@gmail.com> Date: Mon, 11 Mar 2019 20:51:57 +0300 Subject: [PATCH 1/3] chacha: prepare for multiblock asm implementation --- chacha-crypt.c | 89 +++++++++++++++++++++++++++++++++++++++++++------------ chacha-internal.h | 4 +++ configure.ac | 3 ++ 3 files changed, 77 insertions(+), 19 deletions(-) diff --git a/chacha-crypt.c b/chacha-crypt.c index 63d799ce..03e50d97 100644 --- a/chacha-crypt.c +++ b/chacha-crypt.c @@ -54,34 +54,85 @@ #define CHACHA_ROUNDS 20 +/* is the pointer aligned on a word boundary? */ +static int +chacha_is_aligned(const void *p) { + return ((size_t)p & (sizeof(size_t) - 1)) == 0; +} + +#ifdef HAVE_NATIVE_chacha_blocks void -chacha_crypt(struct chacha_ctx *ctx, - size_t length, - uint8_t *c, - const uint8_t *m) +_nettle_chacha_blocks_c +#else +#undef _chacha_blocks +static void +_chacha_blocks +#endif + (struct chacha_ctx *ctx, const uint8_t *m, uint8_t *c, size_t length, size_t rounds) { - if (!length) - return; - - for (;;) + while(length >= CHACHA_BLOCK_SIZE) { uint32_t x[_CHACHA_STATE_LENGTH]; + uint32_t *dst = m ? x : (uint32_t *)c; + _chacha_core (dst, ctx->state, rounds); + ctx->state[13] += (++ctx->state[12] == 0); + /* stopping at 2^70 length per nonce is user's responsibility */ - _chacha_core (x, ctx->state, CHACHA_ROUNDS); + if (m) + { + memxor3 (c, m, x, CHACHA_BLOCK_SIZE); + m += CHACHA_BLOCK_SIZE; + } + c += CHACHA_BLOCK_SIZE; + length -= CHACHA_BLOCK_SIZE; + } + if (length) + { + uint32_t x[_CHACHA_STATE_LENGTH]; + _chacha_core (x, ctx->state, rounds); ctx->state[13] += (++ctx->state[12] == 0); - /* stopping at 2^70 length per nonce is user's responsibility */ - - if (length <= CHACHA_BLOCK_SIZE) - { + + if (m) + { memxor3 (c, m, x, length); - return; + m += length; } - memxor3 (c, m, x, CHACHA_BLOCK_SIZE); + else + memcpy (c, x, length); + } +} - length -= CHACHA_BLOCK_SIZE; - c += CHACHA_BLOCK_SIZE; - m += CHACHA_BLOCK_SIZE; - } +void +chacha_crypt(struct chacha_ctx *ctx, + size_t length, + uint8_t *c, + const uint8_t *m) +{ + unsigned char buffer[16 * CHACHA_BLOCK_SIZE]; + int in_aligned, out_aligned; + in_aligned = chacha_is_aligned(m); + out_aligned = chacha_is_aligned(c); + if (in_aligned && out_aligned) + { + return _chacha_blocks(ctx, m, c, length, CHACHA_ROUNDS); + } + while (length) + { + const size_t bytes = (length > sizeof(buffer)) ? sizeof(buffer) : length; + const unsigned char *src = m; + unsigned char *dst = (out_aligned) ? c : buffer; + if (!in_aligned) + { + memcpy(buffer, m, bytes); + src = buffer; + } + _chacha_blocks(ctx, src, dst, bytes, CHACHA_ROUNDS); + if (!out_aligned) + memcpy(c, buffer, bytes); + if (m) m += bytes; + c += bytes; + length -= bytes; + } } diff --git a/chacha-internal.h b/chacha-internal.h index 1bca8e74..2e2baea0 100644 --- a/chacha-internal.h +++ b/chacha-internal.h @@ -39,8 +39,12 @@ #include "nettle-types.h" #define _chacha_core _nettle_chacha_core +#define _chacha_blocks _nettle_chacha_blocks void _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds); +void +_chacha_blocks(struct chacha_ctx *ctx, const uint8_t *src, uint8_t *dst, size_t length, size_t rounds); + #endif /* NETTLE_CHACHA_INTERNAL_H_INCLUDED */ diff --git a/configure.ac b/configure.ac index 4a482bdd..21c932a5 100644 --- a/configure.ac +++ b/configure.ac @@ -468,6 +468,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ # Assembler files which generate additional object files if they are used. asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ + chacha-blocks.asm chacha-blocks-2.asm \ chacha-core-internal-2.asm \ gcm-hash.asm gcm-hash-2.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -572,6 +573,8 @@ AH_VERBATIM([HAVE_NATIVE], [/* Define to 1 each of the following for which a native (ie. CPU specific) implementation of the corresponding routine exists. */ #undef HAVE_NATIVE_chacha_core +#undef HAVE_NATIVE_chacha_blocks +#undef HAVE_NATIVE_hchacha #undef HAVE_NATIVE_ecc_192_modp #undef HAVE_NATIVE_ecc_192_redc #undef HAVE_NATIVE_ecc_224_modp -- 2.11.0
>From 87ad780f609971c64644bc1271fb24483c1cee75 Mon Sep 17 00:00:00 2001 From: "Yuriy M. Kaminskiy" <yum...@gmail.com> Date: Tue, 12 Mar 2019 12:22:53 +0300 Subject: [PATCH 2/3] chacha-blocks: add x86_64 sse2 implementation Adopted from public domain implementation by Andrew Moon <liquid...@gmail.com> https://github.com/floodyberry/chacha-opt Before (AMD K8 @ 2.5GHz): STREAM enc | 6.27 ns/B 152.1 MiB/s 15.67 c/B After: STREAM enc | 3.02 ns/B 316.3 MiB/s 7.54 c/B --- x86_64/chacha-blocks.asm | 826 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 826 insertions(+) create mode 100644 x86_64/chacha-blocks.asm diff --git a/x86_64/chacha-blocks.asm b/x86_64/chacha-blocks.asm new file mode 100644 index 00000000..d8ade58a --- /dev/null +++ b/x86_64/chacha-blocks.asm @@ -0,0 +1,826 @@ +C x86/chacha-blocks.asm + +ifelse(< + Copyright (C) 2014 Andrew Moon <liquid...@gmail.com> + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C This file was released as Public-Domain-or-MIT at +C https://github.com/floodyberry/chacha-opt + + .file "chacha-blocks.asm" + + C chacha_blocks(struct chacha_ctx *ctx, + C const uint8_t *src, uint8_t *dst, size_t length, + C size_t rounds) + + .text +chacha_blocks_sse2_local: +PROLOGUE(_nettle_chacha_blocks) +.cfi_startproc +W64_ENTRY(5, 16) +pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_rel_offset %rbx, 0 +pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_rel_offset %rbp, 0 +movq %rsp, %rbp +.cfi_def_cfa_register %rbp +andq $~63, %rsp +subq $512, %rsp +C movq $0x3320646e61707865, %rax +C movq $0x6b20657479622d32, %r8 +C movd %rax, %xmm8 +C movd %r8, %xmm14 +C punpcklqdq %xmm14, %xmm8 +C movdqu 0(%rdi), %xmm9 +C movdqu 16(%rdi), %xmm10 +C movdqu 32(%rdi), %xmm11 + movdqu 0(%rdi), %xmm8 + movdqu 16(%rdi), %xmm9 + movdqu 32(%rdi), %xmm10 + movdqu 48(%rdi), %xmm11 +C movq 48(%rdi), %rax + movq %r8, %rax +movq $1, %r9 +movdqa %xmm8, 0(%rsp) +movdqa %xmm9, 16(%rsp) +movdqa %xmm10, 32(%rsp) +movdqa %xmm11, 48(%rsp) +movq %rax, 64(%rsp) +cmpq $256, %rcx +jb .Lchacha_blocks_sse2_below256 +pshufd $0x00, %xmm8, %xmm0 +pshufd $0x55, %xmm8, %xmm1 +pshufd $0xaa, %xmm8, %xmm2 +pshufd $0xff, %xmm8, %xmm3 +movdqa %xmm0, 128(%rsp) +movdqa %xmm1, 144(%rsp) +movdqa %xmm2, 160(%rsp) +movdqa %xmm3, 176(%rsp) +pshufd $0x00, %xmm9, %xmm0 +pshufd $0x55, %xmm9, %xmm1 +pshufd $0xaa, %xmm9, %xmm2 +pshufd $0xff, %xmm9, %xmm3 +movdqa %xmm0, 192(%rsp) +movdqa %xmm1, 208(%rsp) +movdqa %xmm2, 224(%rsp) +movdqa %xmm3, 240(%rsp) +pshufd $0x00, %xmm10, %xmm0 +pshufd $0x55, %xmm10, %xmm1 +pshufd $0xaa, %xmm10, %xmm2 +pshufd $0xff, %xmm10, %xmm3 +movdqa %xmm0, 256(%rsp) +movdqa %xmm1, 272(%rsp) +movdqa %xmm2, 288(%rsp) +movdqa %xmm3, 304(%rsp) +pshufd $0xaa, %xmm11, %xmm0 +pshufd $0xff, %xmm11, %xmm1 +movdqa %xmm0, 352(%rsp) +movdqa %xmm1, 368(%rsp) +jmp .Lchacha_blocks_sse2_atleast256 +.p2align 6,,63 +.Lchacha_blocks_sse2_atleast256: +movq 48(%rsp), %rax +leaq 1(%rax), %r8 +leaq 2(%rax), %r9 +leaq 3(%rax), %r10 +leaq 4(%rax), %rbx +movl %eax, 320(%rsp) +movl %r8d, 4+320(%rsp) +movl %r9d, 8+320(%rsp) +movl %r10d, 12+320(%rsp) +shrq $32, %rax +shrq $32, %r8 +shrq $32, %r9 +shrq $32, %r10 +movl %eax, 336(%rsp) +movl %r8d, 4+336(%rsp) +movl %r9d, 8+336(%rsp) +movl %r10d, 12+336(%rsp) +movq %rbx, 48(%rsp) +movq 64(%rsp), %rax +movdqa 128(%rsp), %xmm0 +movdqa 144(%rsp), %xmm1 +movdqa 160(%rsp), %xmm2 +movdqa 176(%rsp), %xmm3 +movdqa 192(%rsp), %xmm4 +movdqa 208(%rsp), %xmm5 +movdqa 224(%rsp), %xmm6 +movdqa 240(%rsp), %xmm7 +movdqa 256(%rsp), %xmm8 +movdqa 272(%rsp), %xmm9 +movdqa 288(%rsp), %xmm10 +movdqa 304(%rsp), %xmm11 +movdqa 320(%rsp), %xmm12 +movdqa 336(%rsp), %xmm13 +movdqa 352(%rsp), %xmm14 +movdqa 368(%rsp), %xmm15 +.Lchacha_blocks_sse2_mainloop1: +paddd %xmm4, %xmm0 +paddd %xmm5, %xmm1 +pxor %xmm0, %xmm12 +pxor %xmm1, %xmm13 +paddd %xmm6, %xmm2 +paddd %xmm7, %xmm3 +movdqa %xmm6, 96(%rsp) +pxor %xmm2, %xmm14 +pxor %xmm3, %xmm15 +pshuflw $0xb1,%xmm12,%xmm12 +pshufhw $0xb1,%xmm12,%xmm12 +pshuflw $0xb1,%xmm13,%xmm13 +pshufhw $0xb1,%xmm13,%xmm13 +pshuflw $0xb1,%xmm14,%xmm14 +pshufhw $0xb1,%xmm14,%xmm14 +pshuflw $0xb1,%xmm15,%xmm15 +pshufhw $0xb1,%xmm15,%xmm15 +paddd %xmm12, %xmm8 +paddd %xmm13, %xmm9 +paddd %xmm14, %xmm10 +paddd %xmm15, %xmm11 +movdqa %xmm12, 112(%rsp) +pxor %xmm8, %xmm4 +pxor %xmm9, %xmm5 +movdqa 96(%rsp), %xmm6 +movdqa %xmm4, %xmm12 +pslld $ 12, %xmm4 +psrld $20, %xmm12 +pxor %xmm12, %xmm4 +movdqa %xmm5, %xmm12 +pslld $ 12, %xmm5 +psrld $20, %xmm12 +pxor %xmm12, %xmm5 +pxor %xmm10, %xmm6 +pxor %xmm11, %xmm7 +movdqa %xmm6, %xmm12 +pslld $ 12, %xmm6 +psrld $20, %xmm12 +pxor %xmm12, %xmm6 +movdqa %xmm7, %xmm12 +pslld $ 12, %xmm7 +psrld $20, %xmm12 +pxor %xmm12, %xmm7 +movdqa 112(%rsp), %xmm12 +paddd %xmm4, %xmm0 +paddd %xmm5, %xmm1 +pxor %xmm0, %xmm12 +pxor %xmm1, %xmm13 +paddd %xmm6, %xmm2 +paddd %xmm7, %xmm3 +movdqa %xmm6, 96(%rsp) +pxor %xmm2, %xmm14 +pxor %xmm3, %xmm15 +movdqa %xmm12, %xmm6 +pslld $ 8, %xmm12 +psrld $24, %xmm6 +pxor %xmm6, %xmm12 +movdqa %xmm13, %xmm6 +pslld $ 8, %xmm13 +psrld $24, %xmm6 +pxor %xmm6, %xmm13 +paddd %xmm12, %xmm8 +paddd %xmm13, %xmm9 +movdqa %xmm14, %xmm6 +pslld $ 8, %xmm14 +psrld $24, %xmm6 +pxor %xmm6, %xmm14 +movdqa %xmm15, %xmm6 +pslld $ 8, %xmm15 +psrld $24, %xmm6 +pxor %xmm6, %xmm15 +paddd %xmm14, %xmm10 +paddd %xmm15, %xmm11 +movdqa %xmm12, 112(%rsp) +pxor %xmm8, %xmm4 +pxor %xmm9, %xmm5 +movdqa 96(%rsp), %xmm6 +movdqa %xmm4, %xmm12 +pslld $ 7, %xmm4 +psrld $25, %xmm12 +pxor %xmm12, %xmm4 +movdqa %xmm5, %xmm12 +pslld $ 7, %xmm5 +psrld $25, %xmm12 +pxor %xmm12, %xmm5 +pxor %xmm10, %xmm6 +pxor %xmm11, %xmm7 +movdqa %xmm6, %xmm12 +pslld $ 7, %xmm6 +psrld $25, %xmm12 +pxor %xmm12, %xmm6 +movdqa %xmm7, %xmm12 +pslld $ 7, %xmm7 +psrld $25, %xmm12 +pxor %xmm12, %xmm7 +movdqa 112(%rsp), %xmm12 +paddd %xmm5, %xmm0 +paddd %xmm6, %xmm1 +pxor %xmm0, %xmm15 +pxor %xmm1, %xmm12 +paddd %xmm7, %xmm2 +paddd %xmm4, %xmm3 +movdqa %xmm7, 96(%rsp) +pxor %xmm2, %xmm13 +pxor %xmm3, %xmm14 +pshuflw $0xb1,%xmm15,%xmm15 +pshufhw $0xb1,%xmm15,%xmm15 +pshuflw $0xb1,%xmm12,%xmm12 +pshufhw $0xb1,%xmm12,%xmm12 +pshuflw $0xb1,%xmm13,%xmm13 +pshufhw $0xb1,%xmm13,%xmm13 +pshuflw $0xb1,%xmm14,%xmm14 +pshufhw $0xb1,%xmm14,%xmm14 +paddd %xmm15, %xmm10 +paddd %xmm12, %xmm11 +paddd %xmm13, %xmm8 +paddd %xmm14, %xmm9 +movdqa %xmm15, 112(%rsp) +pxor %xmm10, %xmm5 +pxor %xmm11, %xmm6 +movdqa 96(%rsp), %xmm7 +movdqa %xmm5, %xmm15 +pslld $ 12, %xmm5 +psrld $20, %xmm15 +pxor %xmm15, %xmm5 +movdqa %xmm6, %xmm15 +pslld $ 12, %xmm6 +psrld $20, %xmm15 +pxor %xmm15, %xmm6 +pxor %xmm8, %xmm7 +pxor %xmm9, %xmm4 +movdqa %xmm7, %xmm15 +pslld $ 12, %xmm7 +psrld $20, %xmm15 +pxor %xmm15, %xmm7 +movdqa %xmm4, %xmm15 +pslld $ 12, %xmm4 +psrld $20, %xmm15 +pxor %xmm15, %xmm4 +movdqa 112(%rsp), %xmm15 +paddd %xmm5, %xmm0 +paddd %xmm6, %xmm1 +pxor %xmm0, %xmm15 +pxor %xmm1, %xmm12 +paddd %xmm7, %xmm2 +paddd %xmm4, %xmm3 +movdqa %xmm7, 96(%rsp) +pxor %xmm2, %xmm13 +pxor %xmm3, %xmm14 +movdqa %xmm15, %xmm7 +pslld $ 8, %xmm15 +psrld $24, %xmm7 +pxor %xmm7, %xmm15 +movdqa %xmm12, %xmm7 +pslld $ 8, %xmm12 +psrld $24, %xmm7 +pxor %xmm7, %xmm12 +paddd %xmm15, %xmm10 +paddd %xmm12, %xmm11 +movdqa %xmm13, %xmm7 +pslld $ 8, %xmm13 +psrld $24, %xmm7 +pxor %xmm7, %xmm13 +movdqa %xmm14, %xmm7 +pslld $ 8, %xmm14 +psrld $24, %xmm7 +pxor %xmm7, %xmm14 +paddd %xmm13, %xmm8 +paddd %xmm14, %xmm9 +movdqa %xmm15, 112(%rsp) +pxor %xmm10, %xmm5 +pxor %xmm11, %xmm6 +movdqa 96(%rsp), %xmm7 +movdqa %xmm5, %xmm15 +pslld $ 7, %xmm5 +psrld $25, %xmm15 +pxor %xmm15, %xmm5 +movdqa %xmm6, %xmm15 +pslld $ 7, %xmm6 +psrld $25, %xmm15 +pxor %xmm15, %xmm6 +pxor %xmm8, %xmm7 +pxor %xmm9, %xmm4 +movdqa %xmm7, %xmm15 +pslld $ 7, %xmm7 +psrld $25, %xmm15 +pxor %xmm15, %xmm7 +movdqa %xmm4, %xmm15 +pslld $ 7, %xmm4 +psrld $25, %xmm15 +pxor %xmm15, %xmm4 +movdqa 112(%rsp), %xmm15 +subq $2, %rax +jnz .Lchacha_blocks_sse2_mainloop1 +paddd 128(%rsp), %xmm0 +paddd 144(%rsp), %xmm1 +paddd 160(%rsp), %xmm2 +paddd 176(%rsp), %xmm3 +paddd 192(%rsp), %xmm4 +paddd 208(%rsp), %xmm5 +paddd 224(%rsp), %xmm6 +paddd 240(%rsp), %xmm7 +paddd 256(%rsp), %xmm8 +paddd 272(%rsp), %xmm9 +paddd 288(%rsp), %xmm10 +paddd 304(%rsp), %xmm11 +paddd 320(%rsp), %xmm12 +paddd 336(%rsp), %xmm13 +paddd 352(%rsp), %xmm14 +paddd 368(%rsp), %xmm15 +movdqa %xmm8, 384(%rsp) +movdqa %xmm9, 400(%rsp) +movdqa %xmm10, 416(%rsp) +movdqa %xmm11, 432(%rsp) +movdqa %xmm12, 448(%rsp) +movdqa %xmm13, 464(%rsp) +movdqa %xmm14, 480(%rsp) +movdqa %xmm15, 496(%rsp) +movdqa %xmm0, %xmm8 +movdqa %xmm2, %xmm9 +movdqa %xmm4, %xmm10 +movdqa %xmm6, %xmm11 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +punpckhdq %xmm5, %xmm4 +punpckhdq %xmm7, %xmm6 +punpckldq %xmm1, %xmm8 +punpckldq %xmm3, %xmm9 +punpckldq %xmm5, %xmm10 +punpckldq %xmm7, %xmm11 +movdqa %xmm0, %xmm1 +movdqa %xmm4, %xmm3 +movdqa %xmm8, %xmm5 +movdqa %xmm10, %xmm7 +punpckhqdq %xmm2, %xmm0 +punpckhqdq %xmm6, %xmm4 +punpckhqdq %xmm9, %xmm8 +punpckhqdq %xmm11, %xmm10 +punpcklqdq %xmm2, %xmm1 +punpcklqdq %xmm6, %xmm3 +punpcklqdq %xmm9, %xmm5 +punpcklqdq %xmm11, %xmm7 +andq %rsi, %rsi +jz .Lchacha_blocks_sse2_noinput1 +movdqu 0(%rsi), %xmm2 +movdqu 16(%rsi), %xmm6 +movdqu 64(%rsi), %xmm9 +movdqu 80(%rsi), %xmm11 +movdqu 128(%rsi), %xmm12 +movdqu 144(%rsi), %xmm13 +movdqu 192(%rsi), %xmm14 +movdqu 208(%rsi), %xmm15 +pxor %xmm2, %xmm5 +pxor %xmm6, %xmm7 +pxor %xmm9, %xmm8 +pxor %xmm11, %xmm10 +pxor %xmm12, %xmm1 +pxor %xmm13, %xmm3 +pxor %xmm14, %xmm0 +pxor %xmm15, %xmm4 +movdqu %xmm5, 0(%rdx) +movdqu %xmm7, 16(%rdx) +movdqu %xmm8, 64(%rdx) +movdqu %xmm10, 80(%rdx) +movdqu %xmm1, 128(%rdx) +movdqu %xmm3, 144(%rdx) +movdqu %xmm0, 192(%rdx) +movdqu %xmm4, 208(%rdx) +movdqa 384(%rsp), %xmm0 +movdqa 400(%rsp), %xmm1 +movdqa 416(%rsp), %xmm2 +movdqa 432(%rsp), %xmm3 +movdqa 448(%rsp), %xmm4 +movdqa 464(%rsp), %xmm5 +movdqa 480(%rsp), %xmm6 +movdqa 496(%rsp), %xmm7 +movdqa %xmm0, %xmm8 +movdqa %xmm2, %xmm9 +movdqa %xmm4, %xmm10 +movdqa %xmm6, %xmm11 +punpckldq %xmm1, %xmm8 +punpckldq %xmm3, %xmm9 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +punpckldq %xmm5, %xmm10 +punpckldq %xmm7, %xmm11 +punpckhdq %xmm5, %xmm4 +punpckhdq %xmm7, %xmm6 +movdqa %xmm8, %xmm1 +movdqa %xmm0, %xmm3 +movdqa %xmm10, %xmm5 +movdqa %xmm4, %xmm7 +punpcklqdq %xmm9, %xmm1 +punpcklqdq %xmm11, %xmm5 +punpckhqdq %xmm9, %xmm8 +punpckhqdq %xmm11, %xmm10 +punpcklqdq %xmm2, %xmm3 +punpcklqdq %xmm6, %xmm7 +punpckhqdq %xmm2, %xmm0 +punpckhqdq %xmm6, %xmm4 +movdqu 32(%rsi), %xmm2 +movdqu 48(%rsi), %xmm6 +movdqu 96(%rsi), %xmm9 +movdqu 112(%rsi), %xmm11 +movdqu 160(%rsi), %xmm12 +movdqu 176(%rsi), %xmm13 +movdqu 224(%rsi), %xmm14 +movdqu 240(%rsi), %xmm15 +pxor %xmm2, %xmm1 +pxor %xmm6, %xmm5 +pxor %xmm9, %xmm8 +pxor %xmm11, %xmm10 +pxor %xmm12, %xmm3 +pxor %xmm13, %xmm7 +pxor %xmm14, %xmm0 +pxor %xmm15, %xmm4 +movdqu %xmm1, 32(%rdx) +movdqu %xmm5, 48(%rdx) +movdqu %xmm8, 96(%rdx) +movdqu %xmm10, 112(%rdx) +movdqu %xmm3, 160(%rdx) +movdqu %xmm7, 176(%rdx) +movdqu %xmm0, 224(%rdx) +movdqu %xmm4, 240(%rdx) +addq $256, %rsi +jmp .Lchacha_blocks_sse2_mainloop_cont +.Lchacha_blocks_sse2_noinput1: +movdqu %xmm5, 0(%rdx) +movdqu %xmm7, 16(%rdx) +movdqu %xmm8, 64(%rdx) +movdqu %xmm10, 80(%rdx) +movdqu %xmm1, 128(%rdx) +movdqu %xmm3, 144(%rdx) +movdqu %xmm0, 192(%rdx) +movdqu %xmm4, 208(%rdx) +movdqa 384(%rsp), %xmm0 +movdqa 400(%rsp), %xmm1 +movdqa 416(%rsp), %xmm2 +movdqa 432(%rsp), %xmm3 +movdqa 448(%rsp), %xmm4 +movdqa 464(%rsp), %xmm5 +movdqa 480(%rsp), %xmm6 +movdqa 496(%rsp), %xmm7 +movdqa %xmm0, %xmm8 +movdqa %xmm2, %xmm9 +movdqa %xmm4, %xmm10 +movdqa %xmm6, %xmm11 +punpckldq %xmm1, %xmm8 +punpckldq %xmm3, %xmm9 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +punpckldq %xmm5, %xmm10 +punpckldq %xmm7, %xmm11 +punpckhdq %xmm5, %xmm4 +punpckhdq %xmm7, %xmm6 +movdqa %xmm8, %xmm1 +movdqa %xmm0, %xmm3 +movdqa %xmm10, %xmm5 +movdqa %xmm4, %xmm7 +punpcklqdq %xmm9, %xmm1 +punpcklqdq %xmm11, %xmm5 +punpckhqdq %xmm9, %xmm8 +punpckhqdq %xmm11, %xmm10 +punpcklqdq %xmm2, %xmm3 +punpcklqdq %xmm6, %xmm7 +punpckhqdq %xmm2, %xmm0 +punpckhqdq %xmm6, %xmm4 +movdqu %xmm1, 32(%rdx) +movdqu %xmm5, 48(%rdx) +movdqu %xmm8, 96(%rdx) +movdqu %xmm10, 112(%rdx) +movdqu %xmm3, 160(%rdx) +movdqu %xmm7, 176(%rdx) +movdqu %xmm0, 224(%rdx) +movdqu %xmm4, 240(%rdx) +.Lchacha_blocks_sse2_mainloop_cont: +addq $256, %rdx +subq $256, %rcx +cmp $256, %rcx +jae .Lchacha_blocks_sse2_atleast256 +movdqa 0(%rsp), %xmm8 +movdqa 16(%rsp), %xmm9 +movdqa 32(%rsp), %xmm10 +movdqa 48(%rsp), %xmm11 +movq $1, %r9 +.Lchacha_blocks_sse2_below256: +movq %r9, %xmm5 +andq %rcx, %rcx +jz .Lchacha_blocks_sse2_done +cmpq $64, %rcx +jae .Lchacha_blocks_sse2_above63 +movq %rdx, %r9 +andq %rsi, %rsi +jz .Lchacha_blocks_sse2_noinput2 +movq %rcx, %r10 +movq %rsp, %rdx +addq %r10, %rsi +addq %r10, %rdx +negq %r10 +.Lchacha_blocks_sse2_copyinput: +movb (%rsi, %r10), %al +movb %al, (%rdx, %r10) +incq %r10 +jnz .Lchacha_blocks_sse2_copyinput +movq %rsp, %rsi +.Lchacha_blocks_sse2_noinput2: +movq %rsp, %rdx +.Lchacha_blocks_sse2_above63: +movdqa %xmm8, %xmm0 +movdqa %xmm9, %xmm1 +movdqa %xmm10, %xmm2 +movdqa %xmm11, %xmm3 +movq 64(%rsp), %rax +.Lchacha_blocks_sse2_mainloop2: +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x93,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x39,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +subq $2, %rax +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x39,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x93,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +jnz .Lchacha_blocks_sse2_mainloop2 +paddd %xmm8, %xmm0 +paddd %xmm9, %xmm1 +paddd %xmm10, %xmm2 +paddd %xmm11, %xmm3 +andq %rsi, %rsi +jz .Lchacha_blocks_sse2_noinput3 +movdqu 0(%rsi), %xmm12 +movdqu 16(%rsi), %xmm13 +movdqu 32(%rsi), %xmm14 +movdqu 48(%rsi), %xmm15 +pxor %xmm12, %xmm0 +pxor %xmm13, %xmm1 +pxor %xmm14, %xmm2 +pxor %xmm15, %xmm3 +addq $64, %rsi +.Lchacha_blocks_sse2_noinput3: +movdqu %xmm0, 0(%rdx) +movdqu %xmm1, 16(%rdx) +movdqu %xmm2, 32(%rdx) +movdqu %xmm3, 48(%rdx) +paddq %xmm5, %xmm11 +cmpq $64, %rcx +jbe .Lchacha_blocks_sse2_mainloop2_finishup +addq $64, %rdx +subq $64, %rcx +jmp .Lchacha_blocks_sse2_below256 +.Lchacha_blocks_sse2_mainloop2_finishup: +cmpq $64, %rcx +je .Lchacha_blocks_sse2_done +addq %rcx, %r9 +addq %rcx, %rdx +negq %rcx +.Lchacha_blocks_sse2_copyoutput: +movb (%rdx, %rcx), %al +movb %al, (%r9, %rcx) +incq %rcx +jnz .Lchacha_blocks_sse2_copyoutput +.Lchacha_blocks_sse2_done: +C movdqu %xmm11, 32(%rdi) + movdqu %xmm11, 48(%rdi) +movq %rbp, %rsp +.cfi_def_cfa_register %rsp +popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp +popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx +W64_EXIT(5, 16) +ret +.cfi_endproc +EPILOGUE(_nettle_chacha_blocks) + +hchacha_sse2_local: +PROLOGUE(_nettle_hchacha) +.cfi_startproc +W64_ENTRY(4, 5) +movq $0x3320646e61707865, %rax +movq $0x6b20657479622d32, %r8 +movd %rax, %xmm0 +movd %r8, %xmm4 +punpcklqdq %xmm4, %xmm0 +movdqu 0(%rdi), %xmm1 +movdqu 16(%rdi), %xmm2 +movdqu 0(%rsi), %xmm3 +.Lhchacha_sse2_mainloop: +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x93,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x39,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +subq $2, %rcx +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x39,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x93,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +ja .Lhchacha_sse2_mainloop +movdqu %xmm0, 0(%rdx) +movdqu %xmm3, 16(%rdx) +W64_EXIT(4, 5) +ret +.cfi_endproc +EPILOGUE(_nettle_hchacha) + +ifelse(W64_ABI,yes,,< +C Not used in nettle, broken with W64 ABI +PROLOGUE(_nettle_chacha) +.cfi_startproc +C W64_ENTRY(6, 2) +pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_rel_offset %rbp, 0 +movq %rsp, %rbp +.cfi_def_cfa_register %rbp +subq $64, %rsp +andq $~63, %rsp +movdqu 0(%rdi), %xmm0 +movdqu 16(%rdi), %xmm1 +movdqa %xmm0, 0(%rsp) +movdqa %xmm1, 16(%rsp) +xorq %rdi, %rdi +movq %rdi, 32(%rsp) +movq 0(%rsi), %rsi +movq %rsi, 40(%rsp) +C movq %r9, 48(%rsp) +movq %rsp, %rdi +movq %rdx, %rsi +movq %rcx, %rdx +movq %r8, %rcx + movq %r9, %r8 +call chacha_blocks_sse2_local +pxor %xmm0, %xmm0 +movdqa %xmm0, 0(%rsp) +movdqa %xmm0, 16(%rsp) +movdqa %xmm0, 32(%rsp) +movq %rbp, %rsp +.cfi_def_cfa_register %rsp +popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp +C W64_EXIT(6, 2) +ret +.cfi_endproc +EPILOGUE(_nettle_chacha) + +PROLOGUE(_nettle_xchacha) +.cfi_startproc +W64_ENTRY(6, 1) +pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_rel_offset %rbp, 0 +pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_rel_offset %rbx, 0 +movq %rsp, %rbp +.cfi_def_cfa_register %rbp +subq $64, %rsp +andq $~63, %rsp +movq %rsp, %rbx +xorq %rax, %rax +movq %rax, 32(%rbx) +movq 16(%rsi), %rax +movq %rax, 40(%rbx) +C movq %r9, 48(%rbx) + pushq %r9 +pushq %rdx +pushq %rcx +pushq %r8 +movq %rbx, %rdx +movq %r9, %rcx +call hchacha_sse2_local +movq %rbx, %rdi +popq %rcx +popq %rdx +popq %rsi + popq %r8 +call chacha_blocks_sse2_local +pxor %xmm0, %xmm0 +movdqa %xmm0, 0(%rbx) +movdqa %xmm0, 16(%rbx) +movdqa %xmm0, 32(%rbx) +movq %rbp, %rsp +.cfi_def_cfa_register %rsp +popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx +popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp +W64_EXIT(6, 1) +ret +.cfi_endproc +EPILOGUE(_nettle_xchacha) +>) -- 2.11.0
>From 21b21748be8218a0a481174bf6a80cbbd749555b Mon Sep 17 00:00:00 2001 From: "Yuriy M. Kaminskiy" <yum...@gmail.com> Date: Mon, 11 Mar 2019 21:20:56 +0300 Subject: [PATCH 3/3] chacha-blocks: add (fat) x86/sse2 implementation Adopted from public domain implementation by Andrew Moon <liquid...@gmail.com> https://github.com/floodyberry/chacha-opt Before (AMD K8 @ 2.5GHz): STREAM enc | 5.54 ns/B 172.0 MiB/s 13.86 c/B After: STREAM enc | 3.11 ns/B 306.6 MiB/s 7.78 c/B --- Makefile.in | 3 +- configure.ac | 11 + fat-setup.h | 3 + fat-x86.c | 232 +++++++++++ x86/chacha_constants_x86.inc | 7 + x86/fat/chacha-blocks.asm | 37 ++ x86/sse2/chacha-blocks.asm | 949 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1241 insertions(+), 1 deletion(-) create mode 100644 fat-x86.c create mode 100644 x86/chacha_constants_x86.inc create mode 100644 x86/fat/chacha-blocks.asm create mode 100644 x86/sse2/chacha-blocks.asm diff --git a/Makefile.in b/Makefile.in index 83250cf3..d1cfc461 100644 --- a/Makefile.in +++ b/Makefile.in @@ -187,7 +187,7 @@ hogweed_SOURCES = sexp.c sexp-format.c \ ed25519-sha512-pubkey.c \ ed25519-sha512-sign.c ed25519-sha512-verify.c -OPT_SOURCES = fat-x86_64.c fat-arm.c mini-gmp.c +OPT_SOURCES = fat-x86.c fat-x86_64.c fat-arm.c mini-gmp.c HEADERS = aes.h arcfour.h arctwo.h asn1.h blowfish.h \ base16.h base64.h bignum.h buffer.h camellia.h cast128.h \ @@ -585,6 +585,7 @@ distdir: $(DISTFILES) fi ; \ done set -e; for d in sparc32 sparc64 x86 \ + x86/fat x86/sse2 \ x86_64 x86_64/aesni x86_64/fat \ arm arm/neon arm/v6 arm/fat ; do \ mkdir "$(distdir)/$$d" ; \ diff --git a/configure.ac b/configure.ac index 21c932a5..fd6abbed 100644 --- a/configure.ac +++ b/configure.ac @@ -419,6 +419,17 @@ if test "x$enable_assembler" = xyes ; then fi else asm_path=x86 + if test "x$enable_fat" = xyes ; then + asm_path="x86/fat $asm_path" + OPT_NETTLE_SOURCES="fat-x86.c $OPT_NETTLE_SOURCES" + else + if test "x$enable_x86_aesni" = xyes ; then + asm_path="x86/aesni $asm_path" + fi + if test "x$enable_x86_sse2" = xyes ; then + asm_path="x86/aesni $asm_path" + fi + fi fi ;; *sparc*) diff --git a/fat-setup.h b/fat-setup.h index b623ebf9..73c8e7a7 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -176,3 +176,6 @@ typedef void umac_nh_n_func (uint64_t *out, unsigned n, const uint32_t *key, unsigned length, const uint8_t *msg); typedef void chacha_core_func(uint32_t *dst, const uint32_t *src, unsigned rounds); + +struct chacha_ctx; +typedef void chacha_blocks_func(struct chacha_ctx *ctx, const uint8_t *src, uint8_t *dst, size_t length, size_t rounds); diff --git a/fat-x86.c b/fat-x86.c new file mode 100644 index 00000000..9d4e6bda --- /dev/null +++ b/fat-x86.c @@ -0,0 +1,232 @@ +/* fat-x86_64.c + + Copyright (C) 2015 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +*/ + +#define _GNU_SOURCE + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "nettle-types.h" + +#include "aes-internal.h" +#include "memxor.h" +#include "fat-setup.h" + +#if 0 +void _nettle_cpuid (uint32_t input, uint32_t regs[4]); +#endif + +struct x86_features +{ +#if 0 + enum x86_vendor { X86_OTHER, X86_INTEL, X86_AMD } vendor; + int have_aesni; +#endif + int have_sse2; +}; + +#define SKIP(s, slen, literal, llen) \ + (((slen) >= (llen) && memcmp ((s), (literal), llen) == 0) \ + ? ((slen) -= (llen), (s) += (llen), 1) : 0) +#define MATCH(s, slen, literal, llen) \ + ((slen) == (llen) && memcmp ((s), (literal), llen) == 0) + +static void +get_x86_features (struct x86_features *features) +{ + const char *s; +#if 0 + features->vendor = X86_OTHER; + features->have_aesni = 0; +#endif + features->have_sse2 = 0; + + s = secure_getenv (ENV_OVERRIDE); + if (s) + for (;;) + { + const char *sep = strchr (s, ','); + size_t length = sep ? (size_t) (sep - s) : strlen(s); + +#if 0 + if (SKIP (s, length, "vendor:", 7)) + { + if (MATCH(s, length, "intel", 5)) + features->vendor = X86_INTEL; + else if (MATCH(s, length, "amd", 3)) + features->vendor = X86_AMD; + + } + else + if (MATCH (s, length, "aesni", 5)) + features->have_aesni = 1; + else +#endif + if (MATCH (s, length, "sse2", 4)) + features->have_sse2 = 1; + if (!sep) + break; + s = sep + 1; + } + else + { +#if 0 + uint32_t cpuid_data[4]; + _nettle_cpuid (0, cpuid_data); + if (memcmp (cpuid_data + 1, "Genu" "ntel" "ineI", 12) == 0) + features->vendor = X86_INTEL; + else if (memcmp (cpuid_data + 1, "Auth" "cAMD" "enti", 12) == 0) + features->vendor = X86_AMD; + + _nettle_cpuid (1, cpuid_data); + if (cpuid_data[2] & 0x02000000) + features->have_aesni = 1; +#endif +#if __GNUC__ && __GNUC__ >= 6 /* arbitrary */ + if (__builtin_cpu_supports("sse2")) + features->have_sse2 = 1; +#endif + } +} + +#if 0 +DECLARE_FAT_FUNC(_nettle_aes_encrypt, aes_crypt_internal_func) +DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, x86_64) +DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, aesni) + +DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func) +DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, x86_64) +DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, aesni) + +DECLARE_FAT_FUNC(nettle_memxor, memxor_func) +DECLARE_FAT_FUNC_VAR(memxor, memxor_func, x86_64) +DECLARE_FAT_FUNC_VAR(memxor, memxor_func, sse2) +#endif +DECLARE_FAT_FUNC(_nettle_chacha_blocks, chacha_blocks_func) +DECLARE_FAT_FUNC_VAR(chacha_blocks, chacha_blocks_func, c) +DECLARE_FAT_FUNC_VAR(chacha_blocks, chacha_blocks_func, sse2) + +/* This function should usually be called only once, at startup. But + it is idempotent, and on x86, pointer updates are atomic, so + there's no danger if it is called simultaneously from multiple + threads. */ +static void CONSTRUCTOR +fat_init (void) +{ + struct x86_features features; + int verbose; + + /* FIXME: Replace all getenv calls by getenv_secure? */ + verbose = getenv (ENV_VERBOSE) != NULL; + if (verbose) + fprintf (stderr, "libnettle: fat library initialization.\n"); + + get_x86_features (&features); +#if 0 + if (verbose) + { + const char * const vendor_names[3] = + { "other", "intel", "amd" }; + fprintf (stderr, "libnettle: cpu features: vendor:%s%s\n", + vendor_names[features.vendor], + features.have_aesni ? ",aesni" : ""); + } + if (features.have_aesni) + { + if (verbose) + fprintf (stderr, "libnettle: using aes instructions.\n"); + _nettle_aes_encrypt_vec = _nettle_aes_encrypt_aesni; + _nettle_aes_decrypt_vec = _nettle_aes_decrypt_aesni; + } + else + { + if (verbose) + fprintf (stderr, "libnettle: not using aes instructions.\n"); + _nettle_aes_encrypt_vec = _nettle_aes_encrypt_x86_64; + _nettle_aes_decrypt_vec = _nettle_aes_decrypt_x86_64; + } + + if (features.vendor == X86_INTEL) + { + if (verbose) + fprintf (stderr, "libnettle: intel SSE2 will be used for memxor.\n"); + nettle_memxor_vec = _nettle_memxor_sse2; + } + else + { + if (verbose) + fprintf (stderr, "libnettle: intel SSE2 will not be used for memxor.\n"); + nettle_memxor_vec = _nettle_memxor_x86_64; + } +#endif + if (features.have_sse2) + { + if (verbose) + fprintf (stderr, "libnettle: using sse2 instructions.\n"); + _nettle_chacha_blocks_vec = _nettle_chacha_blocks_sse2; + } + else + { + if (verbose) + fprintf (stderr, "libnettle: not using sse2 instructions.\n"); + _nettle_chacha_blocks_vec = _nettle_chacha_blocks_c; + } +} + +#if 0 +DEFINE_FAT_FUNC(_nettle_aes_encrypt, void, + (unsigned rounds, const uint32_t *keys, + const struct aes_table *T, + size_t length, uint8_t *dst, + const uint8_t *src), + (rounds, keys, T, length, dst, src)) + +DEFINE_FAT_FUNC(_nettle_aes_decrypt, void, + (unsigned rounds, const uint32_t *keys, + const struct aes_table *T, + size_t length, uint8_t *dst, + const uint8_t *src), + (rounds, keys, T, length, dst, src)) + +DEFINE_FAT_FUNC(nettle_memxor, void *, + (void *dst, const void *src, size_t n), + (dst, src, n)) +#endif + +DEFINE_FAT_FUNC(_nettle_chacha_blocks, void, + (struct chacha_ctx *ctx, const uint8_t *src, uint8_t *dst, size_t length, size_t rounds), + (ctx, src, dst, length, rounds)) diff --git a/x86/chacha_constants_x86.inc b/x86/chacha_constants_x86.inc new file mode 100644 index 00000000..79b9881b --- /dev/null +++ b/x86/chacha_constants_x86.inc @@ -0,0 +1,7 @@ +.section .rodata +.p2align 4,,15 +chacha_constants: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */ +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ + diff --git a/x86/fat/chacha-blocks.asm b/x86/fat/chacha-blocks.asm new file mode 100644 index 00000000..98d5c8a6 --- /dev/null +++ b/x86/fat/chacha-blocks.asm @@ -0,0 +1,37 @@ +C x86/fat/chacha-blocks.asm + +ifelse(< + Copyright (C) 2015 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +dnl PROLOGUE(_nettle_chacha_blocks) picked up by configure +dnl PROLOGUE(_nettle_hchacha) picked up by configure + +define(<fat_transform>, <$1_sse2>) +include_src(<x86/sse2/chacha-blocks.asm>) diff --git a/x86/sse2/chacha-blocks.asm b/x86/sse2/chacha-blocks.asm new file mode 100644 index 00000000..786db7b7 --- /dev/null +++ b/x86/sse2/chacha-blocks.asm @@ -0,0 +1,949 @@ +C x86/chacha-blocks.asm + +ifelse(< + Copyright (C) 2014 Andrew Moon <liquid...@gmail.com> + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C This file was released as Public-Domain-or-MIT at +C https://github.com/floodyberry/chacha-opt + + .file "chacha-blocks.asm" + + C chacha_blocks(struct chacha_ctx *ctx, + C const uint8_t *src, uint8_t *dst, size_t length, + C unsigned rounds) + .text +.macro LOAD_VAR_PIC var, reg + call 1f + 1: + popl \reg + leal \var - 1b(\reg), \reg +.endm + ALIGN(16) +chacha_blocks_sse2_local: +PROLOGUE(_nettle_chacha_blocks) +.cfi_startproc +pushl %ebp +.cfi_adjust_cfa_offset 4 +.cfi_rel_offset %ebp, 0 +.cfi_def_cfa_register %ebp + pushl %ebx +.cfi_adjust_cfa_offset 4 +.cfi_rel_offset %ebx, 0 + pushl %esi +.cfi_adjust_cfa_offset 4 +.cfi_rel_offset %esi, 0 + pushl %edi +.cfi_adjust_cfa_offset 4 +.cfi_rel_offset %edi, 0 +movl %esp, %ebp +andl $~63, %esp +sub $704, %esp +C movl %ebx,68(%esp) +C movl %esi,72(%esp +C movl %edi,76(%esp) +C movl 8(%ebp),%ecx + movl 8+12(%ebp),%ecx +movl %ecx,84(%esp) +C movl 12(%ebp),%esi + movl 12+12(%ebp),%esi +C movl 16(%ebp),%edx + movl 16+12(%ebp),%edx +C movl 20(%ebp),%eax + movl 20+12(%ebp),%eax +C LOAD_VAR_PIC chacha_constants, %ebx +C movdqa 0(%ebx), %xmm0 +C movdqu 0(%ecx),%xmm1 +C movdqu 16(%ecx),%xmm2 +C movdqu 32(%ecx),%xmm3 + movdqu 0(%ecx), %xmm0 + movdqu 16(%ecx), %xmm1 + movdqu 32(%ecx), %xmm2 + movdqu 48(%ecx), %xmm3 +movdqa %xmm0,0(%esp) +movdqa %xmm1,16(%esp) +movdqa %xmm2,32(%esp) +movdqa %xmm3,48(%esp) +C movl 48(%ecx),%ecx + movl 24+12(%ebp),%ecx +movl %ecx,88(%esp) +cmpl $0,%eax +jbe .Lchacha_blocks_sse2_done +cmpl $256,%eax +jb .Lchacha_blocks_sse2_bytesbetween1and255 +pshufd $0x00, %xmm0, %xmm4 +pshufd $0x55, %xmm0, %xmm5 +pshufd $0xaa, %xmm0, %xmm6 +pshufd $0xff, %xmm0, %xmm0 +movdqa %xmm4,128(%esp) +movdqa %xmm5,144(%esp) +movdqa %xmm6,160(%esp) +movdqa %xmm0,176(%esp) +pshufd $0x00, %xmm1, %xmm0 +pshufd $0x55, %xmm1, %xmm4 +pshufd $0xaa, %xmm1, %xmm5 +pshufd $0xff, %xmm1, %xmm1 +movdqa %xmm0,192(%esp) +movdqa %xmm4,208(%esp) +movdqa %xmm5,224(%esp) +movdqa %xmm1,240(%esp) +pshufd $0x00, %xmm2, %xmm0 +pshufd $0x55, %xmm2, %xmm1 +pshufd $0xaa, %xmm2, %xmm4 +pshufd $0xff, %xmm2, %xmm2 +movdqa %xmm0,256(%esp) +movdqa %xmm1,272(%esp) +movdqa %xmm4,288(%esp) +movdqa %xmm2,304(%esp) +pshufd $0xaa, %xmm3, %xmm0 +pshufd $0xff, %xmm3, %xmm1 +movdqa %xmm0,352(%esp) +movdqa %xmm1,368(%esp) +.Lchacha_blocks_sse2_bytesatleast256: +movl 48(%esp),%ecx +movl 4+48(%esp),%ebx +movl %ecx,320(%esp) +movl %ebx,336(%esp) +addl $1,%ecx +adcl $0,%ebx +movl %ecx,4+320(%esp) +movl %ebx,4+336(%esp) +addl $1,%ecx +adcl $0,%ebx +movl %ecx,8+320(%esp) +movl %ebx,8+336(%esp) +addl $1,%ecx +adcl $0,%ebx +movl %ecx,12+320(%esp) +movl %ebx,12+336(%esp) +addl $1,%ecx +adcl $0,%ebx +movl %ecx,48(%esp) +movl %ebx,4+48(%esp) +movl %eax,92(%esp) +movl 88(%esp),%eax +movdqa 160(%esp),%xmm0 +movdqa 224(%esp),%xmm1 +movdqa 288(%esp),%xmm2 +movdqa 352(%esp),%xmm3 +movdqa 176(%esp),%xmm4 +movdqa 240(%esp),%xmm5 +movdqa 304(%esp),%xmm6 +movdqa 368(%esp),%xmm7 +movdqa %xmm0,480(%esp) +movdqa %xmm1,544(%esp) +movdqa %xmm2,608(%esp) +movdqa %xmm3,672(%esp) +movdqa %xmm4,496(%esp) +movdqa %xmm5,560(%esp) +movdqa %xmm6,624(%esp) +movdqa %xmm7,688(%esp) +movdqa 128(%esp),%xmm0 +movdqa 192(%esp),%xmm1 +movdqa 256(%esp),%xmm2 +movdqa 320(%esp),%xmm3 +movdqa 144(%esp),%xmm4 +movdqa 208(%esp),%xmm5 +movdqa 272(%esp),%xmm6 +movdqa 336(%esp),%xmm7 +jmp .Lchacha_blocks_sse2_mainloop1 +.p2align 6 +.Lchacha_blocks_sse2_mainloop1: +paddd %xmm1, %xmm0 +paddd %xmm5, %xmm4 +pxor %xmm0, %xmm3 +pxor %xmm4, %xmm7 +pshuflw $0xb1,%xmm3,%xmm3 +pshuflw $0xb1,%xmm7,%xmm7 +pshufhw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm7,%xmm7 +movdqa %xmm4, 464(%esp) +paddd %xmm3, %xmm2 +paddd %xmm7, %xmm6 +pxor %xmm2, %xmm1 +pxor %xmm6, %xmm5 +movdqa %xmm1, %xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +por %xmm4, %xmm1 +movdqa %xmm5, %xmm4 +pslld $12, %xmm5 +psrld $20, %xmm4 +por %xmm4, %xmm5 +movdqa 464(%esp), %xmm4 +paddd %xmm1, %xmm0 +paddd %xmm5, %xmm4 +pxor %xmm0, %xmm3 +pxor %xmm4, %xmm7 +movdqa %xmm0, 448(%esp) +movdqa %xmm4, 464(%esp) +movdqa %xmm3, %xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +por %xmm4, %xmm3 +movdqa %xmm7, %xmm4 +pslld $8, %xmm7 +psrld $24, %xmm4 +por %xmm4, %xmm7 +paddd %xmm3, %xmm2 +paddd %xmm7, %xmm6 +movdqa %xmm3, 640(%esp) +movdqa %xmm7, 656(%esp) +pxor %xmm2, %xmm1 +pxor %xmm6, %xmm5 +movdqa %xmm2, 576(%esp) +movdqa %xmm6, 592(%esp) +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +movdqa 672(%esp), %xmm3 +psrld $25, %xmm4 +movdqa 688(%esp), %xmm7 +por %xmm4, %xmm1 +movdqa %xmm1, 512(%esp) +movdqa %xmm5, %xmm0 +pslld $7, %xmm5 +movdqa 608(%esp), %xmm2 +psrld $25, %xmm0 +movdqa 624(%esp), %xmm6 +por %xmm0, %xmm5 +movdqa %xmm5, 528(%esp) +movdqa 480(%esp), %xmm0 +movdqa 496(%esp), %xmm4 +movdqa 544(%esp), %xmm1 +movdqa 560(%esp), %xmm5 +paddd %xmm1, %xmm0 +paddd %xmm5, %xmm4 +pxor %xmm0, %xmm3 +pxor %xmm4, %xmm7 +pshuflw $0xb1,%xmm3,%xmm3 +pshuflw $0xb1,%xmm7,%xmm7 +pshufhw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm7,%xmm7 +movdqa %xmm4, 496(%esp) +paddd %xmm3, %xmm2 +paddd %xmm7, %xmm6 +pxor %xmm2, %xmm1 +pxor %xmm6, %xmm5 +movdqa %xmm1, %xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +por %xmm4, %xmm1 +movdqa %xmm5, %xmm4 +pslld $12, %xmm5 +psrld $20, %xmm4 +por %xmm4, %xmm5 +movdqa 496(%esp), %xmm4 +paddd %xmm1, %xmm0 +paddd %xmm5, %xmm4 +pxor %xmm0, %xmm3 +pxor %xmm4, %xmm7 +movdqa %xmm0, 480(%esp) +movdqa %xmm4, 496(%esp) +movdqa %xmm3, %xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +por %xmm4, %xmm3 +movdqa %xmm7, %xmm4 +pslld $8, %xmm7 +psrld $24, %xmm4 +por %xmm4, %xmm7 +paddd %xmm3, %xmm2 +paddd %xmm7, %xmm6 +movdqa %xmm3, 672(%esp) +pxor %xmm2, %xmm1 +pxor %xmm6, %xmm5 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +movdqa 640(%esp), %xmm3 +por %xmm4, %xmm1 +movdqa %xmm5, %xmm0 +pslld $7, %xmm5 +psrld $25, %xmm0 +por %xmm0, %xmm5 +movdqa %xmm5, 560(%esp) +movdqa 448(%esp), %xmm0 +movdqa 528(%esp), %xmm5 +movdqa 464(%esp), %xmm4 +paddd %xmm5, %xmm0 +paddd %xmm1, %xmm4 +pxor %xmm0, %xmm7 +pxor %xmm4, %xmm3 +pshuflw $0xb1,%xmm7,%xmm7 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm7,%xmm7 +pshufhw $0xb1,%xmm3,%xmm3 +movdqa %xmm4, 464(%esp) +paddd %xmm7, %xmm2 +paddd %xmm3, %xmm6 +pxor %xmm2, %xmm5 +pxor %xmm6, %xmm1 +movdqa %xmm5, %xmm4 +pslld $12, %xmm5 +psrld $20, %xmm4 +por %xmm4, %xmm5 +movdqa %xmm1, %xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +por %xmm4, %xmm1 +movdqa 464(%esp), %xmm4 +paddd %xmm5, %xmm0 +paddd %xmm1, %xmm4 +pxor %xmm0, %xmm7 +pxor %xmm4, %xmm3 +movdqa %xmm0, 448(%esp) +movdqa %xmm4, 464(%esp) +movdqa %xmm7, %xmm4 +pslld $8, %xmm7 +psrld $24, %xmm4 +por %xmm4, %xmm7 +movdqa %xmm3, %xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +por %xmm4, %xmm3 +paddd %xmm7, %xmm2 +paddd %xmm3, %xmm6 +movdqa %xmm7, 688(%esp) +movdqa %xmm3, 640(%esp) +pxor %xmm2, %xmm5 +pxor %xmm6, %xmm1 +movdqa %xmm2, 608(%esp) +movdqa %xmm6, 624(%esp) +movdqa %xmm5,%xmm4 +pslld $7, %xmm5 +movdqa 656(%esp), %xmm7 +psrld $25, %xmm4 +movdqa 672(%esp), %xmm3 +por %xmm4, %xmm5 +movdqa %xmm5, 528(%esp) +movdqa %xmm1, %xmm0 +pslld $7, %xmm1 +movdqa 576(%esp), %xmm2 +psrld $25, %xmm0 +movdqa 592(%esp), %xmm6 +por %xmm0, %xmm1 +movdqa %xmm1, 544(%esp) +movdqa 480(%esp), %xmm0 +movdqa 496(%esp), %xmm4 +movdqa 560(%esp), %xmm5 +movdqa 512(%esp), %xmm1 +paddd %xmm5, %xmm0 +paddd %xmm1, %xmm4 +pxor %xmm0, %xmm7 +pxor %xmm4, %xmm3 +pshuflw $0xb1,%xmm7,%xmm7 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm7,%xmm7 +pshufhw $0xb1,%xmm3,%xmm3 +movdqa %xmm4, 496(%esp) +paddd %xmm7, %xmm2 +paddd %xmm3, %xmm6 +pxor %xmm2, %xmm5 +pxor %xmm6, %xmm1 +movdqa %xmm5, %xmm4 +pslld $12, %xmm5 +psrld $20, %xmm4 +por %xmm4, %xmm5 +movdqa %xmm1, %xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +por %xmm4, %xmm1 +movdqa 496(%esp), %xmm4 +paddd %xmm5, %xmm0 +paddd %xmm1, %xmm4 +pxor %xmm0, %xmm7 +pxor %xmm4, %xmm3 +movdqa %xmm0, 480(%esp) +movdqa %xmm4, 496(%esp) +movdqa %xmm7, %xmm4 +pslld $8, %xmm7 +psrld $24, %xmm4 +por %xmm4, %xmm7 +movdqa %xmm3, %xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +por %xmm4, %xmm3 +paddd %xmm7, %xmm2 +paddd %xmm3, %xmm6 +movdqa %xmm3, 672(%esp) +pxor %xmm2, %xmm5 +pxor %xmm6, %xmm1 +movdqa %xmm5,%xmm4 +pslld $7, %xmm5 +psrld $25, %xmm4 +movdqa 640(%esp), %xmm3 +por %xmm4, %xmm5 +movdqa %xmm5, 560(%esp) +movdqa %xmm1, %xmm0 +pslld $7, %xmm1 +psrld $25, %xmm0 +por %xmm0, %xmm1 +movdqa 448(%esp), %xmm0 +movdqa 464(%esp), %xmm4 +movdqa 528(%esp), %xmm5 +subl $2,%eax +ja .Lchacha_blocks_sse2_mainloop1 +movdqa %xmm0, 448(%esp) +movdqa %xmm1, 512(%esp) +movdqa %xmm2, 576(%esp) +movdqa %xmm3, 640(%esp) +movdqa %xmm4, 464(%esp) +movdqa %xmm5, 528(%esp) +movdqa %xmm6, 592(%esp) +movdqa %xmm7, 656(%esp) +cmpl $0,%esi +movdqa 448(%esp),%xmm0 +movdqa 464(%esp),%xmm1 +movdqa 480(%esp),%xmm2 +movdqa 496(%esp),%xmm3 +paddd 128(%esp), %xmm0 +paddd 144(%esp), %xmm1 +paddd 160(%esp), %xmm2 +paddd 176(%esp), %xmm3 +movdqa %xmm0, %xmm4 +movdqa %xmm2, %xmm5 +punpckldq %xmm1, %xmm4 +punpckldq %xmm3, %xmm5 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +movdqa %xmm4, %xmm1 +movdqa %xmm0, %xmm3 +punpcklqdq %xmm5, %xmm4 +punpckhqdq %xmm5, %xmm1 +punpcklqdq %xmm2, %xmm0 +punpckhqdq %xmm2, %xmm3 +jbe .Lchacha_blocks_sse2_noinput1 +movdqu 0(%esi), %xmm2 +movdqu 64(%esi), %xmm5 +movdqu 128(%esi), %xmm6 +movdqu 192(%esi), %xmm7 +pxor %xmm2, %xmm4 +pxor %xmm5, %xmm1 +pxor %xmm6, %xmm0 +pxor %xmm7, %xmm3 +movdqu %xmm4, 0(%edx) +movdqu %xmm1, 64(%edx) +movdqu %xmm0, 128(%edx) +movdqu %xmm3, 192(%edx) +movdqa 512(%esp),%xmm0 +movdqa 528(%esp),%xmm1 +movdqa 544(%esp),%xmm2 +movdqa 560(%esp),%xmm3 +paddd 192(%esp), %xmm0 +paddd 208(%esp), %xmm1 +paddd 224(%esp), %xmm2 +paddd 240(%esp), %xmm3 +movdqa %xmm0, %xmm4 +movdqa %xmm2, %xmm5 +punpckldq %xmm1, %xmm4 +punpckldq %xmm3, %xmm5 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +movdqa %xmm4, %xmm1 +movdqa %xmm0, %xmm3 +punpcklqdq %xmm5, %xmm4 +punpckhqdq %xmm5, %xmm1 +punpcklqdq %xmm2, %xmm0 +punpckhqdq %xmm2, %xmm3 +movdqu 16+0(%esi), %xmm2 +movdqu 16+64(%esi), %xmm5 +movdqu 16+128(%esi), %xmm6 +movdqu 16+192(%esi), %xmm7 +pxor %xmm2, %xmm4 +pxor %xmm5, %xmm1 +pxor %xmm6, %xmm0 +pxor %xmm7, %xmm3 +movdqu %xmm4, 16+0(%edx) +movdqu %xmm1, 16+64(%edx) +movdqu %xmm0, 16+128(%edx) +movdqu %xmm3, 16+192(%edx) +movdqa 576(%esp),%xmm0 +movdqa 592(%esp),%xmm1 +movdqa 608(%esp),%xmm2 +movdqa 624(%esp),%xmm3 +paddd 256(%esp), %xmm0 +paddd 272(%esp), %xmm1 +paddd 288(%esp), %xmm2 +paddd 304(%esp), %xmm3 +movdqa %xmm0, %xmm4 +movdqa %xmm2, %xmm5 +punpckldq %xmm1, %xmm4 +punpckldq %xmm3, %xmm5 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +movdqa %xmm4, %xmm1 +movdqa %xmm0, %xmm3 +punpcklqdq %xmm5, %xmm4 +punpckhqdq %xmm5, %xmm1 +punpcklqdq %xmm2, %xmm0 +punpckhqdq %xmm2, %xmm3 +movdqu 32+0(%esi), %xmm2 +movdqu 32+64(%esi), %xmm5 +movdqu 32+128(%esi), %xmm6 +movdqu 32+192(%esi), %xmm7 +pxor %xmm2, %xmm4 +pxor %xmm5, %xmm1 +pxor %xmm6, %xmm0 +pxor %xmm7, %xmm3 +movdqu %xmm4, 32+0(%edx) +movdqu %xmm1, 32+64(%edx) +movdqu %xmm0, 32+128(%edx) +movdqu %xmm3, 32+192(%edx) +movdqa 640(%esp),%xmm0 +movdqa 656(%esp),%xmm1 +movdqa 672(%esp),%xmm2 +movdqa 688(%esp),%xmm3 +paddd 320(%esp), %xmm0 +paddd 336(%esp), %xmm1 +paddd 352(%esp), %xmm2 +paddd 368(%esp), %xmm3 +movdqa %xmm0, %xmm4 +movdqa %xmm2, %xmm5 +punpckldq %xmm1, %xmm4 +punpckldq %xmm3, %xmm5 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +movdqa %xmm4, %xmm1 +movdqa %xmm0, %xmm3 +punpcklqdq %xmm5, %xmm4 +punpckhqdq %xmm5, %xmm1 +punpcklqdq %xmm2, %xmm0 +punpckhqdq %xmm2, %xmm3 +movdqu 48+0(%esi), %xmm2 +movdqu 48+64(%esi), %xmm5 +movdqu 48+128(%esi), %xmm6 +movdqu 48+192(%esi), %xmm7 +pxor %xmm2, %xmm4 +pxor %xmm5, %xmm1 +pxor %xmm6, %xmm0 +pxor %xmm7, %xmm3 +movdqu %xmm4, 48+0(%edx) +movdqu %xmm1, 48+64(%edx) +movdqu %xmm0, 48+128(%edx) +movdqu %xmm3, 48+192(%edx) +addl $256,%esi +jmp .Lchacha_blocks_sse2_mainloop1_cont +.Lchacha_blocks_sse2_noinput1: +movdqu %xmm4, 0(%edx) +movdqu %xmm1, 64(%edx) +movdqu %xmm0, 128(%edx) +movdqu %xmm3, 192(%edx) +movdqa 512(%esp),%xmm0 +movdqa 528(%esp),%xmm1 +movdqa 544(%esp),%xmm2 +movdqa 560(%esp),%xmm3 +paddd 192(%esp), %xmm0 +paddd 208(%esp), %xmm1 +paddd 224(%esp), %xmm2 +paddd 240(%esp), %xmm3 +movdqa %xmm0, %xmm4 +movdqa %xmm2, %xmm5 +punpckldq %xmm1, %xmm4 +punpckldq %xmm3, %xmm5 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +movdqa %xmm4, %xmm1 +movdqa %xmm0, %xmm3 +punpcklqdq %xmm5, %xmm4 +punpckhqdq %xmm5, %xmm1 +punpcklqdq %xmm2, %xmm0 +punpckhqdq %xmm2, %xmm3 +movdqu %xmm4, 16+0(%edx) +movdqu %xmm1, 16+64(%edx) +movdqu %xmm0, 16+128(%edx) +movdqu %xmm3, 16+192(%edx) +movdqa 576(%esp),%xmm0 +movdqa 592(%esp),%xmm1 +movdqa 608(%esp),%xmm2 +movdqa 624(%esp),%xmm3 +paddd 256(%esp), %xmm0 +paddd 272(%esp), %xmm1 +paddd 288(%esp), %xmm2 +paddd 304(%esp), %xmm3 +movdqa %xmm0, %xmm4 +movdqa %xmm2, %xmm5 +punpckldq %xmm1, %xmm4 +punpckldq %xmm3, %xmm5 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +movdqa %xmm4, %xmm1 +movdqa %xmm0, %xmm3 +punpcklqdq %xmm5, %xmm4 +punpckhqdq %xmm5, %xmm1 +punpcklqdq %xmm2, %xmm0 +punpckhqdq %xmm2, %xmm3 +movdqu %xmm4, 32+0(%edx) +movdqu %xmm1, 32+64(%edx) +movdqu %xmm0, 32+128(%edx) +movdqu %xmm3, 32+192(%edx) +movdqa 640(%esp),%xmm0 +movdqa 656(%esp),%xmm1 +movdqa 672(%esp),%xmm2 +movdqa 688(%esp),%xmm3 +paddd 320(%esp), %xmm0 +paddd 336(%esp), %xmm1 +paddd 352(%esp), %xmm2 +paddd 368(%esp), %xmm3 +movdqa %xmm0, %xmm4 +movdqa %xmm2, %xmm5 +punpckldq %xmm1, %xmm4 +punpckldq %xmm3, %xmm5 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +movdqa %xmm4, %xmm1 +movdqa %xmm0, %xmm3 +punpcklqdq %xmm5, %xmm4 +punpckhqdq %xmm5, %xmm1 +punpcklqdq %xmm2, %xmm0 +punpckhqdq %xmm2, %xmm3 +movdqu %xmm4, 48+0(%edx) +movdqu %xmm1, 48+64(%edx) +movdqu %xmm0, 48+128(%edx) +movdqu %xmm3, 48+192(%edx) +.Lchacha_blocks_sse2_mainloop1_cont: +movl 92(%esp),%eax +subl $256,%eax +addl $256,%edx +cmpl $256,%eax +jae .Lchacha_blocks_sse2_bytesatleast256 +cmpl $0,%eax +jbe .Lchacha_blocks_sse2_done +.Lchacha_blocks_sse2_bytesbetween1and255: +cmpl $64,%eax +jae .Lchacha_blocks_sse2_nocopy +movl %edx,92(%esp) +cmpl $0,%esi +jbe .Lchacha_blocks_sse2_noinput2 +leal 128(%esp),%edi +movl %eax,%ecx +rep movsb +leal 128(%esp),%esi +.Lchacha_blocks_sse2_noinput2: +leal 128(%esp),%edx +.Lchacha_blocks_sse2_nocopy: +movl %eax,80(%esp) +movdqa 0(%esp),%xmm0 +movdqa 16(%esp),%xmm1 +movdqa 32(%esp),%xmm2 +movdqa 48(%esp),%xmm3 +movl 88(%esp),%eax +.Lchacha_blocks_sse2_mainloop2: +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x93,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x39,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +subl $2, %eax +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x39,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x93,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +ja .Lchacha_blocks_sse2_mainloop2 +paddd 0(%esp), %xmm0 +paddd 16(%esp), %xmm1 +paddd 32(%esp), %xmm2 +paddd 48(%esp), %xmm3 +cmpl $0,%esi +jbe .Lchacha_blocks_sse2_noinput3 +movdqu 0(%esi),%xmm4 +movdqu 16(%esi),%xmm5 +movdqu 32(%esi),%xmm6 +movdqu 48(%esi),%xmm7 +pxor %xmm4, %xmm0 +pxor %xmm5, %xmm1 +pxor %xmm6, %xmm2 +pxor %xmm7, %xmm3 +addl $64,%esi +.Lchacha_blocks_sse2_noinput3: +movdqu %xmm0,0(%edx) +movdqu %xmm1,16(%edx) +movdqu %xmm2,32(%edx) +movdqu %xmm3,48(%edx) +movl 80(%esp),%eax +movl 48(%esp),%ecx +movl 4+48(%esp),%ebx +addl $1,%ecx +adcl $0,%ebx +movl %ecx,48(%esp) +movl %ebx,4+48(%esp) +cmpl $64,%eax +ja .Lchacha_blocks_sse2_bytesatleast65 +jae .Lchacha_blocks_sse2_bytesatleast64 +movl %edx,%esi +movl 92(%esp),%edi +movl %eax,%ecx +rep movsb +.Lchacha_blocks_sse2_bytesatleast64: +.Lchacha_blocks_sse2_done: +movl 84(%esp),%eax +movdqa 48(%esp),%xmm0 +C movdqu %xmm0,32(%eax) + movdqu %xmm0,48(%eax) +movl 64(%esp),%eax +C movl 68(%esp),%ebx +C movl 72(%esp),%esi +C movl 76(%esp),%edi +.cfi_remember_state +movl %ebp, %esp +.cfi_def_cfa_register %esp + popl %edi +.cfi_adjust_cfa_offset -4 +.cfi_restore %edi + popl %esi +.cfi_adjust_cfa_offset -4 +.cfi_restore %esi + popl %ebx +.cfi_adjust_cfa_offset -4 +.cfi_restore %ebx +popl %ebp +.cfi_adjust_cfa_offset -4 +.cfi_restore %ebp +ret +.cfi_restore_state +.Lchacha_blocks_sse2_bytesatleast65: +subl $64,%eax +addl $64,%edx +jmp .Lchacha_blocks_sse2_bytesbetween1and255 +.cfi_endproc +EPILOGUE(_nettle_chacha_blocks) + + +hchacha_sse2_local: +PROLOGUE(_nettle_hchacha) +.cfi_startproc +LOAD_VAR_PIC chacha_constants, %eax +movdqa 0(%eax), %xmm0 +movl 4(%esp), %eax +movl 8(%esp), %edx +movdqu 0(%eax), %xmm1 +movdqu 16(%eax), %xmm2 +movdqu 0(%edx), %xmm3 +movl 12(%esp), %edx +movl 16(%esp), %ecx +.Lhchacha_sse2_mainloop: +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x93,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x39,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +subl $2, %ecx +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x39,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x93,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +ja .Lhchacha_sse2_mainloop +movdqu %xmm0, 0(%edx) +movdqu %xmm3, 16(%edx) +ret +.cfi_endproc +EPILOGUE(_nettle_hchacha) + +PROLOGUE(_nettle_chacha) +.cfi_startproc +pushl %ebp +.cfi_adjust_cfa_offset 4 +.cfi_rel_offset %ebp, 0 +pushl %ebx +.cfi_adjust_cfa_offset 4 +.cfi_rel_offset %ebx, 0 +movl %esp, %ebp +.cfi_def_cfa_register %ebp +subl $64, %esp +andl $~63, %esp +movl %esp, %ebx +movl 12(%ebp), %ecx +xorl %edx, %edx +movdqu 0(%ecx), %xmm0 +movdqu 16(%ecx), %xmm1 +movdqa %xmm0, 0(%ebx) +movdqa %xmm1, 16(%ebx) +movl 16(%ebp), %ecx +movl %edx, 32(%ebx) +movl %edx, 36(%ebx) +movl 0(%ecx), %eax +movl 4(%ecx), %edx +movl %eax, 40(%ebx) +movl %edx, 44(%ebx) +movl 32(%ebp), %eax +movl %eax, 48(%ebx) +pushl 28(%ebp) +pushl 24(%ebp) +pushl 20(%ebp) +pushl %ebx +call chacha_blocks_sse2_local +pxor %xmm0, %xmm0 +movdqa %xmm0, 0(%ebx) +movdqa %xmm0, 16(%ebx) +movdqa %xmm0, 32(%ebx) +movl %ebp, %esp +.cfi_def_cfa_register %esp +popl %ebx +.cfi_adjust_cfa_offset -4 +.cfi_restore %ebx +popl %ebp +.cfi_adjust_cfa_offset -4 +.cfi_restore %ebp +ret +.cfi_endproc +EPILOGUE(_nettle_chacha) + +PROLOGUE(_nettle_xchacha) +.cfi_startproc +pushl %ebp +.cfi_adjust_cfa_offset 4 +.cfi_rel_offset %ebp, 0 +pushl %ebx +.cfi_adjust_cfa_offset 4 +.cfi_rel_offset %ebx, 0 +movl %esp, %ebp +.cfi_def_cfa_register %ebp +subl $64, %esp +andl $~63, %esp +movl %esp, %ebx +pushl 32(%ebp) +pushl %ebx +pushl 16(%ebp) +pushl 12(%ebp) +call hchacha_sse2_local +xorl %edx, %edx +movl 16(%ebp), %ecx +movl 32(%ebx), %edx +movl 36(%ebx), %edx +movl 16(%ecx), %eax +movl %eax, 40(%ebx) +movl 20(%ecx), %eax +movl %eax, 44(%ebx) +C movl 32(%ebp), %eax +C movl %eax, 48(%ebx) + pushl 32(%ebp) +pushl 28(%ebp) +pushl 24(%ebp) +pushl 20(%ebp) +pushl %ebx +call chacha_blocks_sse2_local +pxor %xmm0, %xmm0 +movdqa %xmm0, 0(%ebx) +movdqa %xmm0, 16(%ebx) +movdqa %xmm0, 32(%ebx) +movl %ebp, %esp +.cfi_def_cfa_register %esp +popl %ebx +.cfi_adjust_cfa_offset -4 +.cfi_restore %ebx +popl %ebp +.cfi_adjust_cfa_offset -4 +.cfi_restore %ebp +ret +.cfi_endproc +EPILOGUE(_nettle_xchacha) + +include_src(<x86/chacha_constants_x86.inc>) + -- 2.11.0
_______________________________________________ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs