From: Ondrej Mosnacek <omosna...@gmail.com>

This patch adds optimized implementations of AEGIS-128, AEGIS-128L,
and AEGIS-256, utilizing the AES-NI and SSE2 x86 extensions.

Signed-off-by: Ondrej Mosnacek <omosna...@gmail.com>
---
 arch/x86/crypto/Makefile               |   8 +
 arch/x86/crypto/aegis128-aesni-asm.S   | 749 ++++++++++++++++++++++
 arch/x86/crypto/aegis128-aesni-glue.c  | 407 ++++++++++++
 arch/x86/crypto/aegis128l-aesni-asm.S  | 825 +++++++++++++++++++++++++
 arch/x86/crypto/aegis128l-aesni-glue.c | 407 ++++++++++++
 arch/x86/crypto/aegis256-aesni-asm.S   | 702 +++++++++++++++++++++
 arch/x86/crypto/aegis256-aesni-glue.c  | 407 ++++++++++++
 crypto/Kconfig                         |  24 +
 8 files changed, 3529 insertions(+)
 create mode 100644 arch/x86/crypto/aegis128-aesni-asm.S
 create mode 100644 arch/x86/crypto/aegis128-aesni-glue.c
 create mode 100644 arch/x86/crypto/aegis128l-aesni-asm.S
 create mode 100644 arch/x86/crypto/aegis128l-aesni-glue.c
 create mode 100644 arch/x86/crypto/aegis256-aesni-asm.S
 create mode 100644 arch/x86/crypto/aegis256-aesni-glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5f07333bb224..c183553a4bd6 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -38,6 +38,10 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
 obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 
+obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
+obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o
+obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o
+
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
        obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -72,6 +76,10 @@ salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
+aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o
+aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
+
 ifeq ($(avx_supported),yes)
        camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
                                        camellia_aesni_avx_glue.o
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S 
b/arch/x86/crypto/aegis128-aesni-asm.S
new file mode 100644
index 000000000000..9254e0b6cc06
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -0,0 +1,749 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosna...@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define KEY    %xmm5
+#define MSG    %xmm5
+#define T0     %xmm6
+#define T1     %xmm7
+
+#define STATEP %rdi
+#define LEN    %rsi
+#define SRC    %rdx
+#define DST    %rcx
+
+.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
+.align 16
+.Laegis128_const_0:
+       .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+       .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128_const_1:
+       .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+       .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
+.align 16
+.Laegis128_counter:
+       .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+       .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+/*
+ * aegis128_update
+ * input:
+ *   STATE[0-4] - input state
+ * output:
+ *   STATE[0-4] - output state (shifted positions)
+ * changed:
+ *   T0
+ */
+.macro aegis128_update
+       movdqa STATE4, T0
+       aesenc STATE0, STATE4
+       aesenc STATE1, STATE0
+       aesenc STATE2, STATE1
+       aesenc STATE3, STATE2
+       aesenc T0,     STATE3
+.endm
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   SRC - src
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   T0
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+       xor %r9, %r9
+       pxor MSG, MSG
+
+       mov LEN, %r8
+       and $0x1, %r8
+       jz .Lld_partial_1
+
+       mov LEN, %r8
+       and $0x1E, %r8
+       add SRC, %r8
+       mov (%r8), %r9b
+
+.Lld_partial_1:
+       mov LEN, %r8
+       and $0x2, %r8
+       jz .Lld_partial_2
+
+       mov LEN, %r8
+       and $0x1C, %r8
+       add SRC, %r8
+       shl $0x10, %r9
+       mov (%r8), %r9w
+
+.Lld_partial_2:
+       mov LEN, %r8
+       and $0x4, %r8
+       jz .Lld_partial_4
+
+       mov LEN, %r8
+       and $0x18, %r8
+       add SRC, %r8
+       shl $32, %r9
+       mov (%r8), %r8d
+       xor %r8, %r9
+
+.Lld_partial_4:
+       movq %r9, MSG
+
+       mov LEN, %r8
+       and $0x8, %r8
+       jz .Lld_partial_8
+
+       mov LEN, %r8
+       and $0x10, %r8
+       add SRC, %r8
+       pslldq $8, MSG
+       movq (%r8), T0
+       pxor T0, MSG
+
+.Lld_partial_8:
+       ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   DST - dst
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+       mov LEN, %r8
+       mov DST, %r9
+
+       movq T0, %r10
+
+       cmp $8, %r8
+       jl .Lst_partial_8
+
+       mov %r10, (%r9)
+       psrldq $8, T0
+       movq T0, %r10
+
+       sub $8, %r8
+       add $8, %r9
+
+.Lst_partial_8:
+       cmp $4, %r8
+       jl .Lst_partial_4
+
+       mov %r10d, (%r9)
+       shr $32, %r10
+
+       sub $4, %r8
+       add $4, %r9
+
+.Lst_partial_4:
+       cmp $2, %r8
+       jl .Lst_partial_2
+
+       mov %r10w, (%r9)
+       shr $0x10, %r10
+
+       sub $2, %r8
+       add $2, %r9
+
+.Lst_partial_2:
+       cmp $1, %r8
+       jl .Lst_partial_1
+
+       mov %r10b, (%r9)
+
+.Lst_partial_1:
+       ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_aegis128_aesni_init(void *state, const void *key, const void 
*iv);
+ */
+ENTRY(crypto_aegis128_aesni_init)
+       FRAME_BEGIN
+
+       /* load IV: */
+       movdqu (%rdx), T1
+
+       /* load key: */
+       movdqa (%rsi), KEY
+       pxor KEY, T1
+       movdqa T1, STATE0
+       movdqa KEY, STATE3
+       movdqa KEY, STATE4
+
+       /* load the constants: */
+       movdqa .Laegis128_const_0, STATE2
+       movdqa .Laegis128_const_1, STATE1
+       pxor STATE2, STATE3
+       pxor STATE1, STATE4
+
+       /* update 10 times with KEY / KEY xor IV: */
+       aegis128_update; pxor KEY, STATE4
+       aegis128_update; pxor T1,  STATE3
+       aegis128_update; pxor KEY, STATE2
+       aegis128_update; pxor T1,  STATE1
+       aegis128_update; pxor KEY, STATE0
+       aegis128_update; pxor T1,  STATE4
+       aegis128_update; pxor KEY, STATE3
+       aegis128_update; pxor T1,  STATE2
+       aegis128_update; pxor KEY, STATE1
+       aegis128_update; pxor T1,  STATE0
+
+       /* store the state: */
+       movdqu STATE0, 0x00(STATEP)
+       movdqu STATE1, 0x10(STATEP)
+       movdqu STATE2, 0x20(STATEP)
+       movdqu STATE3, 0x30(STATEP)
+       movdqu STATE4, 0x40(STATEP)
+
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128_aesni_init)
+
+/*
+ * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
+ *                               const void *data);
+ */
+ENTRY(crypto_aegis128_aesni_ad)
+       FRAME_BEGIN
+
+       cmp $0x10, LEN
+       jb .Lad_out
+
+       /* load the state: */
+       movdqu 0x00(STATEP), STATE0
+       movdqu 0x10(STATEP), STATE1
+       movdqu 0x20(STATEP), STATE2
+       movdqu 0x30(STATEP), STATE3
+       movdqu 0x40(STATEP), STATE4
+
+       mov SRC, %r8
+       and $0xF, %r8
+       jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+       movdqa 0x00(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE4
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_1
+
+       movdqa 0x10(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE3
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_2
+
+       movdqa 0x20(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE2
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_3
+
+       movdqa 0x30(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE1
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_4
+
+       movdqa 0x40(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE0
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_0
+
+       add $0x50, SRC
+       jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+       movdqu 0x00(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE4
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_1
+
+       movdqu 0x10(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE3
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_2
+
+       movdqu 0x20(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE2
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_3
+
+       movdqu 0x30(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE1
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_4
+
+       movdqu 0x40(SRC), MSG
+       aegis128_update
+       pxor MSG, STATE0
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_0
+
+       add $0x50, SRC
+       jmp .Lad_u_loop
+
+       /* store the state: */
+.Lad_out_0:
+       movdqu STATE0, 0x00(STATEP)
+       movdqu STATE1, 0x10(STATEP)
+       movdqu STATE2, 0x20(STATEP)
+       movdqu STATE3, 0x30(STATEP)
+       movdqu STATE4, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lad_out_1:
+       movdqu STATE4, 0x00(STATEP)
+       movdqu STATE0, 0x10(STATEP)
+       movdqu STATE1, 0x20(STATEP)
+       movdqu STATE2, 0x30(STATEP)
+       movdqu STATE3, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lad_out_2:
+       movdqu STATE3, 0x00(STATEP)
+       movdqu STATE4, 0x10(STATEP)
+       movdqu STATE0, 0x20(STATEP)
+       movdqu STATE1, 0x30(STATEP)
+       movdqu STATE2, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lad_out_3:
+       movdqu STATE2, 0x00(STATEP)
+       movdqu STATE3, 0x10(STATEP)
+       movdqu STATE4, 0x20(STATEP)
+       movdqu STATE0, 0x30(STATEP)
+       movdqu STATE1, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lad_out_4:
+       movdqu STATE1, 0x00(STATEP)
+       movdqu STATE2, 0x10(STATEP)
+       movdqu STATE3, 0x20(STATEP)
+       movdqu STATE4, 0x30(STATEP)
+       movdqu STATE0, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lad_out:
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128_aesni_ad)
+
+.macro encrypt_block a s0 s1 s2 s3 s4 i
+       movdq\a (\i * 0x10)(SRC), MSG
+       movdqa MSG, T0
+       pxor \s1, T0
+       pxor \s4, T0
+       movdqa \s2, T1
+       pand \s3, T1
+       pxor T1, T0
+       movdq\a T0, (\i * 0x10)(DST)
+
+       aegis128_update
+       pxor MSG, \s4
+
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lenc_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
+ *                                const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc)
+       FRAME_BEGIN
+
+       cmp $0x10, LEN
+       jb .Lenc_out
+
+       /* load the state: */
+       movdqu 0x00(STATEP), STATE0
+       movdqu 0x10(STATEP), STATE1
+       movdqu 0x20(STATEP), STATE2
+       movdqu 0x30(STATEP), STATE3
+       movdqu 0x40(STATEP), STATE4
+
+       mov  SRC,  %r8
+       or   DST,  %r8
+       and $0xF, %r8
+       jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+       encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+       encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+       encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+       encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+       encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+       add $0x50, SRC
+       add $0x50, DST
+       jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+       encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+       encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+       encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+       encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+       encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+       add $0x50, SRC
+       add $0x50, DST
+       jmp .Lenc_u_loop
+
+       /* store the state: */
+.Lenc_out_0:
+       movdqu STATE4, 0x00(STATEP)
+       movdqu STATE0, 0x10(STATEP)
+       movdqu STATE1, 0x20(STATEP)
+       movdqu STATE2, 0x30(STATEP)
+       movdqu STATE3, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lenc_out_1:
+       movdqu STATE3, 0x00(STATEP)
+       movdqu STATE4, 0x10(STATEP)
+       movdqu STATE0, 0x20(STATEP)
+       movdqu STATE1, 0x30(STATEP)
+       movdqu STATE2, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lenc_out_2:
+       movdqu STATE2, 0x00(STATEP)
+       movdqu STATE3, 0x10(STATEP)
+       movdqu STATE4, 0x20(STATEP)
+       movdqu STATE0, 0x30(STATEP)
+       movdqu STATE1, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lenc_out_3:
+       movdqu STATE1, 0x00(STATEP)
+       movdqu STATE2, 0x10(STATEP)
+       movdqu STATE3, 0x20(STATEP)
+       movdqu STATE4, 0x30(STATEP)
+       movdqu STATE0, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lenc_out_4:
+       movdqu STATE0, 0x00(STATEP)
+       movdqu STATE1, 0x10(STATEP)
+       movdqu STATE2, 0x20(STATEP)
+       movdqu STATE3, 0x30(STATEP)
+       movdqu STATE4, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Lenc_out:
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128_aesni_enc)
+
+/*
+ * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
+ *                                     const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc_tail)
+       FRAME_BEGIN
+
+       /* load the state: */
+       movdqu 0x00(STATEP), STATE0
+       movdqu 0x10(STATEP), STATE1
+       movdqu 0x20(STATEP), STATE2
+       movdqu 0x30(STATEP), STATE3
+       movdqu 0x40(STATEP), STATE4
+
+       /* encrypt message: */
+       call __load_partial
+
+       movdqa MSG, T0
+       pxor STATE1, T0
+       pxor STATE4, T0
+       movdqa STATE2, T1
+       pand STATE3, T1
+       pxor T1, T0
+
+       call __store_partial
+
+       aegis128_update
+       pxor MSG, STATE4
+
+       /* store the state: */
+       movdqu STATE4, 0x00(STATEP)
+       movdqu STATE0, 0x10(STATEP)
+       movdqu STATE1, 0x20(STATEP)
+       movdqu STATE2, 0x30(STATEP)
+       movdqu STATE3, 0x40(STATEP)
+
+       FRAME_END
+ENDPROC(crypto_aegis128_aesni_enc_tail)
+
+.macro decrypt_block a s0 s1 s2 s3 s4 i
+       movdq\a (\i * 0x10)(SRC), MSG
+       pxor \s1, MSG
+       pxor \s4, MSG
+       movdqa \s2, T1
+       pand \s3, T1
+       pxor T1, MSG
+       movdq\a MSG, (\i * 0x10)(DST)
+
+       aegis128_update
+       pxor MSG, \s4
+
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
+ *                                const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec)
+       FRAME_BEGIN
+
+       cmp $0x10, LEN
+       jb .Ldec_out
+
+       /* load the state: */
+       movdqu 0x00(STATEP), STATE0
+       movdqu 0x10(STATEP), STATE1
+       movdqu 0x20(STATEP), STATE2
+       movdqu 0x30(STATEP), STATE3
+       movdqu 0x40(STATEP), STATE4
+
+       mov  SRC, %r8
+       or   DST, %r8
+       and $0xF, %r8
+       jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+       decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+       decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+       decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+       decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+       decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+       add $0x50, SRC
+       add $0x50, DST
+       jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+       decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+       decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+       decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+       decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+       decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+       add $0x50, SRC
+       add $0x50, DST
+       jmp .Ldec_u_loop
+
+       /* store the state: */
+.Ldec_out_0:
+       movdqu STATE4, 0x00(STATEP)
+       movdqu STATE0, 0x10(STATEP)
+       movdqu STATE1, 0x20(STATEP)
+       movdqu STATE2, 0x30(STATEP)
+       movdqu STATE3, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Ldec_out_1:
+       movdqu STATE3, 0x00(STATEP)
+       movdqu STATE4, 0x10(STATEP)
+       movdqu STATE0, 0x20(STATEP)
+       movdqu STATE1, 0x30(STATEP)
+       movdqu STATE2, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Ldec_out_2:
+       movdqu STATE2, 0x00(STATEP)
+       movdqu STATE3, 0x10(STATEP)
+       movdqu STATE4, 0x20(STATEP)
+       movdqu STATE0, 0x30(STATEP)
+       movdqu STATE1, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Ldec_out_3:
+       movdqu STATE1, 0x00(STATEP)
+       movdqu STATE2, 0x10(STATEP)
+       movdqu STATE3, 0x20(STATEP)
+       movdqu STATE4, 0x30(STATEP)
+       movdqu STATE0, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Ldec_out_4:
+       movdqu STATE0, 0x00(STATEP)
+       movdqu STATE1, 0x10(STATEP)
+       movdqu STATE2, 0x20(STATEP)
+       movdqu STATE3, 0x30(STATEP)
+       movdqu STATE4, 0x40(STATEP)
+       FRAME_END
+       ret
+
+.Ldec_out:
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128_aesni_dec)
+
+/*
+ * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
+ *                                     const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec_tail)
+       FRAME_BEGIN
+
+       /* load the state: */
+       movdqu 0x00(STATEP), STATE0
+       movdqu 0x10(STATEP), STATE1
+       movdqu 0x20(STATEP), STATE2
+       movdqu 0x30(STATEP), STATE3
+       movdqu 0x40(STATEP), STATE4
+
+       /* decrypt message: */
+       call __load_partial
+
+       pxor STATE1, MSG
+       pxor STATE4, MSG
+       movdqa STATE2, T1
+       pand STATE3, T1
+       pxor T1, MSG
+
+       movdqa MSG, T0
+       call __store_partial
+
+       /* mask with byte count: */
+       movq LEN, T0
+       punpcklbw T0, T0
+       punpcklbw T0, T0
+       punpcklbw T0, T0
+       punpcklbw T0, T0
+       movdqa .Laegis128_counter, T1
+       pcmpgtb T1, T0
+       pand T0, MSG
+
+       aegis128_update
+       pxor MSG, STATE4
+
+       /* store the state: */
+       movdqu STATE4, 0x00(STATEP)
+       movdqu STATE0, 0x10(STATEP)
+       movdqu STATE1, 0x20(STATEP)
+       movdqu STATE2, 0x30(STATEP)
+       movdqu STATE3, 0x40(STATEP)
+
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128_aesni_dec_tail)
+
+/*
+ * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
+ *                                  u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis128_aesni_final)
+       FRAME_BEGIN
+
+       /* load the state: */
+       movdqu 0x00(STATEP), STATE0
+       movdqu 0x10(STATEP), STATE1
+       movdqu 0x20(STATEP), STATE2
+       movdqu 0x30(STATEP), STATE3
+       movdqu 0x40(STATEP), STATE4
+
+       /* prepare length block: */
+       movq %rdx, MSG
+       movq %rcx, T0
+       pslldq $8, T0
+       pxor T0, MSG
+       psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+       pxor STATE3, MSG
+
+       /* update state: */
+       aegis128_update; pxor MSG, STATE4
+       aegis128_update; pxor MSG, STATE3
+       aegis128_update; pxor MSG, STATE2
+       aegis128_update; pxor MSG, STATE1
+       aegis128_update; pxor MSG, STATE0
+       aegis128_update; pxor MSG, STATE4
+       aegis128_update; pxor MSG, STATE3
+
+       /* xor tag: */
+       movdqu (%rsi), MSG
+
+       pxor STATE0, MSG
+       pxor STATE1, MSG
+       pxor STATE2, MSG
+       pxor STATE3, MSG
+       pxor STATE4, MSG
+
+       movdqu MSG, (%rsi)
+
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128_aesni_final)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c 
b/arch/x86/crypto/aegis128-aesni-glue.c
new file mode 100644
index 000000000000..5de7c0d46edf
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -0,0 +1,407 @@
+/*
+ * The AEGIS-128 Authenticated-Encryption Algorithm
+ *   Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosna...@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128_BLOCK_ALIGN 16
+#define AEGIS128_BLOCK_SIZE 16
+#define AEGIS128_NONCE_SIZE 16
+#define AEGIS128_STATE_BLOCKS 5
+#define AEGIS128_KEY_SIZE 16
+#define AEGIS128_MIN_AUTH_SIZE 8
+#define AEGIS128_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis128_aesni_ad(
+               void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis128_aesni_enc(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_enc_tail(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec_tail(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_final(
+               void *state, void *tag_xor, unsigned int cryptlen,
+               unsigned int assoclen);
+
+struct aegis_block {
+       u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+       struct aegis_block blocks[AEGIS128_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+       struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+       int (*skcipher_walk_init)(struct skcipher_walk *walk,
+                                 struct aead_request *req, bool atomic);
+
+       void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+                            void *dst);
+       void (*crypt_tail)(void *state, unsigned int length, const void *src,
+                          void *dst);
+};
+
+static void crypto_aegis128_aesni_process_ad(
+               struct aegis_state *state, struct scatterlist *sg_src,
+               unsigned int assoclen)
+{
+       struct scatter_walk walk;
+       struct aegis_block buf;
+       unsigned int pos = 0;
+
+       scatterwalk_start(&walk, sg_src);
+       while (assoclen != 0) {
+               unsigned int size = scatterwalk_clamp(&walk, assoclen);
+               unsigned int left = size;
+               void *mapped = scatterwalk_map(&walk);
+               const u8 *src = (const u8 *)mapped;
+
+               if (pos + size >= AEGIS128_BLOCK_SIZE) {
+                       if (pos > 0) {
+                               unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
+                               memcpy(buf.bytes + pos, src, fill);
+                               crypto_aegis128_aesni_ad(state,
+                                                        AEGIS128_BLOCK_SIZE,
+                                                        buf.bytes);
+                               pos = 0;
+                               left -= fill;
+                               src += fill;
+                       }
+
+                       crypto_aegis128_aesni_ad(state, left, src);
+
+                       src += left & ~(AEGIS128_BLOCK_SIZE - 1);
+                       left &= AEGIS128_BLOCK_SIZE - 1;
+               }
+
+               memcpy(buf.bytes + pos, src, left);
+               pos += left;
+               assoclen -= size;
+
+               scatterwalk_unmap(mapped);
+               scatterwalk_advance(&walk, size);
+               scatterwalk_done(&walk, 0, assoclen);
+       }
+
+       if (pos > 0) {
+               memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
+               crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
+       }
+}
+
+static void crypto_aegis128_aesni_process_crypt(
+               struct aegis_state *state, struct aead_request *req,
+               const struct aegis_crypt_ops *ops)
+{
+       struct skcipher_walk walk;
+       u8 *src, *dst;
+       unsigned int chunksize, base;
+
+       ops->skcipher_walk_init(&walk, req, false);
+
+       while (walk.nbytes) {
+               src = walk.src.virt.addr;
+               dst = walk.dst.virt.addr;
+               chunksize = walk.nbytes;
+
+               ops->crypt_blocks(state, chunksize, src, dst);
+
+               base = chunksize & ~(AEGIS128_BLOCK_SIZE - 1);
+               src += base;
+               dst += base;
+               chunksize &= AEGIS128_BLOCK_SIZE - 1;
+
+               if (chunksize > 0)
+                       ops->crypt_tail(state, chunksize, src, dst);
+
+               skcipher_walk_done(&walk, 0);
+       }
+}
+
+static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
+{
+       u8 *ctx = crypto_aead_ctx(aead);
+       ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+       return (void *)ctx;
+}
+
+static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 
*key,
+                                       unsigned int keylen)
+{
+       struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead);
+
+       if (keylen != AEGIS128_KEY_SIZE) {
+               crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+
+       memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE);
+
+       return 0;
+}
+
+static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
+                                               unsigned int authsize)
+{
+       if (authsize > AEGIS128_MAX_AUTH_SIZE)
+               return -EINVAL;
+       if (authsize < AEGIS128_MIN_AUTH_SIZE)
+               return -EINVAL;
+       return 0;
+}
+
+static void crypto_aegis128_aesni_crypt(struct aead_request *req,
+                                       struct aegis_block *tag_xor,
+                                       unsigned int cryptlen,
+                                       const struct aegis_crypt_ops *ops)
+{
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+       struct aegis_state state;
+
+       kernel_fpu_begin();
+
+       crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
+       crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
+       crypto_aegis128_aesni_process_crypt(&state, req, ops);
+       crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+       kernel_fpu_end();
+}
+
+static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
+{
+       static const struct aegis_crypt_ops OPS = {
+               .skcipher_walk_init = skcipher_walk_aead_encrypt,
+               .crypt_blocks = crypto_aegis128_aesni_enc,
+               .crypt_tail = crypto_aegis128_aesni_enc_tail,
+       };
+
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aegis_block tag = {};
+       unsigned int authsize = crypto_aead_authsize(tfm);
+       unsigned int cryptlen = req->cryptlen;
+
+       crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+       scatterwalk_map_and_copy(tag.bytes, req->dst,
+                                req->assoclen + cryptlen, authsize, 1);
+       return 0;
+}
+
+static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
+{
+       static const struct aegis_block zeros = {};
+
+       static const struct aegis_crypt_ops OPS = {
+               .skcipher_walk_init = skcipher_walk_aead_decrypt,
+               .crypt_blocks = crypto_aegis128_aesni_dec,
+               .crypt_tail = crypto_aegis128_aesni_dec_tail,
+       };
+
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aegis_block tag;
+       unsigned int authsize = crypto_aead_authsize(tfm);
+       unsigned int cryptlen = req->cryptlen - authsize;
+
+       scatterwalk_map_and_copy(tag.bytes, req->src,
+                                req->assoclen + cryptlen, authsize, 0);
+
+       crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+       return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+{
+       return 0;
+}
+
+static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead,
+                                       const u8 *key, unsigned int keylen)
+{
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead,
+                                            unsigned int authsize)
+{
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis128_aesni_encrypt(struct aead_request *req)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       aead = &cryptd_tfm->base;
+       if (irq_fpu_usable() && (!in_atomic() ||
+                                !cryptd_aead_queued(cryptd_tfm)))
+               aead = cryptd_aead_child(cryptd_tfm);
+
+       aead_request_set_tfm(req, aead);
+
+       return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis128_aesni_decrypt(struct aead_request *req)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       aead = &cryptd_tfm->base;
+       if (irq_fpu_usable() && (!in_atomic() ||
+                                !cryptd_aead_queued(cryptd_tfm)))
+               aead = cryptd_aead_child(cryptd_tfm);
+
+       aead_request_set_tfm(req, aead);
+
+       return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+{
+       struct cryptd_aead *cryptd_tfm;
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+       cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL,
+                                      CRYPTO_ALG_INTERNAL);
+       if (IS_ERR(cryptd_tfm))
+               return PTR_ERR(cryptd_tfm);
+
+       *ctx = cryptd_tfm;
+       crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+       return 0;
+}
+
+static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+{
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+       cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis128_aesni_alg[] = {
+       {
+               .setkey = crypto_aegis128_aesni_setkey,
+               .setauthsize = crypto_aegis128_aesni_setauthsize,
+               .encrypt = crypto_aegis128_aesni_encrypt,
+               .decrypt = crypto_aegis128_aesni_decrypt,
+               .init = crypto_aegis128_aesni_init_tfm,
+               .exit = crypto_aegis128_aesni_exit_tfm,
+
+               .ivsize = AEGIS128_NONCE_SIZE,
+               .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+               .chunksize = AEGIS128_BLOCK_SIZE,
+
+               .base = {
+                       .cra_flags = CRYPTO_ALG_INTERNAL,
+                       .cra_blocksize = 1,
+                       .cra_ctxsize = sizeof(struct aegis_ctx) +
+                               __alignof__(struct aegis_ctx),
+                       .cra_alignmask = 0,
+
+                       .cra_name = "__aegis128",
+                       .cra_driver_name = "__aegis128-aesni",
+
+                       .cra_module = THIS_MODULE,
+               }
+       }, {
+               .setkey = cryptd_aegis128_aesni_setkey,
+               .setauthsize = cryptd_aegis128_aesni_setauthsize,
+               .encrypt = cryptd_aegis128_aesni_encrypt,
+               .decrypt = cryptd_aegis128_aesni_decrypt,
+               .init = cryptd_aegis128_aesni_init_tfm,
+               .exit = cryptd_aegis128_aesni_exit_tfm,
+
+               .ivsize = AEGIS128_NONCE_SIZE,
+               .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+               .chunksize = AEGIS128_BLOCK_SIZE,
+
+               .base = {
+                       .cra_flags = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize = 1,
+                       .cra_ctxsize = sizeof(struct cryptd_aead *),
+                       .cra_alignmask = 0,
+
+                       .cra_priority = 400,
+
+                       .cra_name = "aegis128",
+                       .cra_driver_name = "aegis128-aesni",
+
+                       .cra_module = THIS_MODULE,
+               }
+       }
+};
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+       X86_FEATURE_MATCH(X86_FEATURE_AES),
+       X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+       {}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init crypto_aegis128_aesni_module_init(void)
+{
+       if (!x86_match_cpu(aesni_cpu_id))
+               return -ENODEV;
+
+       return crypto_register_aeads(crypto_aegis128_aesni_alg,
+                                    ARRAY_SIZE(crypto_aegis128_aesni_alg));
+}
+
+static void __exit crypto_aegis128_aesni_module_exit(void)
+{
+       crypto_unregister_aeads(crypto_aegis128_aesni_alg,
+                               ARRAY_SIZE(crypto_aegis128_aesni_alg));
+}
+
+module_init(crypto_aegis128_aesni_module_init);
+module_exit(crypto_aegis128_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosna...@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis128");
+MODULE_ALIAS_CRYPTO("aegis128-aesni");
diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S 
b/arch/x86/crypto/aegis128l-aesni-asm.S
new file mode 100644
index 000000000000..9263c344f2c7
--- /dev/null
+++ b/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -0,0 +1,825 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128L
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosna...@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define STATE5 %xmm5
+#define STATE6 %xmm6
+#define STATE7 %xmm7
+#define MSG0   %xmm8
+#define MSG1   %xmm9
+#define T0     %xmm10
+#define T1     %xmm11
+#define T2     %xmm12
+#define T3     %xmm13
+
+#define STATEP %rdi
+#define LEN    %rsi
+#define SRC    %rdx
+#define DST    %rcx
+
+.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32
+.align 16
+.Laegis128l_const_0:
+       .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+       .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128l_const_1:
+       .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+       .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16
+.align 16
+.Laegis128l_counter0:
+       .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+       .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Laegis128l_counter1:
+       .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+       .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   SRC - src
+ * output:
+ *   MSG0 - first message block
+ *   MSG1 - second message block
+ * changed:
+ *   T0
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+       xor %r9, %r9
+       pxor MSG0, MSG0
+       pxor MSG1, MSG1
+
+       mov LEN, %r8
+       and $0x1, %r8
+       jz .Lld_partial_1
+
+       mov LEN, %r8
+       and $0x1E, %r8
+       add SRC, %r8
+       mov (%r8), %r9b
+
+.Lld_partial_1:
+       mov LEN, %r8
+       and $0x2, %r8
+       jz .Lld_partial_2
+
+       mov LEN, %r8
+       and $0x1C, %r8
+       add SRC, %r8
+       shl $0x10, %r9
+       mov (%r8), %r9w
+
+.Lld_partial_2:
+       mov LEN, %r8
+       and $0x4, %r8
+       jz .Lld_partial_4
+
+       mov LEN, %r8
+       and $0x18, %r8
+       add SRC, %r8
+       shl $32, %r9
+       mov (%r8), %r8d
+       xor %r8, %r9
+
+.Lld_partial_4:
+       movq %r9, MSG0
+
+       mov LEN, %r8
+       and $0x8, %r8
+       jz .Lld_partial_8
+
+       mov LEN, %r8
+       and $0x10, %r8
+       add SRC, %r8
+       pslldq $8, MSG0
+       movq (%r8), T0
+       pxor T0, MSG0
+
+.Lld_partial_8:
+       mov LEN, %r8
+       and $0x10, %r8
+       jz .Lld_partial_16
+
+       movdqa MSG0, MSG1
+       movdqu (SRC), MSG0
+
+.Lld_partial_16:
+       ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   DST - dst
+ * output:
+ *   T0   - first message block
+ *   T1   - second message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+       mov LEN, %r8
+       mov DST, %r9
+
+       cmp $16, %r8
+       jl .Lst_partial_16
+
+       movdqu T0, (%r9)
+       movdqa T1, T0
+
+       sub $16, %r8
+       add $16, %r9
+
+.Lst_partial_16:
+       movq T0, %r10
+
+       cmp $8, %r8
+       jl .Lst_partial_8
+
+       mov %r10, (%r9)
+       psrldq $8, T0
+       movq T0, %r10
+
+       sub $8, %r8
+       add $8, %r9
+
+.Lst_partial_8:
+       cmp $4, %r8
+       jl .Lst_partial_4
+
+       mov %r10d, (%r9)
+       shr $32, %r10
+
+       sub $4, %r8
+       add $4, %r9
+
+.Lst_partial_4:
+       cmp $2, %r8
+       jl .Lst_partial_2
+
+       mov %r10w, (%r9)
+       shr $0x10, %r10
+
+       sub $2, %r8
+       add $2, %r9
+
+.Lst_partial_2:
+       cmp $1, %r8
+       jl .Lst_partial_1
+
+       mov %r10b, (%r9)
+
+.Lst_partial_1:
+       ret
+ENDPROC(__store_partial)
+
+.macro update
+       movdqa STATE7, T0
+       aesenc STATE0, STATE7
+       aesenc STATE1, STATE0
+       aesenc STATE2, STATE1
+       aesenc STATE3, STATE2
+       aesenc STATE4, STATE3
+       aesenc STATE5, STATE4
+       aesenc STATE6, STATE5
+       aesenc T0,     STATE6
+.endm
+
+.macro update0
+       update
+       pxor MSG0, STATE7
+       pxor MSG1, STATE3
+.endm
+
+.macro update1
+       update
+       pxor MSG0, STATE6
+       pxor MSG1, STATE2
+.endm
+
+.macro update2
+       update
+       pxor MSG0, STATE5
+       pxor MSG1, STATE1
+.endm
+
+.macro update3
+       update
+       pxor MSG0, STATE4
+       pxor MSG1, STATE0
+.endm
+
+.macro update4
+       update
+       pxor MSG0, STATE3
+       pxor MSG1, STATE7
+.endm
+
+.macro update5
+       update
+       pxor MSG0, STATE2
+       pxor MSG1, STATE6
+.endm
+
+.macro update6
+       update
+       pxor MSG0, STATE1
+       pxor MSG1, STATE5
+.endm
+
+.macro update7
+       update
+       pxor MSG0, STATE0
+       pxor MSG1, STATE4
+.endm
+
+.macro state_load
+       movdqu 0x00(STATEP), STATE0
+       movdqu 0x10(STATEP), STATE1
+       movdqu 0x20(STATEP), STATE2
+       movdqu 0x30(STATEP), STATE3
+       movdqu 0x40(STATEP), STATE4
+       movdqu 0x50(STATEP), STATE5
+       movdqu 0x60(STATEP), STATE6
+       movdqu 0x70(STATEP), STATE7
+.endm
+
+.macro state_store s0 s1 s2 s3 s4 s5 s6 s7
+       movdqu \s7, 0x00(STATEP)
+       movdqu \s0, 0x10(STATEP)
+       movdqu \s1, 0x20(STATEP)
+       movdqu \s2, 0x30(STATEP)
+       movdqu \s3, 0x40(STATEP)
+       movdqu \s4, 0x50(STATEP)
+       movdqu \s5, 0x60(STATEP)
+       movdqu \s6, 0x70(STATEP)
+.endm
+
+.macro state_store0
+       state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
+.endm
+
+.macro state_store1
+       state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
+.endm
+
+.macro state_store2
+       state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro state_store3
+       state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro state_store4
+       state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro state_store5
+       state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
+.endm
+
+.macro state_store6
+       state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
+.endm
+
+.macro state_store7
+       state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_init(void *state, const void *key, const void 
*iv);
+ */
+ENTRY(crypto_aegis128l_aesni_init)
+       FRAME_BEGIN
+
+       /* load key: */
+       movdqa (%rsi), MSG1
+       movdqa MSG1, STATE0
+       movdqa MSG1, STATE4
+       movdqa MSG1, STATE5
+       movdqa MSG1, STATE6
+       movdqa MSG1, STATE7
+
+       /* load IV: */
+       movdqu (%rdx), MSG0
+       pxor MSG0, STATE0
+       pxor MSG0, STATE4
+
+       /* load the constants: */
+       movdqa .Laegis128l_const_0, STATE2
+       movdqa .Laegis128l_const_1, STATE1
+       movdqa STATE1, STATE3
+       pxor STATE2, STATE5
+       pxor STATE1, STATE6
+       pxor STATE2, STATE7
+
+       /* update 10 times with IV and KEY: */
+       update0
+       update1
+       update2
+       update3
+       update4
+       update5
+       update6
+       update7
+       update0
+       update1
+
+       state_store1
+
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128l_aesni_init)
+
+.macro ad_block a i
+       movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+       movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+       update\i
+       sub $0x20, LEN
+       cmp $0x20, LEN
+       jl .Lad_out_\i
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_ad(void *state, unsigned int length,
+ *                                const void *data);
+ */
+ENTRY(crypto_aegis128l_aesni_ad)
+       FRAME_BEGIN
+
+       cmp $0x20, LEN
+       jb .Lad_out
+
+       state_load
+
+       mov  SRC, %r8
+       and $0xf, %r8
+       jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+       ad_block a 0
+       ad_block a 1
+       ad_block a 2
+       ad_block a 3
+       ad_block a 4
+       ad_block a 5
+       ad_block a 6
+       ad_block a 7
+
+       add $0x100, SRC
+       jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+       ad_block u 0
+       ad_block u 1
+       ad_block u 2
+       ad_block u 3
+       ad_block u 4
+       ad_block u 5
+       ad_block u 6
+       ad_block u 7
+
+       add $0x100, SRC
+       jmp .Lad_u_loop
+
+.Lad_out_0:
+       state_store0
+       FRAME_END
+       ret
+
+.Lad_out_1:
+       state_store1
+       FRAME_END
+       ret
+
+.Lad_out_2:
+       state_store2
+       FRAME_END
+       ret
+
+.Lad_out_3:
+       state_store3
+       FRAME_END
+       ret
+
+.Lad_out_4:
+       state_store4
+       FRAME_END
+       ret
+
+.Lad_out_5:
+       state_store5
+       FRAME_END
+       ret
+
+.Lad_out_6:
+       state_store6
+       FRAME_END
+       ret
+
+.Lad_out_7:
+       state_store7
+       FRAME_END
+       ret
+
+.Lad_out:
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128l_aesni_ad)
+
+.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7
+       pxor \s1, \m0
+       pxor \s6, \m0
+       movdqa \s2, T3
+       pand \s3, T3
+       pxor T3, \m0
+
+       pxor \s2, \m1
+       pxor \s5, \m1
+       movdqa \s6, T3
+       pand \s7, T3
+       pxor T3, \m1
+.endm
+
+.macro crypt0 m0 m1
+       crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
+.endm
+
+.macro crypt1 m0 m1
+       crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
+.endm
+
+.macro crypt2 m0 m1
+       crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro crypt3 m0 m1
+       crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro crypt4 m0 m1
+       crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro crypt5 m0 m1
+       crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
+.endm
+
+.macro crypt6 m0 m1
+       crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
+.endm
+
+.macro crypt7 m0 m1
+       crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
+.endm
+
+.macro encrypt_block a i
+       movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+       movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+       movdqa MSG0, T0
+       movdqa MSG1, T1
+       crypt\i T0, T1
+       movdq\a T0, (\i * 0x20 + 0x00)(DST)
+       movdq\a T1, (\i * 0x20 + 0x10)(DST)
+
+       update\i
+
+       sub $0x20, LEN
+       cmp $0x20, LEN
+       jl .Lenc_out_\i
+.endm
+
+.macro decrypt_block a i
+       movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+       movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+       crypt\i MSG0, MSG1
+       movdq\a MSG0, (\i * 0x20 + 0x00)(DST)
+       movdq\a MSG1, (\i * 0x20 + 0x10)(DST)
+
+       update\i
+
+       sub $0x20, LEN
+       cmp $0x20, LEN
+       jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_enc(void *state, unsigned int length,
+ *                                 const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_enc)
+       FRAME_BEGIN
+
+       cmp $0x20, LEN
+       jb .Lenc_out
+
+       state_load
+
+       mov  SRC, %r8
+       or   DST, %r8
+       and $0xf, %r8
+       jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+       encrypt_block a 0
+       encrypt_block a 1
+       encrypt_block a 2
+       encrypt_block a 3
+       encrypt_block a 4
+       encrypt_block a 5
+       encrypt_block a 6
+       encrypt_block a 7
+
+       add $0x100, SRC
+       add $0x100, DST
+       jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+       encrypt_block u 0
+       encrypt_block u 1
+       encrypt_block u 2
+       encrypt_block u 3
+       encrypt_block u 4
+       encrypt_block u 5
+       encrypt_block u 6
+       encrypt_block u 7
+
+       add $0x100, SRC
+       add $0x100, DST
+       jmp .Lenc_u_loop
+
+.Lenc_out_0:
+       state_store0
+       FRAME_END
+       ret
+
+.Lenc_out_1:
+       state_store1
+       FRAME_END
+       ret
+
+.Lenc_out_2:
+       state_store2
+       FRAME_END
+       ret
+
+.Lenc_out_3:
+       state_store3
+       FRAME_END
+       ret
+
+.Lenc_out_4:
+       state_store4
+       FRAME_END
+       ret
+
+.Lenc_out_5:
+       state_store5
+       FRAME_END
+       ret
+
+.Lenc_out_6:
+       state_store6
+       FRAME_END
+       ret
+
+.Lenc_out_7:
+       state_store7
+       FRAME_END
+       ret
+
+.Lenc_out:
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128l_aesni_enc)
+
+/*
+ * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length,
+ *                                      const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_enc_tail)
+       FRAME_BEGIN
+
+       state_load
+
+       /* encrypt message: */
+       call __load_partial
+
+       movdqa MSG0, T0
+       movdqa MSG1, T1
+       crypt0 T0, T1
+
+       call __store_partial
+
+       update0
+
+       state_store0
+
+       FRAME_END
+ENDPROC(crypto_aegis128l_aesni_enc_tail)
+
+/*
+ * void crypto_aegis128l_aesni_dec(void *state, unsigned int length,
+ *                                 const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_dec)
+       FRAME_BEGIN
+
+       cmp $0x20, LEN
+       jb .Ldec_out
+
+       state_load
+
+       mov  SRC, %r8
+       or   DST, %r8
+       and $0xF, %r8
+       jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+       decrypt_block a 0
+       decrypt_block a 1
+       decrypt_block a 2
+       decrypt_block a 3
+       decrypt_block a 4
+       decrypt_block a 5
+       decrypt_block a 6
+       decrypt_block a 7
+
+       add $0x100, SRC
+       add $0x100, DST
+       jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+       decrypt_block u 0
+       decrypt_block u 1
+       decrypt_block u 2
+       decrypt_block u 3
+       decrypt_block u 4
+       decrypt_block u 5
+       decrypt_block u 6
+       decrypt_block u 7
+
+       add $0x100, SRC
+       add $0x100, DST
+       jmp .Ldec_u_loop
+
+.Ldec_out_0:
+       state_store0
+       FRAME_END
+       ret
+
+.Ldec_out_1:
+       state_store1
+       FRAME_END
+       ret
+
+.Ldec_out_2:
+       state_store2
+       FRAME_END
+       ret
+
+.Ldec_out_3:
+       state_store3
+       FRAME_END
+       ret
+
+.Ldec_out_4:
+       state_store4
+       FRAME_END
+       ret
+
+.Ldec_out_5:
+       state_store5
+       FRAME_END
+       ret
+
+.Ldec_out_6:
+       state_store6
+       FRAME_END
+       ret
+
+.Ldec_out_7:
+       state_store7
+       FRAME_END
+       ret
+
+.Ldec_out:
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128l_aesni_dec)
+
+/*
+ * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length,
+ *                                      const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_dec_tail)
+       FRAME_BEGIN
+
+       state_load
+
+       /* decrypt message: */
+       call __load_partial
+
+       crypt0 MSG0, MSG1
+
+       movdqa MSG0, T0
+       movdqa MSG1, T1
+       call __store_partial
+
+       /* mask with byte count: */
+       movq LEN, T0
+       punpcklbw T0, T0
+       punpcklbw T0, T0
+       punpcklbw T0, T0
+       punpcklbw T0, T0
+       movdqa T0, T1
+       movdqa .Laegis128l_counter0, T2
+       movdqa .Laegis128l_counter1, T3
+       pcmpgtb T2, T0
+       pcmpgtb T3, T1
+       pand T0, MSG0
+       pand T1, MSG1
+
+       update0
+
+       state_store0
+
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128l_aesni_dec_tail)
+
+/*
+ * void crypto_aegis128l_aesni_final(void *state, void *tag_xor,
+ *                                   u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis128l_aesni_final)
+       FRAME_BEGIN
+
+       state_load
+
+       /* prepare length block: */
+       movq %rdx, MSG0
+       movq %rcx, T0
+       pslldq $8, T0
+       pxor T0, MSG0
+       psllq $3, MSG0 /* multiply by 8 (to get bit count) */
+
+       pxor STATE2, MSG0
+       movdqa MSG0, MSG1
+
+       /* update state: */
+       update0
+       update1
+       update2
+       update3
+       update4
+       update5
+       update6
+
+       /* xor tag: */
+       movdqu (%rsi), T0
+
+       pxor STATE1, T0
+       pxor STATE2, T0
+       pxor STATE3, T0
+       pxor STATE4, T0
+       pxor STATE5, T0
+       pxor STATE6, T0
+       pxor STATE7, T0
+
+       movdqu T0, (%rsi)
+
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis128l_aesni_final)
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c 
b/arch/x86/crypto/aegis128l-aesni-glue.c
new file mode 100644
index 000000000000..876e4866e633
--- /dev/null
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -0,0 +1,407 @@
+/*
+ * The AEGIS-128L Authenticated-Encryption Algorithm
+ *   Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosna...@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128L_BLOCK_ALIGN 16
+#define AEGIS128L_BLOCK_SIZE 32
+#define AEGIS128L_NONCE_SIZE 16
+#define AEGIS128L_STATE_BLOCKS 8
+#define AEGIS128L_KEY_SIZE 16
+#define AEGIS128L_MIN_AUTH_SIZE 8
+#define AEGIS128L_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis128l_aesni_ad(
+               void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis128l_aesni_enc(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_dec(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_enc_tail(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_dec_tail(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_final(
+               void *state, void *tag_xor, unsigned int cryptlen,
+               unsigned int assoclen);
+
+struct aegis_block {
+       u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+       struct aegis_block blocks[AEGIS128L_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+       struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+       int (*skcipher_walk_init)(struct skcipher_walk *walk,
+                                 struct aead_request *req, bool atomic);
+
+       void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+                            void *dst);
+       void (*crypt_tail)(void *state, unsigned int length, const void *src,
+                          void *dst);
+};
+
+static void crypto_aegis128l_aesni_process_ad(
+               struct aegis_state *state, struct scatterlist *sg_src,
+               unsigned int assoclen)
+{
+       struct scatter_walk walk;
+       struct aegis_block buf;
+       unsigned int pos = 0;
+
+       scatterwalk_start(&walk, sg_src);
+       while (assoclen != 0) {
+               unsigned int size = scatterwalk_clamp(&walk, assoclen);
+               unsigned int left = size;
+               void *mapped = scatterwalk_map(&walk);
+               const u8 *src = (const u8 *)mapped;
+
+               if (pos + size >= AEGIS128L_BLOCK_SIZE) {
+                       if (pos > 0) {
+                               unsigned int fill = AEGIS128L_BLOCK_SIZE - pos;
+                               memcpy(buf.bytes + pos, src, fill);
+                               crypto_aegis128l_aesni_ad(state,
+                                                         AEGIS128L_BLOCK_SIZE,
+                                                         buf.bytes);
+                               pos = 0;
+                               left -= fill;
+                               src += fill;
+                       }
+
+                       crypto_aegis128l_aesni_ad(state, left, src);
+
+                       src += left & ~(AEGIS128L_BLOCK_SIZE - 1);
+                       left &= AEGIS128L_BLOCK_SIZE - 1;
+               }
+
+               memcpy(buf.bytes + pos, src, left);
+               pos += left;
+               assoclen -= size;
+
+               scatterwalk_unmap(mapped);
+               scatterwalk_advance(&walk, size);
+               scatterwalk_done(&walk, 0, assoclen);
+       }
+
+       if (pos > 0) {
+               memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos);
+               crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, 
buf.bytes);
+       }
+}
+
+static void crypto_aegis128l_aesni_process_crypt(
+               struct aegis_state *state, struct aead_request *req,
+               const struct aegis_crypt_ops *ops)
+{
+       struct skcipher_walk walk;
+       u8 *src, *dst;
+       unsigned int chunksize, base;
+
+       ops->skcipher_walk_init(&walk, req, false);
+
+       while (walk.nbytes) {
+               src = walk.src.virt.addr;
+               dst = walk.dst.virt.addr;
+               chunksize = walk.nbytes;
+
+               ops->crypt_blocks(state, chunksize, src, dst);
+
+               base = chunksize & ~(AEGIS128L_BLOCK_SIZE - 1);
+               src += base;
+               dst += base;
+               chunksize &= AEGIS128L_BLOCK_SIZE - 1;
+
+               if (chunksize > 0)
+                       ops->crypt_tail(state, chunksize, src, dst);
+
+               skcipher_walk_done(&walk, 0);
+       }
+}
+
+static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead)
+{
+       u8 *ctx = crypto_aead_ctx(aead);
+       ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+       return (void *)ctx;
+}
+
+static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead,
+                                        const u8 *key, unsigned int keylen)
+{
+       struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead);
+
+       if (keylen != AEGIS128L_KEY_SIZE) {
+               crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+
+       memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE);
+
+       return 0;
+}
+
+static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm,
+                                             unsigned int authsize)
+{
+       if (authsize > AEGIS128L_MAX_AUTH_SIZE)
+               return -EINVAL;
+       if (authsize < AEGIS128L_MIN_AUTH_SIZE)
+               return -EINVAL;
+       return 0;
+}
+
+static void crypto_aegis128l_aesni_crypt(struct aead_request *req,
+                                        struct aegis_block *tag_xor,
+                                        unsigned int cryptlen,
+                                        const struct aegis_crypt_ops *ops)
+{
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm);
+       struct aegis_state state;
+
+       kernel_fpu_begin();
+
+       crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv);
+       crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen);
+       crypto_aegis128l_aesni_process_crypt(&state, req, ops);
+       crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+       kernel_fpu_end();
+}
+
+static int crypto_aegis128l_aesni_encrypt(struct aead_request *req)
+{
+       static const struct aegis_crypt_ops OPS = {
+               .skcipher_walk_init = skcipher_walk_aead_encrypt,
+               .crypt_blocks = crypto_aegis128l_aesni_enc,
+               .crypt_tail = crypto_aegis128l_aesni_enc_tail,
+       };
+
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aegis_block tag = {};
+       unsigned int authsize = crypto_aead_authsize(tfm);
+       unsigned int cryptlen = req->cryptlen;
+
+       crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+       scatterwalk_map_and_copy(tag.bytes, req->dst,
+                                req->assoclen + cryptlen, authsize, 1);
+       return 0;
+}
+
+static int crypto_aegis128l_aesni_decrypt(struct aead_request *req)
+{
+       static const struct aegis_block zeros = {};
+
+       static const struct aegis_crypt_ops OPS = {
+               .skcipher_walk_init = skcipher_walk_aead_decrypt,
+               .crypt_blocks = crypto_aegis128l_aesni_dec,
+               .crypt_tail = crypto_aegis128l_aesni_dec_tail,
+       };
+
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aegis_block tag;
+       unsigned int authsize = crypto_aead_authsize(tfm);
+       unsigned int cryptlen = req->cryptlen - authsize;
+
+       scatterwalk_map_and_copy(tag.bytes, req->src,
+                                req->assoclen + cryptlen, authsize, 0);
+
+       crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+       return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
+{
+       return 0;
+}
+
+static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead,
+                                        const u8 *key, unsigned int keylen)
+{
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead,
+                                             unsigned int authsize)
+{
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       aead = &cryptd_tfm->base;
+       if (irq_fpu_usable() && (!in_atomic() ||
+                                !cryptd_aead_queued(cryptd_tfm)))
+               aead = cryptd_aead_child(cryptd_tfm);
+
+       aead_request_set_tfm(req, aead);
+
+       return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       aead = &cryptd_tfm->base;
+       if (irq_fpu_usable() && (!in_atomic() ||
+                                !cryptd_aead_queued(cryptd_tfm)))
+               aead = cryptd_aead_child(cryptd_tfm);
+
+       aead_request_set_tfm(req, aead);
+
+       return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
+{
+       struct cryptd_aead *cryptd_tfm;
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+       cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL,
+                                      CRYPTO_ALG_INTERNAL);
+       if (IS_ERR(cryptd_tfm))
+               return PTR_ERR(cryptd_tfm);
+
+       *ctx = cryptd_tfm;
+       crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+       return 0;
+}
+
+static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
+{
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+       cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis128l_aesni_alg[] = {
+       {
+               .setkey = crypto_aegis128l_aesni_setkey,
+               .setauthsize = crypto_aegis128l_aesni_setauthsize,
+               .encrypt = crypto_aegis128l_aesni_encrypt,
+               .decrypt = crypto_aegis128l_aesni_decrypt,
+               .init = crypto_aegis128l_aesni_init_tfm,
+               .exit = crypto_aegis128l_aesni_exit_tfm,
+
+               .ivsize = AEGIS128L_NONCE_SIZE,
+               .maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
+               .chunksize = AEGIS128L_BLOCK_SIZE,
+
+               .base = {
+                       .cra_flags = CRYPTO_ALG_INTERNAL,
+                       .cra_blocksize = 1,
+                       .cra_ctxsize = sizeof(struct aegis_ctx) +
+                               __alignof__(struct aegis_ctx),
+                       .cra_alignmask = 0,
+
+                       .cra_name = "__aegis128l",
+                       .cra_driver_name = "__aegis128l-aesni",
+
+                       .cra_module = THIS_MODULE,
+               }
+       }, {
+               .setkey = cryptd_aegis128l_aesni_setkey,
+               .setauthsize = cryptd_aegis128l_aesni_setauthsize,
+               .encrypt = cryptd_aegis128l_aesni_encrypt,
+               .decrypt = cryptd_aegis128l_aesni_decrypt,
+               .init = cryptd_aegis128l_aesni_init_tfm,
+               .exit = cryptd_aegis128l_aesni_exit_tfm,
+
+               .ivsize = AEGIS128L_NONCE_SIZE,
+               .maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
+               .chunksize = AEGIS128L_BLOCK_SIZE,
+
+               .base = {
+                       .cra_flags = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize = 1,
+                       .cra_ctxsize = sizeof(struct cryptd_aead *),
+                       .cra_alignmask = 0,
+
+                       .cra_priority = 400,
+
+                       .cra_name = "aegis128l",
+                       .cra_driver_name = "aegis128l-aesni",
+
+                       .cra_module = THIS_MODULE,
+               }
+       }
+};
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+       X86_FEATURE_MATCH(X86_FEATURE_AES),
+       X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+       {}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init crypto_aegis128l_aesni_module_init(void)
+{
+       if (!x86_match_cpu(aesni_cpu_id))
+               return -ENODEV;
+
+       return crypto_register_aeads(crypto_aegis128l_aesni_alg,
+                                    ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+}
+
+static void __exit crypto_aegis128l_aesni_module_exit(void)
+{
+       crypto_unregister_aeads(crypto_aegis128l_aesni_alg,
+                               ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+}
+
+module_init(crypto_aegis128l_aesni_module_init);
+module_exit(crypto_aegis128l_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosna...@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis128l");
+MODULE_ALIAS_CRYPTO("aegis128l-aesni");
diff --git a/arch/x86/crypto/aegis256-aesni-asm.S 
b/arch/x86/crypto/aegis256-aesni-asm.S
new file mode 100644
index 000000000000..1d977d515bf9
--- /dev/null
+++ b/arch/x86/crypto/aegis256-aesni-asm.S
@@ -0,0 +1,702 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128L
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosna...@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define STATE5 %xmm5
+#define MSG    %xmm6
+#define T0     %xmm7
+#define T1     %xmm8
+#define T2     %xmm9
+#define T3     %xmm10
+
+#define STATEP %rdi
+#define LEN    %rsi
+#define SRC    %rdx
+#define DST    %rcx
+
+.section .rodata.cst16.aegis256_const, "aM", @progbits, 32
+.align 16
+.Laegis256_const_0:
+       .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+       .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis256_const_1:
+       .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+       .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16
+.align 16
+.Laegis256_counter:
+       .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+       .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   SRC - src
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   T0
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+       xor %r9, %r9
+       pxor MSG, MSG
+
+       mov LEN, %r8
+       and $0x1, %r8
+       jz .Lld_partial_1
+
+       mov LEN, %r8
+       and $0x1E, %r8
+       add SRC, %r8
+       mov (%r8), %r9b
+
+.Lld_partial_1:
+       mov LEN, %r8
+       and $0x2, %r8
+       jz .Lld_partial_2
+
+       mov LEN, %r8
+       and $0x1C, %r8
+       add SRC, %r8
+       shl $0x10, %r9
+       mov (%r8), %r9w
+
+.Lld_partial_2:
+       mov LEN, %r8
+       and $0x4, %r8
+       jz .Lld_partial_4
+
+       mov LEN, %r8
+       and $0x18, %r8
+       add SRC, %r8
+       shl $32, %r9
+       mov (%r8), %r8d
+       xor %r8, %r9
+
+.Lld_partial_4:
+       movq %r9, MSG
+
+       mov LEN, %r8
+       and $0x8, %r8
+       jz .Lld_partial_8
+
+       mov LEN, %r8
+       and $0x10, %r8
+       add SRC, %r8
+       pslldq $8, MSG
+       movq (%r8), T0
+       pxor T0, MSG
+
+.Lld_partial_8:
+       ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   DST - dst
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+       mov LEN, %r8
+       mov DST, %r9
+
+       movq T0, %r10
+
+       cmp $8, %r8
+       jl .Lst_partial_8
+
+       mov %r10, (%r9)
+       psrldq $8, T0
+       movq T0, %r10
+
+       sub $8, %r8
+       add $8, %r9
+
+.Lst_partial_8:
+       cmp $4, %r8
+       jl .Lst_partial_4
+
+       mov %r10d, (%r9)
+       shr $32, %r10
+
+       sub $4, %r8
+       add $4, %r9
+
+.Lst_partial_4:
+       cmp $2, %r8
+       jl .Lst_partial_2
+
+       mov %r10w, (%r9)
+       shr $0x10, %r10
+
+       sub $2, %r8
+       add $2, %r9
+
+.Lst_partial_2:
+       cmp $1, %r8
+       jl .Lst_partial_1
+
+       mov %r10b, (%r9)
+
+.Lst_partial_1:
+       ret
+ENDPROC(__store_partial)
+
+.macro update
+       movdqa STATE5, T0
+       aesenc STATE0, STATE5
+       aesenc STATE1, STATE0
+       aesenc STATE2, STATE1
+       aesenc STATE3, STATE2
+       aesenc STATE4, STATE3
+       aesenc T0,     STATE4
+.endm
+
+.macro update0 m
+       update
+       pxor \m, STATE5
+.endm
+
+.macro update1 m
+       update
+       pxor \m, STATE4
+.endm
+
+.macro update2 m
+       update
+       pxor \m, STATE3
+.endm
+
+.macro update3 m
+       update
+       pxor \m, STATE2
+.endm
+
+.macro update4 m
+       update
+       pxor \m, STATE1
+.endm
+
+.macro update5 m
+       update
+       pxor \m, STATE0
+.endm
+
+.macro state_load
+       movdqu 0x00(STATEP), STATE0
+       movdqu 0x10(STATEP), STATE1
+       movdqu 0x20(STATEP), STATE2
+       movdqu 0x30(STATEP), STATE3
+       movdqu 0x40(STATEP), STATE4
+       movdqu 0x50(STATEP), STATE5
+.endm
+
+.macro state_store s0 s1 s2 s3 s4 s5
+       movdqu \s5, 0x00(STATEP)
+       movdqu \s0, 0x10(STATEP)
+       movdqu \s1, 0x20(STATEP)
+       movdqu \s2, 0x30(STATEP)
+       movdqu \s3, 0x40(STATEP)
+       movdqu \s4, 0x50(STATEP)
+.endm
+
+.macro state_store0
+       state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro state_store1
+       state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro state_store2
+       state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro state_store3
+       state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
+.endm
+
+.macro state_store4
+       state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
+.endm
+
+.macro state_store5
+       state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
+.endm
+
+/*
+ * void crypto_aegis256_aesni_init(void *state, const void *key, const void 
*iv);
+ */
+ENTRY(crypto_aegis256_aesni_init)
+       FRAME_BEGIN
+
+       /* load key: */
+       movdqa 0x00(%rsi), MSG
+       movdqa 0x10(%rsi), T1
+       movdqa MSG, STATE4
+       movdqa T1, STATE5
+
+       /* load IV: */
+       movdqu 0x00(%rdx), T2
+       movdqu 0x10(%rdx), T3
+       pxor MSG, T2
+       pxor T1, T3
+       movdqa T2, STATE0
+       movdqa T3, STATE1
+
+       /* load the constants: */
+       movdqa .Laegis256_const_0, STATE3
+       movdqa .Laegis256_const_1, STATE2
+       pxor STATE3, STATE4
+       pxor STATE2, STATE5
+
+       /* update 10 times with IV and KEY: */
+       update0 MSG
+       update1 T1
+       update2 T2
+       update3 T3
+       update4 MSG
+       update5 T1
+       update0 T2
+       update1 T3
+       update2 MSG
+       update3 T1
+       update4 T2
+       update5 T3
+       update0 MSG
+       update1 T1
+       update2 T2
+       update3 T3
+
+       state_store3
+
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis256_aesni_init)
+
+.macro ad_block a i
+       movdq\a (\i * 0x10)(SRC), MSG
+       update\i MSG
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lad_out_\i
+.endm
+
+/*
+ * void crypto_aegis256_aesni_ad(void *state, unsigned int length,
+ *                               const void *data);
+ */
+ENTRY(crypto_aegis256_aesni_ad)
+       FRAME_BEGIN
+
+       cmp $0x10, LEN
+       jb .Lad_out
+
+       state_load
+
+       mov  SRC, %r8
+       and $0xf, %r8
+       jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+       ad_block a 0
+       ad_block a 1
+       ad_block a 2
+       ad_block a 3
+       ad_block a 4
+       ad_block a 5
+
+       add $0x60, SRC
+       jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+       ad_block u 0
+       ad_block u 1
+       ad_block u 2
+       ad_block u 3
+       ad_block u 4
+       ad_block u 5
+
+       add $0x60, SRC
+       jmp .Lad_u_loop
+
+.Lad_out_0:
+       state_store0
+       FRAME_END
+       ret
+
+.Lad_out_1:
+       state_store1
+       FRAME_END
+       ret
+
+.Lad_out_2:
+       state_store2
+       FRAME_END
+       ret
+
+.Lad_out_3:
+       state_store3
+       FRAME_END
+       ret
+
+.Lad_out_4:
+       state_store4
+       FRAME_END
+       ret
+
+.Lad_out_5:
+       state_store5
+       FRAME_END
+       ret
+
+.Lad_out:
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis256_aesni_ad)
+
+.macro crypt m s0 s1 s2 s3 s4 s5
+       pxor \s1, \m
+       pxor \s4, \m
+       pxor \s5, \m
+       movdqa \s2, T3
+       pand \s3, T3
+       pxor T3, \m
+.endm
+
+.macro crypt0 m
+       crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro crypt1 m
+       crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro crypt2 m
+       crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro crypt3 m
+       crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
+.endm
+
+.macro crypt4 m
+       crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
+.endm
+
+.macro crypt5 m
+       crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
+.endm
+
+.macro encrypt_block a i
+       movdq\a (\i * 0x10)(SRC), MSG
+       movdqa MSG, T0
+       crypt\i T0
+       movdq\a T0, (\i * 0x10)(DST)
+
+       update\i MSG
+
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Lenc_out_\i
+.endm
+
+.macro decrypt_block a i
+       movdq\a (\i * 0x10)(SRC), MSG
+       crypt\i MSG
+       movdq\a MSG, (\i * 0x10)(DST)
+
+       update\i MSG
+
+       sub $0x10, LEN
+       cmp $0x10, LEN
+       jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis256_aesni_enc(void *state, unsigned int length,
+ *                                const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_enc)
+       FRAME_BEGIN
+
+       cmp $0x10, LEN
+       jb .Lenc_out
+
+       state_load
+
+       mov  SRC, %r8
+       or   DST, %r8
+       and $0xf, %r8
+       jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+       encrypt_block a 0
+       encrypt_block a 1
+       encrypt_block a 2
+       encrypt_block a 3
+       encrypt_block a 4
+       encrypt_block a 5
+
+       add $0x60, SRC
+       add $0x60, DST
+       jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+       encrypt_block u 0
+       encrypt_block u 1
+       encrypt_block u 2
+       encrypt_block u 3
+       encrypt_block u 4
+       encrypt_block u 5
+
+       add $0x60, SRC
+       add $0x60, DST
+       jmp .Lenc_u_loop
+
+.Lenc_out_0:
+       state_store0
+       FRAME_END
+       ret
+
+.Lenc_out_1:
+       state_store1
+       FRAME_END
+       ret
+
+.Lenc_out_2:
+       state_store2
+       FRAME_END
+       ret
+
+.Lenc_out_3:
+       state_store3
+       FRAME_END
+       ret
+
+.Lenc_out_4:
+       state_store4
+       FRAME_END
+       ret
+
+.Lenc_out_5:
+       state_store5
+       FRAME_END
+       ret
+
+.Lenc_out:
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis256_aesni_enc)
+
+/*
+ * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length,
+ *                                     const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_enc_tail)
+       FRAME_BEGIN
+
+       state_load
+
+       /* encrypt message: */
+       call __load_partial
+
+       movdqa MSG, T0
+       crypt0 T0
+
+       call __store_partial
+
+       update0 MSG
+
+       state_store0
+
+       FRAME_END
+ENDPROC(crypto_aegis256_aesni_enc_tail)
+
+/*
+ * void crypto_aegis256_aesni_dec(void *state, unsigned int length,
+ *                                const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_dec)
+       FRAME_BEGIN
+
+       cmp $0x10, LEN
+       jb .Ldec_out
+
+       state_load
+
+       mov  SRC, %r8
+       or   DST, %r8
+       and $0xF, %r8
+       jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+       decrypt_block a 0
+       decrypt_block a 1
+       decrypt_block a 2
+       decrypt_block a 3
+       decrypt_block a 4
+       decrypt_block a 5
+
+       add $0x60, SRC
+       add $0x60, DST
+       jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+       decrypt_block u 0
+       decrypt_block u 1
+       decrypt_block u 2
+       decrypt_block u 3
+       decrypt_block u 4
+       decrypt_block u 5
+
+       add $0x60, SRC
+       add $0x60, DST
+       jmp .Ldec_u_loop
+
+.Ldec_out_0:
+       state_store0
+       FRAME_END
+       ret
+
+.Ldec_out_1:
+       state_store1
+       FRAME_END
+       ret
+
+.Ldec_out_2:
+       state_store2
+       FRAME_END
+       ret
+
+.Ldec_out_3:
+       state_store3
+       FRAME_END
+       ret
+
+.Ldec_out_4:
+       state_store4
+       FRAME_END
+       ret
+
+.Ldec_out_5:
+       state_store5
+       FRAME_END
+       ret
+
+.Ldec_out:
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis256_aesni_dec)
+
+/*
+ * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length,
+ *                                     const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_dec_tail)
+       FRAME_BEGIN
+
+       state_load
+
+       /* decrypt message: */
+       call __load_partial
+
+       crypt0 MSG
+
+       movdqa MSG, T0
+       call __store_partial
+
+       /* mask with byte count: */
+       movq LEN, T0
+       punpcklbw T0, T0
+       punpcklbw T0, T0
+       punpcklbw T0, T0
+       punpcklbw T0, T0
+       movdqa .Laegis256_counter, T1
+       pcmpgtb T1, T0
+       pand T0, MSG
+
+       update0 MSG
+
+       state_store0
+
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis256_aesni_dec_tail)
+
+/*
+ * void crypto_aegis256_aesni_final(void *state, void *tag_xor,
+ *                                  u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis256_aesni_final)
+       FRAME_BEGIN
+
+       state_load
+
+       /* prepare length block: */
+       movq %rdx, MSG
+       movq %rcx, T0
+       pslldq $8, T0
+       pxor T0, MSG
+       psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+       pxor STATE3, MSG
+
+       /* update state: */
+       update0 MSG
+       update1 MSG
+       update2 MSG
+       update3 MSG
+       update4 MSG
+       update5 MSG
+       update0 MSG
+
+       /* xor tag: */
+       movdqu (%rsi), MSG
+
+       pxor STATE0, MSG
+       pxor STATE1, MSG
+       pxor STATE2, MSG
+       pxor STATE3, MSG
+       pxor STATE4, MSG
+       pxor STATE5, MSG
+
+       movdqu MSG, (%rsi)
+
+       FRAME_END
+       ret
+ENDPROC(crypto_aegis256_aesni_final)
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c 
b/arch/x86/crypto/aegis256-aesni-glue.c
new file mode 100644
index 000000000000..3181655dd862
--- /dev/null
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -0,0 +1,407 @@
+/*
+ * The AEGIS-256 Authenticated-Encryption Algorithm
+ *   Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosna...@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS256_BLOCK_ALIGN 16
+#define AEGIS256_BLOCK_SIZE 16
+#define AEGIS256_NONCE_SIZE 32
+#define AEGIS256_STATE_BLOCKS 6
+#define AEGIS256_KEY_SIZE 32
+#define AEGIS256_MIN_AUTH_SIZE 8
+#define AEGIS256_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis256_aesni_ad(
+               void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis256_aesni_enc(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_dec(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_enc_tail(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_dec_tail(
+               void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_final(
+               void *state, void *tag_xor, unsigned int cryptlen,
+               unsigned int assoclen);
+
+struct aegis_block {
+       u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+       struct aegis_block blocks[AEGIS256_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+       struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+       int (*skcipher_walk_init)(struct skcipher_walk *walk,
+                                 struct aead_request *req, bool atomic);
+
+       void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+                            void *dst);
+       void (*crypt_tail)(void *state, unsigned int length, const void *src,
+                          void *dst);
+};
+
+static void crypto_aegis256_aesni_process_ad(
+               struct aegis_state *state, struct scatterlist *sg_src,
+               unsigned int assoclen)
+{
+       struct scatter_walk walk;
+       struct aegis_block buf;
+       unsigned int pos = 0;
+
+       scatterwalk_start(&walk, sg_src);
+       while (assoclen != 0) {
+               unsigned int size = scatterwalk_clamp(&walk, assoclen);
+               unsigned int left = size;
+               void *mapped = scatterwalk_map(&walk);
+               const u8 *src = (const u8 *)mapped;
+
+               if (pos + size >= AEGIS256_BLOCK_SIZE) {
+                       if (pos > 0) {
+                               unsigned int fill = AEGIS256_BLOCK_SIZE - pos;
+                               memcpy(buf.bytes + pos, src, fill);
+                               crypto_aegis256_aesni_ad(state,
+                                                        AEGIS256_BLOCK_SIZE,
+                                                        buf.bytes);
+                               pos = 0;
+                               left -= fill;
+                               src += fill;
+                       }
+
+                       crypto_aegis256_aesni_ad(state, left, src);
+
+                       src += left & ~(AEGIS256_BLOCK_SIZE - 1);
+                       left &= AEGIS256_BLOCK_SIZE - 1;
+               }
+
+               memcpy(buf.bytes + pos, src, left);
+               pos += left;
+               assoclen -= size;
+
+               scatterwalk_unmap(mapped);
+               scatterwalk_advance(&walk, size);
+               scatterwalk_done(&walk, 0, assoclen);
+       }
+
+       if (pos > 0) {
+               memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos);
+               crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes);
+       }
+}
+
+static void crypto_aegis256_aesni_process_crypt(
+               struct aegis_state *state, struct aead_request *req,
+               const struct aegis_crypt_ops *ops)
+{
+       struct skcipher_walk walk;
+       u8 *src, *dst;
+       unsigned int chunksize, base;
+
+       ops->skcipher_walk_init(&walk, req, false);
+
+       while (walk.nbytes) {
+               src = walk.src.virt.addr;
+               dst = walk.dst.virt.addr;
+               chunksize = walk.nbytes;
+
+               ops->crypt_blocks(state, chunksize, src, dst);
+
+               base = chunksize & ~(AEGIS256_BLOCK_SIZE - 1);
+               src += base;
+               dst += base;
+               chunksize &= AEGIS256_BLOCK_SIZE - 1;
+
+               if (chunksize > 0)
+                       ops->crypt_tail(state, chunksize, src, dst);
+
+               skcipher_walk_done(&walk, 0);
+       }
+}
+
+static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead)
+{
+       u8 *ctx = crypto_aead_ctx(aead);
+       ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+       return (void *)ctx;
+}
+
+static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 
*key,
+                                       unsigned int keylen)
+{
+       struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead);
+
+       if (keylen != AEGIS256_KEY_SIZE) {
+               crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+
+       memcpy(ctx->key.bytes, key, AEGIS256_KEY_SIZE);
+
+       return 0;
+}
+
+static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm,
+                                               unsigned int authsize)
+{
+       if (authsize > AEGIS256_MAX_AUTH_SIZE)
+               return -EINVAL;
+       if (authsize < AEGIS256_MIN_AUTH_SIZE)
+               return -EINVAL;
+       return 0;
+}
+
+static void crypto_aegis256_aesni_crypt(struct aead_request *req,
+                                       struct aegis_block *tag_xor,
+                                       unsigned int cryptlen,
+                                       const struct aegis_crypt_ops *ops)
+{
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm);
+       struct aegis_state state;
+
+       kernel_fpu_begin();
+
+       crypto_aegis256_aesni_init(&state, ctx->key.bytes, req->iv);
+       crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen);
+       crypto_aegis256_aesni_process_crypt(&state, req, ops);
+       crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+       kernel_fpu_end();
+}
+
+static int crypto_aegis256_aesni_encrypt(struct aead_request *req)
+{
+       static const struct aegis_crypt_ops OPS = {
+               .skcipher_walk_init = skcipher_walk_aead_encrypt,
+               .crypt_blocks = crypto_aegis256_aesni_enc,
+               .crypt_tail = crypto_aegis256_aesni_enc_tail,
+       };
+
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aegis_block tag = {};
+       unsigned int authsize = crypto_aead_authsize(tfm);
+       unsigned int cryptlen = req->cryptlen;
+
+       crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+       scatterwalk_map_and_copy(tag.bytes, req->dst,
+                                req->assoclen + cryptlen, authsize, 1);
+       return 0;
+}
+
+static int crypto_aegis256_aesni_decrypt(struct aead_request *req)
+{
+       static const struct aegis_block zeros = {};
+
+       static const struct aegis_crypt_ops OPS = {
+               .skcipher_walk_init = skcipher_walk_aead_decrypt,
+               .crypt_blocks = crypto_aegis256_aesni_dec,
+               .crypt_tail = crypto_aegis256_aesni_dec_tail,
+       };
+
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aegis_block tag;
+       unsigned int authsize = crypto_aead_authsize(tfm);
+       unsigned int cryptlen = req->cryptlen - authsize;
+
+       scatterwalk_map_and_copy(tag.bytes, req->src,
+                                req->assoclen + cryptlen, authsize, 0);
+
+       crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+       return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead)
+{
+       return 0;
+}
+
+static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead,
+                                       const u8 *key, unsigned int keylen)
+{
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead,
+                                            unsigned int authsize)
+{
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis256_aesni_encrypt(struct aead_request *req)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       aead = &cryptd_tfm->base;
+       if (irq_fpu_usable() && (!in_atomic() ||
+                                !cryptd_aead_queued(cryptd_tfm)))
+               aead = cryptd_aead_child(cryptd_tfm);
+
+       aead_request_set_tfm(req, aead);
+
+       return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis256_aesni_decrypt(struct aead_request *req)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+       struct cryptd_aead *cryptd_tfm = *ctx;
+
+       aead = &cryptd_tfm->base;
+       if (irq_fpu_usable() && (!in_atomic() ||
+                                !cryptd_aead_queued(cryptd_tfm)))
+               aead = cryptd_aead_child(cryptd_tfm);
+
+       aead_request_set_tfm(req, aead);
+
+       return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead)
+{
+       struct cryptd_aead *cryptd_tfm;
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+       cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL,
+                                      CRYPTO_ALG_INTERNAL);
+       if (IS_ERR(cryptd_tfm))
+               return PTR_ERR(cryptd_tfm);
+
+       *ctx = cryptd_tfm;
+       crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+       return 0;
+}
+
+static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
+{
+       struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+       cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis256_aesni_alg[] = {
+       {
+               .setkey = crypto_aegis256_aesni_setkey,
+               .setauthsize = crypto_aegis256_aesni_setauthsize,
+               .encrypt = crypto_aegis256_aesni_encrypt,
+               .decrypt = crypto_aegis256_aesni_decrypt,
+               .init = crypto_aegis256_aesni_init_tfm,
+               .exit = crypto_aegis256_aesni_exit_tfm,
+
+               .ivsize = AEGIS256_NONCE_SIZE,
+               .maxauthsize = AEGIS256_MAX_AUTH_SIZE,
+               .chunksize = AEGIS256_BLOCK_SIZE,
+
+               .base = {
+                       .cra_flags = CRYPTO_ALG_INTERNAL,
+                       .cra_blocksize = 1,
+                       .cra_ctxsize = sizeof(struct aegis_ctx) +
+                               __alignof__(struct aegis_ctx),
+                       .cra_alignmask = 0,
+
+                       .cra_name = "__aegis256",
+                       .cra_driver_name = "__aegis256-aesni",
+
+                       .cra_module = THIS_MODULE,
+               }
+       }, {
+               .setkey = cryptd_aegis256_aesni_setkey,
+               .setauthsize = cryptd_aegis256_aesni_setauthsize,
+               .encrypt = cryptd_aegis256_aesni_encrypt,
+               .decrypt = cryptd_aegis256_aesni_decrypt,
+               .init = cryptd_aegis256_aesni_init_tfm,
+               .exit = cryptd_aegis256_aesni_exit_tfm,
+
+               .ivsize = AEGIS256_NONCE_SIZE,
+               .maxauthsize = AEGIS256_MAX_AUTH_SIZE,
+               .chunksize = AEGIS256_BLOCK_SIZE,
+
+               .base = {
+                       .cra_flags = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize = 1,
+                       .cra_ctxsize = sizeof(struct cryptd_aead *),
+                       .cra_alignmask = 0,
+
+                       .cra_priority = 400,
+
+                       .cra_name = "aegis256",
+                       .cra_driver_name = "aegis256-aesni",
+
+                       .cra_module = THIS_MODULE,
+               }
+       }
+};
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+       X86_FEATURE_MATCH(X86_FEATURE_AES),
+       X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+       {}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init crypto_aegis256_aesni_module_init(void)
+{
+       if (!x86_match_cpu(aesni_cpu_id))
+               return -ENODEV;
+
+       return crypto_register_aeads(crypto_aegis256_aesni_alg,
+                                   ARRAY_SIZE(crypto_aegis256_aesni_alg));
+}
+
+static void __exit crypto_aegis256_aesni_module_exit(void)
+{
+       crypto_unregister_aeads(crypto_aegis256_aesni_alg,
+                               ARRAY_SIZE(crypto_aegis256_aesni_alg));
+}
+
+module_init(crypto_aegis256_aesni_module_init);
+module_exit(crypto_aegis256_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosna...@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis256");
+MODULE_ALIAS_CRYPTO("aegis256-aesni");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 48856238a490..d8d123ea47c6 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -310,6 +310,30 @@ config CRYPTO_AEGIS256
        help
         Support for the AEGIS-256 dedicated AEAD algorithm.
 
+config CRYPTO_AEGIS128_AESNI_SSE2
+       tristate "AEGIS-128 AEAD algorithm (x86_64 AESNI+SSE2 implementation)"
+       depends on X86 && 64BIT
+       select CRYPTO_AEAD
+       select CRYPTO_CRYPTD
+       help
+        AESNI+SSE2 implementation of the AEGSI-128 dedicated AEAD algorithm.
+
+config CRYPTO_AEGIS128L_AESNI_SSE2
+       tristate "AEGIS-128L AEAD algorithm (x86_64 AESNI+SSE2 implementation)"
+       depends on X86 && 64BIT
+       select CRYPTO_AEAD
+       select CRYPTO_CRYPTD
+       help
+        AESNI+SSE2 implementation of the AEGSI-128L dedicated AEAD algorithm.
+
+config CRYPTO_AEGIS256_AESNI_SSE2
+       tristate "AEGIS-256 AEAD algorithm (x86_64 AESNI+SSE2 implementation)"
+       depends on X86 && 64BIT
+       select CRYPTO_AEAD
+       select CRYPTO_CRYPTD
+       help
+        AESNI+SSE2 implementation of the AEGSI-256 dedicated AEAD algorithm.
+
 config CRYPTO_SEQIV
        tristate "Sequence Number IV Generator"
        select CRYPTO_AEAD
-- 
2.17.0

Reply via email to