Module Name: src Committed By: riastradh Date: Mon Jun 29 23:29:40 UTC 2020
Modified Files: src/sys/arch/x86/conf: files.x86 src/sys/arch/x86/x86: identcpu.c Added Files: src/sys/crypto/aes/arch/x86: aes_ni.c aes_ni.h aes_ni_64.S files.aesni Log Message: Add x86 AES-NI support. Limited to amd64 for now. In principle, AES-NI should work in 32-bit mode, and there may even be some 32-bit-only CPUs that support AES-NI, but that requires work to adapt the assembly. To generate a diff of this commit: cvs rdiff -u -r1.111 -r1.112 src/sys/arch/x86/conf/files.x86 cvs rdiff -u -r1.107 -r1.108 src/sys/arch/x86/x86/identcpu.c cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/x86/aes_ni.c \ src/sys/crypto/aes/arch/x86/aes_ni.h \ src/sys/crypto/aes/arch/x86/aes_ni_64.S \ src/sys/crypto/aes/arch/x86/files.aesni Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/x86/conf/files.x86 diff -u src/sys/arch/x86/conf/files.x86:1.111 src/sys/arch/x86/conf/files.x86:1.112 --- src/sys/arch/x86/conf/files.x86:1.111 Wed May 6 19:45:12 2020 +++ src/sys/arch/x86/conf/files.x86 Mon Jun 29 23:29:39 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.x86,v 1.111 2020/05/06 19:45:12 bouyer Exp $ +# $NetBSD: files.x86,v 1.112 2020/06/29 23:29:39 riastradh Exp $ # options for MP configuration through the MP spec defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI @@ -165,3 +165,6 @@ file arch/x86/pci/pciide_machdep.c pciid file arch/x86/pci/pci_bus_fixup.c pci_bus_fixup file arch/x86/pci/pci_addr_fixup.c pci_addr_fixup + +# AES-NI +include "crypto/aes/arch/x86/files.aesni" Index: src/sys/arch/x86/x86/identcpu.c diff -u src/sys/arch/x86/x86/identcpu.c:1.107 src/sys/arch/x86/x86/identcpu.c:1.108 --- src/sys/arch/x86/x86/identcpu.c:1.107 Sat Apr 25 15:26:18 2020 +++ src/sys/arch/x86/x86/identcpu.c Mon Jun 29 23:29:39 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: identcpu.c,v 1.107 2020/04/25 15:26:18 bouyer Exp $ */ +/* $NetBSD: identcpu.c,v 1.108 2020/06/29 23:29:39 riastradh Exp $ */ /*- * Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.107 2020/04/25 15:26:18 bouyer Exp $"); +__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.108 2020/06/29 23:29:39 riastradh Exp $"); #include "opt_xen.h" @@ -39,6 +39,8 @@ __KERNEL_RCSID(0, "$NetBSD: identcpu.c,v #include <sys/device.h> #include <sys/cpu.h> +#include <crypto/aes/arch/x86/aes_ni.h> + #include <uvm/uvm_extern.h> #include <machine/specialreg.h> @@ -995,6 +997,10 @@ cpu_probe(struct cpu_info *ci) /* Early patch of text segment. */ x86_patch(true); #endif +#ifdef __x86_64__ /* not yet implemented on i386 */ + if (cpu_feature[1] & CPUID2_AES) + aes_md_init(&aes_ni_impl); +#endif } else { /* * If not first. Warn about cpu_feature mismatch for Added files: Index: src/sys/crypto/aes/arch/x86/aes_ni.c diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_ni.c:1.1 --- /dev/null Mon Jun 29 23:29:40 2020 +++ src/sys/crypto/aes/arch/x86/aes_ni.c Mon Jun 29 23:29:40 2020 @@ -0,0 +1,252 @@ +/* $NetBSD: aes_ni.c,v 1.1 2020/06/29 23:29:40 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_ni.c,v 1.1 2020/06/29 23:29:40 riastradh Exp $"); + +#include <sys/types.h> +#include <sys/systm.h> + +#include <crypto/aes/aes.h> +#include <crypto/aes/arch/x86/aes_ni.h> + +#include <x86/cpuvar.h> +#include <x86/fpu.h> +#include <x86/specialreg.h> + +static void +aesni_setenckey(struct aesenc *enc, const uint8_t key[static 16], + uint32_t nrounds) +{ + + switch (nrounds) { + case 10: + aesni_setenckey128(enc, key); + break; + case 12: + aesni_setenckey192(enc, key); + break; + case 14: + aesni_setenckey256(enc, key); + break; + default: + panic("invalid AES rounds: %u", nrounds); + } +} + +static void +aesni_setenckey_impl(struct aesenc *enc, const uint8_t key[static 16], + uint32_t nrounds) +{ + + fpu_kern_enter(); + aesni_setenckey(enc, key, nrounds); + fpu_kern_leave(); +} + +static void +aesni_setdeckey_impl(struct aesdec *dec, const uint8_t key[static 16], + uint32_t nrounds) +{ + struct aesenc enc; + + fpu_kern_enter(); + aesni_setenckey(&enc, key, nrounds); + aesni_enctodec(&enc, dec, nrounds); + fpu_kern_leave(); + + explicit_memset(&enc, 0, sizeof enc); +} + +static void +aesni_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + + fpu_kern_enter(); + aesni_enc(enc, in, out, nrounds); + fpu_kern_leave(); +} + +static void +aesni_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + + fpu_kern_enter(); + aesni_dec(dec, in, out, nrounds); + fpu_kern_leave(); +} + +static void +aesni_cbc_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + KASSERT(nbytes % 16 == 0); + + fpu_kern_enter(); + aesni_cbc_enc(enc, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static void +aesni_cbc_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + KASSERT(nbytes % 16 == 0); + + fpu_kern_enter(); + + if (nbytes % 128) { + aesni_cbc_dec1(dec, in, out, nbytes % 128, iv, nrounds); + in += nbytes % 128; + out += nbytes % 128; + nbytes -= nbytes % 128; + } + + KASSERT(nbytes % 128 == 0); + if (nbytes) + aesni_cbc_dec8(dec, in, out, nbytes, iv, nrounds); + + fpu_kern_leave(); +} + +static void +aesni_xts_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + KASSERT(nbytes % 16 == 0); + + fpu_kern_enter(); + + if (nbytes % 128) { + aesni_xts_enc1(enc, in, out, nbytes % 128, iv, nrounds); + in += nbytes % 128; + out += nbytes % 128; + nbytes -= nbytes % 128; + } + + KASSERT(nbytes % 128 == 0); + if (nbytes) + aesni_xts_enc8(enc, in, out, nbytes, iv, nrounds); + + fpu_kern_leave(); +} + +static void +aesni_xts_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + KASSERT(nbytes % 16 == 0); + + fpu_kern_enter(); + + if (nbytes % 128) { + aesni_xts_dec1(dec, in, out, nbytes % 128, iv, nrounds); + in += nbytes % 128; + out += nbytes % 128; + nbytes -= nbytes % 128; + } + + KASSERT(nbytes % 128 == 0); + if (nbytes) + aesni_xts_dec8(dec, in, out, nbytes, iv, nrounds); + + fpu_kern_leave(); +} + +static int +aesni_xts_update_selftest(void) +{ + static const struct { + uint8_t in[16], out[16]; + } cases[] = { + {{1}, {2}}, + {{0,0,0,0x80}, {0,0,0,0,1}}, + {{0,0,0,0,0,0,0,0x80}, {0,0,0,0,0,0,0,0,1}}, + {{0,0,0,0x80,0,0,0,0x80}, {0,0,0,0,1,0,0,0,1}}, + {{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x80}, {0x87}}, + {{0,0,0,0,0,0,0,0x80,0,0,0,0,0,0,0,0x80}, + {0x87,0,0,0,0,0,0,0,1}}, + {{0,0,0,0x80,0,0,0,0,0,0,0,0,0,0,0,0x80}, {0x87,0,0,0,1}}, + {{0,0,0,0x80,0,0,0,0x80,0,0,0,0,0,0,0,0x80}, + {0x87,0,0,0,1,0,0,0,1}}, + }; + unsigned i; + uint8_t tweak[16]; + + for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { + aesni_xts_update(cases[i].in, tweak); + if (memcmp(tweak, cases[i].out, 16)) + return -1; + } + + /* Success! */ + return 0; +} + +static int +aesni_probe(void) +{ + int result = 0; + + /* Verify that the CPU supports AES-NI. */ + if ((cpu_feature[1] & CPUID2_AES) == 0) + return -1; + + fpu_kern_enter(); + + /* Verify that our XTS tweak update logic works. */ + if (aesni_xts_update_selftest()) + result = -1; + + fpu_kern_leave(); + + return result; +} + +struct aes_impl aes_ni_impl = { + .ai_name = "Intel AES-NI", + .ai_probe = aesni_probe, + .ai_setenckey = aesni_setenckey_impl, + .ai_setdeckey = aesni_setdeckey_impl, + .ai_enc = aesni_enc_impl, + .ai_dec = aesni_dec_impl, + .ai_cbc_enc = aesni_cbc_enc_impl, + .ai_cbc_dec = aesni_cbc_dec_impl, + .ai_xts_enc = aesni_xts_enc_impl, + .ai_xts_dec = aesni_xts_dec_impl, +}; Index: src/sys/crypto/aes/arch/x86/aes_ni.h diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_ni.h:1.1 --- /dev/null Mon Jun 29 23:29:40 2020 +++ src/sys/crypto/aes/arch/x86/aes_ni.h Mon Jun 29 23:29:40 2020 @@ -0,0 +1,68 @@ +/* $NetBSD: aes_ni.h,v 1.1 2020/06/29 23:29:40 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_X86_AES_NI_H +#define _CRYPTO_AES_ARCH_X86_AES_NI_H + +#include <sys/types.h> + +#include <crypto/aes/aes.h> + +/* Assembly routines */ + +void aesni_setenckey128(struct aesenc *, const uint8_t[static 16]); +void aesni_setenckey192(struct aesenc *, const uint8_t[static 24]); +void aesni_setenckey256(struct aesenc *, const uint8_t[static 32]); + +void aesni_enctodec(const struct aesenc *, struct aesdec *, uint32_t); + +void aesni_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], uint32_t); +void aesni_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], uint32_t); + +void aesni_cbc_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aesni_cbc_dec1(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t, const uint8_t[static 16], uint32_t); +void aesni_cbc_dec8(const struct aesdec *, const uint8_t[static 128], + uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t); + +void aesni_xts_enc1(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aesni_xts_enc8(const struct aesenc *, const uint8_t[static 128], + uint8_t[static 128], size_t, uint8_t[static 16], uint32_t); +void aesni_xts_dec1(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aesni_xts_dec8(const struct aesdec *, const uint8_t[static 128], + uint8_t[static 128], size_t, uint8_t[static 16], uint32_t); +void aesni_xts_update(const uint8_t[static 16], uint8_t[static 16]); + +extern struct aes_impl aes_ni_impl; + +#endif /* _CRYPTO_AES_ARCH_X86_AES_NI_H */ Index: src/sys/crypto/aes/arch/x86/aes_ni_64.S diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_ni_64.S:1.1 --- /dev/null Mon Jun 29 23:29:40 2020 +++ src/sys/crypto/aes/arch/x86/aes_ni_64.S Mon Jun 29 23:29:40 2020 @@ -0,0 +1,1095 @@ +/* $NetBSD: aes_ni_64.S,v 1.1 2020/06/29 23:29:40 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <machine/asm.h> + +/* + * MOVDQA/MOVDQU are Move Double Quadword (Aligned/Unaligned), defined + * to operate on integers; MOVAPS/MOVUPS are Move (Aligned/Unaligned) + * Packed Single, defined to operate on binary32 floats. They have + * exactly the same architectural effects (move a 128-bit quantity from + * memory into an xmm register). + * + * In principle, they might have different microarchitectural effects + * so that MOVAPS/MOVUPS might incur a penalty when the register is + * later used for integer paths, but in practice they don't. So we use + * the one whose instruction encoding is shorter -- MOVAPS/MOVUPS. + */ +#define movdqa movaps +#define movdqu movups + +/* + * aesni_setenckey128(struct aesenc *enckey@rdi, const uint8_t key[16] @rsi) + * + * Expand a 16-byte AES-128 key into 10 round keys. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_setenckey128) + movdqu (%rsi),%xmm0 /* load master key into %xmm0 */ + movdqa %xmm0,(%rdi) /* store master key as the first round key */ + lea 0x10(%rdi),%rdi /* advance %rdi to next round key */ + aeskeygenassist $0x1,%xmm0,%xmm2 + call aesni_expand128 + aeskeygenassist $0x2,%xmm0,%xmm2 + call aesni_expand128 + aeskeygenassist $0x4,%xmm0,%xmm2 + call aesni_expand128 + aeskeygenassist $0x8,%xmm0,%xmm2 + call aesni_expand128 + aeskeygenassist $0x10,%xmm0,%xmm2 + call aesni_expand128 + aeskeygenassist $0x20,%xmm0,%xmm2 + call aesni_expand128 + aeskeygenassist $0x40,%xmm0,%xmm2 + call aesni_expand128 + aeskeygenassist $0x80,%xmm0,%xmm2 + call aesni_expand128 + aeskeygenassist $0x1b,%xmm0,%xmm2 + call aesni_expand128 + aeskeygenassist $0x36,%xmm0,%xmm2 + call aesni_expand128 + ret +END(aesni_setenckey128) + +/* + * aesni_setenckey192(struct aesenc *enckey@rdi, const uint8_t key[24] @rsi) + * + * Expand a 24-byte AES-192 key into 12 round keys. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_setenckey192) + movdqu (%rsi),%xmm0 /* load master key [0:128) into %xmm0 */ + movq 0x10(%rsi),%xmm1 /* load master key [128:192) into %xmm1 */ + movdqa %xmm0,(%rdi) /* store master key [0:128) as round key */ + lea 0x10(%rdi),%rdi /* advance %rdi to next round key */ + aeskeygenassist $0x1,%xmm1,%xmm2 + call aesni_expand192a + aeskeygenassist $0x2,%xmm0,%xmm2 + call aesni_expand192b + aeskeygenassist $0x4,%xmm1,%xmm2 + call aesni_expand192a + aeskeygenassist $0x8,%xmm0,%xmm2 + call aesni_expand192b + aeskeygenassist $0x10,%xmm1,%xmm2 + call aesni_expand192a + aeskeygenassist $0x20,%xmm0,%xmm2 + call aesni_expand192b + aeskeygenassist $0x40,%xmm1,%xmm2 + call aesni_expand192a + aeskeygenassist $0x80,%xmm0,%xmm2 + call aesni_expand192b + ret +END(aesni_setenckey192) + +/* + * aesni_setenckey256(struct aesenc *enckey@rdi, const uint8_t key[32] @rsi) + * + * Expand a 32-byte AES-256 key into 14 round keys. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_setenckey256) + movdqu (%rsi),%xmm0 /* load master key [0:128) into %xmm0 */ + movdqu 0x10(%rsi),%xmm1 /* load master key [128:256) into %xmm1 */ + movdqa %xmm0,(%rdi) /* store master key [0:128) as round key */ + movdqa %xmm1,0x10(%rdi) /* store master key [128:256) as round key */ + lea 0x20(%rdi),%rdi /* advance %rdi to next round key */ + aeskeygenassist $0x1,%xmm1,%xmm2 + call aesni_expand256a + aeskeygenassist $0x1,%xmm0,%xmm2 + call aesni_expand256b + aeskeygenassist $0x2,%xmm1,%xmm2 + call aesni_expand256a + aeskeygenassist $0x2,%xmm0,%xmm2 + call aesni_expand256b + aeskeygenassist $0x4,%xmm1,%xmm2 + call aesni_expand256a + aeskeygenassist $0x4,%xmm0,%xmm2 + call aesni_expand256b + aeskeygenassist $0x8,%xmm1,%xmm2 + call aesni_expand256a + aeskeygenassist $0x8,%xmm0,%xmm2 + call aesni_expand256b + aeskeygenassist $0x10,%xmm1,%xmm2 + call aesni_expand256a + aeskeygenassist $0x10,%xmm0,%xmm2 + call aesni_expand256b + aeskeygenassist $0x20,%xmm1,%xmm2 + call aesni_expand256a + aeskeygenassist $0x20,%xmm0,%xmm2 + call aesni_expand256b + aeskeygenassist $0x40,%xmm1,%xmm2 + call aesni_expand256a + ret +END(aesni_setenckey256) + +/* + * aesni_expand128(uint128_t *rkp@rdi, uint128_t prk@xmm0, + * uint128_t keygenassist@xmm2) + * + * 1. Compute the AES-128 round key using the previous round key. + * 2. Store it at *rkp. + * 3. Set %xmm0 to it. + * 4. Advance %rdi to point at the next round key. + * + * Internal ABI. On entry: + * + * %rdi = rkp, pointer to round key to compute + * %xmm0 = (prk[0], prk[1], prk[2], prk[3]) + * %xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])) ^ RCON) + * + * On exit: + * + * %rdi = &rkp[1], rkp advanced by one round key + * %xmm0 = rk, the round key we just computed + * %xmm2 = garbage + * %xmm4 = garbage + * %xmm5 = garbage + * %xmm6 = garbage + * + * Note: %xmm1 is preserved (as are %xmm3 and %xmm7 through %xmm15, + * and all other registers). + */ + .text + _ALIGN_TEXT + .type aesni_expand128,@function +aesni_expand128: + /* + * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]), + * i.e., set each word of %xmm2 to t := Rot(SubWord(prk[3])) ^ RCON. + */ + pshufd $0b11111111,%xmm2,%xmm2 + + /* + * %xmm4 := (0, prk[0], prk[1], prk[2]) + * %xmm5 := (0, 0, prk[0], prk[1]) + * %xmm6 := (0, 0, 0, prk[0]) + */ + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm6 + pslldq $4,%xmm4 + pslldq $8,%xmm5 + pslldq $12,%xmm6 + + /* + * %xmm0 := (rk[0] = t ^ prk[0], + * rk[1] = t ^ prk[0] ^ prk[1], + * rk[2] = t ^ prk[0] ^ prk[1] ^ prk[2], + * rk[3] = t ^ prk[0] ^ prk[1] ^ prk[2] ^ prk[3]) + */ + pxor %xmm2,%xmm0 + pxor %xmm4,%xmm0 + pxor %xmm5,%xmm0 + pxor %xmm6,%xmm0 + + movdqa %xmm0,(%rdi) /* store round key */ + lea 0x10(%rdi),%rdi /* advance to next round key address */ + ret +END(aesni_expand128) + +/* + * aesni_expand192a(uint128_t *rkp@rdi, uint128_t prk@xmm0, + * uint64_t rklo@xmm1, uint128_t keygenassist@xmm2) + * + * Set even-numbered AES-192 round key. + * + * Internal ABI. On entry: + * + * %rdi = rkp, pointer to two round keys to compute + * %xmm0 = (prk[0], prk[1], prk[2], prk[3]) + * %xmm1 = (rklo[0], rklo[1], xxx, xxx) + * %xmm2 = (xxx, t = Rot(SubWord(rklo[1])) ^ RCON, xxx, xxx) + * + * On exit: + * + * %rdi = &rkp[2], rkp advanced by two round keys + * %xmm0 = nrk, second round key we just computed + * %xmm1 = rk, first round key we just computed + * %xmm2 = garbage + * %xmm4 = garbage + * %xmm5 = garbage + * %xmm6 = garbage + * %xmm7 = garbage + */ + .text + _ALIGN_TEXT + .type aesni_expand192a,@function +aesni_expand192a: + /* + * %xmm2 := (%xmm2[1], %xmm2[1], %xmm2[1], %xmm2[1]), + * i.e., set each word of %xmm2 to t := Rot(SubWord(rklo[1])) ^ RCON. + */ + pshufd $0b01010101,%xmm2,%xmm2 + + /* + * We need to compute: + * + * rk[0] := rklo[0] + * rk[1] := rklo[1] + * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] + * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] + * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2] + * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] + * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] + * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] + * ^ rklo[1] + */ + + /* + * %xmm4 := (prk[0], prk[1], prk[2], prk[3]) + * %xmm5 := (0, prk[0], prk[1], prk[2]) + * %xmm6 := (0, 0, prk[0], prk[1]) + * %xmm7 := (0, 0, 0, prk[0]) + */ + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm6 + movdqa %xmm0,%xmm7 + pslldq $4,%xmm5 + pslldq $8,%xmm6 + pslldq $12,%xmm7 + + /* %xmm4 := (rk[2], rk[3], nrk[0], nrk[1]) */ + pxor %xmm2,%xmm4 + pxor %xmm5,%xmm4 + pxor %xmm6,%xmm4 + pxor %xmm7,%xmm4 + + /* + * At this point, rk is split across %xmm1 (rk[0],rk[1],...) and + * %xmm4 (rk[2],rk[3],...); nrk is in %xmm4 (...,nrk[0],nrk[1]); + * and we have yet to compute nrk[2] or nrk[3], which requires + * rklo[0] and rklo[1] in %xmm1 (rklo[0], rklo[1], ...). We need + * nrk to end up in %xmm0 at the end, so gather rk into %xmm1 and + * nrk into %xmm0. + */ + + /* %xmm0 := (nrk[0], nrk[1], nrk[1], nrk[1]) */ + pshufd $0b11111110,%xmm4,%xmm0 + + /* + * %xmm6 := (0, 0, rklo[0], rklo[1]) + * %xmm7 := (0, 0, 0, rklo[0]) + */ + movdqa %xmm1,%xmm6 + movdqa %xmm1,%xmm7 + + pslldq $8,%xmm6 + pslldq $12,%xmm7 + + /* + * %xmm0 := (nrk[0], + * nrk[1], + * nrk[2] = nrk[1] ^ rklo[0], + * nrk[3] = nrk[1] ^ rklo[0] ^ rklo[1]) + */ + pxor %xmm6,%xmm0 + pxor %xmm7,%xmm0 + + /* %xmm1 := (rk[0], rk[1], rk[2], rk[3]) */ + shufps $0b01000100,%xmm4,%xmm1 + + movdqa %xmm1,(%rdi) /* store round key */ + movdqa %xmm0,0x10(%rdi) /* store next round key */ + lea 0x20(%rdi),%rdi /* advance two round keys */ + ret +END(aesni_expand192a) + +/* + * aesni_expand192b(uint128_t *roundkey@rdi, uint128_t prk@xmm0, + * uint128_t keygenassist@xmm2) + * + * Set odd-numbered AES-192 round key. + * + * Internal ABI. On entry: + * + * %rdi = rkp, pointer to round key to compute + * %xmm0 = (prk[0], prk[1], prk[2], prk[3]) + * %xmm1 = (xxx, xxx, pprk[2], pprk[3]) + * %xmm2 = (xxx, xxx, xxx, t = Rot(Sub(prk[3])) ^ RCON) + * + * On exit: + * + * %rdi = &rkp[1], rkp advanced by one round key + * %xmm0 = rk, the round key we just computed + * %xmm1 = (nrk[0], nrk[1], xxx, xxx), half of next round key + * %xmm2 = garbage + * %xmm4 = garbage + * %xmm5 = garbage + * %xmm6 = garbage + * %xmm7 = garbage + */ + .text + _ALIGN_TEXT + .type aesni_expand192b,@function +aesni_expand192b: + /* + * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]), + * i.e., set each word of %xmm2 to t := Rot(Sub(prk[3])) ^ RCON. + */ + pshufd $0b11111111,%xmm2,%xmm2 + + /* + * We need to compute: + * + * rk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] + * rk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] + * rk[2] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0] + * rk[3] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0] + * ^ prk[1] + * nrk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0] + * ^ prk[1] ^ prk[2] + * nrk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0] + * ^ prk[1] ^ prk[2] ^ prk[3] + */ + + /* %xmm1 := (pprk[2], pprk[3], prk[0], prk[1]) */ + shufps $0b01001110,%xmm0,%xmm1 + + /* + * %xmm5 := (0, pprk[2], pprk[3], prk[0]) + * %xmm6 := (0, 0, pprk[2], pprk[3]) + * %xmm7 := (0, 0, 0, pprk[2]) + */ + movdqa %xmm1,%xmm5 + movdqa %xmm1,%xmm6 + movdqa %xmm1,%xmm7 + pslldq $4,%xmm5 + pslldq $8,%xmm6 + pslldq $12,%xmm7 + + /* %xmm1 := (rk[0], rk[1], rk[2], rk[3) */ + pxor %xmm2,%xmm1 + pxor %xmm5,%xmm1 + pxor %xmm6,%xmm1 + pxor %xmm7,%xmm1 + + /* %xmm4 := (prk[2], prk[3], xxx, xxx) */ + pshufd $0b00001110,%xmm0,%xmm4 + + /* %xmm5 := (0, prk[2], xxx, xxx) */ + movdqa %xmm4,%xmm5 + pslldq $4,%xmm5 + + /* %xmm0 := (rk[0], rk[1], rk[2], rk[3]) */ + movdqa %xmm1,%xmm0 + + /* %xmm1 := (rk[3], rk[3], xxx, xxx) */ + shufps $0b00001111,%xmm1,%xmm1 + + /* + * %xmm1 := (nrk[0] = rk[3] ^ prk[2], + * nrk[1] = rk[3] ^ prk[2] ^ prk[3], + * xxx, + * xxx) + */ + pxor %xmm4,%xmm1 + pxor %xmm5,%xmm1 + + movdqa %xmm0,(%rdi) /* store round key */ + lea 0x10(%rdi),%rdi /* advance to next round key address */ + ret +END(aesni_expand192b) + +/* + * aesni_expand256a(uint128_t *rkp@rdi, uint128_t pprk@xmm0, + * uint128_t prk@xmm1, uint128_t keygenassist@xmm2) + * + * Set even-numbered AES-256 round key. + * + * Internal ABI. On entry: + * + * %rdi = rkp, pointer to round key to compute + * %xmm0 = (pprk[0], pprk[1], pprk[2], pprk[3]) + * %xmm1 = (prk[0], prk[1], prk[2], prk[3]) + * %xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3]))) + * + * On exit: + * + * %rdi = &rkp[1], rkp advanced by one round key + * %xmm0 = rk, the round key we just computed + * %xmm1 = prk, previous round key, preserved from entry + * %xmm2 = garbage + * %xmm4 = garbage + * %xmm5 = garbage + * %xmm6 = garbage + * + * The computation turns out to be the same as for AES-128; the + * previous round key does not figure into it, only the + * previous-previous round key. + */ + aesni_expand256a = aesni_expand128 + +/* + * aesni_expand256b(uint128_t *rkp@rdi, uint128_t prk@xmm0, + * uint128_t pprk@xmm1, uint128_t keygenassist@xmm2) + * + * Set odd-numbered AES-256 round key. + * + * Internal ABI. On entry: + * + * %rdi = rkp, pointer to round key to compute + * %xmm0 = (prk[0], prk[1], prk[2], prk[3]) + * %xmm1 = (pprk[0], pprk[1], pprk[2], pprk[3]) + * %xmm2 = (xxx, xxx, t = Sub(prk[3]), xxx) + * + * On exit: + * + * %rdi = &rkp[1], rkp advanced by one round key + * %xmm0 = prk, previous round key, preserved from entry + * %xmm1 = rk, the round key we just computed + * %xmm2 = garbage + * %xmm4 = garbage + * %xmm5 = garbage + * %xmm6 = garbage + */ + .text + _ALIGN_TEXT + .type aesni_expand256b,@function +aesni_expand256b: + /* + * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]), + * i.e., set each word of %xmm2 to t := Sub(prk[3]). + */ + pshufd $0b10101010,%xmm2,%xmm2 + + /* + * %xmm4 := (0, pprk[0], pprk[1], pprk[2]) + * %xmm5 := (0, 0, pprk[0], pprk[1]) + * %xmm6 := (0, 0, 0, pprk[0]) + */ + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm5 + movdqa %xmm1,%xmm6 + pslldq $4,%xmm4 + pslldq $8,%xmm5 + pslldq $12,%xmm6 + + /* + * %xmm0 := (rk[0] = t ^ pprk[0], + * rk[1] = t ^ pprk[0] ^ pprk[1], + * rk[2] = t ^ pprk[0] ^ pprk[1] ^ pprk[2], + * rk[3] = t ^ pprk[0] ^ pprk[1] ^ pprk[2] ^ pprk[3]) + */ + pxor %xmm2,%xmm1 + pxor %xmm4,%xmm1 + pxor %xmm5,%xmm1 + pxor %xmm6,%xmm1 + + movdqa %xmm1,(%rdi) /* store round key */ + lea 0x10(%rdi),%rdi /* advance to next round key address */ + ret +END(aesni_expand256b) + +/* + * aesni_enctodec(const struct aesenc *enckey@rdi, struct aesdec *deckey@rsi, + * uint32_t nrounds@rdx) + * + * Convert AES encryption round keys to AES decryption round keys. + * `rounds' must be between 10 and 14. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_enctodec) + shl $4,%edx /* rdx := byte offset of last round key */ + movdqa (%rdi,%rdx),%xmm0 /* load last round key */ + movdqa %xmm0,(%rsi) /* store last round key verbatim */ +1: sub $0x10,%rdx /* advance to next round key */ + lea 0x10(%rsi),%rsi + jz 2f /* stop if this is the last one */ + movdqa (%rdi,%rdx),%xmm0 /* load round key */ + aesimc %xmm0,%xmm0 /* convert encryption to decryption */ + movdqa %xmm0,(%rsi) /* store round key */ + jmp 1b +2: movdqa (%rdi),%xmm0 /* load first round key */ + movdqa %xmm0,(%rsi) /* store first round key verbatim */ + ret +END(aesni_enctodec) + +/* + * aesni_enc(const struct aesenc *enckey@rdi, const uint8_t in[16] @rsi, + * uint8_t out[16] @rdx, uint32_t nrounds@ecx) + * + * Encrypt a single block. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_enc) + movdqu (%rsi),%xmm0 + call aesni_enc1 + movdqu %xmm0,(%rdx) + ret +END(aesni_enc) + +/* + * aesni_dec(const struct aesdec *deckey@rdi, const uint8_t in[16] @rsi, + * uint8_t out[16] @rdx, uint32_t nrounds@ecx) + * + * Decrypt a single block. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_dec) + movdqu (%rsi),%xmm0 + call aesni_dec1 + movdqu %xmm0,(%rdx) + ret +END(aesni_dec) + +/* + * aesni_cbc_enc(const struct aesenc *enckey@rdi, const uint8_t *in@rsi, + * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t iv[16] @r8, + * uint32_t nrounds@r9d) + * + * Encrypt a contiguous sequence of blocks with AES-CBC. + * + * nbytes must be an integral multiple of 16. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_cbc_enc) + cmp $0,%rcx + jz 2f + mov %rcx,%r10 /* r10 := nbytes */ + movdqu (%r8),%xmm0 /* xmm0 := chaining value */ +1: movdqu (%rsi),%xmm1 /* xmm1 := plaintext block */ + lea 0x10(%rsi),%rsi + pxor %xmm1,%xmm0 /* xmm0 := cv ^ ptxt */ + mov %r9d,%ecx /* ecx := nrounds */ + call aesni_enc1 /* xmm0 := ciphertext block */ + movdqu %xmm0,(%rdx) + lea 0x10(%rdx),%rdx + sub $0x10,%r10 + jnz 1b /* repeat if r10 is nonzero */ + movdqu %xmm0,(%r8) /* store chaining value */ +2: ret +END(aesni_cbc_enc) + +/* + * aesni_cbc_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi, + * uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8, + * uint32_t nrounds@r9) + * + * Decrypt a contiguous sequence of blocks with AES-CBC. + * + * nbytes must be a positive integral multiple of 16. This routine + * is not vectorized; use aesni_cbc_dec8 for >=8 blocks at once. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_cbc_dec1) + push %rbp /* create stack frame uint128[1] */ + mov %rsp,%rbp + sub $0x10,%rsp + movdqu (%r8),%xmm8 /* xmm8 := iv */ + movdqa %xmm8,(%rsp) /* save iv */ + mov %rcx,%r10 /* r10 := nbytes */ + movdqu -0x10(%rsi,%r10),%xmm0 /* xmm0 := last ciphertext block */ + movdqu %xmm0,(%r8) /* update iv */ +1: mov %r9d,%ecx /* ecx := nrounds */ + call aesni_dec1 /* xmm0 := cv ^ ptxt */ + sub $0x10,%r10 + jz 2f /* first block if r10 is now zero */ + movdqu -0x10(%rsi,%r10),%xmm8 /* xmm8 := chaining value */ + pxor %xmm8,%xmm0 /* xmm0 := ptxt */ + movdqu %xmm0,(%rdx,%r10) /* store plaintext block */ + movdqa %xmm8,%xmm0 /* move cv = ciphertext block */ + jmp 1b +2: pxor (%rsp),%xmm0 /* xmm0 := ptxt */ + movdqu %xmm0,(%rdx) /* store first plaintext block */ + leave + ret +END(aesni_cbc_dec1) + +/* + * aesni_cbc_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi, + * uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8, + * uint32_t nrounds@r9) + * + * Decrypt a contiguous sequence of 8-block units with AES-CBC. + * + * nbytes must be a positive integral multiple of 128. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_cbc_dec8) + push %rbp /* create stack frame uint128[1] */ + mov %rsp,%rbp + sub $0x10,%rsp + movdqu (%r8),%xmm8 /* xmm8 := iv */ + movdqa %xmm8,(%rsp) /* save iv */ + mov %rcx,%r10 /* r10 := nbytes */ + movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := ciphertext block[n-1] */ + movdqu %xmm7,(%r8) /* update iv */ +1: movdqu -0x20(%rsi,%r10),%xmm6 /* xmm6 := ciphertext block[n-2] */ + movdqu -0x30(%rsi,%r10),%xmm5 /* xmm5 := ciphertext block[n-3] */ + movdqu -0x40(%rsi,%r10),%xmm4 /* xmm4 := ciphertext block[n-4] */ + movdqu -0x50(%rsi,%r10),%xmm3 /* xmm3 := ciphertext block[n-5] */ + movdqu -0x60(%rsi,%r10),%xmm2 /* xmm2 := ciphertext block[n-6] */ + movdqu -0x70(%rsi,%r10),%xmm1 /* xmm1 := ciphertext block[n-7] */ + movdqu -0x80(%rsi,%r10),%xmm0 /* xmm0 := ciphertext block[n-8] */ + movdqa %xmm6,%xmm15 /* xmm[8+i] := cv[i], 0<i<8 */ + movdqa %xmm5,%xmm14 + movdqa %xmm4,%xmm13 + movdqa %xmm3,%xmm12 + movdqa %xmm2,%xmm11 + movdqa %xmm1,%xmm10 + movdqa %xmm0,%xmm9 + mov %r9d,%ecx /* ecx := nrounds */ + call aesni_dec8 /* xmm[i] := cv[i] ^ ptxt[i], 0<=i<8 */ + pxor %xmm15,%xmm7 /* xmm[i] := ptxt[i], 0<i<8 */ + pxor %xmm14,%xmm6 + pxor %xmm13,%xmm5 + pxor %xmm12,%xmm4 + pxor %xmm11,%xmm3 + pxor %xmm10,%xmm2 + pxor %xmm9,%xmm1 + movdqu %xmm7,-0x10(%rdx,%r10) /* store plaintext blocks */ + movdqu %xmm6,-0x20(%rdx,%r10) + movdqu %xmm5,-0x30(%rdx,%r10) + movdqu %xmm4,-0x40(%rdx,%r10) + movdqu %xmm3,-0x50(%rdx,%r10) + movdqu %xmm2,-0x60(%rdx,%r10) + movdqu %xmm1,-0x70(%rdx,%r10) + sub $0x80,%r10 + jz 2f /* first block if r10 is now zero */ + movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := cv[0] */ + pxor %xmm7,%xmm0 /* xmm0 := ptxt[0] */ + movdqu %xmm0,(%rdx,%r10) /* store plaintext block */ + jmp 1b +2: pxor (%rsp),%xmm0 /* xmm0 := ptxt[0] */ + movdqu %xmm0,(%rdx) /* store first plaintext block */ + leave + ret +END(aesni_cbc_dec8) + +/* + * aesni_xts_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi, + * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8, + * uint32_t nrounds@r9d) + * + * Encrypt a contiguous sequence of blocks with AES-XTS. + * + * nbytes must be a positive integral multiple of 16. This routine + * is not vectorized; use aesni_xts_enc8 for >=8 blocks at once. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_xts_enc1) + mov %rcx,%r10 /* r10 := nbytes */ + movdqu (%r8),%xmm15 /* xmm15 := tweak */ +1: movdqu (%rsi),%xmm0 /* xmm0 := ptxt */ + lea 0x10(%rsi),%rsi /* advance rdi to next block */ + pxor %xmm15,%xmm0 /* xmm0 := ptxt ^ tweak */ + mov %r9d,%ecx /* ecx := nrounds */ + call aesni_enc1 /* xmm0 := AES(ptxt ^ tweak) */ + pxor %xmm15,%xmm0 /* xmm0 := AES(ptxt ^ tweak) ^ tweak */ + movdqu %xmm0,(%rdx) /* store ciphertext block */ + lea 0x10(%rdx),%rdx /* advance rsi to next block */ + call aesni_xts_mulx /* xmm15 *= x; trash xmm0 */ + sub $0x10,%r10 + jnz 1b /* repeat if more blocks */ + movdqu %xmm15,(%r8) /* update tweak */ + ret +END(aesni_xts_enc1) + +/* + * aesni_xts_enc8(const struct aesenc *enckey@rdi, const uint8_t *in@rsi, + * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8, + * uint32_t nrounds@r9d) + * + * Encrypt a contiguous sequence of blocks with AES-XTS. + * + * nbytes must be a positive integral multiple of 128. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_xts_enc8) + push %rbp /* create stack frame uint128[1] */ + mov %rsp,%rbp + sub $0x10,%rsp + mov %rcx,%r10 /* r10 := nbytes */ + movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */ +1: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */ + call aesni_xts_mulx /* xmm15 := tweak[1] */ + movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */ + call aesni_xts_mulx /* xmm15 := tweak[2] */ + movdqa %xmm15,%xmm10 /* xmm10 := tweak[2] */ + call aesni_xts_mulx /* xmm15 := tweak[3] */ + movdqa %xmm15,%xmm11 /* xmm11 := tweak[3] */ + call aesni_xts_mulx /* xmm15 := tweak[4] */ + movdqa %xmm15,%xmm12 /* xmm12 := tweak[4] */ + call aesni_xts_mulx /* xmm15 := tweak[5] */ + movdqa %xmm15,%xmm13 /* xmm13 := tweak[5] */ + call aesni_xts_mulx /* xmm15 := tweak[6] */ + movdqa %xmm15,%xmm14 /* xmm14 := tweak[6] */ + call aesni_xts_mulx /* xmm15 := tweak[7] */ + movdqu (%rsi),%xmm0 /* xmm[i] := ptxt[i] */ + movdqu 0x10(%rsi),%xmm1 + movdqu 0x20(%rsi),%xmm2 + movdqu 0x30(%rsi),%xmm3 + movdqu 0x40(%rsi),%xmm4 + movdqu 0x50(%rsi),%xmm5 + movdqu 0x60(%rsi),%xmm6 + movdqu 0x70(%rsi),%xmm7 + lea 0x80(%rsi),%rsi /* advance rsi to next block group */ + movdqa %xmm8,(%rsp) /* save tweak[0] */ + pxor %xmm8,%xmm0 /* xmm[i] := ptxt[i] ^ tweak[i] */ + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + mov %r9d,%ecx /* ecx := nrounds */ + call aesni_enc8 /* xmm[i] := AES(ptxt[i] ^ tweak[i]) */ + pxor (%rsp),%xmm0 /* xmm[i] := AES(...) ^ tweak[i] */ + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + movdqu %xmm0,(%rdx) /* store ciphertext blocks */ + movdqu %xmm1,0x10(%rdx) + movdqu %xmm2,0x20(%rdx) + movdqu %xmm3,0x30(%rdx) + movdqu %xmm4,0x40(%rdx) + movdqu %xmm5,0x50(%rdx) + movdqu %xmm6,0x60(%rdx) + movdqu %xmm7,0x70(%rdx) + lea 0x80(%rdx),%rdx /* advance rdx to next block group */ + call aesni_xts_mulx /* xmm15 := tweak[8] */ + sub $0x80,%r10 + jnz 1b /* repeat if more block groups */ + movdqu %xmm15,(%r8) /* update tweak */ + leave + ret +END(aesni_xts_enc8) + +/* + * aesni_xts_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi, + * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8, + * uint32_t nrounds@r9d) + * + * Decrypt a contiguous sequence of blocks with AES-XTS. + * + * nbytes must be a positive integral multiple of 16. This routine + * is not vectorized; use aesni_xts_dec8 for >=8 blocks at once. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_xts_dec1) + mov %rcx,%r10 /* r10 := nbytes */ + movdqu (%r8),%xmm15 /* xmm15 := tweak */ +1: movdqu (%rsi),%xmm0 /* xmm0 := ctxt */ + lea 0x10(%rsi),%rsi /* advance rdi to next block */ + pxor %xmm15,%xmm0 /* xmm0 := ctxt ^ tweak */ + mov %r9d,%ecx /* ecx := nrounds */ + call aesni_dec1 /* xmm0 := AES(ctxt ^ tweak) */ + pxor %xmm15,%xmm0 /* xmm0 := AES(ctxt ^ tweak) ^ tweak */ + movdqu %xmm0,(%rdx) /* store plaintext block */ + lea 0x10(%rdx),%rdx /* advance rsi to next block */ + call aesni_xts_mulx /* xmm15 *= x; trash xmm0 */ + sub $0x10,%r10 + jnz 1b /* repeat if more blocks */ + movdqu %xmm15,(%r8) /* update tweak */ + ret +END(aesni_xts_dec1) + +/* + * aesni_xts_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi, + * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8, + * uint32_t nrounds@r9d) + * + * Decrypt a contiguous sequence of blocks with AES-XTS. + * + * nbytes must be a positive integral multiple of 128. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_xts_dec8) + push %rbp /* create stack frame uint128[1] */ + mov %rsp,%rbp + sub $0x10,%rsp + mov %rcx,%r10 /* r10 := nbytes */ + movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */ +1: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */ + call aesni_xts_mulx /* xmm15 := tweak[1] */ + movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */ + call aesni_xts_mulx /* xmm15 := tweak[2] */ + movdqa %xmm15,%xmm10 /* xmm10 := tweak[2] */ + call aesni_xts_mulx /* xmm15 := tweak[3] */ + movdqa %xmm15,%xmm11 /* xmm11 := tweak[3] */ + call aesni_xts_mulx /* xmm51 := tweak[4] */ + movdqa %xmm15,%xmm12 /* xmm12 := tweak[4] */ + call aesni_xts_mulx /* xmm15 := tweak[5] */ + movdqa %xmm15,%xmm13 /* xmm13 := tweak[5] */ + call aesni_xts_mulx /* xmm15 := tweak[6] */ + movdqa %xmm15,%xmm14 /* xmm14 := tweak[6] */ + call aesni_xts_mulx /* xmm15 := tweak[7] */ + movdqu (%rsi),%xmm0 /* xmm[i] := ptxt[i] */ + movdqu 0x10(%rsi),%xmm1 + movdqu 0x20(%rsi),%xmm2 + movdqu 0x30(%rsi),%xmm3 + movdqu 0x40(%rsi),%xmm4 + movdqu 0x50(%rsi),%xmm5 + movdqu 0x60(%rsi),%xmm6 + movdqu 0x70(%rsi),%xmm7 + lea 0x80(%rsi),%rsi /* advance rsi to next block group */ + movdqa %xmm8,(%rsp) /* save tweak[0] */ + pxor %xmm8,%xmm0 /* xmm[i] := ptxt[i] ^ tweak[i] */ + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + mov %r9d,%ecx /* ecx := nrounds */ + call aesni_dec8 /* xmm[i] := AES(ptxt[i] ^ tweak[i]) */ + pxor (%rsp),%xmm0 /* xmm[i] := AES(...) ^ tweak[i] */ + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + movdqu %xmm0,(%rdx) /* store ciphertext blocks */ + movdqu %xmm1,0x10(%rdx) + movdqu %xmm2,0x20(%rdx) + movdqu %xmm3,0x30(%rdx) + movdqu %xmm4,0x40(%rdx) + movdqu %xmm5,0x50(%rdx) + movdqu %xmm6,0x60(%rdx) + movdqu %xmm7,0x70(%rdx) + lea 0x80(%rdx),%rdx /* advance rdx to next block group */ + call aesni_xts_mulx /* xmm15 := tweak[8] */ + sub $0x80,%r10 + jnz 1b /* repeat if more block groups */ + movdqu %xmm15,(%r8) /* update tweak */ + leave + ret +END(aesni_xts_dec8) + +/* + * aesni_xts_mulx(tweak@xmm15) + * + * Multiply xmm15 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. + * Uses %xmm0 as temporary. + */ + .text + _ALIGN_TEXT + .type aesni_xts_mulx,@function +aesni_xts_mulx: + /* + * Simultaneously determine + * (a) whether the high bit of the low quadword must be + * shifted into the low bit of the high quadword, and + * (b) whether the high bit of the high quadword must be + * carried into x^128 = x^7 + x^2 + x + 1. + */ + pxor %xmm0,%xmm0 /* xmm0 := 0 */ + pcmpgtq %xmm15,%xmm0 /* xmm0[i] := -1 if 0 > xmm15[i] else 0 */ + pshufd $0b01001110,%xmm0,%xmm0 /* swap halves of xmm0 */ + pand xtscarry(%rip),%xmm0 /* copy xtscarry according to mask */ + psllq $1,%xmm15 /* shift */ + pxor %xmm0,%xmm15 /* incorporate (a) and (b) */ + ret +END(aesni_xts_mulx) + + .section .rodata + .align 16 + .type xtscarry,@object +xtscarry: + .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0 +END(xtscarry) + +/* + * aesni_xts_update(const uint8_t in[16] @rdi, uint8_t out[16] @rsi) + * + * Update an AES-XTS tweak. + * + * Standard ABI calling convention. + */ +ENTRY(aesni_xts_update) + movdqu (%rdi),%xmm15 + call aesni_xts_mulx + movdqu %xmm15,(%rsi) + ret +END(aesni_xts_update) + +/* + * aesni_enc1(const struct aesenc *enckey@rdi, uint128_t block@xmm0, + * uint32_t nrounds@ecx) + * + * Encrypt a single AES block in %xmm0. + * + * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx. + */ + .text + _ALIGN_TEXT + .type aesni_enc1,@function +aesni_enc1: + pxor (%rdi),%xmm0 /* xor in first round key */ + shl $4,%ecx /* ecx := total byte size of round keys */ + lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */ + neg %rcx /* rcx := byte offset of round key from end */ +1: movdqa (%rax,%rcx),%xmm8 /* load round key */ + add $0x10,%rcx + jz 2f /* stop if this is the last one */ + aesenc %xmm8,%xmm0 + jmp 1b +2: aesenclast %xmm8,%xmm0 + ret +END(aesni_enc1) + +/* + * aesni_enc8(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, ..., + * block7@xmm7, uint32_t nrounds@ecx) + * + * Encrypt eight AES blocks in %xmm0 through %xmm7 in parallel. + * + * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx. + */ + .text + _ALIGN_TEXT + .type aesni_enc8,@function +aesni_enc8: + movdqa (%rdi),%xmm8 /* xor in first round key */ + pxor %xmm8,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm8,%xmm2 + pxor %xmm8,%xmm3 + pxor %xmm8,%xmm4 + pxor %xmm8,%xmm5 + pxor %xmm8,%xmm6 + pxor %xmm8,%xmm7 + shl $4,%ecx /* ecx := total byte size of round keys */ + lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */ + neg %rcx /* rcx := byte offset of round key from end */ +1: movdqa (%rax,%rcx),%xmm8 /* load round key */ + add $0x10,%rcx + jz 2f /* stop if this is the last one */ + aesenc %xmm8,%xmm0 + aesenc %xmm8,%xmm1 + aesenc %xmm8,%xmm2 + aesenc %xmm8,%xmm3 + aesenc %xmm8,%xmm4 + aesenc %xmm8,%xmm5 + aesenc %xmm8,%xmm6 + aesenc %xmm8,%xmm7 + jmp 1b +2: aesenclast %xmm8,%xmm0 + aesenclast %xmm8,%xmm1 + aesenclast %xmm8,%xmm2 + aesenclast %xmm8,%xmm3 + aesenclast %xmm8,%xmm4 + aesenclast %xmm8,%xmm5 + aesenclast %xmm8,%xmm6 + aesenclast %xmm8,%xmm7 + ret +END(aesni_enc8) + +/* + * aesni_dec1(const struct aesdec *deckey@rdi, uint128_t block@xmm0, + * uint32_t nrounds@ecx) + * + * Decrypt a single AES block in %xmm0. + * + * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx. + */ + .text + _ALIGN_TEXT + .type aesni_dec1,@function +aesni_dec1: + pxor (%rdi),%xmm0 /* xor in first round key */ + shl $4,%ecx /* ecx := byte offset of round key */ + lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */ + neg %rcx /* rcx := byte offset of round key from end */ +1: movdqa (%rax,%rcx),%xmm8 /* load round key */ + add $0x10,%rcx + jz 2f /* stop if this is the last one */ + aesdec %xmm8,%xmm0 + jmp 1b +2: aesdeclast %xmm8,%xmm0 + ret +END(aesni_dec1) + +/* + * aesni_dec8(const struct aesdec *deckey@rdi, uint128_t block0@xmm0, ..., + * block7@xmm7, uint32_t nrounds@ecx) + * + * Decrypt eight AES blocks in %xmm0 through %xmm7 in parallel. + * + * Internal ABI. Uses %xmm8 as temporary. Destroys %rcx. + */ + .text + _ALIGN_TEXT + .type aesni_dec8,@function +aesni_dec8: + movdqa (%rdi),%xmm8 /* xor in first round key */ + pxor %xmm8,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm8,%xmm2 + pxor %xmm8,%xmm3 + pxor %xmm8,%xmm4 + pxor %xmm8,%xmm5 + pxor %xmm8,%xmm6 + pxor %xmm8,%xmm7 + shl $4,%ecx /* ecx := byte offset of round key */ + lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */ + neg %rcx /* rcx := byte offset of round key from end */ +1: movdqa (%rax,%rcx),%xmm8 /* load round key */ + add $0x10,%rcx + jz 2f /* stop if this is the last one */ + aesdec %xmm8,%xmm0 + aesdec %xmm8,%xmm1 + aesdec %xmm8,%xmm2 + aesdec %xmm8,%xmm3 + aesdec %xmm8,%xmm4 + aesdec %xmm8,%xmm5 + aesdec %xmm8,%xmm6 + aesdec %xmm8,%xmm7 + jmp 1b +2: aesdeclast %xmm8,%xmm0 + aesdeclast %xmm8,%xmm1 + aesdeclast %xmm8,%xmm2 + aesdeclast %xmm8,%xmm3 + aesdeclast %xmm8,%xmm4 + aesdeclast %xmm8,%xmm5 + aesdeclast %xmm8,%xmm6 + aesdeclast %xmm8,%xmm7 + ret +END(aesni_dec8) Index: src/sys/crypto/aes/arch/x86/files.aesni diff -u /dev/null src/sys/crypto/aes/arch/x86/files.aesni:1.1 --- /dev/null Mon Jun 29 23:29:40 2020 +++ src/sys/crypto/aes/arch/x86/files.aesni Mon Jun 29 23:29:40 2020 @@ -0,0 +1,6 @@ +# $NetBSD: files.aesni,v 1.1 2020/06/29 23:29:40 riastradh Exp $ + +ifdef amd64 # amd64-only for now; i386 left as exercise for reader +file crypto/aes/arch/x86/aes_ni.c aes +file crypto/aes/arch/x86/aes_ni_64.S aes +endif