Module Name: src Committed By: riastradh Date: Mon Jun 29 23:56:31 UTC 2020
Modified Files: src/sys/arch/aarch64/aarch64: cpu.c src/sys/arch/aarch64/conf: files.aarch64 src/sys/arch/arm/conf: files.arm src/sys/arch/arm/vfp: vfp_init.c Added Files: src/sys/crypto/aes/arch/arm: aes_neon.c aes_neon.h aes_neon_impl.c aes_neon_impl.h aes_neon_subr.c arm_neon.h files.aesneon Log Message: New permutation-based AES implementation using ARM NEON. Also derived from Mike Hamburg's public-domain vpaes code. To generate a diff of this commit: cvs rdiff -u -r1.49 -r1.50 src/sys/arch/aarch64/aarch64/cpu.c cvs rdiff -u -r1.23 -r1.24 src/sys/arch/aarch64/conf/files.aarch64 cvs rdiff -u -r1.157 -r1.158 src/sys/arch/arm/conf/files.arm cvs rdiff -u -r1.65 -r1.66 src/sys/arch/arm/vfp/vfp_init.c cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/arm/aes_neon.c \ src/sys/crypto/aes/arch/arm/aes_neon.h \ src/sys/crypto/aes/arch/arm/aes_neon_impl.c \ src/sys/crypto/aes/arch/arm/aes_neon_impl.h \ src/sys/crypto/aes/arch/arm/aes_neon_subr.c \ src/sys/crypto/aes/arch/arm/arm_neon.h \ src/sys/crypto/aes/arch/arm/files.aesneon Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/aarch64/aarch64/cpu.c diff -u src/sys/arch/aarch64/aarch64/cpu.c:1.49 src/sys/arch/aarch64/aarch64/cpu.c:1.50 --- src/sys/arch/aarch64/aarch64/cpu.c:1.49 Mon Jun 29 23:31:41 2020 +++ src/sys/arch/aarch64/aarch64/cpu.c Mon Jun 29 23:56:30 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: cpu.c,v 1.49 2020/06/29 23:31:41 riastradh Exp $ */ +/* $NetBSD: cpu.c,v 1.50 2020/06/29 23:56:30 riastradh Exp $ */ /* * Copyright (c) 2017 Ryo Shimizu <r...@nerv.org> @@ -27,7 +27,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.49 2020/06/29 23:31:41 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.50 2020/06/29 23:56:30 riastradh Exp $"); #include "locators.h" #include "opt_arm_debug.h" @@ -45,6 +45,7 @@ __KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.49 #include <sys/systm.h> #include <crypto/aes/arch/arm/aes_armv8.h> +#include <crypto/aes/arch/arm/aes_neon.h> #include <aarch64/armreg.h> #include <aarch64/cpu.h> @@ -601,16 +602,24 @@ cpu_setup_aes(device_t dv, struct cpu_in { struct aarch64_sysctl_cpu_id *id = &ci->ci_id; - /* Verify that it is supported. */ + /* Check for ARMv8.0-AES support. */ switch (__SHIFTOUT(id->ac_aa64isar0, ID_AA64ISAR0_EL1_AES)) { case ID_AA64ISAR0_EL1_AES_AES: case ID_AA64ISAR0_EL1_AES_PMUL: - break; - default: + aes_md_init(&aes_armv8_impl); return; + default: + break; } - aes_md_init(&aes_armv8_impl); + /* Failing that, check for SIMD support. */ + switch (__SHIFTOUT(id->ac_aa64pfr0, ID_AA64PFR0_EL1_ADVSIMD)) { + case ID_AA64PFR0_EL1_ADV_SIMD_IMPL: + aes_md_init(&aes_neon_impl); + return; + default: + break; + } } #ifdef MULTIPROCESSOR Index: src/sys/arch/aarch64/conf/files.aarch64 diff -u src/sys/arch/aarch64/conf/files.aarch64:1.23 src/sys/arch/aarch64/conf/files.aarch64:1.24 --- src/sys/arch/aarch64/conf/files.aarch64:1.23 Mon Jun 29 23:31:41 2020 +++ src/sys/arch/aarch64/conf/files.aarch64 Mon Jun 29 23:56:30 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.aarch64,v 1.23 2020/06/29 23:31:41 riastradh Exp $ +# $NetBSD: files.aarch64,v 1.24 2020/06/29 23:56:30 riastradh Exp $ defflag opt_cpuoptions.h AARCH64_ALIGNMENT_CHECK defflag opt_cpuoptions.h AARCH64_EL0_STACK_ALIGNMENT_CHECK @@ -141,3 +141,6 @@ file dev/tprof/tprof_armv8.c tprof nee # ARMv8.0-AES include "crypto/aes/arch/arm/files.aesarmv8" + +# vpaes with ARM NEON +include "crypto/aes/arch/arm/files.aesneon" Index: src/sys/arch/arm/conf/files.arm diff -u src/sys/arch/arm/conf/files.arm:1.157 src/sys/arch/arm/conf/files.arm:1.158 --- src/sys/arch/arm/conf/files.arm:1.157 Sat Apr 18 11:00:38 2020 +++ src/sys/arch/arm/conf/files.arm Mon Jun 29 23:56:31 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.arm,v 1.157 2020/04/18 11:00:38 skrll Exp $ +# $NetBSD: files.arm,v 1.158 2020/06/29 23:56:31 riastradh Exp $ # temporary define to allow easy moving to ../arch/arm/arm32 defflag ARM32 @@ -262,3 +262,7 @@ file arch/arm/arm/linux_trap.c compat_l # profiling support file dev/tprof/tprof_armv7.c tprof + +# vpaes with ARM NEON -- disabled for now pending arm32 kernel fpu +# support and ctf +include "crypto/aes/arch/arm/files.aesneon" Index: src/sys/arch/arm/vfp/vfp_init.c diff -u src/sys/arch/arm/vfp/vfp_init.c:1.65 src/sys/arch/arm/vfp/vfp_init.c:1.66 --- src/sys/arch/arm/vfp/vfp_init.c:1.65 Mon Jun 29 23:54:06 2020 +++ src/sys/arch/arm/vfp/vfp_init.c Mon Jun 29 23:56:31 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: vfp_init.c,v 1.65 2020/06/29 23:54:06 riastradh Exp $ */ +/* $NetBSD: vfp_init.c,v 1.66 2020/06/29 23:56:31 riastradh Exp $ */ /* * Copyright (c) 2008 ARM Ltd @@ -32,7 +32,7 @@ #include "opt_cputypes.h" #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: vfp_init.c,v 1.65 2020/06/29 23:54:06 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfp_init.c,v 1.66 2020/06/29 23:56:31 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> @@ -50,6 +50,9 @@ __KERNEL_RCSID(0, "$NetBSD: vfp_init.c,v #include <uvm/uvm_extern.h> /* for pmap.h */ +#include <crypto/aes/aes.h> +#include <crypto/aes/arch/arm/aes_neon.h> + #ifdef FPU_VFP #ifdef CPU_CORTEX @@ -402,8 +405,11 @@ vfp_attach(struct cpu_info *ci) install_coproc_handler(VFP_COPROC, vfp_handler); install_coproc_handler(VFP_COPROC2, vfp_handler); #ifdef CPU_CORTEX - if (cpu_neon_present) - install_coproc_handler(CORE_UNKNOWN_HANDLER, neon_handler); + if (cpu_neon_present) { + install_coproc_handler(CORE_UNKNOWN_HANDLER, + neon_handler); + aes_md_init(&aes_neon_impl); + } #endif } } Added files: Index: src/sys/crypto/aes/arch/arm/aes_neon.c diff -u /dev/null src/sys/crypto/aes/arch/arm/aes_neon.c:1.1 --- /dev/null Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon.c Mon Jun 29 23:56:31 2020 @@ -0,0 +1,610 @@ +/* $NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES + * software, at <https://crypto.stanford.edu/vpaes/>, described in + * + * Mike Hamburg, `Accelerating AES with Vector Permute + * Instructions', in Christophe Clavier and Kris Gaj (eds.), + * Cryptographic Hardware and Embedded Systems -- CHES 2009, + * Springer LNCS 5747, pp. 18-32. + * + * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $"); + +#include <sys/types.h> + +#include <sys/systm.h> + +#include "aes_neon_impl.h" + +static const uint8x16_t +mc_forward[4] = { + {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04, + 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C}, + {0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08, + 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00}, + {0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C, + 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04}, + {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00, + 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08}, +}, +mc_backward[4] = { + {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06, + 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E}, + {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02, + 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A}, + {0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E, + 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06}, + {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A, + 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02}, +}, +ipt[2] = { + {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2, + 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA}, + {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C, + 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD}, +}, +opt[2] = { + {0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF, + 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7}, + {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01, + 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1}, +}, +dipt[2] = { + {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F, + 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15}, + {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86, + 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12}, +}, +sb1[2] = { + {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1, + 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5}, + {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36, + 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B}, +}, +sb2[2] = { + {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2, + 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E}, + {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69, + 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2}, +}, +sbo[2] = { + {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0, + 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15}, + {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF, + 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E}, +}, +dsb9[2] = { + {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85, + 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA}, + {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0, + 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72}, +}, +dsbd[2] = { + {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D, + 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5}, + {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C, + 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29}, +}, +dsbb[2] = { + {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0, + 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60}, + {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1, + 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3}, +}, +dsbe[2] = { + {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46, + 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22}, + {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C, + 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94}, +}, +dsbo[2] = { + {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13, + 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7}, + {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12, + 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA}, +}, +dks1[2] = { + {0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6, + 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A}, + {0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45, + 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B}, +}, +dks2[2] = { + {0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27, + 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46}, + {0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81, + 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73}, +}, +dks3[2] = { + {0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03, + 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8}, + {0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE, + 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5}, +}, +dks4[2] = { + {0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3, + 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0}, + {0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0, + 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F}, +}, +deskew[2] = { + {0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07, + 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D}, + {0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F, + 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28}, +}, +sr[4] = { + {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, + 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}, + {0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03, + 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B}, + {0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F, + 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07}, + {0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B, + 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03}, +}, +rcon = {0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F, + 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70}, +s63 = {0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B, + 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B}, +of = {0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F, + 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F}, +inv = {0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E, + 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04}, +inva = {0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01, + 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03}; + +static inline uint8x16_t +loadroundkey(const void *rkp) +{ + return vld1q_u8(rkp); +} + +static inline void +storeroundkey(void *rkp, uint8x16_t rk) +{ + vst1q_u8(rkp, rk); +} + +/* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */ +static inline void +bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x) +{ + + *lo = of & x; + *hi = of & vshrq_n_u8(x, 4); +} + +/* + * t is a pair of maps respectively from low and high nybbles to bytes. + * Apply t the nybbles, and add the results in GF(2). + */ +static uint8x16_t +aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2]) +{ + uint8x16_t lo, hi; + + bytes2nybbles(&lo, &hi, x); + return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi); +} + +static inline void +subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_, + uint8x16_t inva_) +{ + uint8x16_t k, i, ak, j; + + bytes2nybbles(&k, &i, x); + ak = vqtbl1q_u8(inva_, k); + j = i ^ k; + *io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i)); + *jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j)); +} + +static uint8x16_t +aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk) +{ + uint8x16_t io, jo; + + /* smear prk */ + prk ^= vextq_u8(vdupq_n_u8(0), prk, 12); + prk ^= vextq_u8(vdupq_n_u8(0), prk, 8); + prk ^= s63; + + /* subbytes */ + subbytes(&io, &jo, rk, inv, inva); + rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo); + + /* add in smeared stuff */ + return rk ^ prk; +} + +static uint8x16_t +aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot) +{ + uint32x4_t rk32; + + /* extract rcon from rcon_rot */ + prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15); + *rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15); + + /* rotate */ + rk32 = vreinterpretq_u32_u8(rk); + rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3)); + rk = vreinterpretq_u8_u32(rk32); + rk = vextq_u8(rk, rk, 1); + + return aes_schedule_low_round(rk, prk); +} + +static uint8x16_t +aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i) +{ + uint8x16_t y = vdupq_n_u8(0); + + x ^= s63; + + x = vqtbl1q_u8(x, mc_forward[0]); + y ^= x; + x = vqtbl1q_u8(x, mc_forward[0]); + y ^= x; + x = vqtbl1q_u8(x, mc_forward[0]); + y ^= x; + + return vqtbl1q_u8(y, sr_i); +} + +static uint8x16_t +aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i) +{ + + return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt); +} + +static uint8x16_t +aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i) +{ + uint8x16_t y = vdupq_n_u8(0); + + x = aes_schedule_transform(x, dks1); + y = vqtbl1q_u8(y ^ x, mc_forward[0]); + x = aes_schedule_transform(x, dks2); + y = vqtbl1q_u8(y ^ x, mc_forward[0]); + x = aes_schedule_transform(x, dks3); + y = vqtbl1q_u8(y ^ x, mc_forward[0]); + x = aes_schedule_transform(x, dks4); + y = vqtbl1q_u8(y ^ x, mc_forward[0]); + + return vqtbl1q_u8(y, sr_i); +} + +static uint8x16_t +aes_schedule_mangle_last_dec(uint8x16_t x) +{ + + return aes_schedule_transform(x ^ s63, deskew); +} + +static uint8x16_t +aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk) +{ + uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi); + uint32x4_t prk32 = vreinterpretq_u32_u8(prk); + uint32x4_t rk32; + + rk32 = prkhi32; + rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2), + vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)), + 3); + rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2), + vdupq_n_u32(vgetq_lane_u32(prk32, 3)), + 0); + + return vreinterpretq_u8_u32(rk32); +} + +static uint8x16_t +aes_schedule_192_smearhi(uint8x16_t rk) +{ + uint64x2_t rk64 = vreinterpretq_u64_u8(rk); + + rk64 = vsetq_lane_u64(0, rk64, 0); + + return vreinterpretq_u8_u64(rk64); +} + +void +aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds) +{ + uint32_t *rk32 = enc->aese_aes.aes_rk; + uint8x16_t mrk; /* mangled round key */ + uint8x16_t rk; /* round key */ + uint8x16_t prk; /* previous round key */ + uint8x16_t rcon_rot = rcon; + uint64_t i = 3; + + /* input transform */ + rk = aes_schedule_transform(vld1q_u8(key), ipt); + storeroundkey(rk32, rk); + rk32 += 4; + + switch (nrounds) { + case 10: + for (;;) { + rk = aes_schedule_round(rk, rk, &rcon_rot); + if (--nrounds == 0) + break; + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 += 4; + } + break; + case 12: { + uint8x16_t prkhi; /* high half of previous round key */ + + prk = rk; + rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); + prkhi = aes_schedule_192_smearhi(rk); + for (;;) { + prk = aes_schedule_round(rk, prk, &rcon_rot); + rk = vextq_u8(prkhi, prk, 8); + + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 += 4; + rk = aes_schedule_192_smear(prkhi, prk); + prkhi = aes_schedule_192_smearhi(rk); + + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 += 4; + rk = prk = aes_schedule_round(rk, prk, &rcon_rot); + if ((nrounds -= 3) == 0) + break; + + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 += 4; + rk = aes_schedule_192_smear(prkhi, prk); + prkhi = aes_schedule_192_smearhi(rk); + } + break; + } + case 14: { + uint8x16_t pprk; /* previous previous round key */ + + prk = rk; + rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); + for (;;) { + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 += 4; + pprk = rk; + + /* high round */ + rk = prk = aes_schedule_round(rk, prk, &rcon_rot); + if ((nrounds -= 2) == 0) + break; + mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 += 4; + + /* low round */ + rk = vreinterpretq_u8_u32( + vdupq_n_u32( + vgetq_lane_u32(vreinterpretq_u32_u8(rk), + 3))); + rk = aes_schedule_low_round(rk, pprk); + } + break; + } + default: + panic("invalid number of AES rounds: %u", nrounds); + } + storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4])); +} + +void +aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds) +{ + uint32_t *rk32 = dec->aesd_aes.aes_rk; + uint8x16_t mrk; /* mangled round key */ + uint8x16_t ork; /* original round key */ + uint8x16_t rk; /* round key */ + uint8x16_t prk; /* previous round key */ + uint8x16_t rcon_rot = rcon; + unsigned i = nrounds == 12 ? 0 : 2; + + ork = vld1q_u8(key); + + /* input transform */ + rk = aes_schedule_transform(ork, ipt); + + /* go from end */ + rk32 += 4*nrounds; + storeroundkey(rk32, vqtbl1q_u8(ork, sr[i])); + rk32 -= 4; + i ^= 3; + + switch (nrounds) { + case 10: + for (;;) { + rk = aes_schedule_round(rk, rk, &rcon_rot); + if (--nrounds == 0) + break; + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 -= 4; + } + break; + case 12: { + uint8x16_t prkhi; /* high half of previous round key */ + + prk = rk; + rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); + prkhi = aes_schedule_192_smearhi(rk); + for (;;) { + prk = aes_schedule_round(rk, prk, &rcon_rot); + rk = vextq_u8(prkhi, prk, 8); + + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 -= 4; + rk = aes_schedule_192_smear(prkhi, prk); + prkhi = aes_schedule_192_smearhi(rk); + + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 -= 4; + rk = prk = aes_schedule_round(rk, prk, &rcon_rot); + if ((nrounds -= 3) == 0) + break; + + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 -= 4; + rk = aes_schedule_192_smear(prkhi, prk); + prkhi = aes_schedule_192_smearhi(rk); + } + break; + } + case 14: { + uint8x16_t pprk; /* previous previous round key */ + + prk = rk; + rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); + for (;;) { + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 -= 4; + pprk = rk; + + /* high round */ + rk = prk = aes_schedule_round(rk, prk, &rcon_rot); + if ((nrounds -= 2) == 0) + break; + mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); + storeroundkey(rk32, mrk); + rk32 -= 4; + + /* low round */ + rk = vreinterpretq_u8_u32( + vdupq_n_u32( + vgetq_lane_u32(vreinterpretq_u32_u8(rk), + 3))); + rk = aes_schedule_low_round(rk, pprk); + } + break; + } + default: + panic("invalid number of AES rounds: %u", nrounds); + } + storeroundkey(rk32, aes_schedule_mangle_last_dec(rk)); +} + +uint8x16_t +aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds) +{ + const uint32_t *rk32 = enc->aese_aes.aes_rk; + uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; + uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; + uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; + uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; + uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; + uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; + uint8x16_t io, jo; + unsigned rmod4 = 0; + + x = aes_schedule_transform(x, ipt); + x ^= loadroundkey(rk32); + for (;;) { + uint8x16_t A, A2, A2_B, A2_B_D; + + subbytes(&io, &jo, x, inv_, inva_); + + rk32 += 4; + rmod4 = (rmod4 + 1) % 4; + if (--nrounds == 0) + break; + + A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo); + A ^= loadroundkey(rk32); + A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo); + A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]); + A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]); + x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]); + } + x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo); + x ^= loadroundkey(rk32); + return vqtbl1q_u8(x, sr[rmod4]); +} + +uint8x16_t +aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) +{ + const uint32_t *rk32 = dec->aesd_aes.aes_rk; + unsigned i = 3 & ~(nrounds - 1); + uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; + uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; + uint8x16_t io, jo, mc; + + x = aes_schedule_transform(x, dipt); + x ^= loadroundkey(rk32); + rk32 += 4; + + mc = mc_forward[3]; + for (;;) { + subbytes(&io, &jo, x, inv_, inva_); + if (--nrounds == 0) + break; + + x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo); + x ^= loadroundkey(rk32); + rk32 += 4; /* next round key */ + + x = vqtbl1q_u8(x, mc); + x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo); + + x = vqtbl1q_u8(x, mc); + x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo); + + x = vqtbl1q_u8(x, mc); + x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo); + + mc = vextq_u8(mc, mc, 12); + } + x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo); + x ^= loadroundkey(rk32); + return vqtbl1q_u8(x, sr[i]); +} Index: src/sys/crypto/aes/arch/arm/aes_neon.h diff -u /dev/null src/sys/crypto/aes/arch/arm/aes_neon.h:1.1 --- /dev/null Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon.h Mon Jun 29 23:56:31 2020 @@ -0,0 +1,62 @@ +/* $NetBSD: aes_neon.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_H +#define _CRYPTO_AES_ARCH_ARM_AES_NEON_H + +#include <crypto/aes/aes.h> + +/* + * These functions MUST NOT use any vector registers for parameters or + * results -- the caller is compiled with -mfloat-abi=soft in the + * kernel, and dynamically turns on the vector unit just before calling + * them. Internal subroutines that pass vector parameters are declared + * in aes_neon_impl.h instead. + */ + +void aes_neon_setenckey(struct aesenc *, const uint8_t *, unsigned); +void aes_neon_setdeckey(struct aesdec *, const uint8_t *, unsigned); + +void aes_neon_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], uint32_t); +void aes_neon_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], uint32_t); +void aes_neon_cbc_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aes_neon_cbc_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aes_neon_xts_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aes_neon_xts_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); + +int aes_neon_selftest(void); + +extern struct aes_impl aes_neon_impl; + +#endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_H */ Index: src/sys/crypto/aes/arch/arm/aes_neon_impl.c diff -u /dev/null src/sys/crypto/aes/arch/arm/aes_neon_impl.c:1.1 --- /dev/null Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_impl.c Mon Jun 29 23:56:31 2020 @@ -0,0 +1,178 @@ +/* $NetBSD: aes_neon_impl.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_neon_impl.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $"); + +#include <sys/types.h> +#include <sys/proc.h> + +#include <crypto/aes/aes.h> +#include <crypto/aes/arch/arm/aes_neon.h> + +#include <arm/fpu.h> + +#ifdef __aarch64__ +#include <aarch64/armreg.h> +#else +#include <arm/locore.h> +#endif + +static void +aes_neon_setenckey_impl(struct aesenc *enc, const uint8_t *key, + uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_neon_setenckey(enc, key, nrounds); + fpu_kern_leave(); +} + +static void +aes_neon_setdeckey_impl(struct aesdec *dec, const uint8_t *key, + uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_neon_setdeckey(dec, key, nrounds); + fpu_kern_leave(); +} + +static void +aes_neon_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_neon_enc(enc, in, out, nrounds); + fpu_kern_leave(); +} + +static void +aes_neon_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_neon_dec(dec, in, out, nrounds); + fpu_kern_leave(); +} + +static void +aes_neon_cbc_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_neon_cbc_enc(enc, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static void +aes_neon_cbc_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_neon_cbc_dec(dec, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static void +aes_neon_xts_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_neon_xts_enc(enc, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static void +aes_neon_xts_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_neon_xts_dec(dec, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static int +aes_neon_probe(void) +{ +#ifdef __aarch64__ + struct aarch64_sysctl_cpu_id *id; +#endif + int result = 0; + + /* Verify that the CPU supports NEON. */ +#ifdef __aarch64__ + id = &curcpu()->ci_id; + switch (__SHIFTOUT(id->ac_aa64pfr0, ID_AA64PFR0_EL1_ADVSIMD)) { + case ID_AA64PFR0_EL1_ADV_SIMD_IMPL: + break; + default: + return -1; + } +#else + if (!cpu_neon_present) + return -1; +#endif + + fpu_kern_enter(); + result = aes_neon_selftest(); + fpu_kern_leave(); + + return result; +} + +struct aes_impl aes_neon_impl = { + .ai_name = "ARM NEON vpaes", + .ai_probe = aes_neon_probe, + .ai_setenckey = aes_neon_setenckey_impl, + .ai_setdeckey = aes_neon_setdeckey_impl, + .ai_enc = aes_neon_enc_impl, + .ai_dec = aes_neon_dec_impl, + .ai_cbc_enc = aes_neon_cbc_enc_impl, + .ai_cbc_dec = aes_neon_cbc_dec_impl, + .ai_xts_enc = aes_neon_xts_enc_impl, + .ai_xts_dec = aes_neon_xts_dec_impl, +}; Index: src/sys/crypto/aes/arch/arm/aes_neon_impl.h diff -u /dev/null src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.1 --- /dev/null Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h Mon Jun 29 23:56:31 2020 @@ -0,0 +1,42 @@ +/* $NetBSD: aes_neon_impl.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H +#define _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H + +#include <sys/types.h> + +#include "arm_neon.h" + +#include <crypto/aes/aes.h> +#include <crypto/aes/arch/arm/aes_neon.h> + +uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned); +uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned); + +#endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */ Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c diff -u /dev/null src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.1 --- /dev/null Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c Mon Jun 29 23:56:31 2020 @@ -0,0 +1,218 @@ +/* $NetBSD: aes_neon_subr.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $"); + +#include <sys/systm.h> + +#include <lib/libkern/libkern.h> + +#include <crypto/aes/arch/arm/aes_neon.h> + +#include "aes_neon_impl.h" + +static inline uint8x16_t +loadblock(const void *in) +{ + return vld1q_u8(in); +} + +static inline void +storeblock(void *out, uint8x16_t block) +{ + vst1q_u8(out, block); +} + +void +aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint8x16_t block; + + block = loadblock(in); + block = aes_neon_enc1(enc, block, nrounds); + storeblock(out, block); +} + +void +aes_neon_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint8x16_t block; + + block = loadblock(in); + block = aes_neon_dec1(dec, block, nrounds); + storeblock(out, block); +} + +void +aes_neon_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + uint8x16_t cv; + + KASSERT(nbytes); + + cv = loadblock(iv); + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + cv ^= loadblock(in); + cv = aes_neon_enc1(enc, cv, nrounds); + storeblock(out, cv); + } + storeblock(iv, cv); +} + +void +aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + uint8x16_t iv0, cv, b; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + iv0 = loadblock(iv); + cv = loadblock(in + nbytes - 16); + storeblock(iv, cv); + + for (;;) { + b = aes_neon_dec1(dec, cv, nrounds); + if ((nbytes -= 16) == 0) + break; + cv = loadblock(in + nbytes - 16); + storeblock(out + nbytes, b ^ cv); + } + storeblock(out, b ^ iv0); +} + +static inline uint8x16_t +aes_neon_xts_update(uint8x16_t t8) +{ + const int32x4_t zero = vdupq_n_s32(0); + const int32x4_t carry = {0x87, 1, 1, 1}; + int32x4_t t, t_; + uint32x4_t mask; + + t = vreinterpretq_s32_u8(t8); + mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */ + mask = vextq_u32(mask, mask, 3); /* rotate quarters */ + t_ = vsliq_n_s32(zero, t, 1); /* shift */ + t_ ^= carry & mask; + + return vreinterpretq_u8_s32(t_); +} + +static int +aes_neon_xts_update_selftest(void) +{ + static const struct { + uint32_t in[4], out[4]; + } cases[] = { + [0] = { {1}, {2} }, + [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, + [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, + [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, + [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, + [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, + }; + unsigned i; + uint32_t t[4]; + int result = 0; + + for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { + t[0] = cases[i].in[0]; + t[1] = cases[i].in[1]; + t[2] = cases[i].in[2]; + t[3] = cases[i].in[3]; + storeblock(t, aes_neon_xts_update(loadblock(t))); + if (t[0] != cases[i].out[0] || + t[1] != cases[i].out[1] || + t[2] != cases[i].out[2] || + t[3] != cases[i].out[3]) { + printf("%s %u:" + " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", + __func__, i, t[0], t[1], t[2], t[3]); + result = -1; + } + } + + return result; +} + +void +aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint8x16_t t, b; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + t = loadblock(tweak); + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + b = t ^ loadblock(in); + b = aes_neon_enc1(enc, b, nrounds); + storeblock(out, t ^ b); + t = aes_neon_xts_update(t); + } + storeblock(tweak, t); +} + +void +aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint8x16_t t, b; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + t = loadblock(tweak); + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + b = t ^ loadblock(in); + b = aes_neon_dec1(dec, b, nrounds); + storeblock(out, t ^ b); + t = aes_neon_xts_update(t); + } + storeblock(tweak, t); +} + +int +aes_neon_selftest(void) +{ + + if (aes_neon_xts_update_selftest()) + return -1; + + return 0; +} Index: src/sys/crypto/aes/arch/arm/arm_neon.h diff -u /dev/null src/sys/crypto/aes/arch/arm/arm_neon.h:1.1 --- /dev/null Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/arm_neon.h Mon Jun 29 23:56:31 2020 @@ -0,0 +1,405 @@ +/* $NetBSD: arm_neon.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H +#define _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H + +#if defined(__GNUC__) && !defined(__clang__) + +#define _INTRINSATTR \ + __extension__ \ + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + +#ifdef __aarch64__ +typedef __Int32x4_t int32x4_t; +typedef __Int64x2_t int64x2_t; +typedef __Int8x16_t int8x16_t; +typedef __Uint32x4_t uint32x4_t; +typedef __Uint64x2_t uint64x2_t; +typedef __Uint8x16_t uint8x16_t; +#else +typedef __simd128_int32_t int32x4_t; +typedef __simd128_int64_t int64x2_t; +typedef __simd128_int8_t int8x16_t; +typedef __simd128_uint32_t uint32x4_t; +typedef __simd128_uint64_t uint64x2_t; +typedef __simd128_uint8_t uint8x16_t; + +typedef __simd64_int8_t int8x8_t; +typedef __simd64_uint8_t uint8x8_t; +typedef __builtin_neon_udi uint64x1_t; +typedef struct { uint8x8_t val[2]; } uint8x8x2_t; +#endif + +#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) +#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) +#else +#define __neon_lane_index(__v, __i) __i +#endif + +#elif defined(__clang__) + +#define _INTRINSATTR \ + __attribute__((__always_inline__, __nodebug)) + +typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; +typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; +typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; +typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; +typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; +typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; +typedef struct { uint8x8_t val[2]; } uint8x8x2_t; + +#ifdef __LITTLE_ENDIAN__ +#define __neon_lane_index(__v, __i) __i +#else +#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) +#endif + +#else + +#error Teach me how to neon in your compile! + +#endif + +_INTRINSATTR +static __inline uint32x4_t +vcltq_s32(int32x4_t __v0, int32x4_t __v1) +{ + return (uint32x4_t)(__v0 < __v1); +} + +_INTRINSATTR +static __inline int32x4_t +vdupq_n_s32(int32_t __x) +{ + return (int32x4_t) { __x, __x, __x, __x }; +} + +_INTRINSATTR +static __inline uint32x4_t +vdupq_n_u32(uint32_t __x) +{ + return (uint32x4_t) { __x, __x, __x, __x }; +} + +_INTRINSATTR +static __inline uint8x16_t +vdupq_n_u8(uint8_t __x) +{ + return (uint8x16_t) { + __x, __x, __x, __x, __x, __x, __x, __x, + __x, __x, __x, __x, __x, __x, __x, __x, + }; +} + +_INTRINSATTR +static __inline uint32x4_t +vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) +{ +#if defined(__GNUC__) && !defined(__clang__) +#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) + return __builtin_shuffle(__hi, __lo, + (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); +#else + return __builtin_shuffle(__lo, __hi, + (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); +#endif +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ + return __builtin_neon_vextq_v((int8x16_t)__lo, (int8x16_t)__hi, __i, + 50); +#else + uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, 3, 2, 1, 0); + uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, 3, 2, 1, 0); + uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, + (int8x16_t)__hi_r, __i, 50); + return __builtin_shufflevector(__r, __r, 3, 2, 1, 0); +#endif +#endif +} + +_INTRINSATTR +static __inline uint8x16_t +vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) +{ +#if defined(__GNUC__) && !defined(__clang__) +#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) + return __builtin_shuffle(__hi, __lo, + (uint8x16_t) { + 16 - __i, 17 - __i, 18 - __i, 19 - __i, + 20 - __i, 21 - __i, 22 - __i, 23 - __i, + 24 - __i, 25 - __i, 26 - __i, 27 - __i, + 28 - __i, 29 - __i, 30 - __i, 31 - __i, + }); +#else + return __builtin_shuffle(__lo, __hi, + (uint8x16_t) { + __i + 0, __i + 1, __i + 2, __i + 3, + __i + 4, __i + 5, __i + 6, __i + 7, + __i + 8, __i + 9, __i + 10, __i + 11, + __i + 12, __i + 13, __i + 14, __i + 15, + }); +#endif +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ + return __builtin_neon_vextq_v((int8x16_t)__lo, (int8x16_t)__hi, __i, + 48); +#else + uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, + (int8x16_t)__hi_r, __i, 50); + return __builtin_shufflevector(__r, __r, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +#endif +#endif +} + +_INTRINSATTR +static __inline uint32_t +vgetq_lane_u32(uint32x4_t __v, uint8_t __i) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + return __v[__i]; +#else + return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); +#endif +#elif defined(__clang__) + return (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)__v, + __neon_lane_index(__v, __i)); +#endif +} + +_INTRINSATTR +static __inline uint8x16_t +vld1q_u8(const uint8_t *__p8) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + const __builtin_aarch64_simd_qi *__p = + (const __builtin_aarch64_simd_qi *)__p8; + + return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); +#else + const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; + + return (uint8x16_t)__builtin_neon_vld1v16qi(__p); +#endif +#elif defined(__clang__) + return (uint8x16_t)__builtin_neon_vld1q_v(__p, 48); +#endif +} + +_INTRINSATTR +static __inline uint8x16_t +vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + uint8x16_t __res; + __asm__("tbl %0.16b, {%1.16b}, %2.16b" + : "=w"(__res) : "w"(__tab), "w"(__idx)); + return __res; +#else + /* + * No native ARMv7 NEON instruction for this, so do it via two + * half-width TBLs instead (vtbl2_u8 equivalent). + */ + uint64x2_t __tab64 = (uint64x2_t)__tab; + uint8x8_t __tablo = (uint8x8_t)__tab64[0]; + uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; + uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; + union { + uint8x8x2_t __u8x8x2; + __builtin_neon_ti __ti; + } __u = { __tab8x8x2 }; + uint64x2_t __idx64, __out64; + int8x8_t __idxlo, __idxhi, __outlo, __outhi; + + __idx64 = (uint64x2_t)__idx; + __idxlo = (int8x8_t)__idx64[0]; + __idxhi = (int8x8_t)__idx64[1]; + __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); + __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); + __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; + + return (uint8x16_t)__out64; +#endif +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ + return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab, + (int8x16_t)__idx, 48); +#else + uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, + (int8x16_t)__idx, __i, 48); + return __builtin_shufflevector(__r, __r, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +#endif +#endif +} + +_INTRINSATTR +static __inline int32x4_t +vreinterpretq_s32_u8(uint8x16_t __v) +{ + return (int32x4_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vreinterpretq_u32_u8(uint8x16_t __v) +{ + return (uint32x4_t)__v; +} + +_INTRINSATTR +static __inline uint64x2_t +vreinterpretq_u64_u8(uint8x16_t __v) +{ + return (uint64x2_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vreinterpretq_u8_s32(int32x4_t __v) +{ + return (uint8x16_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vreinterpretq_u8_u32(uint32x4_t __v) +{ + return (uint8x16_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vreinterpretq_u8_u64(uint64x2_t __v) +{ + return (uint8x16_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) +{ +#if defined(__GNUC__) && !defined(__clang__) + __v[__neon_lane_index(__v, __i)] = __x; + return __v; +#elif defined(__clang__) + return (uint32x4_t)__builtin_neon_vsetq_lane_i32(__x, (int32x4_t)__v, + __neon_lane_index(__v, __i)); +#endif +} + +_INTRINSATTR +static __inline uint64x2_t +vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) +{ +#if defined(__GNUC__) && !defined(__clang__) + __v[__neon_lane_index(__v, __i)] = __x; + return __v; +#elif defined(__clang__) + return (uint64x2_t)__builtin_neon_vsetq_lane_i32(__x, (int64x2_t)__v, + __neon_lane_index(__v, __i)); +#endif +} + +_INTRINSATTR +static __inline uint8x16_t +vshrq_n_u8(uint8x16_t __v, uint8_t __bits) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); +#else + return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); +#endif +#elif defined(__clang__) + return __builtin_neon_vshrq_n_v((int8x16_t)__v, __bits, 48); +#endif +} + +_INTRINSATTR +static __inline int32x4_t +vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); +#else + return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); +#endif +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ + return __builtin_neon_vsliq_n_v(__vins_r, __vsh_r, __bits, 34); +#else + int32x4_t __vins_r = __builtin_shufflevector(__vins_r, __vins_r, + 3, 2, 1, 0); + int32x4_t __vsh_r = __builtin_shufflevector(__vsh_r, __vsh_r, + 3, 2, 1, 0); + int32x4_t __r = __builtin_neon_vsliq_n_v(__vins_r, __vsh_r, __bits, + 34); + return __builtin_shufflevector(__r, __r, 3, 2, 1, 0); +#endif +#endif +} + +_INTRINSATTR +static __inline void +vst1q_u8(uint8_t *__p8, uint8x16_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; + + __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); +#else + __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; + + __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); +#endif +#elif defined(__clang__) +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +#endif + __builtin_neon_vst1q_v(__p, __v); +#endif +} + +#endif /* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */ Index: src/sys/crypto/aes/arch/arm/files.aesneon diff -u /dev/null src/sys/crypto/aes/arch/arm/files.aesneon:1.1 --- /dev/null Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/files.aesneon Mon Jun 29 23:56:31 2020 @@ -0,0 +1,13 @@ +# $NetBSD: files.aesneon,v 1.1 2020/06/29 23:56:31 riastradh Exp $ + +ifdef aarch64 +makeoptions aes "COPTS.aes_neon.c"+="-march=armv8-a" +makeoptions aes "COPTS.aes_neon_subr.c"+="-march=armv8-a" +else +makeoptions aes "COPTS.aes_neon.c"+="-mfloat-abi=softfp -mfpu=neon" +makeoptions aes "COPTS.aes_neon_subr.c"+="-mfloat-abi=softfp -mfpu=neon" +endif + +file crypto/aes/arch/arm/aes_neon.c aes +file crypto/aes/arch/arm/aes_neon_impl.c aes +file crypto/aes/arch/arm/aes_neon_subr.c aes