Module Name: src Committed By: riastradh Date: Mon Jun 29 23:47:54 UTC 2020
Modified Files: src/sys/arch/x86/conf: files.x86 src/sys/arch/x86/x86: identcpu.c src/sys/crypto/aes: aes.h Added Files: src/sys/crypto/aes/arch/x86: aes_sse2.c aes_sse2.h aes_sse2_dec.c aes_sse2_enc.c aes_sse2_impl.c aes_sse2_impl.h files.aessse2 immintrin.h immintrin_ext.h Log Message: New SSE2-based bitsliced AES implementation. This should work on essentially all x86 CPUs of the last two decades, and may improve throughput over the portable C aes_ct implementation from BearSSL by (a) reducing the number of vector operations in sequence, and (b) batching four rather than two blocks in parallel. Derived from BearSSL'S aes_ct64 implementation adjusted so that where aes_ct64 uses 64-bit q[0],...,q[7], aes_sse2 uses (q[0], q[4]), ..., (q[3], q[7]), each tuple representing a pair of 64-bit quantities stacked in a single 128-bit register. This translation was done very naively, and mostly reduces the cost of ShiftRows and data movement without doing anything to address the S-box or (Inv)MixColumns, which spread all 64-bit quantities across separate registers and ignore the upper halves. Unfortunately, SSE2 -- which is all that is guaranteed on all amd64 CPUs -- doesn't have PSHUFB, which would help out a lot more. For example, vpaes relies on that. Perhaps there are enough CPUs out there with PSHUFB but not AES-NI to make it worthwhile to import or adapt vpaes too. Note: This includes local definitions of various Intel compiler intrinsics for gcc and clang in terms of their __builtin_* &c., because the necessary header files are not available during the kernel build. This is a kludge -- we should fix it properly; the present approach is expedient but not ideal. To generate a diff of this commit: cvs rdiff -u -r1.114 -r1.115 src/sys/arch/x86/conf/files.x86 cvs rdiff -u -r1.109 -r1.110 src/sys/arch/x86/x86/identcpu.c cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/aes.h cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/x86/aes_sse2.c \ src/sys/crypto/aes/arch/x86/aes_sse2.h \ src/sys/crypto/aes/arch/x86/aes_sse2_dec.c \ src/sys/crypto/aes/arch/x86/aes_sse2_enc.c \ src/sys/crypto/aes/arch/x86/aes_sse2_impl.c \ src/sys/crypto/aes/arch/x86/aes_sse2_impl.h \ src/sys/crypto/aes/arch/x86/files.aessse2 \ src/sys/crypto/aes/arch/x86/immintrin.h \ src/sys/crypto/aes/arch/x86/immintrin_ext.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/x86/conf/files.x86 diff -u src/sys/arch/x86/conf/files.x86:1.114 src/sys/arch/x86/conf/files.x86:1.115 --- src/sys/arch/x86/conf/files.x86:1.114 Mon Jun 29 23:39:30 2020 +++ src/sys/arch/x86/conf/files.x86 Mon Jun 29 23:47:54 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.x86,v 1.114 2020/06/29 23:39:30 riastradh Exp $ +# $NetBSD: files.x86,v 1.115 2020/06/29 23:47:54 riastradh Exp $ # options for MP configuration through the MP spec defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI @@ -171,3 +171,6 @@ include "crypto/aes/arch/x86/files.aesni # VIA ACE include "crypto/aes/arch/x86/files.aesvia" + +# Bitsliced AES with SSE2 +include "crypto/aes/arch/x86/files.aessse2" Index: src/sys/arch/x86/x86/identcpu.c diff -u src/sys/arch/x86/x86/identcpu.c:1.109 src/sys/arch/x86/x86/identcpu.c:1.110 --- src/sys/arch/x86/x86/identcpu.c:1.109 Mon Jun 29 23:39:30 2020 +++ src/sys/arch/x86/x86/identcpu.c Mon Jun 29 23:47:54 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: identcpu.c,v 1.109 2020/06/29 23:39:30 riastradh Exp $ */ +/* $NetBSD: identcpu.c,v 1.110 2020/06/29 23:47:54 riastradh Exp $ */ /*- * Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.109 2020/06/29 23:39:30 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.110 2020/06/29 23:47:54 riastradh Exp $"); #include "opt_xen.h" @@ -40,6 +40,7 @@ __KERNEL_RCSID(0, "$NetBSD: identcpu.c,v #include <sys/cpu.h> #include <crypto/aes/arch/x86/aes_ni.h> +#include <crypto/aes/arch/x86/aes_sse2.h> #include <crypto/aes/arch/x86/aes_via.h> #include <uvm/uvm_extern.h> @@ -1005,6 +1006,8 @@ cpu_probe(struct cpu_info *ci) #endif if (cpu_feature[4] & CPUID_VIA_HAS_ACE) aes_md_init(&aes_via_impl); + else if (i386_has_sse && i386_has_sse2) + aes_md_init(&aes_sse2_impl); } else { /* * If not first. Warn about cpu_feature mismatch for Index: src/sys/crypto/aes/aes.h diff -u src/sys/crypto/aes/aes.h:1.1 src/sys/crypto/aes/aes.h:1.2 --- src/sys/crypto/aes/aes.h:1.1 Mon Jun 29 23:27:52 2020 +++ src/sys/crypto/aes/aes.h Mon Jun 29 23:47:54 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes.h,v 1.1 2020/06/29 23:27:52 riastradh Exp $ */ +/* $NetBSD: aes.h,v 1.2 2020/06/29 23:47:54 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -37,8 +37,9 @@ * * Expanded round keys. */ -struct aes { +union aes { uint32_t aes_rk[60]; + uint64_t aes_rk64[30]; } __aligned(16); #define AES_128_NROUNDS 10 @@ -46,11 +47,11 @@ struct aes { #define AES_256_NROUNDS 14 struct aesenc { - struct aes aese_aes; + union aes aese_aes; }; struct aesdec { - struct aes aesd_aes; + union aes aesd_aes; }; struct aes_impl { Added files: Index: src/sys/crypto/aes/arch/x86/aes_sse2.c diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2.c:1.1 --- /dev/null Mon Jun 29 23:47:55 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2.c Mon Jun 29 23:47:54 2020 @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2016 Thomas Pornin <por...@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_sse2.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $"); + +#include <sys/types.h> + +#include <lib/libkern/libkern.h> + +#include "aes_sse2_impl.h" + +static void +br_range_dec32le(uint32_t *p32, size_t nwords, const void *v) +{ + const uint8_t *p8 = v; + + while (nwords --> 0) { + uint32_t x0 = *p8++; + uint32_t x1 = *p8++; + uint32_t x2 = *p8++; + uint32_t x3 = *p8++; + + *p32++ = x0 | (x1 << 8) | (x2 << 16) | (x3 << 24); + } +} + +void +aes_sse2_bitslice_Sbox(__m128i q[static 4]) +{ + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i y1, y2, y3, y4, y5, y6, y7, y8, y9; + __m128i y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + __m128i y20, y21; + __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + __m128i z10, z11, z12, z13, z14, z15, z16, z17; + __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + __m128i t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + __m128i t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + __m128i t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + __m128i t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + __m128i t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + __m128i t60, t61, t62, t63, t64, t65, t66, t67; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = _mm_shuffle_epi32(q[3], 0x0e); + x1 = _mm_shuffle_epi32(q[2], 0x0e); + x2 = _mm_shuffle_epi32(q[1], 0x0e); + x3 = _mm_shuffle_epi32(q[0], 0x0e); + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[3] = _mm_unpacklo_epi64(s4, s0); + q[2] = _mm_unpacklo_epi64(s5, s1); + q[1] = _mm_unpacklo_epi64(s6, s2); + q[0] = _mm_unpacklo_epi64(s7, s3); +} + +void +aes_sse2_ortho(__m128i q[static 4]) +{ +#define SWAPN(cl, ch, s, x, y) do { \ + __m128i a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & _mm_set1_epi64x(cl)) | \ + _mm_slli_epi64(b & _mm_set1_epi64x(cl), (s)); \ + (y) = _mm_srli_epi64(a & _mm_set1_epi64x(ch), (s)) | \ + (b & _mm_set1_epi64x(ch)); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + + __m128i q0 = q[0]; + __m128i q1 = q[1]; + __m128i q2 = q[2]; + __m128i q3 = q[3]; + __m128i q4 = _mm_shuffle_epi32(q[0], 0x0e); + __m128i q5 = _mm_shuffle_epi32(q[1], 0x0e); + __m128i q6 = _mm_shuffle_epi32(q[2], 0x0e); + __m128i q7 = _mm_shuffle_epi32(q[3], 0x0e); + SWAP8(q0, q4); + SWAP8(q1, q5); + SWAP8(q2, q6); + SWAP8(q3, q7); + q[0] = _mm_unpacklo_epi64(q0, q4); + q[1] = _mm_unpacklo_epi64(q1, q5); + q[2] = _mm_unpacklo_epi64(q2, q6); + q[3] = _mm_unpacklo_epi64(q3, q7); +} + +__m128i +aes_sse2_interleave_in(__m128i w) +{ + __m128i lo, hi; + + lo = _mm_shuffle_epi32(w, 0x10); + hi = _mm_shuffle_epi32(w, 0x32); + lo &= _mm_set1_epi64x(0x00000000FFFFFFFF); + hi &= _mm_set1_epi64x(0x00000000FFFFFFFF); + lo |= _mm_slli_epi64(lo, 16); + hi |= _mm_slli_epi64(hi, 16); + lo &= _mm_set1_epi32(0x0000FFFF); + hi &= _mm_set1_epi32(0x0000FFFF); + lo |= _mm_slli_epi64(lo, 8); + hi |= _mm_slli_epi64(hi, 8); + lo &= _mm_set1_epi16(0x00FF); + hi &= _mm_set1_epi16(0x00FF); + return lo | _mm_slli_epi64(hi, 8); +} + +__m128i +aes_sse2_interleave_out(__m128i q) +{ + __m128i lo, hi; + + lo = q; + hi = _mm_srli_si128(q, 1); + lo &= _mm_set1_epi16(0x00FF); + hi &= _mm_set1_epi16(0x00FF); + lo |= _mm_srli_epi64(lo, 8); + hi |= _mm_srli_epi64(hi, 8); + lo &= _mm_set1_epi32(0x0000FFFF); + hi &= _mm_set1_epi32(0x0000FFFF); + lo |= _mm_srli_epi64(lo, 16); + hi |= _mm_srli_epi64(hi, 16); + return (__m128i)_mm_shuffle_ps((__m128)lo, (__m128)hi, 0x88); +} + +static const unsigned char Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t +sub_word(uint32_t x) +{ + __m128i q[4]; + uint32_t y; + + memset(q, 0, sizeof(q)); + q[0] = _mm_loadu_si32(&x); + aes_sse2_ortho(q); + aes_sse2_bitslice_Sbox(q); + aes_sse2_ortho(q); + _mm_storeu_si32(&y, q[0]); + return y; +} + +unsigned +aes_sse2_keysched(uint64_t *comp_skey, const void *key, size_t key_len) +{ + unsigned num_rounds; + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + switch (key_len) { + case 16: + num_rounds = 10; + break; + case 24: + num_rounds = 12; + break; + case 32: + num_rounds = 14; + break; + default: + /* abort(); */ + return 0; + } + nk = (int)(key_len >> 2); + nkf = (int)((num_rounds + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + __m128i q[4], q0, q1, q2, q3, q4, q5, q6, q7; + __m128i w; + + w = _mm_loadu_epi8(skey + i); + q[0] = q[1] = q[2] = q[3] = aes_sse2_interleave_in(w); + aes_sse2_ortho(q); + q0 = q[0] & _mm_set1_epi64x(0x1111111111111111); + q1 = q[1] & _mm_set1_epi64x(0x2222222222222222); + q2 = q[2] & _mm_set1_epi64x(0x4444444444444444); + q3 = q[3] & _mm_set1_epi64x(0x8888888888888888); + q4 = _mm_shuffle_epi32(q0, 0x0e); + q5 = _mm_shuffle_epi32(q1, 0x0e); + q6 = _mm_shuffle_epi32(q2, 0x0e); + q7 = _mm_shuffle_epi32(q3, 0x0e); + _mm_storeu_si64(&comp_skey[j + 0], q0 | q1 | q2 | q3); + _mm_storeu_si64(&comp_skey[j + 1], q4 | q5 | q6 | q7); + } + return num_rounds; +} + +void +aes_sse2_skey_expand(uint64_t *skey, + unsigned num_rounds, const uint64_t *comp_skey) +{ + unsigned u, v, n; + + n = (num_rounds + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + __m128i x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = _mm_loadu_si64(&comp_skey[u]); + x0 &= 0x1111111111111111; + x1 &= 0x2222222222222222; + x2 &= 0x4444444444444444; + x3 &= 0x8888888888888888; + x1 = _mm_srli_epi64(x1, 1); + x2 = _mm_srli_epi64(x2, 2); + x3 = _mm_srli_epi64(x3, 3); + x0 = _mm_sub_epi64(_mm_slli_epi64(x0, 4), x0); + x1 = _mm_sub_epi64(_mm_slli_epi64(x1, 4), x1); + x2 = _mm_sub_epi64(_mm_slli_epi64(x2, 4), x2); + x3 = _mm_sub_epi64(_mm_slli_epi64(x3, 4), x3); + _mm_storeu_si64(&skey[v + 0], x0); + _mm_storeu_si64(&skey[v + 1], x1); + _mm_storeu_si64(&skey[v + 2], x2); + _mm_storeu_si64(&skey[v + 3], x3); + } +} Index: src/sys/crypto/aes/arch/x86/aes_sse2.h diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2.h:1.1 --- /dev/null Mon Jun 29 23:47:55 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2.h Mon Jun 29 23:47:54 2020 @@ -0,0 +1,36 @@ +/* $NetBSD: aes_sse2.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_X86_AES_SSE2_H +#define _CRYPTO_AES_ARCH_X86_AES_SSE2_H + +#include <crypto/aes/aes.h> + +extern struct aes_impl aes_sse2_impl; + +#endif /* _CRYPTO_AES_ARCH_X86_AES_SSE2_H */ Index: src/sys/crypto/aes/arch/x86/aes_sse2_dec.c diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2_dec.c:1.1 --- /dev/null Mon Jun 29 23:47:55 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2_dec.c Mon Jun 29 23:47:54 2020 @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2016 Thomas Pornin <por...@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_sse2_dec.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $"); + +#include <sys/types.h> + +#include "aes_sse2_impl.h" + +/* see inner.h */ +void +aes_sse2_bitslice_invSbox(__m128i q[static 4]) +{ + /* + * See br_aes_ct_bitslice_invSbox(). This is the natural extension + * to 64-bit registers. + */ + __m128i q0, q1, q2, q3, q4, q5, q6, q7; + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = _mm_shuffle_epi32(q[0], 0x0e); + q5 = _mm_shuffle_epi32(~q[1], 0x0e); + q6 = _mm_shuffle_epi32(~q[2], 0x0e); + q7 = _mm_shuffle_epi32(q[3], 0x0e); + + q[3] = _mm_unpacklo_epi64(q5 ^ q0 ^ q2, q1 ^ q4 ^ q6); + q[2] = _mm_unpacklo_epi64(q4 ^ q7 ^ q1, q0 ^ q3 ^ q5); + q[1] = _mm_unpacklo_epi64(q3 ^ q6 ^ q0, q7 ^ q2 ^ q4); + q[0] = _mm_unpacklo_epi64(q2 ^ q5 ^ q7, q6 ^ q1 ^ q3); + + aes_sse2_bitslice_Sbox(q); + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = _mm_shuffle_epi32(q[0], 0x0e); + q5 = _mm_shuffle_epi32(~q[1], 0x0e); + q6 = _mm_shuffle_epi32(~q[2], 0x0e); + q7 = _mm_shuffle_epi32(q[3], 0x0e); + + q[3] = _mm_unpacklo_epi64(q5 ^ q0 ^ q2, q1 ^ q4 ^ q6); + q[2] = _mm_unpacklo_epi64(q4 ^ q7 ^ q1, q0 ^ q3 ^ q5); + q[1] = _mm_unpacklo_epi64(q3 ^ q6 ^ q0, q7 ^ q2 ^ q4); + q[0] = _mm_unpacklo_epi64(q2 ^ q5 ^ q7, q6 ^ q1 ^ q3); +} + +static inline void +add_round_key(__m128i q[static 4], const uint64_t sk[static 8]) +{ + q[0] ^= _mm_set_epi64x(sk[4], sk[0]); + q[1] ^= _mm_set_epi64x(sk[5], sk[1]); + q[2] ^= _mm_set_epi64x(sk[6], sk[2]); + q[3] ^= _mm_set_epi64x(sk[7], sk[3]); +} + +static inline __m128i +inv_shift_row(__m128i q) +{ + __m128i x, y0, y1, y2, y3, y4, y5, y6; + + x = q; + y0 = x & _mm_set1_epi64x(0x000000000000FFFF); + y1 = x & _mm_set1_epi64x(0x000000000FFF0000); + y2 = x & _mm_set1_epi64x(0x00000000F0000000); + y3 = x & _mm_set1_epi64x(0x000000FF00000000); + y4 = x & _mm_set1_epi64x(0x0000FF0000000000); + y5 = x & _mm_set1_epi64x(0x000F000000000000); + y6 = x & _mm_set1_epi64x(0xFFF0000000000000); + y1 = _mm_slli_epi64(y1, 4); + y2 = _mm_srli_epi64(y2, 12); + y3 = _mm_slli_epi64(y3, 8); + y4 = _mm_srli_epi64(y4, 8); + y5 = _mm_slli_epi64(y5, 12); + y6 = _mm_srli_epi64(y6, 4); + return y0 | y1 | y2 | y3 | y4 | y5 | y6; +} + +static inline void +inv_shift_rows(__m128i q[static 4]) +{ + + q[0] = inv_shift_row(q[0]); + q[1] = inv_shift_row(q[1]); + q[2] = inv_shift_row(q[2]); + q[3] = inv_shift_row(q[3]); +} + +static inline __m128i +rotr32(__m128i x) +{ + return _mm_slli_epi64(x, 32) | _mm_srli_epi64(x, 32); +} + +static inline void +inv_mix_columns(__m128i q[4]) +{ + __m128i q0, q1, q2, q3, q4, q5, q6, q7; + __m128i r0, r1, r2, r3, r4, r5, r6, r7; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + r0 = _mm_srli_epi64(q0, 16) | _mm_slli_epi64(q0, 48); + r1 = _mm_srli_epi64(q1, 16) | _mm_slli_epi64(q1, 48); + r2 = _mm_srli_epi64(q2, 16) | _mm_slli_epi64(q2, 48); + r3 = _mm_srli_epi64(q3, 16) | _mm_slli_epi64(q3, 48); + + q7 = _mm_shuffle_epi32(q3, 0x0e); + q6 = _mm_shuffle_epi32(q2, 0x0e); + q5 = _mm_shuffle_epi32(q1, 0x0e); + q4 = _mm_shuffle_epi32(q0, 0x0e); + + r7 = _mm_shuffle_epi32(r3, 0x0e); + r6 = _mm_shuffle_epi32(r2, 0x0e); + r5 = _mm_shuffle_epi32(r1, 0x0e); + r4 = _mm_shuffle_epi32(r0, 0x0e); + + s0 = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr32(q0 ^ q5 ^ q6 ^ r0 ^ r5); + s1 = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6); + s2 = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr32(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7); + s3 = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr32(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7); + s4 = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6); + s5 = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7); + s6 = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr32(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7); + s7 = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr32(q4 ^ q5 ^ q7 ^ r4 ^ r7); + + q[0] = _mm_unpacklo_epi64(s0, s4); + q[1] = _mm_unpacklo_epi64(s1, s5); + q[2] = _mm_unpacklo_epi64(s2, s6); + q[3] = _mm_unpacklo_epi64(s3, s7); +} + +/* see inner.h */ +void +aes_sse2_bitslice_decrypt(unsigned num_rounds, + const uint64_t *skey, __m128i q[static 4]) +{ + unsigned u; + + add_round_key(q, skey + (num_rounds << 3)); + for (u = num_rounds - 1; u > 0; u --) { + inv_shift_rows(q); + aes_sse2_bitslice_invSbox(q); + add_round_key(q, skey + (u << 3)); + inv_mix_columns(q); + } + inv_shift_rows(q); + aes_sse2_bitslice_invSbox(q); + add_round_key(q, skey); +} Index: src/sys/crypto/aes/arch/x86/aes_sse2_enc.c diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2_enc.c:1.1 --- /dev/null Mon Jun 29 23:47:55 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2_enc.c Mon Jun 29 23:47:54 2020 @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2016 Thomas Pornin <por...@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_sse2_enc.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $"); + +#include <sys/types.h> + +#include "aes_sse2_impl.h" + +static inline void +add_round_key(__m128i q[static 4], const uint64_t sk[static 8]) +{ + q[0] ^= _mm_set_epi64x(sk[4], sk[0]); + q[1] ^= _mm_set_epi64x(sk[5], sk[1]); + q[2] ^= _mm_set_epi64x(sk[6], sk[2]); + q[3] ^= _mm_set_epi64x(sk[7], sk[3]); +} + +static inline __m128i +shift_row(__m128i q) +{ + __m128i x, y0, y1, y2, y3, y4, y5, y6; + + x = q; + y0 = x & _mm_set1_epi64x(0x000000000000FFFF); + y1 = x & _mm_set1_epi64x(0x00000000FFF00000); + y2 = x & _mm_set1_epi64x(0x00000000000F0000); + y3 = x & _mm_set1_epi64x(0x0000FF0000000000); + y4 = x & _mm_set1_epi64x(0x000000FF00000000); + y5 = x & _mm_set1_epi64x(0xF000000000000000); + y6 = x & _mm_set1_epi64x(0x0FFF000000000000); + y1 = _mm_srli_epi64(y1, 4); + y2 = _mm_slli_epi64(y2, 12); + y3 = _mm_srli_epi64(y3, 8); + y4 = _mm_slli_epi64(y4, 8); + y5 = _mm_srli_epi64(y5, 12); + y6 = _mm_slli_epi64(y6, 4); + return y0 | y1 | y2 | y3 | y4 | y5 | y6; +} + +static inline void +shift_rows(__m128i q[static 4]) +{ + + q[0] = shift_row(q[0]); + q[1] = shift_row(q[1]); + q[2] = shift_row(q[2]); + q[3] = shift_row(q[3]); +} + +static inline __m128i +rotr32(__m128i x) +{ + return _mm_slli_epi64(x, 32) | _mm_srli_epi64(x, 32); +} + +static inline void +mix_columns(__m128i q[static 4]) +{ + __m128i q0, q1, q2, q3, q4, q5, q6, q7; + __m128i r0, r1, r2, r3, r4, r5, r6, r7; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + r0 = _mm_srli_epi64(q0, 16) | _mm_slli_epi64(q0, 48); + r1 = _mm_srli_epi64(q1, 16) | _mm_slli_epi64(q1, 48); + r2 = _mm_srli_epi64(q2, 16) | _mm_slli_epi64(q2, 48); + r3 = _mm_srli_epi64(q3, 16) | _mm_slli_epi64(q3, 48); + + q7 = _mm_shuffle_epi32(q3, 0x0e); + q6 = _mm_shuffle_epi32(q2, 0x0e); + q5 = _mm_shuffle_epi32(q1, 0x0e); + q4 = _mm_shuffle_epi32(q0, 0x0e); + + r7 = _mm_shuffle_epi32(r3, 0x0e); + r6 = _mm_shuffle_epi32(r2, 0x0e); + r5 = _mm_shuffle_epi32(r1, 0x0e); + r4 = _mm_shuffle_epi32(r0, 0x0e); + + s0 = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + s1 = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + s2 = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + s3 = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + s4 = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + s5 = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + s6 = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + s7 = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); + + q[0] = _mm_unpacklo_epi64(s0, s4); + q[1] = _mm_unpacklo_epi64(s1, s5); + q[2] = _mm_unpacklo_epi64(s2, s6); + q[3] = _mm_unpacklo_epi64(s3, s7); +} + +void +aes_sse2_bitslice_encrypt(unsigned num_rounds, + const uint64_t *skey, __m128i q[static 4]) +{ + unsigned u; + + add_round_key(q, skey); + for (u = 1; u < num_rounds; u ++) { + aes_sse2_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, skey + (u << 3)); + } + aes_sse2_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, skey + (num_rounds << 3)); +} Index: src/sys/crypto/aes/arch/x86/aes_sse2_impl.c diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2_impl.c:1.1 --- /dev/null Mon Jun 29 23:47:55 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2_impl.c Mon Jun 29 23:47:54 2020 @@ -0,0 +1,611 @@ +/* $NetBSD: aes_sse2_impl.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_sse2_impl.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $"); + +#include <sys/types.h> +#include <sys/endian.h> +#include <sys/systm.h> + +#include <crypto/aes/aes.h> +#include <crypto/aes/arch/x86/aes_sse2.h> + +#include <x86/cpu.h> +#include <x86/cpuvar.h> +#include <x86/fpu.h> +#include <x86/specialreg.h> + +#include "aes_sse2_impl.h" + +static void +aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds) +{ + size_t key_len; + + switch (nrounds) { + case 10: + key_len = 16; + break; + case 12: + key_len = 24; + break; + case 14: + key_len = 32; + break; + default: + panic("invalid AES nrounds: %u", nrounds); + } + + fpu_kern_enter(); + aes_sse2_keysched(rk, key, key_len); + fpu_kern_leave(); +} + +static void +aes_sse2_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds) +{ + + aes_sse2_setkey(enc->aese_aes.aes_rk64, key, nrounds); +} + +static void +aes_sse2_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds) +{ + + /* + * BearSSL computes InvMixColumns on the fly -- no need for + * distinct decryption round keys. + */ + aes_sse2_setkey(dec->aesd_aes.aes_rk64, key, nrounds); +} + +static void +aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + + fpu_kern_enter(); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Load input block interleaved with garbage blocks. */ + q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); + q[1] = q[2] = q[3] = _mm_setzero_si128(); + + /* Transform to bitslice, decrypt, transform from bitslice. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store output block. */ + _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + + fpu_kern_leave(); +} + +static void +aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + + fpu_kern_enter(); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); + + /* Load input block interleaved with garbage blocks. */ + q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); + q[1] = q[2] = q[3] = _mm_setzero_si128(); + + /* Transform to bitslice, decrypt, transform from bitslice. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store output block. */ + _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + + fpu_kern_leave(); +} + +static void +aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + __m128i cv; + + KASSERT(nbytes % 16 == 0); + + /* Skip if there's nothing to do. */ + if (nbytes == 0) + return; + + fpu_kern_enter(); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Load the IV. */ + cv = _mm_loadu_epi8(iv); + + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + /* Load input block and apply CV. */ + q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in)); + + /* Transform to bitslice, encrypt, transform from bitslice. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Remember ciphertext as CV and store output block. */ + cv = aes_sse2_interleave_out(q[0]); + _mm_storeu_epi8(out, cv); + } + + /* Store updated IV. */ + _mm_storeu_epi8(iv, cv); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + + fpu_kern_leave(); +} + +static void +aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + __m128i cv, iv, w; + + KASSERT(nbytes % 16 == 0); + + /* Skip if there's nothing to do. */ + if (nbytes == 0) + return; + + fpu_kern_enter(); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); + + /* Load the IV. */ + iv = _mm_loadu_epi8(ivp); + + /* Load the last cipher block. */ + cv = _mm_loadu_epi8(in + nbytes - 16); + + /* Store the updated IV. */ + _mm_storeu_epi8(ivp, cv); + + /* Process the last blocks if not an even multiple of four. */ + if (nbytes % (4*16)) { + unsigned n = (nbytes/16) % 4; + + KASSERT(n > 0); + KASSERT(n < 4); + + q[1] = q[2] = q[3] = _mm_setzero_si128(); + q[n - 1] = aes_sse2_interleave_in(cv); + switch (nbytes % 64) { + case 48: + w = _mm_loadu_epi8(in + nbytes - 32); + q[1] = aes_sse2_interleave_in(w); + /*FALLTHROUGH*/ + case 32: + w = _mm_loadu_epi8(in + nbytes - 48); + q[0] = aes_sse2_interleave_in(w); + /*FALLTHROUGH*/ + case 16: + break; + } + + /* Decrypt. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + do { + n--; + w = aes_sse2_interleave_out(q[n]); + if ((nbytes -= 16) == 0) + goto out; + cv = _mm_loadu_epi8(in + nbytes - 16); + _mm_storeu_epi8(out + nbytes, w ^ cv); + } while (n); + } + + for (;;) { + KASSERT(nbytes >= 64); + nbytes -= 64; + + /* + * 1. Set up upper cipher block from cv. + * 2. Load lower cipher block into cv and set it up. + * 3. Decrypt. + */ + q[3] = aes_sse2_interleave_in(cv); + + w = _mm_loadu_epi8(in + nbytes + 4*8); + q[2] = aes_sse2_interleave_in(w); + + w = _mm_loadu_epi8(in + nbytes + 4*4); + q[1] = aes_sse2_interleave_in(w); + + w = _mm_loadu_epi8(in + nbytes + 4*0); + q[0] = aes_sse2_interleave_in(w); + + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the upper output block. */ + w = aes_sse2_interleave_out(q[3]); + cv = _mm_loadu_epi8(in + nbytes + 4*8); + _mm_storeu_epi8(out + nbytes + 4*12, w ^ cv); + + /* Store the middle output blocks. */ + w = aes_sse2_interleave_out(q[2]); + cv = _mm_loadu_epi8(in + nbytes + 4*4); + _mm_storeu_epi8(out + nbytes + 4*8, w ^ cv); + + w = aes_sse2_interleave_out(q[1]); + cv = _mm_loadu_epi8(in + nbytes + 4*0); + _mm_storeu_epi8(out + nbytes + 4*4, w ^ cv); + + /* + * Get the first output block, but don't load the CV + * yet -- it might be the previous ciphertext block, or + * it might be the IV. + */ + w = aes_sse2_interleave_out(q[0]); + + /* Stop if we've reached the first output block. */ + if (nbytes == 0) + goto out; + + /* + * Load the preceding cipher block, and apply it as the + * chaining value to this one. + */ + cv = _mm_loadu_epi8(in + nbytes - 16); + _mm_storeu_epi8(out + nbytes, w ^ cv); + } + +out: /* Store the first output block. */ + _mm_storeu_epi8(out, w ^ iv); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + + fpu_kern_leave(); +} + +static inline __m128i +aes_sse2_xts_update(__m128i t) +{ + const __m128i one = _mm_set_epi64x(1, 1); + __m128i s, m, c; + + s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */ + m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */ + m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */ + c = _mm_set_epi64x(1, 0x87); /* carry */ + + return _mm_slli_epi64(t, 1) ^ (c & ~m); +} + +static int +aes_sse2_xts_update_selftest(void) +{ + static const struct { + uint32_t in[4], out[4]; + } cases[] = { + [0] = { {1}, {2} }, + [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, + [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, + [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, + [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, + [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, + }; + unsigned i; + uint32_t t[4]; + int result = 0; + + for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { + t[0] = cases[i].in[0]; + t[1] = cases[i].in[1]; + t[2] = cases[i].in[2]; + t[3] = cases[i].in[3]; + _mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t))); + if (t[0] != cases[i].out[0] || + t[1] != cases[i].out[1] || + t[2] != cases[i].out[2] || + t[3] != cases[i].out[3]) { + printf("%s %u:" + " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", + __func__, i, t[0], t[1], t[2], t[3]); + result = -1; + } + } + + return result; +} + +static void +aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + __m128i w; + __m128i t[5]; + unsigned i; + + KASSERT(nbytes % 16 == 0); + + /* Skip if there's nothing to do. */ + if (nbytes == 0) + return; + + fpu_kern_enter(); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Load tweak. */ + t[0] = _mm_loadu_epi8(tweak); + + /* Handle the first block separately if odd number. */ + if (nbytes % (4*16)) { + /* Load up the tweaked inputs. */ + for (i = 0; i < (nbytes/16) % 4; i++) { + w = _mm_loadu_epi8(in + 16*i) ^ t[i]; + q[i] = aes_sse2_interleave_in(w); + t[i + 1] = aes_sse2_xts_update(t[i]); + } + for (; i < 4; i++) + q[i] = _mm_setzero_si128(); + + /* Encrypt up to four blocks. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the tweaked outputs. */ + for (i = 0; i < (nbytes/16) % 4; i++) { + w = aes_sse2_interleave_out(q[i]); + _mm_storeu_epi8(out + 16*i, w ^ t[i]); + } + + /* Advance to the next block. */ + t[0] = t[i]; + in += nbytes % (4*16); + out += nbytes % (4*16); + nbytes -= nbytes % (4*16); + if (nbytes == 0) + goto out; + } + + do { + KASSERT(nbytes % 64 == 0); + KASSERT(nbytes >= 64); + + /* Load up the tweaked inputs. */ + for (i = 0; i < 4; i++) { + w = _mm_loadu_epi8(in + 16*i) ^ t[i]; + q[i] = aes_sse2_interleave_in(w); + t[i + 1] = aes_sse2_xts_update(t[i]); + } + + /* Encrypt four blocks. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the tweaked outputs. */ + for (i = 0; i < 4; i++) { + w = aes_sse2_interleave_out(q[i]); + _mm_storeu_epi8(out + 16*i, w ^ t[i]); + } + + /* Advance to the next block. */ + t[0] = t[4]; + in += 64; + out += 64; + nbytes -= 64; + } while (nbytes); + +out: /* Store the updated tweak. */ + _mm_storeu_epi8(tweak, t[0]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + explicit_memset(t, 0, sizeof t); + + fpu_kern_leave(); +} + +static void +aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + __m128i w; + __m128i t[5]; + unsigned i; + + KASSERT(nbytes % 16 == 0); + + /* Skip if there's nothing to do. */ + if (nbytes == 0) + return; + + fpu_kern_enter(); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); + + /* Load tweak. */ + t[0] = _mm_loadu_epi8(tweak); + + /* Handle the first block separately if odd number. */ + if (nbytes % (4*16)) { + /* Load up the tweaked inputs. */ + for (i = 0; i < (nbytes/16) % 4; i++) { + w = _mm_loadu_epi8(in + 16*i) ^ t[i]; + q[i] = aes_sse2_interleave_in(w); + t[i + 1] = aes_sse2_xts_update(t[i]); + } + for (; i < 4; i++) + q[i] = _mm_setzero_si128(); + + /* Decrypt up to four blocks. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the tweaked outputs. */ + for (i = 0; i < (nbytes/16) % 4; i++) { + w = aes_sse2_interleave_out(q[i]); + _mm_storeu_epi8(out + 16*i, w ^ t[i]); + } + + /* Advance to the next block. */ + t[0] = t[i]; + in += nbytes % (4*16); + out += nbytes % (4*16); + nbytes -= nbytes % (4*16); + if (nbytes == 0) + goto out; + } + + do { + KASSERT(nbytes % 64 == 0); + KASSERT(nbytes >= 64); + + /* Load up the tweaked inputs. */ + for (i = 0; i < 4; i++) { + w = _mm_loadu_epi8(in + 16*i) ^ t[i]; + q[i] = aes_sse2_interleave_in(w); + t[i + 1] = aes_sse2_xts_update(t[i]); + } + + /* Decrypt four blocks. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the tweaked outputs. */ + for (i = 0; i < 4; i++) { + w = aes_sse2_interleave_out(q[i]); + _mm_storeu_epi8(out + 16*i, w ^ t[i]); + } + + /* Advance to the next block. */ + t[0] = t[4]; + in += 64; + out += 64; + nbytes -= 64; + } while (nbytes); + +out: /* Store the updated tweak. */ + _mm_storeu_epi8(tweak, t[0]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + explicit_memset(t, 0, sizeof t); + + fpu_kern_leave(); +} + +static int +aes_sse2_probe(void) +{ + int result = 0; + + /* Verify that the CPU supports SSE and SSE2. */ + if (!i386_has_sse) + return -1; + if (!i386_has_sse2) + return -1; + + fpu_kern_enter(); + + if (aes_sse2_xts_update_selftest()) + result = -1; + + fpu_kern_leave(); + + /* XXX test aes_sse2_bitslice_decrypt */ + /* XXX test aes_sse2_bitslice_encrypt */ + /* XXX test aes_sse2_keysched */ + /* XXX test aes_sse2_ortho */ + /* XXX test aes_sse2_skey_expand */ + + return result; +} + +struct aes_impl aes_sse2_impl = { + .ai_name = "Intel SSE2 bitsliced", + .ai_probe = aes_sse2_probe, + .ai_setenckey = aes_sse2_setenckey, + .ai_setdeckey = aes_sse2_setdeckey, + .ai_enc = aes_sse2_enc, + .ai_dec = aes_sse2_dec, + .ai_cbc_enc = aes_sse2_cbc_enc, + .ai_cbc_dec = aes_sse2_cbc_dec, + .ai_xts_enc = aes_sse2_xts_enc, + .ai_xts_dec = aes_sse2_xts_dec, +}; Index: src/sys/crypto/aes/arch/x86/aes_sse2_impl.h diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2_impl.h:1.1 --- /dev/null Mon Jun 29 23:47:55 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2_impl.h Mon Jun 29 23:47:54 2020 @@ -0,0 +1,47 @@ +/* $NetBSD: aes_sse2_impl.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H +#define _CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H + +#include <sys/types.h> + +#include <crypto/aes/arch/x86/immintrin.h> +#include <crypto/aes/arch/x86/immintrin_ext.h> + +void aes_sse2_bitslice_Sbox(__m128i[static 4]); +void aes_sse2_bitslice_invSbox(__m128i[static 4]); +void aes_sse2_ortho(__m128i[static 4]); +__m128i aes_sse2_interleave_in(__m128i); +__m128i aes_sse2_interleave_out(__m128i); +unsigned aes_sse2_keysched(uint64_t *, const void *, size_t); +void aes_sse2_skey_expand(uint64_t *, unsigned, const uint64_t *); +void aes_sse2_bitslice_encrypt(unsigned, const uint64_t *, __m128i[static 4]); +void aes_sse2_bitslice_decrypt(unsigned, const uint64_t *, __m128i[static 4]); + +#endif /* _CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H */ Index: src/sys/crypto/aes/arch/x86/files.aessse2 diff -u /dev/null src/sys/crypto/aes/arch/x86/files.aessse2:1.1 --- /dev/null Mon Jun 29 23:47:55 2020 +++ src/sys/crypto/aes/arch/x86/files.aessse2 Mon Jun 29 23:47:54 2020 @@ -0,0 +1,11 @@ +# $NetBSD: files.aessse2,v 1.1 2020/06/29 23:47:54 riastradh Exp $ + +makeoptions aes "COPTS.aes_sse2.c"+="-msse2" +makeoptions aes "COPTS.aes_sse2_dec.c"+="-msse2" +makeoptions aes "COPTS.aes_sse2_enc.c"+="-msse2" +makeoptions aes "COPTS.aes_sse2_impl.c"+="-msse2" + +file crypto/aes/arch/x86/aes_sse2.c aes +file crypto/aes/arch/x86/aes_sse2_dec.c aes +file crypto/aes/arch/x86/aes_sse2_enc.c aes +file crypto/aes/arch/x86/aes_sse2_impl.c aes Index: src/sys/crypto/aes/arch/x86/immintrin.h diff -u /dev/null src/sys/crypto/aes/arch/x86/immintrin.h:1.1 --- /dev/null Mon Jun 29 23:47:55 2020 +++ src/sys/crypto/aes/arch/x86/immintrin.h Mon Jun 29 23:47:54 2020 @@ -0,0 +1,216 @@ +/* $NetBSD: immintrin.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_H +#define _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_H + +#include <sys/types.h> + +/* + * This kludgerous header file provides definitions for the Intel + * intrinsics that work with GCC and Clang, because <immintrin.h> is + * not available during the kernel build and arranging to make it + * available is complicated. Please fix this properly! + */ + +#if defined(__GNUC__) && !defined(__clang__) + +#define _INTRINSATTR \ + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +#define _PACKALIAS + +typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); +typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); +typedef long long __m128i_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); +typedef long long __v2di __attribute__((__vector_size__(16))); +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef float __v4sf __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); + +#elif defined(__clang__) + +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __m128i + __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __m128i_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); +typedef long long __v2di __attribute__((__vector_size__(16))); +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef float __v4sf __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); + +#define _INTRINSATTR \ + __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ + __min_vector_width__(128))) +#define _PACKALIAS \ + __attribute__((__packed__, __may_alias__)) + +#else + +#error Please teach me how to do Intel intrinsics for your compiler! + +#endif + +_INTRINSATTR +static __inline __m128i +_mm_loadu_si32(const void *__p) +{ + int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v; + return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_loadu_si64(const void *__p) +{ + int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v; + return __extension__ (__m128i)(__v2di){ __v, 0 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set1_epi16(int16_t __v) +{ + return __extension__ (__m128i)(__v8hi){ + __v, __v, __v, __v, __v, __v, __v, __v + }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set1_epi32(int32_t __v) +{ + return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set1_epi64x(int64_t __v) +{ + return __extension__ (__m128i)(__v2di){ __v, __v }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0) +{ + return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set_epi64x(int64_t __v1, int64_t __v0) +{ + return __extension__ (__m128i)(__v2di){ __v0, __v1 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_setzero_si128(void) +{ + return _mm_set1_epi64x(0); +} + +#define _mm_shuffle_epi32(v,m) \ + (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m)) + +#define _mm_shuffle_ps(x,y,m) \ + (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \ + (__v4sf)(__m128)(y), (int)(m)) \ + +_INTRINSATTR +static __inline __m128i +_mm_slli_epi64(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_slli_si128(v,bytes) \ + (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \ + 8*(int)(bytes)) +#elif defined(__clang__) +#define _mm_slli_si128(v,bytes) \ + (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \ + (int)(bytes)) +#endif + +_INTRINSATTR +static __inline __m128i +_mm_srli_epi64(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_srli_si128(v,bytes) \ + (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes)) +#elif defined(__clang__) +#define _mm_srli_si128(v,bytes) \ + (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \ + (int)(bytes)); +#endif + +_INTRINSATTR +static __inline void +_mm_storeu_si32(void *__p, __m128i __v) +{ + ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0]; +} + +_INTRINSATTR +static __inline void +_mm_storeu_si64(void *__p, __m128i __v) +{ + ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0]; +} + +_INTRINSATTR +static __inline __m128i +_mm_sub_epi64(__m128i __x, __m128i __y) +{ + return (__m128i)((__v2du)__x - (__v2du)__y); +} + +_INTRINSATTR +static __inline __m128i +_mm_unpacklo_epi64(__m128i __lo, __m128i __hi) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo, + (__v2di)__hi); +#elif defined(__clang__) + return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, + 0, 4, 1, 5); +#endif +} + +#endif /* _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_H */ Index: src/sys/crypto/aes/arch/x86/immintrin_ext.h diff -u /dev/null src/sys/crypto/aes/arch/x86/immintrin_ext.h:1.1 --- /dev/null Mon Jun 29 23:47:55 2020 +++ src/sys/crypto/aes/arch/x86/immintrin_ext.h Mon Jun 29 23:47:54 2020 @@ -0,0 +1,48 @@ +/* $NetBSD: immintrin_ext.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_EXT_H +#define _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_EXT_H + +#include "immintrin.h" + +_INTRINSATTR +static __inline __m128i +_mm_loadu_epi8(const void *__p) +{ + return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v; +} + +_INTRINSATTR +static __inline void +_mm_storeu_epi8(void *__p, __m128i __v) +{ + ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v; +} + +#endif /* _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_EXT_H */