Module Name:    src
Committed By:   riastradh
Date:           Mon Jun 29 23:47:54 UTC 2020

Modified Files:
        src/sys/arch/x86/conf: files.x86
        src/sys/arch/x86/x86: identcpu.c
        src/sys/crypto/aes: aes.h
Added Files:
        src/sys/crypto/aes/arch/x86: aes_sse2.c aes_sse2.h aes_sse2_dec.c
            aes_sse2_enc.c aes_sse2_impl.c aes_sse2_impl.h files.aessse2
            immintrin.h immintrin_ext.h

Log Message:
New SSE2-based bitsliced AES implementation.

This should work on essentially all x86 CPUs of the last two decades,
and may improve throughput over the portable C aes_ct implementation
from BearSSL by

(a) reducing the number of vector operations in sequence, and
(b) batching four rather than two blocks in parallel.

Derived from BearSSL'S aes_ct64 implementation adjusted so that where
aes_ct64 uses 64-bit q[0],...,q[7], aes_sse2 uses (q[0], q[4]), ...,
(q[3], q[7]), each tuple representing a pair of 64-bit quantities
stacked in a single 128-bit register.  This translation was done very
naively, and mostly reduces the cost of ShiftRows and data movement
without doing anything to address the S-box or (Inv)MixColumns, which
spread all 64-bit quantities across separate registers and ignore the
upper halves.

Unfortunately, SSE2 -- which is all that is guaranteed on all amd64
CPUs -- doesn't have PSHUFB, which would help out a lot more.  For
example, vpaes relies on that.  Perhaps there are enough CPUs out
there with PSHUFB but not AES-NI to make it worthwhile to import or
adapt vpaes too.

Note: This includes local definitions of various Intel compiler
intrinsics for gcc and clang in terms of their __builtin_* &c.,
because the necessary header files are not available during the
kernel build.  This is a kludge -- we should fix it properly; the
present approach is expedient but not ideal.


To generate a diff of this commit:
cvs rdiff -u -r1.114 -r1.115 src/sys/arch/x86/conf/files.x86
cvs rdiff -u -r1.109 -r1.110 src/sys/arch/x86/x86/identcpu.c
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/aes.h
cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/x86/aes_sse2.c \
    src/sys/crypto/aes/arch/x86/aes_sse2.h \
    src/sys/crypto/aes/arch/x86/aes_sse2_dec.c \
    src/sys/crypto/aes/arch/x86/aes_sse2_enc.c \
    src/sys/crypto/aes/arch/x86/aes_sse2_impl.c \
    src/sys/crypto/aes/arch/x86/aes_sse2_impl.h \
    src/sys/crypto/aes/arch/x86/files.aessse2 \
    src/sys/crypto/aes/arch/x86/immintrin.h \
    src/sys/crypto/aes/arch/x86/immintrin_ext.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/x86/conf/files.x86
diff -u src/sys/arch/x86/conf/files.x86:1.114 src/sys/arch/x86/conf/files.x86:1.115
--- src/sys/arch/x86/conf/files.x86:1.114	Mon Jun 29 23:39:30 2020
+++ src/sys/arch/x86/conf/files.x86	Mon Jun 29 23:47:54 2020
@@ -1,4 +1,4 @@
-#	$NetBSD: files.x86,v 1.114 2020/06/29 23:39:30 riastradh Exp $
+#	$NetBSD: files.x86,v 1.115 2020/06/29 23:47:54 riastradh Exp $
 
 # options for MP configuration through the MP spec
 defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI
@@ -171,3 +171,6 @@ include "crypto/aes/arch/x86/files.aesni
 
 # VIA ACE
 include "crypto/aes/arch/x86/files.aesvia"
+
+# Bitsliced AES with SSE2
+include "crypto/aes/arch/x86/files.aessse2"

Index: src/sys/arch/x86/x86/identcpu.c
diff -u src/sys/arch/x86/x86/identcpu.c:1.109 src/sys/arch/x86/x86/identcpu.c:1.110
--- src/sys/arch/x86/x86/identcpu.c:1.109	Mon Jun 29 23:39:30 2020
+++ src/sys/arch/x86/x86/identcpu.c	Mon Jun 29 23:47:54 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: identcpu.c,v 1.109 2020/06/29 23:39:30 riastradh Exp $	*/
+/*	$NetBSD: identcpu.c,v 1.110 2020/06/29 23:47:54 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.109 2020/06/29 23:39:30 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.110 2020/06/29 23:47:54 riastradh Exp $");
 
 #include "opt_xen.h"
 
@@ -40,6 +40,7 @@ __KERNEL_RCSID(0, "$NetBSD: identcpu.c,v
 #include <sys/cpu.h>
 
 #include <crypto/aes/arch/x86/aes_ni.h>
+#include <crypto/aes/arch/x86/aes_sse2.h>
 #include <crypto/aes/arch/x86/aes_via.h>
 
 #include <uvm/uvm_extern.h>
@@ -1005,6 +1006,8 @@ cpu_probe(struct cpu_info *ci)
 #endif
 		if (cpu_feature[4] & CPUID_VIA_HAS_ACE)
 			aes_md_init(&aes_via_impl);
+		else if (i386_has_sse && i386_has_sse2)
+			aes_md_init(&aes_sse2_impl);
 	} else {
 		/*
 		 * If not first. Warn about cpu_feature mismatch for

Index: src/sys/crypto/aes/aes.h
diff -u src/sys/crypto/aes/aes.h:1.1 src/sys/crypto/aes/aes.h:1.2
--- src/sys/crypto/aes/aes.h:1.1	Mon Jun 29 23:27:52 2020
+++ src/sys/crypto/aes/aes.h	Mon Jun 29 23:47:54 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes.h,v 1.1 2020/06/29 23:27:52 riastradh Exp $	*/
+/*	$NetBSD: aes.h,v 1.2 2020/06/29 23:47:54 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -37,8 +37,9 @@
  *
  *	Expanded round keys.
  */
-struct aes {
+union aes {
 	uint32_t	aes_rk[60];
+	uint64_t	aes_rk64[30];
 } __aligned(16);
 
 #define	AES_128_NROUNDS	10
@@ -46,11 +47,11 @@ struct aes {
 #define	AES_256_NROUNDS	14
 
 struct aesenc {
-	struct aes	aese_aes;
+	union aes	aese_aes;
 };
 
 struct aesdec {
-	struct aes	aesd_aes;
+	union aes	aesd_aes;
 };
 
 struct aes_impl {

Added files:

Index: src/sys/crypto/aes/arch/x86/aes_sse2.c
diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2.c:1.1
--- /dev/null	Mon Jun 29 23:47:55 2020
+++ src/sys/crypto/aes/arch/x86/aes_sse2.c	Mon Jun 29 23:47:54 2020
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <por...@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: aes_sse2.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $");
+
+#include <sys/types.h>
+
+#include <lib/libkern/libkern.h>
+
+#include "aes_sse2_impl.h"
+
+static void
+br_range_dec32le(uint32_t *p32, size_t nwords, const void *v)
+{
+	const uint8_t *p8 = v;
+
+	while (nwords --> 0) {
+		uint32_t x0 = *p8++;
+		uint32_t x1 = *p8++;
+		uint32_t x2 = *p8++;
+		uint32_t x3 = *p8++;
+
+		*p32++ = x0 | (x1 << 8) | (x2 << 16) | (x3 << 24);
+	}
+}
+
+void
+aes_sse2_bitslice_Sbox(__m128i q[static 4])
+{
+	__m128i x0, x1, x2, x3, x4, x5, x6, x7;
+	__m128i y1, y2, y3, y4, y5, y6, y7, y8, y9;
+	__m128i y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
+	__m128i y20, y21;
+	__m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
+	__m128i z10, z11, z12, z13, z14, z15, z16, z17;
+	__m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+	__m128i t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
+	__m128i t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+	__m128i t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
+	__m128i t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
+	__m128i t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
+	__m128i t60, t61, t62, t63, t64, t65, t66, t67;
+	__m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+	x0 = _mm_shuffle_epi32(q[3], 0x0e);
+	x1 = _mm_shuffle_epi32(q[2], 0x0e);
+	x2 = _mm_shuffle_epi32(q[1], 0x0e);
+	x3 = _mm_shuffle_epi32(q[0], 0x0e);
+	x4 = q[3];
+	x5 = q[2];
+	x6 = q[1];
+	x7 = q[0];
+
+	/*
+	 * Top linear transformation.
+	 */
+	y14 = x3 ^ x5;
+	y13 = x0 ^ x6;
+	y9 = x0 ^ x3;
+	y8 = x0 ^ x5;
+	t0 = x1 ^ x2;
+	y1 = t0 ^ x7;
+	y4 = y1 ^ x3;
+	y12 = y13 ^ y14;
+	y2 = y1 ^ x0;
+	y5 = y1 ^ x6;
+	y3 = y5 ^ y8;
+	t1 = x4 ^ y12;
+	y15 = t1 ^ x5;
+	y20 = t1 ^ x1;
+	y6 = y15 ^ x7;
+	y10 = y15 ^ t0;
+	y11 = y20 ^ y9;
+	y7 = x7 ^ y11;
+	y17 = y10 ^ y11;
+	y19 = y10 ^ y8;
+	y16 = t0 ^ y11;
+	y21 = y13 ^ y16;
+	y18 = x0 ^ y16;
+
+	/*
+	 * Non-linear section.
+	 */
+	t2 = y12 & y15;
+	t3 = y3 & y6;
+	t4 = t3 ^ t2;
+	t5 = y4 & x7;
+	t6 = t5 ^ t2;
+	t7 = y13 & y16;
+	t8 = y5 & y1;
+	t9 = t8 ^ t7;
+	t10 = y2 & y7;
+	t11 = t10 ^ t7;
+	t12 = y9 & y11;
+	t13 = y14 & y17;
+	t14 = t13 ^ t12;
+	t15 = y8 & y10;
+	t16 = t15 ^ t12;
+	t17 = t4 ^ t14;
+	t18 = t6 ^ t16;
+	t19 = t9 ^ t14;
+	t20 = t11 ^ t16;
+	t21 = t17 ^ y20;
+	t22 = t18 ^ y19;
+	t23 = t19 ^ y21;
+	t24 = t20 ^ y18;
+
+	t25 = t21 ^ t22;
+	t26 = t21 & t23;
+	t27 = t24 ^ t26;
+	t28 = t25 & t27;
+	t29 = t28 ^ t22;
+	t30 = t23 ^ t24;
+	t31 = t22 ^ t26;
+	t32 = t31 & t30;
+	t33 = t32 ^ t24;
+	t34 = t23 ^ t33;
+	t35 = t27 ^ t33;
+	t36 = t24 & t35;
+	t37 = t36 ^ t34;
+	t38 = t27 ^ t36;
+	t39 = t29 & t38;
+	t40 = t25 ^ t39;
+
+	t41 = t40 ^ t37;
+	t42 = t29 ^ t33;
+	t43 = t29 ^ t40;
+	t44 = t33 ^ t37;
+	t45 = t42 ^ t41;
+	z0 = t44 & y15;
+	z1 = t37 & y6;
+	z2 = t33 & x7;
+	z3 = t43 & y16;
+	z4 = t40 & y1;
+	z5 = t29 & y7;
+	z6 = t42 & y11;
+	z7 = t45 & y17;
+	z8 = t41 & y10;
+	z9 = t44 & y12;
+	z10 = t37 & y3;
+	z11 = t33 & y4;
+	z12 = t43 & y13;
+	z13 = t40 & y5;
+	z14 = t29 & y2;
+	z15 = t42 & y9;
+	z16 = t45 & y14;
+	z17 = t41 & y8;
+
+	/*
+	 * Bottom linear transformation.
+	 */
+	t46 = z15 ^ z16;
+	t47 = z10 ^ z11;
+	t48 = z5 ^ z13;
+	t49 = z9 ^ z10;
+	t50 = z2 ^ z12;
+	t51 = z2 ^ z5;
+	t52 = z7 ^ z8;
+	t53 = z0 ^ z3;
+	t54 = z6 ^ z7;
+	t55 = z16 ^ z17;
+	t56 = z12 ^ t48;
+	t57 = t50 ^ t53;
+	t58 = z4 ^ t46;
+	t59 = z3 ^ t54;
+	t60 = t46 ^ t57;
+	t61 = z14 ^ t57;
+	t62 = t52 ^ t58;
+	t63 = t49 ^ t58;
+	t64 = z4 ^ t59;
+	t65 = t61 ^ t62;
+	t66 = z1 ^ t63;
+	s0 = t59 ^ t63;
+	s6 = t56 ^ ~t62;
+	s7 = t48 ^ ~t60;
+	t67 = t64 ^ t65;
+	s3 = t53 ^ t66;
+	s4 = t51 ^ t66;
+	s5 = t47 ^ t65;
+	s1 = t64 ^ ~s3;
+	s2 = t55 ^ ~t67;
+
+	q[3] = _mm_unpacklo_epi64(s4, s0);
+	q[2] = _mm_unpacklo_epi64(s5, s1);
+	q[1] = _mm_unpacklo_epi64(s6, s2);
+	q[0] = _mm_unpacklo_epi64(s7, s3);
+}
+
+void
+aes_sse2_ortho(__m128i q[static 4])
+{
+#define SWAPN(cl, ch, s, x, y)   do { \
+		__m128i a, b; \
+		a = (x); \
+		b = (y); \
+		(x) = (a & _mm_set1_epi64x(cl)) | \
+		    _mm_slli_epi64(b & _mm_set1_epi64x(cl), (s)); \
+		(y) = _mm_srli_epi64(a & _mm_set1_epi64x(ch), (s)) | \
+		    (b & _mm_set1_epi64x(ch)); \
+	} while (0)
+
+#define SWAP2(x, y)    SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA,  1, x, y)
+#define SWAP4(x, y)    SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC,  2, x, y)
+#define SWAP8(x, y)    SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0,  4, x, y)
+
+	SWAP2(q[0], q[1]);
+	SWAP2(q[2], q[3]);
+
+	SWAP4(q[0], q[2]);
+	SWAP4(q[1], q[3]);
+
+	__m128i q0 = q[0];
+	__m128i q1 = q[1];
+	__m128i q2 = q[2];
+	__m128i q3 = q[3];
+	__m128i q4 = _mm_shuffle_epi32(q[0], 0x0e);
+	__m128i q5 = _mm_shuffle_epi32(q[1], 0x0e);
+	__m128i q6 = _mm_shuffle_epi32(q[2], 0x0e);
+	__m128i q7 = _mm_shuffle_epi32(q[3], 0x0e);
+	SWAP8(q0, q4);
+	SWAP8(q1, q5);
+	SWAP8(q2, q6);
+	SWAP8(q3, q7);
+	q[0] = _mm_unpacklo_epi64(q0, q4);
+	q[1] = _mm_unpacklo_epi64(q1, q5);
+	q[2] = _mm_unpacklo_epi64(q2, q6);
+	q[3] = _mm_unpacklo_epi64(q3, q7);
+}
+
+__m128i
+aes_sse2_interleave_in(__m128i w)
+{
+	__m128i lo, hi;
+
+	lo = _mm_shuffle_epi32(w, 0x10);
+	hi = _mm_shuffle_epi32(w, 0x32);
+	lo &= _mm_set1_epi64x(0x00000000FFFFFFFF);
+	hi &= _mm_set1_epi64x(0x00000000FFFFFFFF);
+	lo |= _mm_slli_epi64(lo, 16);
+	hi |= _mm_slli_epi64(hi, 16);
+	lo &= _mm_set1_epi32(0x0000FFFF);
+	hi &= _mm_set1_epi32(0x0000FFFF);
+	lo |= _mm_slli_epi64(lo, 8);
+	hi |= _mm_slli_epi64(hi, 8);
+	lo &= _mm_set1_epi16(0x00FF);
+	hi &= _mm_set1_epi16(0x00FF);
+	return lo | _mm_slli_epi64(hi, 8);
+}
+
+__m128i
+aes_sse2_interleave_out(__m128i q)
+{
+	__m128i lo, hi;
+
+	lo = q;
+	hi = _mm_srli_si128(q, 1);
+	lo &= _mm_set1_epi16(0x00FF);
+	hi &= _mm_set1_epi16(0x00FF);
+	lo |= _mm_srli_epi64(lo, 8);
+	hi |= _mm_srli_epi64(hi, 8);
+	lo &= _mm_set1_epi32(0x0000FFFF);
+	hi &= _mm_set1_epi32(0x0000FFFF);
+	lo |= _mm_srli_epi64(lo, 16);
+	hi |= _mm_srli_epi64(hi, 16);
+	return (__m128i)_mm_shuffle_ps((__m128)lo, (__m128)hi, 0x88);
+}
+
+static const unsigned char Rcon[] = {
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
+};
+
+static uint32_t
+sub_word(uint32_t x)
+{
+	__m128i q[4];
+	uint32_t y;
+
+	memset(q, 0, sizeof(q));
+	q[0] = _mm_loadu_si32(&x);
+	aes_sse2_ortho(q);
+	aes_sse2_bitslice_Sbox(q);
+	aes_sse2_ortho(q);
+	_mm_storeu_si32(&y, q[0]);
+	return y;
+}
+
+unsigned
+aes_sse2_keysched(uint64_t *comp_skey, const void *key, size_t key_len)
+{
+	unsigned num_rounds;
+	int i, j, k, nk, nkf;
+	uint32_t tmp;
+	uint32_t skey[60];
+
+	switch (key_len) {
+	case 16:
+		num_rounds = 10;
+		break;
+	case 24:
+		num_rounds = 12;
+		break;
+	case 32:
+		num_rounds = 14;
+		break;
+	default:
+		/* abort(); */
+		return 0;
+	}
+	nk = (int)(key_len >> 2);
+	nkf = (int)((num_rounds + 1) << 2);
+	br_range_dec32le(skey, (key_len >> 2), key);
+	tmp = skey[(key_len >> 2) - 1];
+	for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+		if (j == 0) {
+			tmp = (tmp << 24) | (tmp >> 8);
+			tmp = sub_word(tmp) ^ Rcon[k];
+		} else if (nk > 6 && j == 4) {
+			tmp = sub_word(tmp);
+		}
+		tmp ^= skey[i - nk];
+		skey[i] = tmp;
+		if (++ j == nk) {
+			j = 0;
+			k ++;
+		}
+	}
+
+	for (i = 0, j = 0; i < nkf; i += 4, j += 2) {
+		__m128i q[4], q0, q1, q2, q3, q4, q5, q6, q7;
+		__m128i w;
+
+		w = _mm_loadu_epi8(skey + i);
+		q[0] = q[1] = q[2] = q[3] = aes_sse2_interleave_in(w);
+		aes_sse2_ortho(q);
+		q0 = q[0] & _mm_set1_epi64x(0x1111111111111111);
+		q1 = q[1] & _mm_set1_epi64x(0x2222222222222222);
+		q2 = q[2] & _mm_set1_epi64x(0x4444444444444444);
+		q3 = q[3] & _mm_set1_epi64x(0x8888888888888888);
+		q4 = _mm_shuffle_epi32(q0, 0x0e);
+		q5 = _mm_shuffle_epi32(q1, 0x0e);
+		q6 = _mm_shuffle_epi32(q2, 0x0e);
+		q7 = _mm_shuffle_epi32(q3, 0x0e);
+		_mm_storeu_si64(&comp_skey[j + 0], q0 | q1 | q2 | q3);
+		_mm_storeu_si64(&comp_skey[j + 1], q4 | q5 | q6 | q7);
+	}
+	return num_rounds;
+}
+
+void
+aes_sse2_skey_expand(uint64_t *skey,
+	unsigned num_rounds, const uint64_t *comp_skey)
+{
+	unsigned u, v, n;
+
+	n = (num_rounds + 1) << 1;
+	for (u = 0, v = 0; u < n; u ++, v += 4) {
+		__m128i x0, x1, x2, x3;
+
+		x0 = x1 = x2 = x3 = _mm_loadu_si64(&comp_skey[u]);
+		x0 &= 0x1111111111111111;
+		x1 &= 0x2222222222222222;
+		x2 &= 0x4444444444444444;
+		x3 &= 0x8888888888888888;
+		x1 = _mm_srli_epi64(x1, 1);
+		x2 = _mm_srli_epi64(x2, 2);
+		x3 = _mm_srli_epi64(x3, 3);
+		x0 = _mm_sub_epi64(_mm_slli_epi64(x0, 4), x0);
+		x1 = _mm_sub_epi64(_mm_slli_epi64(x1, 4), x1);
+		x2 = _mm_sub_epi64(_mm_slli_epi64(x2, 4), x2);
+		x3 = _mm_sub_epi64(_mm_slli_epi64(x3, 4), x3);
+		_mm_storeu_si64(&skey[v + 0], x0);
+		_mm_storeu_si64(&skey[v + 1], x1);
+		_mm_storeu_si64(&skey[v + 2], x2);
+		_mm_storeu_si64(&skey[v + 3], x3);
+	}
+}
Index: src/sys/crypto/aes/arch/x86/aes_sse2.h
diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2.h:1.1
--- /dev/null	Mon Jun 29 23:47:55 2020
+++ src/sys/crypto/aes/arch/x86/aes_sse2.h	Mon Jun 29 23:47:54 2020
@@ -0,0 +1,36 @@
+/*	$NetBSD: aes_sse2.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_ARCH_X86_AES_SSE2_H
+#define	_CRYPTO_AES_ARCH_X86_AES_SSE2_H
+
+#include <crypto/aes/aes.h>
+
+extern struct aes_impl aes_sse2_impl;
+
+#endif	/* _CRYPTO_AES_ARCH_X86_AES_SSE2_H */
Index: src/sys/crypto/aes/arch/x86/aes_sse2_dec.c
diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2_dec.c:1.1
--- /dev/null	Mon Jun 29 23:47:55 2020
+++ src/sys/crypto/aes/arch/x86/aes_sse2_dec.c	Mon Jun 29 23:47:54 2020
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <por...@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: aes_sse2_dec.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $");
+
+#include <sys/types.h>
+
+#include "aes_sse2_impl.h"
+
+/* see inner.h */
+void
+aes_sse2_bitslice_invSbox(__m128i q[static 4])
+{
+	/*
+	 * See br_aes_ct_bitslice_invSbox(). This is the natural extension
+	 * to 64-bit registers.
+	 */
+	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = _mm_shuffle_epi32(q[0], 0x0e);
+	q5 = _mm_shuffle_epi32(~q[1], 0x0e);
+	q6 = _mm_shuffle_epi32(~q[2], 0x0e);
+	q7 = _mm_shuffle_epi32(q[3], 0x0e);
+
+	q[3] = _mm_unpacklo_epi64(q5 ^ q0 ^ q2, q1 ^ q4 ^ q6);
+	q[2] = _mm_unpacklo_epi64(q4 ^ q7 ^ q1, q0 ^ q3 ^ q5);
+	q[1] = _mm_unpacklo_epi64(q3 ^ q6 ^ q0, q7 ^ q2 ^ q4);
+	q[0] = _mm_unpacklo_epi64(q2 ^ q5 ^ q7, q6 ^ q1 ^ q3);
+
+	aes_sse2_bitslice_Sbox(q);
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = _mm_shuffle_epi32(q[0], 0x0e);
+	q5 = _mm_shuffle_epi32(~q[1], 0x0e);
+	q6 = _mm_shuffle_epi32(~q[2], 0x0e);
+	q7 = _mm_shuffle_epi32(q[3], 0x0e);
+
+	q[3] = _mm_unpacklo_epi64(q5 ^ q0 ^ q2, q1 ^ q4 ^ q6);
+	q[2] = _mm_unpacklo_epi64(q4 ^ q7 ^ q1, q0 ^ q3 ^ q5);
+	q[1] = _mm_unpacklo_epi64(q3 ^ q6 ^ q0, q7 ^ q2 ^ q4);
+	q[0] = _mm_unpacklo_epi64(q2 ^ q5 ^ q7, q6 ^ q1 ^ q3);
+}
+
+static inline void
+add_round_key(__m128i q[static 4], const uint64_t sk[static 8])
+{
+	q[0] ^= _mm_set_epi64x(sk[4], sk[0]);
+	q[1] ^= _mm_set_epi64x(sk[5], sk[1]);
+	q[2] ^= _mm_set_epi64x(sk[6], sk[2]);
+	q[3] ^= _mm_set_epi64x(sk[7], sk[3]);
+}
+
+static inline __m128i
+inv_shift_row(__m128i q)
+{
+	__m128i x, y0, y1, y2, y3, y4, y5, y6;
+
+	x = q;
+	y0 = x & _mm_set1_epi64x(0x000000000000FFFF);
+	y1 = x & _mm_set1_epi64x(0x000000000FFF0000);
+	y2 = x & _mm_set1_epi64x(0x00000000F0000000);
+	y3 = x & _mm_set1_epi64x(0x000000FF00000000);
+	y4 = x & _mm_set1_epi64x(0x0000FF0000000000);
+	y5 = x & _mm_set1_epi64x(0x000F000000000000);
+	y6 = x & _mm_set1_epi64x(0xFFF0000000000000);
+	y1 = _mm_slli_epi64(y1, 4);
+	y2 = _mm_srli_epi64(y2, 12);
+	y3 = _mm_slli_epi64(y3, 8);
+	y4 = _mm_srli_epi64(y4, 8);
+	y5 = _mm_slli_epi64(y5, 12);
+	y6 = _mm_srli_epi64(y6, 4);
+	return y0 | y1 | y2 | y3 | y4 | y5 | y6;
+}
+
+static inline void
+inv_shift_rows(__m128i q[static 4])
+{
+
+	q[0] = inv_shift_row(q[0]);
+	q[1] = inv_shift_row(q[1]);
+	q[2] = inv_shift_row(q[2]);
+	q[3] = inv_shift_row(q[3]);
+}
+
+static inline __m128i
+rotr32(__m128i x)
+{
+	return _mm_slli_epi64(x, 32) | _mm_srli_epi64(x, 32);
+}
+
+static inline void
+inv_mix_columns(__m128i q[4])
+{
+	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+	__m128i r0, r1, r2, r3, r4, r5, r6, r7;
+	__m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	r0 = _mm_srli_epi64(q0, 16) | _mm_slli_epi64(q0, 48);
+	r1 = _mm_srli_epi64(q1, 16) | _mm_slli_epi64(q1, 48);
+	r2 = _mm_srli_epi64(q2, 16) | _mm_slli_epi64(q2, 48);
+	r3 = _mm_srli_epi64(q3, 16) | _mm_slli_epi64(q3, 48);
+
+	q7 = _mm_shuffle_epi32(q3, 0x0e);
+	q6 = _mm_shuffle_epi32(q2, 0x0e);
+	q5 = _mm_shuffle_epi32(q1, 0x0e);
+	q4 = _mm_shuffle_epi32(q0, 0x0e);
+
+	r7 = _mm_shuffle_epi32(r3, 0x0e);
+	r6 = _mm_shuffle_epi32(r2, 0x0e);
+	r5 = _mm_shuffle_epi32(r1, 0x0e);
+	r4 = _mm_shuffle_epi32(r0, 0x0e);
+
+	s0 = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr32(q0 ^ q5 ^ q6 ^ r0 ^ r5);
+	s1 = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6);
+	s2 = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr32(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7);
+	s3 = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr32(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7);
+	s4 = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6);
+	s5 = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7);
+	s6 = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr32(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7);
+	s7 = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr32(q4 ^ q5 ^ q7 ^ r4 ^ r7);
+
+	q[0] = _mm_unpacklo_epi64(s0, s4);
+	q[1] = _mm_unpacklo_epi64(s1, s5);
+	q[2] = _mm_unpacklo_epi64(s2, s6);
+	q[3] = _mm_unpacklo_epi64(s3, s7);
+}
+
+/* see inner.h */
+void
+aes_sse2_bitslice_decrypt(unsigned num_rounds,
+	const uint64_t *skey, __m128i q[static 4])
+{
+	unsigned u;
+
+	add_round_key(q, skey + (num_rounds << 3));
+	for (u = num_rounds - 1; u > 0; u --) {
+		inv_shift_rows(q);
+		aes_sse2_bitslice_invSbox(q);
+		add_round_key(q, skey + (u << 3));
+		inv_mix_columns(q);
+	}
+	inv_shift_rows(q);
+	aes_sse2_bitslice_invSbox(q);
+	add_round_key(q, skey);
+}
Index: src/sys/crypto/aes/arch/x86/aes_sse2_enc.c
diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2_enc.c:1.1
--- /dev/null	Mon Jun 29 23:47:55 2020
+++ src/sys/crypto/aes/arch/x86/aes_sse2_enc.c	Mon Jun 29 23:47:54 2020
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <por...@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: aes_sse2_enc.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $");
+
+#include <sys/types.h>
+
+#include "aes_sse2_impl.h"
+
+static inline void
+add_round_key(__m128i q[static 4], const uint64_t sk[static 8])
+{
+	q[0] ^= _mm_set_epi64x(sk[4], sk[0]);
+	q[1] ^= _mm_set_epi64x(sk[5], sk[1]);
+	q[2] ^= _mm_set_epi64x(sk[6], sk[2]);
+	q[3] ^= _mm_set_epi64x(sk[7], sk[3]);
+}
+
+static inline __m128i
+shift_row(__m128i q)
+{
+	__m128i x, y0, y1, y2, y3, y4, y5, y6;
+
+	x = q;
+	y0 = x & _mm_set1_epi64x(0x000000000000FFFF);
+	y1 = x & _mm_set1_epi64x(0x00000000FFF00000);
+	y2 = x & _mm_set1_epi64x(0x00000000000F0000);
+	y3 = x & _mm_set1_epi64x(0x0000FF0000000000);
+	y4 = x & _mm_set1_epi64x(0x000000FF00000000);
+	y5 = x & _mm_set1_epi64x(0xF000000000000000);
+	y6 = x & _mm_set1_epi64x(0x0FFF000000000000);
+	y1 = _mm_srli_epi64(y1, 4);
+	y2 = _mm_slli_epi64(y2, 12);
+	y3 = _mm_srli_epi64(y3, 8);
+	y4 = _mm_slli_epi64(y4, 8);
+	y5 = _mm_srli_epi64(y5, 12);
+	y6 = _mm_slli_epi64(y6, 4);
+	return y0 | y1 | y2 | y3 | y4 | y5 | y6;
+}
+
+static inline void
+shift_rows(__m128i q[static 4])
+{
+
+	q[0] = shift_row(q[0]);
+	q[1] = shift_row(q[1]);
+	q[2] = shift_row(q[2]);
+	q[3] = shift_row(q[3]);
+}
+
+static inline __m128i
+rotr32(__m128i x)
+{
+	return _mm_slli_epi64(x, 32) | _mm_srli_epi64(x, 32);
+}
+
+static inline void
+mix_columns(__m128i q[static 4])
+{
+	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+	__m128i r0, r1, r2, r3, r4, r5, r6, r7;
+	__m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	r0 = _mm_srli_epi64(q0, 16) | _mm_slli_epi64(q0, 48);
+	r1 = _mm_srli_epi64(q1, 16) | _mm_slli_epi64(q1, 48);
+	r2 = _mm_srli_epi64(q2, 16) | _mm_slli_epi64(q2, 48);
+	r3 = _mm_srli_epi64(q3, 16) | _mm_slli_epi64(q3, 48);
+
+	q7 = _mm_shuffle_epi32(q3, 0x0e);
+	q6 = _mm_shuffle_epi32(q2, 0x0e);
+	q5 = _mm_shuffle_epi32(q1, 0x0e);
+	q4 = _mm_shuffle_epi32(q0, 0x0e);
+
+	r7 = _mm_shuffle_epi32(r3, 0x0e);
+	r6 = _mm_shuffle_epi32(r2, 0x0e);
+	r5 = _mm_shuffle_epi32(r1, 0x0e);
+	r4 = _mm_shuffle_epi32(r0, 0x0e);
+
+	s0 = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0);
+	s1 = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1);
+	s2 = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2);
+	s3 = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3);
+	s4 = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4);
+	s5 = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5);
+	s6 = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6);
+	s7 = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
+
+	q[0] = _mm_unpacklo_epi64(s0, s4);
+	q[1] = _mm_unpacklo_epi64(s1, s5);
+	q[2] = _mm_unpacklo_epi64(s2, s6);
+	q[3] = _mm_unpacklo_epi64(s3, s7);
+}
+
+void
+aes_sse2_bitslice_encrypt(unsigned num_rounds,
+	const uint64_t *skey, __m128i q[static 4])
+{
+	unsigned u;
+
+	add_round_key(q, skey);
+	for (u = 1; u < num_rounds; u ++) {
+		aes_sse2_bitslice_Sbox(q);
+		shift_rows(q);
+		mix_columns(q);
+		add_round_key(q, skey + (u << 3));
+	}
+	aes_sse2_bitslice_Sbox(q);
+	shift_rows(q);
+	add_round_key(q, skey + (num_rounds << 3));
+}
Index: src/sys/crypto/aes/arch/x86/aes_sse2_impl.c
diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2_impl.c:1.1
--- /dev/null	Mon Jun 29 23:47:55 2020
+++ src/sys/crypto/aes/arch/x86/aes_sse2_impl.c	Mon Jun 29 23:47:54 2020
@@ -0,0 +1,611 @@
+/*	$NetBSD: aes_sse2_impl.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: aes_sse2_impl.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $");
+
+#include <sys/types.h>
+#include <sys/endian.h>
+#include <sys/systm.h>
+
+#include <crypto/aes/aes.h>
+#include <crypto/aes/arch/x86/aes_sse2.h>
+
+#include <x86/cpu.h>
+#include <x86/cpuvar.h>
+#include <x86/fpu.h>
+#include <x86/specialreg.h>
+
+#include "aes_sse2_impl.h"
+
+static void
+aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
+{
+	size_t key_len;
+
+	switch (nrounds) {
+	case 10:
+		key_len = 16;
+		break;
+	case 12:
+		key_len = 24;
+		break;
+	case 14:
+		key_len = 32;
+		break;
+	default:
+		panic("invalid AES nrounds: %u", nrounds);
+	}
+
+	fpu_kern_enter();
+	aes_sse2_keysched(rk, key, key_len);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds)
+{
+
+	aes_sse2_setkey(enc->aese_aes.aes_rk64, key, nrounds);
+}
+
+static void
+aes_sse2_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds)
+{
+
+	/*
+	 * BearSSL computes InvMixColumns on the fly -- no need for
+	 * distinct decryption round keys.
+	 */
+	aes_sse2_setkey(dec->aesd_aes.aes_rk64, key, nrounds);
+}
+
+static void
+aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	__m128i q[4];
+
+	fpu_kern_enter();
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
+
+	/* Load input block interleaved with garbage blocks.  */
+	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
+	q[1] = q[2] = q[3] = _mm_setzero_si128();
+
+	/* Transform to bitslice, decrypt, transform from bitslice.  */
+	aes_sse2_ortho(q);
+	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
+	aes_sse2_ortho(q);
+
+	/* Store output block.  */
+	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	__m128i q[4];
+
+	fpu_kern_enter();
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
+
+	/* Load input block interleaved with garbage blocks.  */
+	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
+	q[1] = q[2] = q[3] = _mm_setzero_si128();
+
+	/* Transform to bitslice, decrypt, transform from bitslice.  */
+	aes_sse2_ortho(q);
+	aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
+	aes_sse2_ortho(q);
+
+	/* Store output block.  */
+	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	__m128i q[4];
+	__m128i cv;
+
+	KASSERT(nbytes % 16 == 0);
+
+	/* Skip if there's nothing to do.  */
+	if (nbytes == 0)
+		return;
+
+	fpu_kern_enter();
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
+
+	/* Load the IV.  */
+	cv = _mm_loadu_epi8(iv);
+
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		/* Load input block and apply CV.  */
+		q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
+
+		/* Transform to bitslice, encrypt, transform from bitslice.  */
+		aes_sse2_ortho(q);
+		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
+		aes_sse2_ortho(q);
+
+		/* Remember ciphertext as CV and store output block.  */
+		cv = aes_sse2_interleave_out(q[0]);
+		_mm_storeu_epi8(out, cv);
+	}
+
+	/* Store updated IV.  */
+	_mm_storeu_epi8(iv, cv);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
+    uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	__m128i q[4];
+	__m128i cv, iv, w;
+
+	KASSERT(nbytes % 16 == 0);
+
+	/* Skip if there's nothing to do.  */
+	if (nbytes == 0)
+		return;
+
+	fpu_kern_enter();
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
+
+	/* Load the IV.  */
+	iv = _mm_loadu_epi8(ivp);
+
+	/* Load the last cipher block.  */
+	cv = _mm_loadu_epi8(in + nbytes - 16);
+
+	/* Store the updated IV.  */
+	_mm_storeu_epi8(ivp, cv);
+
+	/* Process the last blocks if not an even multiple of four.  */
+	if (nbytes % (4*16)) {
+		unsigned n = (nbytes/16) % 4;
+
+		KASSERT(n > 0);
+		KASSERT(n < 4);
+
+		q[1] = q[2] = q[3] = _mm_setzero_si128();
+		q[n - 1] = aes_sse2_interleave_in(cv);
+		switch (nbytes % 64) {
+		case 48:
+			w = _mm_loadu_epi8(in + nbytes - 32);
+			q[1] = aes_sse2_interleave_in(w);
+			/*FALLTHROUGH*/
+		case 32:
+			w = _mm_loadu_epi8(in + nbytes - 48);
+			q[0] = aes_sse2_interleave_in(w);
+			/*FALLTHROUGH*/
+		case 16:
+			break;
+		}
+
+		/* Decrypt.  */
+		aes_sse2_ortho(q);
+		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
+		aes_sse2_ortho(q);
+
+		do {
+			n--;
+			w = aes_sse2_interleave_out(q[n]);
+			if ((nbytes -= 16) == 0)
+				goto out;
+			cv = _mm_loadu_epi8(in + nbytes - 16);
+			_mm_storeu_epi8(out + nbytes, w ^ cv);
+		} while (n);
+	}
+
+	for (;;) {
+		KASSERT(nbytes >= 64);
+		nbytes -= 64;
+
+		/*
+		 * 1. Set up upper cipher block from cv.
+		 * 2. Load lower cipher block into cv and set it up.
+		 * 3. Decrypt.
+		 */
+		q[3] = aes_sse2_interleave_in(cv);
+
+		w = _mm_loadu_epi8(in + nbytes + 4*8);
+		q[2] = aes_sse2_interleave_in(w);
+
+		w = _mm_loadu_epi8(in + nbytes + 4*4);
+		q[1] = aes_sse2_interleave_in(w);
+
+		w = _mm_loadu_epi8(in + nbytes + 4*0);
+		q[0] = aes_sse2_interleave_in(w);
+
+		aes_sse2_ortho(q);
+		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
+		aes_sse2_ortho(q);
+
+		/* Store the upper output block.  */
+		w = aes_sse2_interleave_out(q[3]);
+		cv = _mm_loadu_epi8(in + nbytes + 4*8);
+		_mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
+
+		/* Store the middle output blocks.  */
+		w = aes_sse2_interleave_out(q[2]);
+		cv = _mm_loadu_epi8(in + nbytes + 4*4);
+		_mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
+
+		w = aes_sse2_interleave_out(q[1]);
+		cv = _mm_loadu_epi8(in + nbytes + 4*0);
+		_mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
+
+		/*
+		 * Get the first output block, but don't load the CV
+		 * yet -- it might be the previous ciphertext block, or
+		 * it might be the IV.
+		 */
+		w = aes_sse2_interleave_out(q[0]);
+
+		/* Stop if we've reached the first output block.  */
+		if (nbytes == 0)
+			goto out;
+
+		/*
+		 * Load the preceding cipher block, and apply it as the
+		 * chaining value to this one.
+		 */
+		cv = _mm_loadu_epi8(in + nbytes - 16);
+		_mm_storeu_epi8(out + nbytes, w ^ cv);
+	}
+
+out:	/* Store the first output block.  */
+	_mm_storeu_epi8(out, w ^ iv);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+
+	fpu_kern_leave();
+}
+
+static inline __m128i
+aes_sse2_xts_update(__m128i t)
+{
+	const __m128i one = _mm_set_epi64x(1, 1);
+	__m128i s, m, c;
+
+	s = _mm_srli_epi64(t, 63);	/* 1 if high bit set else 0 */
+	m = _mm_sub_epi64(s, one);	/* 0 if high bit set else -1 */
+	m = _mm_shuffle_epi32(m, 0x4e);	/* swap halves */
+	c = _mm_set_epi64x(1, 0x87);	/* carry */
+
+	return _mm_slli_epi64(t, 1) ^ (c & ~m);
+}
+
+static int
+aes_sse2_xts_update_selftest(void)
+{
+	static const struct {
+		uint32_t in[4], out[4];
+	} cases[] = {
+		[0] = { {1}, {2} },
+		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
+		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
+		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
+		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
+		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
+	};
+	unsigned i;
+	uint32_t t[4];
+	int result = 0;
+
+	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
+		t[0] = cases[i].in[0];
+		t[1] = cases[i].in[1];
+		t[2] = cases[i].in[2];
+		t[3] = cases[i].in[3];
+		_mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
+		if (t[0] != cases[i].out[0] ||
+		    t[1] != cases[i].out[1] ||
+		    t[2] != cases[i].out[2] ||
+		    t[3] != cases[i].out[3]) {
+			printf("%s %u:"
+			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
+			    __func__, i, t[0], t[1], t[2], t[3]);
+			result = -1;
+		}
+	}
+
+	return result;
+}
+
+static void
+aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	__m128i q[4];
+	__m128i w;
+	__m128i t[5];
+	unsigned i;
+
+	KASSERT(nbytes % 16 == 0);
+
+	/* Skip if there's nothing to do.  */
+	if (nbytes == 0)
+		return;
+
+	fpu_kern_enter();
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
+
+	/* Load tweak.  */
+	t[0] = _mm_loadu_epi8(tweak);
+
+	/* Handle the first block separately if odd number.  */
+	if (nbytes % (4*16)) {
+		/* Load up the tweaked inputs.  */
+		for (i = 0; i < (nbytes/16) % 4; i++) {
+			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
+			q[i] = aes_sse2_interleave_in(w);
+			t[i + 1] = aes_sse2_xts_update(t[i]);
+		}
+		for (; i < 4; i++)
+			q[i] = _mm_setzero_si128();
+
+		/* Encrypt up to four blocks.  */
+		aes_sse2_ortho(q);
+		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
+		aes_sse2_ortho(q);
+
+		/* Store the tweaked outputs.  */
+		for (i = 0; i < (nbytes/16) % 4; i++) {
+			w = aes_sse2_interleave_out(q[i]);
+			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
+		}
+
+		/* Advance to the next block.  */
+		t[0] = t[i];
+		in += nbytes % (4*16);
+		out += nbytes % (4*16);
+		nbytes -= nbytes % (4*16);
+		if (nbytes == 0)
+			goto out;
+	}
+
+	do {
+		KASSERT(nbytes % 64 == 0);
+		KASSERT(nbytes >= 64);
+
+		/* Load up the tweaked inputs.  */
+		for (i = 0; i < 4; i++) {
+			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
+			q[i] = aes_sse2_interleave_in(w);
+			t[i + 1] = aes_sse2_xts_update(t[i]);
+		}
+
+		/* Encrypt four blocks.  */
+		aes_sse2_ortho(q);
+		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
+		aes_sse2_ortho(q);
+
+		/* Store the tweaked outputs.  */
+		for (i = 0; i < 4; i++) {
+			w = aes_sse2_interleave_out(q[i]);
+			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
+		}
+
+		/* Advance to the next block.  */
+		t[0] = t[4];
+		in += 64;
+		out += 64;
+		nbytes -= 64;
+	} while (nbytes);
+
+out:	/* Store the updated tweak.  */
+	_mm_storeu_epi8(tweak, t[0]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+	explicit_memset(t, 0, sizeof t);
+
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	__m128i q[4];
+	__m128i w;
+	__m128i t[5];
+	unsigned i;
+
+	KASSERT(nbytes % 16 == 0);
+
+	/* Skip if there's nothing to do.  */
+	if (nbytes == 0)
+		return;
+
+	fpu_kern_enter();
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
+
+	/* Load tweak.  */
+	t[0] = _mm_loadu_epi8(tweak);
+
+	/* Handle the first block separately if odd number.  */
+	if (nbytes % (4*16)) {
+		/* Load up the tweaked inputs.  */
+		for (i = 0; i < (nbytes/16) % 4; i++) {
+			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
+			q[i] = aes_sse2_interleave_in(w);
+			t[i + 1] = aes_sse2_xts_update(t[i]);
+		}
+		for (; i < 4; i++)
+			q[i] = _mm_setzero_si128();
+
+		/* Decrypt up to four blocks.  */
+		aes_sse2_ortho(q);
+		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
+		aes_sse2_ortho(q);
+
+		/* Store the tweaked outputs.  */
+		for (i = 0; i < (nbytes/16) % 4; i++) {
+			w = aes_sse2_interleave_out(q[i]);
+			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
+		}
+
+		/* Advance to the next block.  */
+		t[0] = t[i];
+		in += nbytes % (4*16);
+		out += nbytes % (4*16);
+		nbytes -= nbytes % (4*16);
+		if (nbytes == 0)
+			goto out;
+	}
+
+	do {
+		KASSERT(nbytes % 64 == 0);
+		KASSERT(nbytes >= 64);
+
+		/* Load up the tweaked inputs.  */
+		for (i = 0; i < 4; i++) {
+			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
+			q[i] = aes_sse2_interleave_in(w);
+			t[i + 1] = aes_sse2_xts_update(t[i]);
+		}
+
+		/* Decrypt four blocks.  */
+		aes_sse2_ortho(q);
+		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
+		aes_sse2_ortho(q);
+
+		/* Store the tweaked outputs.  */
+		for (i = 0; i < 4; i++) {
+			w = aes_sse2_interleave_out(q[i]);
+			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
+		}
+
+		/* Advance to the next block.  */
+		t[0] = t[4];
+		in += 64;
+		out += 64;
+		nbytes -= 64;
+	} while (nbytes);
+
+out:	/* Store the updated tweak.  */
+	_mm_storeu_epi8(tweak, t[0]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+	explicit_memset(t, 0, sizeof t);
+
+	fpu_kern_leave();
+}
+
+static int
+aes_sse2_probe(void)
+{
+	int result = 0;
+
+	/* Verify that the CPU supports SSE and SSE2.  */
+	if (!i386_has_sse)
+		return -1;
+	if (!i386_has_sse2)
+		return -1;
+
+	fpu_kern_enter();
+
+	if (aes_sse2_xts_update_selftest())
+		result = -1;
+
+	fpu_kern_leave();
+
+	/* XXX test aes_sse2_bitslice_decrypt */
+	/* XXX test aes_sse2_bitslice_encrypt */
+	/* XXX test aes_sse2_keysched */
+	/* XXX test aes_sse2_ortho */
+	/* XXX test aes_sse2_skey_expand */
+
+	return result;
+}
+
+struct aes_impl aes_sse2_impl = {
+	.ai_name = "Intel SSE2 bitsliced",
+	.ai_probe = aes_sse2_probe,
+	.ai_setenckey = aes_sse2_setenckey,
+	.ai_setdeckey = aes_sse2_setdeckey,
+	.ai_enc = aes_sse2_enc,
+	.ai_dec = aes_sse2_dec,
+	.ai_cbc_enc = aes_sse2_cbc_enc,
+	.ai_cbc_dec = aes_sse2_cbc_dec,
+	.ai_xts_enc = aes_sse2_xts_enc,
+	.ai_xts_dec = aes_sse2_xts_dec,
+};
Index: src/sys/crypto/aes/arch/x86/aes_sse2_impl.h
diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2_impl.h:1.1
--- /dev/null	Mon Jun 29 23:47:55 2020
+++ src/sys/crypto/aes/arch/x86/aes_sse2_impl.h	Mon Jun 29 23:47:54 2020
@@ -0,0 +1,47 @@
+/*	$NetBSD: aes_sse2_impl.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H
+#define	_CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H
+
+#include <sys/types.h>
+
+#include <crypto/aes/arch/x86/immintrin.h>
+#include <crypto/aes/arch/x86/immintrin_ext.h>
+
+void aes_sse2_bitslice_Sbox(__m128i[static 4]);
+void aes_sse2_bitslice_invSbox(__m128i[static 4]);
+void aes_sse2_ortho(__m128i[static 4]);
+__m128i aes_sse2_interleave_in(__m128i);
+__m128i aes_sse2_interleave_out(__m128i);
+unsigned aes_sse2_keysched(uint64_t *, const void *, size_t);
+void aes_sse2_skey_expand(uint64_t *, unsigned, const uint64_t *);
+void aes_sse2_bitslice_encrypt(unsigned, const uint64_t *, __m128i[static 4]);
+void aes_sse2_bitslice_decrypt(unsigned, const uint64_t *, __m128i[static 4]);
+
+#endif	/* _CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H */
Index: src/sys/crypto/aes/arch/x86/files.aessse2
diff -u /dev/null src/sys/crypto/aes/arch/x86/files.aessse2:1.1
--- /dev/null	Mon Jun 29 23:47:55 2020
+++ src/sys/crypto/aes/arch/x86/files.aessse2	Mon Jun 29 23:47:54 2020
@@ -0,0 +1,11 @@
+#	$NetBSD: files.aessse2,v 1.1 2020/06/29 23:47:54 riastradh Exp $
+
+makeoptions	aes	"COPTS.aes_sse2.c"+="-msse2"
+makeoptions	aes	"COPTS.aes_sse2_dec.c"+="-msse2"
+makeoptions	aes	"COPTS.aes_sse2_enc.c"+="-msse2"
+makeoptions	aes	"COPTS.aes_sse2_impl.c"+="-msse2"
+
+file	crypto/aes/arch/x86/aes_sse2.c		aes
+file	crypto/aes/arch/x86/aes_sse2_dec.c	aes
+file	crypto/aes/arch/x86/aes_sse2_enc.c	aes
+file	crypto/aes/arch/x86/aes_sse2_impl.c	aes
Index: src/sys/crypto/aes/arch/x86/immintrin.h
diff -u /dev/null src/sys/crypto/aes/arch/x86/immintrin.h:1.1
--- /dev/null	Mon Jun 29 23:47:55 2020
+++ src/sys/crypto/aes/arch/x86/immintrin.h	Mon Jun 29 23:47:54 2020
@@ -0,0 +1,216 @@
+/*	$NetBSD: immintrin.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_H
+#define	_SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_H
+
+#include <sys/types.h>
+
+/*
+ * This kludgerous header file provides definitions for the Intel
+ * intrinsics that work with GCC and Clang, because <immintrin.h> is
+ * not available during the kernel build and arranging to make it
+ * available is complicated.  Please fix this properly!
+ */
+
+#if defined(__GNUC__) && !defined(__clang__)
+
+#define	_INTRINSATTR							      \
+	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#define	_PACKALIAS
+
+typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
+typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
+typedef long long __m128i_u
+    __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
+typedef long long __v2di __attribute__((__vector_size__(16)));
+typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+
+#elif defined(__clang__)
+
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __m128i
+    __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __m128i_u
+    __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
+typedef long long __v2di __attribute__((__vector_size__(16)));
+typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+
+#define	_INTRINSATTR							      \
+	__attribute__((__always_inline__, __nodebug__, __target__("sse2"),    \
+		__min_vector_width__(128)))
+#define	_PACKALIAS							      \
+	__attribute__((__packed__, __may_alias__))
+
+#else
+
+#error Please teach me how to do Intel intrinsics for your compiler!
+
+#endif
+
+_INTRINSATTR
+static __inline __m128i
+_mm_loadu_si32(const void *__p)
+{
+	int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
+	return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_loadu_si64(const void *__p)
+{
+	int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
+	return __extension__ (__m128i)(__v2di){ __v, 0 };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set1_epi16(int16_t __v)
+{
+	return __extension__ (__m128i)(__v8hi){
+	    __v, __v, __v, __v, __v, __v, __v, __v
+	};
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set1_epi32(int32_t __v)
+{
+	return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set1_epi64x(int64_t __v)
+{
+	return __extension__ (__m128i)(__v2di){ __v, __v };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
+{
+	return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set_epi64x(int64_t __v1, int64_t __v0)
+{
+	return __extension__ (__m128i)(__v2di){ __v0, __v1 };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_setzero_si128(void)
+{
+	return _mm_set1_epi64x(0);
+}
+
+#define	_mm_shuffle_epi32(v,m)						      \
+	(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
+
+#define	_mm_shuffle_ps(x,y,m)						      \
+	(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x),		      \
+	    (__v4sf)(__m128)(y), (int)(m))				      \
+
+_INTRINSATTR
+static __inline __m128i
+_mm_slli_epi64(__m128i __v, uint8_t __bits)
+{
+	return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define	_mm_slli_si128(v,bytes)						      \
+	(__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v),	      \
+	    8*(int)(bytes))
+#elif defined(__clang__)
+#define	_mm_slli_si128(v,bytes)						      \
+	(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v),    \
+	    (int)(bytes))
+#endif
+
+_INTRINSATTR
+static __inline __m128i
+_mm_srli_epi64(__m128i __v, uint8_t __bits)
+{
+	return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define	_mm_srli_si128(v,bytes)						      \
+	(__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
+#elif defined(__clang__)
+#define	_mm_srli_si128(v,bytes)						      \
+	(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v),    \
+	    (int)(bytes));
+#endif
+
+_INTRINSATTR
+static __inline void
+_mm_storeu_si32(void *__p, __m128i __v)
+{
+	((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
+}
+
+_INTRINSATTR
+static __inline void
+_mm_storeu_si64(void *__p, __m128i __v)
+{
+	((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_sub_epi64(__m128i __x, __m128i __y)
+{
+	return (__m128i)((__v2du)__x - (__v2du)__y);
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
+	    (__v2di)__hi);
+#elif defined(__clang__)
+	return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
+	    0, 4, 1, 5);
+#endif
+}
+
+#endif	/* _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_H */
Index: src/sys/crypto/aes/arch/x86/immintrin_ext.h
diff -u /dev/null src/sys/crypto/aes/arch/x86/immintrin_ext.h:1.1
--- /dev/null	Mon Jun 29 23:47:55 2020
+++ src/sys/crypto/aes/arch/x86/immintrin_ext.h	Mon Jun 29 23:47:54 2020
@@ -0,0 +1,48 @@
+/*	$NetBSD: immintrin_ext.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_EXT_H
+#define	_SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_EXT_H
+
+#include "immintrin.h"
+
+_INTRINSATTR
+static __inline __m128i
+_mm_loadu_epi8(const void *__p)
+{
+	return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
+}
+
+_INTRINSATTR
+static __inline void
+_mm_storeu_epi8(void *__p, __m128i __v)
+{
+	((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
+}
+
+#endif	/* _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_EXT_H */

Reply via email to