Module Name:    src
Committed By:   maxv
Date:           Wed Nov  7 07:43:08 UTC 2018

Modified Files:
        src/distrib/sets/lists/comp: md.amd64
        src/distrib/sets/lists/modules: md.amd64
        src/etc: MAKEDEV.tmpl
        src/sys/conf: files majors
        src/sys/dev: Makefile
        src/sys/modules: Makefile
Added Files:
        src/sys/dev/nvmm: Makefile files.nvmm nvmm.c nvmm.h nvmm_internal.h
            nvmm_ioctl.h
        src/sys/dev/nvmm/x86: Makefile nvmm_x86.h nvmm_x86_svm.c
            nvmm_x86_svmfunc.S
        src/sys/modules/nvmm: Makefile nvmm.ioconf

Log Message:
Add NVMM - for NetBSD Virtual Machine Monitor -, a kernel driver that
provides support for hardware-accelerated virtualization on NetBSD.

It is made of an MI frontend, to which MD backends can be plugged. One
MD backend is implemented, x86-SVM, for x86 AMD CPUs.

We install

        /usr/include/dev/nvmm/nvmm.h
        /usr/include/dev/nvmm/nvmm_ioctl.h
        /usr/include/dev/nvmm/{arch}/nvmm_{arch}.h

And the kernel module. For now, the only architecture where we do that
is amd64 (arch=x86).

NVMM is not enabled by default in amd64-GENERIC, but is instead easily
modloadable.

Sent to tech-kern@ a month ago. Validated with kASan, and optimized
with tprof.


To generate a diff of this commit:
cvs rdiff -u -r1.259 -r1.260 src/distrib/sets/lists/comp/md.amd64
cvs rdiff -u -r1.77 -r1.78 src/distrib/sets/lists/modules/md.amd64
cvs rdiff -u -r1.195 -r1.196 src/etc/MAKEDEV.tmpl
cvs rdiff -u -r1.1215 -r1.1216 src/sys/conf/files
cvs rdiff -u -r1.79 -r1.80 src/sys/conf/majors
cvs rdiff -u -r1.39 -r1.40 src/sys/dev/Makefile
cvs rdiff -u -r0 -r1.1 src/sys/dev/nvmm/Makefile src/sys/dev/nvmm/files.nvmm \
    src/sys/dev/nvmm/nvmm.c src/sys/dev/nvmm/nvmm.h \
    src/sys/dev/nvmm/nvmm_internal.h src/sys/dev/nvmm/nvmm_ioctl.h
cvs rdiff -u -r0 -r1.1 src/sys/dev/nvmm/x86/Makefile \
    src/sys/dev/nvmm/x86/nvmm_x86.h src/sys/dev/nvmm/x86/nvmm_x86_svm.c \
    src/sys/dev/nvmm/x86/nvmm_x86_svmfunc.S
cvs rdiff -u -r1.209 -r1.210 src/sys/modules/Makefile
cvs rdiff -u -r0 -r1.1 src/sys/modules/nvmm/Makefile \
    src/sys/modules/nvmm/nvmm.ioconf

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/distrib/sets/lists/comp/md.amd64
diff -u src/distrib/sets/lists/comp/md.amd64:1.259 src/distrib/sets/lists/comp/md.amd64:1.260
--- src/distrib/sets/lists/comp/md.amd64:1.259	Tue Jul 17 18:55:24 2018
+++ src/distrib/sets/lists/comp/md.amd64	Wed Nov  7 07:43:07 2018
@@ -1,4 +1,4 @@
-# $NetBSD: md.amd64,v 1.259 2018/07/17 18:55:24 joerg Exp $
+# $NetBSD: md.amd64,v 1.260 2018/11/07 07:43:07 maxv Exp $
 
 ./usr/include/amd64				comp-c-include
 ./usr/include/amd64/ansi.h			comp-c-include
@@ -677,6 +677,11 @@
 ./usr/include/ieeefp.h				comp-c-include
 ./usr/include/mm_malloc.h			comp-obsolete		obsolete
 ./usr/include/mmintrin.h			comp-obsolete		obsolete
+./usr/include/dev/nvmm				comp-c-include
+./usr/include/dev/nvmm/nvmm.h			comp-c-include
+./usr/include/dev/nvmm/nvmm_ioctl.h		comp-c-include
+./usr/include/dev/nvmm/x86			comp-c-include
+./usr/include/dev/nvmm/x86/nvmm_x86.h		comp-c-include
 ./usr/include/pmmintrin.h			comp-obsolete		obsolete
 ./usr/include/x64_64				comp-obsolete		obsolete
 ./usr/include/x64_64/ansi.h			comp-obsolete		obsolete

Index: src/distrib/sets/lists/modules/md.amd64
diff -u src/distrib/sets/lists/modules/md.amd64:1.77 src/distrib/sets/lists/modules/md.amd64:1.78
--- src/distrib/sets/lists/modules/md.amd64:1.77	Tue Aug 28 09:42:10 2018
+++ src/distrib/sets/lists/modules/md.amd64	Wed Nov  7 07:43:07 2018
@@ -1,4 +1,4 @@
-# $NetBSD: md.amd64,v 1.77 2018/08/28 09:42:10 martin Exp $
+# $NetBSD: md.amd64,v 1.78 2018/11/07 07:43:07 maxv Exp $
 #
 # NOTE that there are two sets of files here:
 # @MODULEDIR@ and amd64-xen
@@ -141,6 +141,8 @@
 ./@MODULEDIR@/mt2131/mt2131.kmod		base-kernel-modules	kmod
 ./@MODULEDIR@/nvme				base-obsolete		obsolete
 ./@MODULEDIR@/nvme/nvme.kmod			base-obsolete		obsolete
+./@MODULEDIR@/nvmm				base-kernel-modules	kmod
+./@MODULEDIR@/nvmm/nvmm.kmod			base-kernel-modules	kmod
 ./@MODULEDIR@/nxt2k				base-kernel-modules	kmod
 ./@MODULEDIR@/nxt2k/nxt2k.kmod			base-kernel-modules	kmod
 ./@MODULEDIR@/odcm				base-kernel-modules	kmod

Index: src/etc/MAKEDEV.tmpl
diff -u src/etc/MAKEDEV.tmpl:1.195 src/etc/MAKEDEV.tmpl:1.196
--- src/etc/MAKEDEV.tmpl:1.195	Sun Nov  4 12:48:01 2018
+++ src/etc/MAKEDEV.tmpl	Wed Nov  7 07:43:07 2018
@@ -1,5 +1,5 @@
 #!/bin/sh -
-#	$NetBSD: MAKEDEV.tmpl,v 1.195 2018/11/04 12:48:01 maxv Exp $
+#	$NetBSD: MAKEDEV.tmpl,v 1.196 2018/11/07 07:43:07 maxv Exp $
 #
 # Copyright (c) 2003,2007,2008 The NetBSD Foundation, Inc.
 # All rights reserved.
@@ -258,6 +258,7 @@
 #	nsmb*	SMB requester
 #	nvme*	Non-Volatile Memory Host Controller Interface device driver
 #	nvme*ns* Non-Volatile Memory namespace
+#	nvmm	NetBSD Virtual Machine Monitor
 #	openfirm OpenFirmware accessor
 #	pad*	Pseudo-audio device driver
 #	pci*	PCI bus access devices
@@ -277,7 +278,7 @@
 #	stic*	PixelStamp interface chip
 #	sysmon	System Monitoring hardware
 #	tap*	virtual Ethernet device
-#	tprof   task profiler
+#	tprof	task profiler
 #	tun*	network tunnel driver
 #	twa	3ware Apache control interface
 #	twe	3ware Escalade control interface
@@ -2205,6 +2206,10 @@ nvme[0-9]*)
 	mkdev nvme$unit c %nvme_chr% $(($unit * 65536))
 	;;
 
+nvmm)
+	mkdev nvmm c %nvmm_chr% 0
+	;;
+
 autofs)
 	mkdev autofs c %autofs_chr% 0 600
 	;;

Index: src/sys/conf/files
diff -u src/sys/conf/files:1.1215 src/sys/conf/files:1.1216
--- src/sys/conf/files:1.1215	Fri Oct 19 21:09:10 2018
+++ src/sys/conf/files	Wed Nov  7 07:43:07 2018
@@ -1,4 +1,4 @@
-#	$NetBSD: files,v 1.1215 2018/10/19 21:09:10 jakllsch Exp $
+#	$NetBSD: files,v 1.1216 2018/11/07 07:43:07 maxv Exp $
 #	@(#)files.newconf	7.5 (Berkeley) 5/10/93
 
 version 	20171118
@@ -1549,6 +1549,11 @@ include "lib/libx86emu/files.x86emu"
 include	"dev/tprof/files.tprof"
 
 #
+# NetBSD Virtual Machine Monitor.
+#
+include	"dev/nvmm/files.nvmm"
+
+#
 # alternate memory device
 #
 include "dev/altmem/files.altmem"

Index: src/sys/conf/majors
diff -u src/sys/conf/majors:1.79 src/sys/conf/majors:1.80
--- src/sys/conf/majors:1.79	Sun May 20 14:08:33 2018
+++ src/sys/conf/majors	Wed Nov  7 07:43:07 2018
@@ -1,4 +1,4 @@
-# $NetBSD: majors,v 1.79 2018/05/20 14:08:33 thorpej Exp $
+# $NetBSD: majors,v 1.80 2018/11/07 07:43:07 maxv Exp $
 #
 # Device majors for Machine-Independent drivers.
 #
@@ -78,3 +78,4 @@ device-major nvme      char 341		   nvme
 device-major qemufwcfg char 342		   qemufwcfg
 device-major autofs    char 343		   autofs
 device-major gpiopps   char 344            gpiopps
+device-major nvmm      char 345            nvmm

Index: src/sys/dev/Makefile
diff -u src/sys/dev/Makefile:1.39 src/sys/dev/Makefile:1.40
--- src/sys/dev/Makefile:1.39	Sun Dec 10 20:38:14 2017
+++ src/sys/dev/Makefile	Wed Nov  7 07:43:08 2018
@@ -1,10 +1,14 @@
-#	$NetBSD: Makefile,v 1.39 2017/12/10 20:38:14 bouyer Exp $
+#	$NetBSD: Makefile,v 1.40 2018/11/07 07:43:08 maxv Exp $
 
 SUBDIR=	apm ata bluetooth dec dm dmover dtv filemon hdaudio hdmicec hid hpc \
 	i2c i2o ic ieee1394 ir isa \
 	microcode ofw pci pckbport pcmcia pud putter raidframe sbus scsipi \
 	sun tc usb vme wscons
 
+.if ${MACHINE_ARCH} == "x86_64"
+SUBDIR+= nvmm
+.endif
+
 .include <bsd.own.mk>
 
 .if ${MKISCSI} != "no"

Index: src/sys/modules/Makefile
diff -u src/sys/modules/Makefile:1.209 src/sys/modules/Makefile:1.210
--- src/sys/modules/Makefile:1.209	Tue Aug 28 03:41:38 2018
+++ src/sys/modules/Makefile	Wed Nov  7 07:43:08 2018
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.209 2018/08/28 03:41:38 riastradh Exp $
+#	$NetBSD: Makefile,v 1.210 2018/11/07 07:43:08 maxv Exp $
 
 .include <bsd.own.mk>
 
@@ -202,6 +202,10 @@ SUBDIR+=	tprof_x86
 SUBDIR+=	vmt
 .endif
 
+.if ${MACHINE_ARCH} == "x86_64"
+SUBDIR+=	nvmm
+.endif
+
 .if ${MACHINE_ARCH} == "i386" || \
     ${MACHINE_ARCH} == "x86_64"
 SUBDIR+=	ubsec		# Builds on architectures with PCI bus

Added files:

Index: src/sys/dev/nvmm/Makefile
diff -u /dev/null src/sys/dev/nvmm/Makefile:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/Makefile	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,13 @@
+#	$NetBSD: Makefile,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+.if ${MACHINE_ARCH} == "x86_64"
+SUBDIR= x86
+.endif
+
+.include <bsd.own.mk>
+
+INCSDIR= /usr/include/dev/nvmm
+
+INCS=	nvmm.h nvmm_ioctl.h
+
+.include <bsd.kinc.mk>
Index: src/sys/dev/nvmm/files.nvmm
diff -u /dev/null src/sys/dev/nvmm/files.nvmm:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/files.nvmm	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,11 @@
+#	$NetBSD: files.nvmm,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+defpseudo nvmm
+
+file	dev/nvmm/nvmm.c			nvmm
+
+ifdef amd64
+file	dev/nvmm/x86/nvmm_x86_svm.c	nvmm
+file	dev/nvmm/x86/nvmm_x86_svmfunc.S	nvmm
+endif
+
Index: src/sys/dev/nvmm/nvmm.c
diff -u /dev/null src/sys/dev/nvmm/nvmm.c:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm.c	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,788 @@
+/*	$NetBSD: nvmm.c,v 1.1 2018/11/07 07:43:08 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.1 2018/11/07 07:43:08 maxv Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+
+#include <sys/cpu.h>
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_page.h>
+
+#include "ioconf.h"
+
+#include <dev/nvmm/nvmm.h>
+#include <dev/nvmm/nvmm_internal.h>
+#include <dev/nvmm/nvmm_ioctl.h>
+
+static struct nvmm_machine machines[NVMM_MAX_MACHINES];
+
+static const struct nvmm_impl *nvmm_impl_list[] = {
+	&nvmm_x86_svm	/* x86 AMD SVM */
+};
+
+static const struct nvmm_impl *nvmm_impl = NULL;
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_machine_alloc(struct nvmm_machine **ret)
+{
+	struct nvmm_machine *mach;
+	size_t i;
+
+	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
+		mach = &machines[i];
+
+		rw_enter(&mach->lock, RW_WRITER);
+		if (mach->present) {
+			rw_exit(&mach->lock);
+			continue;
+		}
+
+		mach->present = true;
+		*ret = mach;
+		return 0;
+	}
+
+	return ENOBUFS;
+}
+
+static void
+nvmm_machine_free(struct nvmm_machine *mach)
+{
+	KASSERT(rw_write_held(&mach->lock));
+	KASSERT(mach->present);
+	mach->present = false;
+}
+
+static int
+nvmm_machine_get(nvmm_machid_t machid, struct nvmm_machine **ret, bool writer)
+{
+	struct nvmm_machine *mach;
+	krw_t op = writer ? RW_WRITER : RW_READER;
+
+	if (machid >= NVMM_MAX_MACHINES) {
+		return EINVAL;
+	}
+	mach = &machines[machid];
+
+	rw_enter(&mach->lock, op);
+	if (!mach->present) {
+		rw_exit(&mach->lock);
+		return ENOENT;
+	}
+	if (mach->procid != curproc->p_pid) {
+		rw_exit(&mach->lock);
+		return EPERM;
+	}
+	*ret = mach;
+
+	return 0;
+}
+
+static void
+nvmm_machine_put(struct nvmm_machine *mach)
+{
+	rw_exit(&mach->lock);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_vcpu_alloc(struct nvmm_machine *mach, struct nvmm_cpu **ret)
+{
+	struct nvmm_cpu *vcpu;
+	size_t i;
+
+	for (i = 0; i < NVMM_MAX_VCPUS; i++) {
+		vcpu = &mach->cpus[i];
+
+		mutex_enter(&vcpu->lock);
+		if (vcpu->present) {
+			mutex_exit(&vcpu->lock);
+			continue;
+		}
+
+		vcpu->present = true;
+		vcpu->cpuid = i;
+		*ret = vcpu;
+		return 0;
+	}
+
+	return ENOBUFS;
+}
+
+static void
+nvmm_vcpu_free(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+	KASSERT(mutex_owned(&vcpu->lock));
+	vcpu->present = false;
+	vcpu->hcpu_last = -1;
+}
+
+int
+nvmm_vcpu_get(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
+    struct nvmm_cpu **ret)
+{
+	struct nvmm_cpu *vcpu;
+
+	if (cpuid >= NVMM_MAX_VCPUS) {
+		return EINVAL;
+	}
+	vcpu = &mach->cpus[cpuid];
+
+	mutex_enter(&vcpu->lock);
+	if (!vcpu->present) {
+		mutex_exit(&vcpu->lock);
+		return ENOENT;
+	}
+	*ret = vcpu;
+
+	return 0;
+}
+
+void
+nvmm_vcpu_put(struct nvmm_cpu *vcpu)
+{
+	mutex_exit(&vcpu->lock);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+nvmm_kill_machines(pid_t pid)
+{
+	struct nvmm_machine *mach;
+	struct nvmm_cpu *vcpu;
+	size_t i, j;
+	int error;
+
+	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
+		mach = &machines[i];
+
+		rw_enter(&mach->lock, RW_WRITER);
+		if (!mach->present || mach->procid != pid) {
+			rw_exit(&mach->lock);
+			continue;
+		}
+
+		/* Kill it. */
+		for (j = 0; j < NVMM_MAX_VCPUS; j++) {
+			error = nvmm_vcpu_get(mach, j, &vcpu);
+			if (error)
+				continue;
+			(*nvmm_impl->vcpu_destroy)(mach, vcpu);
+			nvmm_vcpu_free(mach, vcpu);
+			nvmm_vcpu_put(vcpu);
+		}
+		uvmspace_free(mach->vm);
+		uao_detach(mach->uobj);
+		nvmm_machine_free(mach);
+
+		rw_exit(&mach->lock);
+	}
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_capability(struct nvmm_ioc_capability *args)
+{
+	args->cap.version = NVMM_CAPABILITY_VERSION;
+	args->cap.state_size = nvmm_impl->state_size;
+	args->cap.max_machines = NVMM_MAX_MACHINES;
+	args->cap.max_vcpus = NVMM_MAX_VCPUS;
+	args->cap.max_ram = NVMM_MAX_RAM;
+
+	(*nvmm_impl->capability)(&args->cap);
+
+	return 0;
+}
+
+static int
+nvmm_machine_create(struct nvmm_ioc_machine_create *args)
+{
+	struct nvmm_machine *mach;
+	int error;
+
+	error = nvmm_machine_alloc(&mach);
+	if (error)
+		return error;
+
+	/* Curproc owns the machine. */
+	mach->procid = curproc->p_pid;
+
+	/* Create the machine vmspace. */
+	mach->gpa_begin = 0;
+	mach->gpa_end = NVMM_MAX_RAM;
+	mach->vm = uvmspace_alloc(0, mach->gpa_end - mach->gpa_begin, false);
+	mach->uobj = uao_create(mach->gpa_end - mach->gpa_begin, 0);
+
+	/* Grab a reference for the machine. */
+	uao_reference(mach->uobj);
+
+	(*nvmm_impl->machine_create)(mach);
+
+	args->machid = mach->machid;
+	nvmm_machine_put(mach);
+
+	return 0;
+}
+
+static int
+nvmm_machine_destroy(struct nvmm_ioc_machine_destroy *args)
+{
+	struct nvmm_machine *mach;
+	struct nvmm_cpu *vcpu;
+	int error;
+	size_t i;
+
+	error = nvmm_machine_get(args->machid, &mach, true);
+	if (error)
+		return error;
+
+	for (i = 0; i < NVMM_MAX_VCPUS; i++) {
+		error = nvmm_vcpu_get(mach, i, &vcpu);
+		if (error)
+			continue;
+
+		(*nvmm_impl->vcpu_destroy)(mach, vcpu);
+		nvmm_vcpu_free(mach, vcpu);
+		nvmm_vcpu_put(vcpu);
+	}
+
+	(*nvmm_impl->machine_destroy)(mach);
+
+	/* Free the machine vmspace. */
+	uvmspace_free(mach->vm);
+	uao_detach(mach->uobj);
+
+	nvmm_machine_free(mach);
+	nvmm_machine_put(mach);
+
+	return 0;
+}
+
+static int
+nvmm_machine_configure(struct nvmm_ioc_machine_configure *args)
+{
+	struct nvmm_machine *mach;
+	size_t allocsz;
+	void *data;
+	int error;
+
+	if (__predict_false(args->op >= nvmm_impl->conf_max)) {
+		return EINVAL;
+	}
+
+	allocsz = nvmm_impl->conf_sizes[args->op];
+	data = kmem_alloc(allocsz, KM_SLEEP);
+
+	error = nvmm_machine_get(args->machid, &mach, true);
+	if (error) {
+		kmem_free(data, allocsz);
+		return error;
+	}
+
+	error = copyin(args->conf, data, allocsz);
+	if (error) {
+		goto out;
+	}
+
+	error = (*nvmm_impl->machine_configure)(mach, args->op, data);
+
+out:
+	nvmm_machine_put(mach);
+	kmem_free(data, allocsz);
+	return error;
+}
+
+static int
+nvmm_vcpu_create(struct nvmm_ioc_vcpu_create *args)
+{
+	struct nvmm_machine *mach;
+	struct nvmm_cpu *vcpu;
+	int error;
+
+	error = nvmm_machine_get(args->machid, &mach, false);
+	if (error)
+		return error;
+
+	error = nvmm_vcpu_alloc(mach, &vcpu);
+	if (error)
+		goto out;
+
+	error = (*nvmm_impl->vcpu_create)(mach, vcpu);
+	if (error) {
+		nvmm_vcpu_free(mach, vcpu);
+		nvmm_vcpu_put(vcpu);
+		goto out;
+	}
+
+	nvmm_vcpu_put(vcpu);
+
+out:
+	nvmm_machine_put(mach);
+	return error;
+}
+
+static int
+nvmm_vcpu_destroy(struct nvmm_ioc_vcpu_destroy *args)
+{
+	struct nvmm_machine *mach;
+	struct nvmm_cpu *vcpu;
+	int error;
+
+	error = nvmm_machine_get(args->machid, &mach, false);
+	if (error)
+		return error;
+
+	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+	if (error)
+		goto out;
+
+	(*nvmm_impl->vcpu_destroy)(mach, vcpu);
+	nvmm_vcpu_free(mach, vcpu);
+	nvmm_vcpu_put(vcpu);
+
+out:
+	nvmm_machine_put(mach);
+	return error;
+}
+
+static int
+nvmm_vcpu_setstate(struct nvmm_ioc_vcpu_setstate *args)
+{
+	struct nvmm_machine *mach;
+	struct nvmm_cpu *vcpu;
+	void *data;
+	int error;
+
+	data = kmem_alloc(nvmm_impl->state_size, KM_SLEEP);
+
+	error = nvmm_machine_get(args->machid, &mach, false);
+	if (error) {
+		kmem_free(data, nvmm_impl->state_size);
+		return error;
+	}
+
+	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+	if (error)
+		goto out;
+
+	error = copyin(args->state, data, nvmm_impl->state_size);
+	if (error) {
+		nvmm_vcpu_put(vcpu);
+		goto out;
+	}
+
+	(*nvmm_impl->vcpu_setstate)(vcpu, data, args->flags);
+	nvmm_vcpu_put(vcpu);
+
+out:
+	nvmm_machine_put(mach);
+	kmem_free(data, nvmm_impl->state_size);
+	return error;
+}
+
+static int
+nvmm_vcpu_getstate(struct nvmm_ioc_vcpu_getstate *args)
+{
+	struct nvmm_machine *mach;
+	struct nvmm_cpu *vcpu;
+	void *data;
+	int error;
+
+	data = kmem_alloc(nvmm_impl->state_size, KM_SLEEP);
+
+	error = nvmm_machine_get(args->machid, &mach, false);
+	if (error) {
+		kmem_free(data, nvmm_impl->state_size);
+		return error;
+	}
+
+	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+	if (error)
+		goto out;
+
+	(*nvmm_impl->vcpu_getstate)(vcpu, data, args->flags);
+	nvmm_vcpu_put(vcpu);
+	error = copyout(data, args->state, nvmm_impl->state_size);
+
+out:
+	nvmm_machine_put(mach);
+	kmem_free(data, nvmm_impl->state_size);
+	return error;
+}
+
+static int
+nvmm_vcpu_inject(struct nvmm_ioc_vcpu_inject *args)
+{
+	struct nvmm_machine *mach;
+	struct nvmm_cpu *vcpu;
+	int error;
+
+	error = nvmm_machine_get(args->machid, &mach, false);
+	if (error)
+		return error;
+
+	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+	if (error)
+		goto out;
+
+	error = (*nvmm_impl->vcpu_inject)(mach, vcpu, &args->event);
+	nvmm_vcpu_put(vcpu);
+
+out:
+	nvmm_machine_put(mach);
+	return error;
+}
+
+static int
+nvmm_vcpu_run(struct nvmm_ioc_vcpu_run *args)
+{
+	struct nvmm_machine *mach;
+	struct nvmm_cpu *vcpu;
+	int error;
+
+	error = nvmm_machine_get(args->machid, &mach, false);
+	if (error)
+		return error;
+
+	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+	if (error)
+		goto out;
+
+	(*nvmm_impl->vcpu_run)(mach, vcpu, &args->exit);
+	nvmm_vcpu_put(vcpu);
+
+out:
+	nvmm_machine_put(mach);
+	return error;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_gpa_map(struct nvmm_ioc_gpa_map *args)
+{
+	struct proc *p = curproc;
+	struct nvmm_machine *mach;
+	struct vmspace *vmspace;
+	gpaddr_t gpa;
+	vaddr_t uva;
+	int error;
+
+	error = nvmm_machine_get(args->machid, &mach, false);
+	if (error)
+		return error;
+
+	vmspace = p->p_vmspace;
+
+	if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0 ||
+	    (args->hva % PAGE_SIZE) != 0) {
+		error = EINVAL;
+		goto out;
+	}
+	if (args->hva == 0) {
+		error = EINVAL;
+		goto out;
+	}
+	if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
+		error = EINVAL;
+		goto out;
+	}
+	if (args->gpa + args->size <= args->gpa) {
+		error = EINVAL;
+		goto out;
+	}
+	if (args->gpa + args->size >= mach->gpa_end) {
+		error = EINVAL;
+		goto out;
+	}
+	gpa = args->gpa;
+
+	/* Take a reference for the kernel. */
+	uao_reference(mach->uobj);
+
+	/* Map the uobj into the machine address space, as pageable. */
+	error = uvm_map(&mach->vm->vm_map, &gpa, args->size, mach->uobj,
+	    args->gpa, 0, UVM_MAPFLAG(UVM_PROT_RWX, UVM_PROT_RWX,
+	    UVM_INH_NONE, UVM_ADV_NORMAL, UVM_FLAG_FIXED));
+	if (error) {
+		uao_detach(mach->uobj);
+		goto out;
+	}
+	if (gpa != args->gpa) {
+		uao_detach(mach->uobj);
+		printf("[!] uvm_map problem\n");
+		error = EINVAL;
+		goto out;
+	}
+
+	uva = (vaddr_t)args->hva;
+
+	/* Take a reference for the user. */
+	uao_reference(mach->uobj);
+
+	/* Map the uobj into the user address space, as pageable. */
+	error = uvm_map(&vmspace->vm_map, &uva, args->size, mach->uobj,
+	    args->gpa, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
+	    UVM_INH_SHARE, UVM_ADV_NORMAL, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
+	if (error) {
+		uao_detach(mach->uobj);
+		goto out;
+	}
+
+out:
+	nvmm_machine_put(mach);
+	return error;
+}
+
+static int
+nvmm_gpa_unmap(struct nvmm_ioc_gpa_unmap *args)
+{
+	struct nvmm_machine *mach;
+	gpaddr_t gpa;
+	int error;
+
+	error = nvmm_machine_get(args->machid, &mach, false);
+	if (error)
+		return error;
+
+	if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0) {
+		error = EINVAL;
+		goto out;
+	}
+	if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
+		error = EINVAL;
+		goto out;
+	}
+	if (args->gpa + args->size <= args->gpa) {
+		error = EINVAL;
+		goto out;
+	}
+	if (args->gpa + args->size >= mach->gpa_end) {
+		error = EINVAL;
+		goto out;
+	}
+	gpa = args->gpa;
+
+	/* Unmap the memory from the machine. */
+	uvm_unmap(&mach->vm->vm_map, gpa, gpa + args->size);
+
+out:
+	nvmm_machine_put(mach);
+	return error;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_init(void)
+{
+	size_t i, n;
+
+	for (i = 0; i < __arraycount(nvmm_impl_list); i++) {
+		if (!(*nvmm_impl_list[i]->ident)()) {
+			continue;
+		}
+		nvmm_impl = nvmm_impl_list[i];
+		break;
+	}
+	if (nvmm_impl == NULL) {
+		printf("[!] No implementation found\n");
+		return ENOTSUP;
+	}
+
+	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
+		machines[i].machid = i;
+		rw_init(&machines[i].lock);
+		for (n = 0; n < NVMM_MAX_VCPUS; n++) {
+			mutex_init(&machines[i].cpus[n].lock, MUTEX_DEFAULT,
+			    IPL_NONE);
+			machines[i].cpus[n].hcpu_last = -1;
+		}
+	}
+
+	(*nvmm_impl->init)();
+
+	return 0;
+}
+
+static void
+nvmm_fini(void)
+{
+	size_t i, n;
+
+	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
+		rw_destroy(&machines[i].lock);
+		for (n = 0; n < NVMM_MAX_VCPUS; n++) {
+			mutex_destroy(&machines[i].cpus[n].lock);
+		}
+		/* TODO need to free stuff, etc */
+	}
+
+	(*nvmm_impl->fini)();
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_open(dev_t dev, int flags, int type, struct lwp *l)
+{
+	if (minor(dev) != 0) {
+		return EXDEV;
+	}
+
+	return 0;
+}
+
+static int
+nvmm_close(dev_t dev, int flags, int type, struct lwp *l)
+{
+	KASSERT(minor(dev) == 0);
+
+	nvmm_kill_machines(l->l_proc->p_pid);
+
+	return 0;
+}
+
+static int
+nvmm_ioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
+{
+	KASSERT(minor(dev) == 0);
+
+	switch (cmd) {
+	case NVMM_IOC_CAPABILITY:
+		return nvmm_capability(data);
+	case NVMM_IOC_MACHINE_CREATE:
+		return nvmm_machine_create(data);
+	case NVMM_IOC_MACHINE_DESTROY:
+		return nvmm_machine_destroy(data);
+	case NVMM_IOC_MACHINE_CONFIGURE:
+		return nvmm_machine_configure(data);
+	case NVMM_IOC_VCPU_CREATE:
+		return nvmm_vcpu_create(data);
+	case NVMM_IOC_VCPU_DESTROY:
+		return nvmm_vcpu_destroy(data);
+	case NVMM_IOC_VCPU_SETSTATE:
+		return nvmm_vcpu_setstate(data);
+	case NVMM_IOC_VCPU_GETSTATE:
+		return nvmm_vcpu_getstate(data);
+	case NVMM_IOC_VCPU_INJECT:
+		return nvmm_vcpu_inject(data);
+	case NVMM_IOC_VCPU_RUN:
+		return nvmm_vcpu_run(data);
+	case NVMM_IOC_GPA_MAP:
+		return nvmm_gpa_map(data);
+	case NVMM_IOC_GPA_UNMAP:
+		return nvmm_gpa_unmap(data);
+	default:
+		return EINVAL;
+	}
+}
+
+const struct cdevsw nvmm_cdevsw = {
+	.d_open = nvmm_open,
+	.d_close = nvmm_close,
+	.d_read = noread,
+	.d_write = nowrite,
+	.d_ioctl = nvmm_ioctl,
+	.d_stop = nostop,
+	.d_tty = notty,
+	.d_poll = nopoll,
+	.d_mmap = nommap,
+	.d_kqfilter = nokqfilter,
+	.d_discard = nodiscard,
+	.d_flag = D_OTHER | D_MPSAFE
+};
+
+void
+nvmmattach(int nunits)
+{
+	/* nothing */
+}
+
+MODULE(MODULE_CLASS_DRIVER, nvmm, NULL);
+
+static int
+nvmm_modcmd(modcmd_t cmd, void *arg)
+{
+	int error;
+
+	switch (cmd) {
+	case MODULE_CMD_INIT:
+		error = nvmm_init();
+		if (error)
+			return error;
+
+#if defined(_MODULE)
+		{
+			devmajor_t bmajor = NODEVMAJOR;
+			devmajor_t cmajor = 345;
+
+			/* mknod /dev/nvmm c 345 0 */
+			error = devsw_attach("nvmm", NULL, &bmajor,
+			    &nvmm_cdevsw, &cmajor);
+			if (error) {
+				nvmm_fini();
+				return error;
+			}
+		}
+#endif
+		return 0;
+
+	case MODULE_CMD_FINI:
+#if defined(_MODULE)
+		{
+			error = devsw_detach(NULL, &nvmm_cdevsw);
+			if (error) {
+				return error;
+			}
+		}
+#endif
+		nvmm_fini();
+		return 0;
+
+	default:
+		return ENOTTY;
+	}
+}
Index: src/sys/dev/nvmm/nvmm.h
diff -u /dev/null src/sys/dev/nvmm/nvmm.h:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm.h	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,155 @@
+/*	$NetBSD: nvmm.h,v 1.1 2018/11/07 07:43:08 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NVMM_H_
+#define _NVMM_H_
+
+#include <sys/types.h>
+
+#ifndef _KERNEL
+#include <stdbool.h>
+#endif
+
+typedef uint64_t	gpaddr_t;
+typedef uint64_t	gvaddr_t;
+
+typedef uint32_t	nvmm_machid_t;
+typedef uint32_t	nvmm_cpuid_t;
+
+enum nvmm_exit_reason {
+	NVMM_EXIT_NONE		= 0x0000000000000000,
+
+	/* General. */
+	NVMM_EXIT_MEMORY	= 0x0000000000000001,
+	NVMM_EXIT_IO		= 0x0000000000000002,
+	NVMM_EXIT_MSR		= 0x0000000000000003,
+	NVMM_EXIT_INT_READY	= 0x0000000000000004,
+	NVMM_EXIT_NMI_READY	= 0x0000000000000005,
+	NVMM_EXIT_SHUTDOWN	= 0x0000000000000006,
+
+	/* Instructions (x86). */
+	NVMM_EXIT_HLT		= 0x0000000000001000,
+	NVMM_EXIT_MONITOR	= 0x0000000000001001,
+	NVMM_EXIT_MWAIT		= 0x0000000000001002,
+	NVMM_EXIT_MWAIT_COND	= 0x0000000000001003,
+
+	NVMM_EXIT_INVALID	= 0xFFFFFFFFFFFFFFFF
+};
+
+enum nvmm_exit_memory_perm {
+	NVMM_EXIT_MEMORY_READ,
+	NVMM_EXIT_MEMORY_WRITE,
+	NVMM_EXIT_MEMORY_EXEC
+};
+
+struct nvmm_exit_memory {
+	enum nvmm_exit_memory_perm perm;
+	gpaddr_t gpa;
+	uint8_t inst_len;
+	uint8_t inst_bytes[15];
+	uint64_t npc;
+};
+
+enum nvmm_exit_io_type {
+	NVMM_EXIT_IO_IN,
+	NVMM_EXIT_IO_OUT
+};
+
+struct nvmm_exit_io {
+	enum nvmm_exit_io_type type;
+	uint16_t port;
+	int seg;
+	uint8_t address_size;
+	uint8_t operand_size;
+	bool rep;
+	bool str;
+	uint64_t npc;
+};
+
+enum nvmm_exit_msr_type {
+	NVMM_EXIT_MSR_RDMSR,
+	NVMM_EXIT_MSR_WRMSR
+};
+
+struct nvmm_exit_msr {
+	enum nvmm_exit_msr_type type;
+	uint64_t msr;
+	uint64_t val;
+	uint64_t npc;
+};
+
+struct nvmm_exit {
+	enum nvmm_exit_reason reason;
+	union {
+		struct nvmm_exit_memory mem;
+		struct nvmm_exit_io io;
+		struct nvmm_exit_msr msr;
+	} u;
+	uint64_t exitstate[8];
+};
+
+enum nvmm_event_type {
+	NVMM_EVENT_INTERRUPT_HW,
+	NVMM_EVENT_INTERRUPT_SW,
+	NVMM_EVENT_EXCEPTION
+};
+
+struct nvmm_event {
+	enum nvmm_event_type type;
+	uint64_t vector;
+	union {
+		/* NVMM_EVENT_INTERRUPT_HW */
+		uint8_t prio;
+
+		/* NVMM_EVENT_EXCEPTION */
+		uint64_t error;
+	} u;
+};
+
+#define NVMM_CAPABILITY_VERSION		1
+
+struct nvmm_capability {
+	uint64_t version;
+	uint64_t state_size;
+	uint64_t max_machines;
+	uint64_t max_vcpus;
+	uint64_t max_ram;
+	union {
+		struct {
+			uint64_t xcr0_mask;
+			uint64_t mxcsr_mask;
+			uint64_t conf_cpuid_maxops;
+		} x86;
+		uint64_t rsvd[8];
+	} u;
+};
+
+#endif
Index: src/sys/dev/nvmm/nvmm_internal.h
diff -u /dev/null src/sys/dev/nvmm/nvmm_internal.h:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm_internal.h	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,100 @@
+/*	$NetBSD: nvmm_internal.h,v 1.1 2018/11/07 07:43:08 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NVMM_INTERNAL_H_
+#define _NVMM_INTERNAL_H_
+
+#define NVMM_MAX_MACHINES	128
+#define NVMM_MAX_VCPUS		256
+#define NVMM_MAX_RAM		(4UL * (1 << 30))
+
+struct nvmm_cpu {
+	/* Shared. */
+	bool present;
+	nvmm_cpuid_t cpuid;
+	kmutex_t lock;
+
+	/* Last host CPU on which the VCPU ran. */
+	int hcpu_last;
+
+	/* Implementation-specific. */
+	void *cpudata;
+};
+
+struct nvmm_machine {
+	bool present;
+	nvmm_machid_t machid;
+	pid_t procid;
+	krwlock_t lock;
+
+	/* Kernel */
+	struct vmspace *vm;
+	struct uvm_object *uobj;
+	gpaddr_t gpa_begin;
+	gpaddr_t gpa_end;
+
+	/* CPU */
+	struct nvmm_cpu cpus[NVMM_MAX_VCPUS];
+
+	/* Implementation-specific */
+	void *machdata;
+};
+
+struct nvmm_impl {
+	bool (*ident)(void);
+	void (*init)(void);
+	void (*fini)(void);
+	void (*capability)(struct nvmm_capability *);
+
+	size_t conf_max;
+	const size_t *conf_sizes;
+	size_t state_size;
+
+	void (*machine_create)(struct nvmm_machine *);
+	void (*machine_destroy)(struct nvmm_machine *);
+	int (*machine_configure)(struct nvmm_machine *, uint64_t, void *);
+
+	int (*vcpu_create)(struct nvmm_machine *, struct nvmm_cpu *);
+	void (*vcpu_destroy)(struct nvmm_machine *, struct nvmm_cpu *);
+	void (*vcpu_setstate)(struct nvmm_cpu *, void *, uint64_t);
+	void (*vcpu_getstate)(struct nvmm_cpu *, void *, uint64_t);
+	int (*vcpu_inject)(struct nvmm_machine *, struct nvmm_cpu *,
+	    struct nvmm_event *);
+	int (*vcpu_run)(struct nvmm_machine *, struct nvmm_cpu *,
+	    struct nvmm_exit *);
+};
+
+int nvmm_vcpu_get(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_cpu **);
+void nvmm_vcpu_put(struct nvmm_cpu *);
+
+extern const struct nvmm_impl nvmm_x86_svm;
+
+#endif /* _NVMM_INTERNAL_H_ */
Index: src/sys/dev/nvmm/nvmm_ioctl.h
diff -u /dev/null src/sys/dev/nvmm/nvmm_ioctl.h:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm_ioctl.h	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,120 @@
+/*	$NetBSD: nvmm_ioctl.h,v 1.1 2018/11/07 07:43:08 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NVMM_IOCTL_H_
+#define _NVMM_IOCTL_H_
+
+#include <dev/nvmm/nvmm.h>
+
+struct nvmm_ioc_capability {
+	struct nvmm_capability cap;
+};
+
+struct nvmm_ioc_machine_create {
+	nvmm_machid_t machid;
+};
+
+struct nvmm_ioc_machine_destroy {
+	nvmm_machid_t machid;
+};
+
+struct nvmm_ioc_machine_configure {
+	nvmm_machid_t machid;
+	uint64_t op;
+	void *conf;
+};
+
+struct nvmm_ioc_vcpu_create {
+	nvmm_machid_t machid;
+	nvmm_cpuid_t cpuid;
+};
+
+struct nvmm_ioc_vcpu_destroy {
+	nvmm_machid_t machid;
+	nvmm_cpuid_t cpuid;
+};
+
+struct nvmm_ioc_vcpu_setstate {
+	nvmm_machid_t machid;
+	nvmm_cpuid_t cpuid;
+	uint64_t flags;
+	void *state;
+};
+
+struct nvmm_ioc_vcpu_getstate {
+	nvmm_machid_t machid;
+	nvmm_cpuid_t cpuid;
+	uint64_t flags;
+	void *state;
+};
+
+struct nvmm_ioc_vcpu_inject {
+	nvmm_machid_t machid;
+	nvmm_cpuid_t cpuid;
+	struct nvmm_event event;
+};
+
+struct nvmm_ioc_vcpu_run {
+	/* input */
+	nvmm_machid_t machid;
+	nvmm_cpuid_t cpuid;
+	/* output */
+	struct nvmm_exit exit;
+};
+
+struct nvmm_ioc_gpa_map {
+	nvmm_machid_t machid;
+	uintptr_t hva;
+	gpaddr_t gpa;
+	size_t size;
+	int flags;
+};
+
+struct nvmm_ioc_gpa_unmap {
+	nvmm_machid_t machid;
+	gpaddr_t gpa;
+	size_t size;
+};
+
+#define NVMM_IOC_CAPABILITY		_IOR ('N',  0, struct nvmm_ioc_capability)
+#define NVMM_IOC_MACHINE_CREATE		_IOWR('N',  1, struct nvmm_ioc_machine_create)
+#define NVMM_IOC_MACHINE_DESTROY	_IOW ('N',  2, struct nvmm_ioc_machine_destroy)
+#define NVMM_IOC_MACHINE_CONFIGURE	_IOW ('N',  3, struct nvmm_ioc_machine_configure)
+#define NVMM_IOC_VCPU_CREATE		_IOW ('N',  4, struct nvmm_ioc_vcpu_create)
+#define NVMM_IOC_VCPU_DESTROY		_IOW ('N',  5, struct nvmm_ioc_vcpu_destroy)
+#define NVMM_IOC_VCPU_SETSTATE		_IOW ('N',  6, struct nvmm_ioc_vcpu_setstate)
+#define NVMM_IOC_VCPU_GETSTATE		_IOW ('N',  7, struct nvmm_ioc_vcpu_getstate)
+#define NVMM_IOC_VCPU_INJECT		_IOWR('N',  8, struct nvmm_ioc_vcpu_inject)
+#define NVMM_IOC_VCPU_RUN		_IOWR('N',  9, struct nvmm_ioc_vcpu_run)
+#define NVMM_IOC_GPA_MAP		_IOW ('N', 10, struct nvmm_ioc_gpa_map)
+#define NVMM_IOC_GPA_UNMAP		_IOW ('N', 11, struct nvmm_ioc_gpa_unmap)
+
+#endif /* _NVMM_IOCTL_H_ */

Index: src/sys/dev/nvmm/x86/Makefile
diff -u /dev/null src/sys/dev/nvmm/x86/Makefile:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/x86/Makefile	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,7 @@
+#	$NetBSD: Makefile,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+INCSDIR= /usr/include/dev/nvmm/x86
+
+INCS=	nvmm_x86.h
+
+.include <bsd.kinc.mk>
Index: src/sys/dev/nvmm/x86/nvmm_x86.h
diff -u /dev/null src/sys/dev/nvmm/x86/nvmm_x86.h:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/x86/nvmm_x86.h	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,172 @@
+/*	$NetBSD: nvmm_x86.h,v 1.1 2018/11/07 07:43:08 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NVMM_X86_H_
+#define _NVMM_X86_H_
+
+/* Segments. */
+#define NVMM_X64_SEG_CS			0
+#define NVMM_X64_SEG_DS			1
+#define NVMM_X64_SEG_ES			2
+#define NVMM_X64_SEG_FS			3
+#define NVMM_X64_SEG_GS			4
+#define NVMM_X64_SEG_SS			5
+#define NVMM_X64_SEG_GDT		6
+#define NVMM_X64_SEG_IDT		7
+#define NVMM_X64_SEG_LDT		8
+#define NVMM_X64_SEG_TR			9
+#define NVMM_X64_NSEG			10
+
+/* General Purpose Registers. */
+#define NVMM_X64_GPR_RAX		0
+#define NVMM_X64_GPR_RBX		1
+#define NVMM_X64_GPR_RCX		2
+#define NVMM_X64_GPR_RDX		3
+#define NVMM_X64_GPR_R8			4
+#define NVMM_X64_GPR_R9			5
+#define NVMM_X64_GPR_R10		6
+#define NVMM_X64_GPR_R11		7
+#define NVMM_X64_GPR_R12		8
+#define NVMM_X64_GPR_R13		9
+#define NVMM_X64_GPR_R14		10
+#define NVMM_X64_GPR_R15		11
+#define NVMM_X64_GPR_RDI		12
+#define NVMM_X64_GPR_RSI		13
+#define NVMM_X64_GPR_RBP		14
+#define NVMM_X64_GPR_RSP		15
+#define NVMM_X64_GPR_RIP		16
+#define NVMM_X64_GPR_RFLAGS		17
+#define NVMM_X64_NGPR			18
+
+/* Control Registers. */
+#define NVMM_X64_CR_CR0			0
+#define NVMM_X64_CR_CR2			1
+#define NVMM_X64_CR_CR3			2
+#define NVMM_X64_CR_CR4			3
+#define NVMM_X64_CR_CR8			4
+#define NVMM_X64_CR_XCR0		5
+#define NVMM_X64_NCR			6
+
+/* Debug Registers. */
+#define NVMM_X64_DR_DR0			0
+#define NVMM_X64_DR_DR1			1
+#define NVMM_X64_DR_DR2			2
+#define NVMM_X64_DR_DR3			3
+#define NVMM_X64_DR_DR6			4
+#define NVMM_X64_DR_DR7			5
+#define NVMM_X64_NDR			6
+
+/* MSRs. */
+#define NVMM_X64_MSR_EFER		0
+#define NVMM_X64_MSR_STAR		1
+#define NVMM_X64_MSR_LSTAR		2
+#define NVMM_X64_MSR_CSTAR		3
+#define NVMM_X64_MSR_SFMASK		4
+#define NVMM_X64_MSR_KERNELGSBASE	5
+#define NVMM_X64_MSR_SYSENTER_CS	6
+#define NVMM_X64_MSR_SYSENTER_ESP	7
+#define NVMM_X64_MSR_SYSENTER_EIP	8
+#define NVMM_X64_MSR_PAT		9
+#define NVMM_X64_NMSR			10
+
+/* Misc. */
+#define NVMM_X64_MISC_CPL		0
+#define NVMM_X64_NMISC			1
+
+#ifndef ASM_NVMM
+
+#include <sys/types.h>
+#include <x86/cpu_extended_state.h>
+
+struct nvmm_x64_state_seg {
+	uint64_t selector;
+	struct {		/* hidden */
+		uint64_t type:5;
+		uint64_t dpl:2;
+		uint64_t p:1;
+		uint64_t avl:1;
+		uint64_t lng:1;
+		uint64_t def32:1;
+		uint64_t gran:1;
+		uint64_t rsvd:52;
+	} attrib;
+	uint64_t limit;		/* hidden */
+	uint64_t base;		/* hidden */
+};
+
+/* VM exit state indexes. */
+#define NVMM_X64_EXITSTATE_CR8	0
+
+/* Flags. */
+#define NVMM_X64_STATE_SEGS	0x01
+#define NVMM_X64_STATE_GPRS	0x02
+#define NVMM_X64_STATE_CRS	0x04
+#define NVMM_X64_STATE_DRS	0x08
+#define NVMM_X64_STATE_MSRS	0x10
+#define NVMM_X64_STATE_MISC	0x20
+#define NVMM_X64_STATE_FPU	0x40
+#define NVMM_X64_STATE_ALL	\
+	(NVMM_X64_STATE_SEGS | NVMM_X64_STATE_GPRS | NVMM_X64_STATE_CRS | \
+	 NVMM_X64_STATE_DRS | NVMM_X64_STATE_MSRS | NVMM_X64_STATE_MISC | \
+	 NVMM_X64_STATE_FPU)
+
+struct nvmm_x64_state {
+	struct nvmm_x64_state_seg segs[NVMM_X64_NSEG];
+	uint64_t gprs[NVMM_X64_NGPR];
+	uint64_t crs[NVMM_X64_NCR];
+	uint64_t drs[NVMM_X64_NDR];
+	uint64_t msrs[NVMM_X64_NMSR];
+	uint64_t misc[NVMM_X64_NMISC];
+	struct fxsave fpu;
+};
+
+#define NVMM_X86_CONF_CPUID	0
+#define NVMM_X86_NCONF		1
+
+struct nvmm_x86_conf_cpuid {
+	uint32_t leaf;
+	struct {
+		uint32_t eax;
+		uint32_t ebx;
+		uint32_t ecx;
+		uint32_t edx;
+	} set;
+	struct {
+		uint32_t eax;
+		uint32_t ebx;
+		uint32_t ecx;
+		uint32_t edx;
+	} del;
+};
+
+#endif /* ASM_NVMM */
+
+#endif /* _NVMM_X86_H_ */
Index: src/sys/dev/nvmm/x86/nvmm_x86_svm.c
diff -u /dev/null src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/x86/nvmm_x86_svm.c	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,2088 @@
+/*	$NetBSD: nvmm_x86_svm.c,v 1.1 2018/11/07 07:43:08 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_svm.c,v 1.1 2018/11/07 07:43:08 maxv Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/cpu.h>
+#include <sys/xcall.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_page.h>
+
+#include <x86/cputypes.h>
+#include <x86/cpu_msr.h>
+#include <x86/specialreg.h>
+#include <x86/pmap.h>
+#include <x86/dbregs.h>
+#include <machine/cpuvar.h>
+
+#include <dev/nvmm/nvmm.h>
+#include <dev/nvmm/nvmm_internal.h>
+#include <dev/nvmm/x86/nvmm_x86.h>
+
+int svm_vmrun(paddr_t, uint64_t *);
+
+#define	MSR_VM_HSAVE_PA	0xC0010117
+
+/* -------------------------------------------------------------------------- */
+
+#define VMCB_EXITCODE_CR0_READ		0x0000
+#define VMCB_EXITCODE_CR1_READ		0x0001
+#define VMCB_EXITCODE_CR2_READ		0x0002
+#define VMCB_EXITCODE_CR3_READ		0x0003
+#define VMCB_EXITCODE_CR4_READ		0x0004
+#define VMCB_EXITCODE_CR5_READ		0x0005
+#define VMCB_EXITCODE_CR6_READ		0x0006
+#define VMCB_EXITCODE_CR7_READ		0x0007
+#define VMCB_EXITCODE_CR8_READ		0x0008
+#define VMCB_EXITCODE_CR9_READ		0x0009
+#define VMCB_EXITCODE_CR10_READ		0x000A
+#define VMCB_EXITCODE_CR11_READ		0x000B
+#define VMCB_EXITCODE_CR12_READ		0x000C
+#define VMCB_EXITCODE_CR13_READ		0x000D
+#define VMCB_EXITCODE_CR14_READ		0x000E
+#define VMCB_EXITCODE_CR15_READ		0x000F
+#define VMCB_EXITCODE_CR0_WRITE		0x0010
+#define VMCB_EXITCODE_CR1_WRITE		0x0011
+#define VMCB_EXITCODE_CR2_WRITE		0x0012
+#define VMCB_EXITCODE_CR3_WRITE		0x0013
+#define VMCB_EXITCODE_CR4_WRITE		0x0014
+#define VMCB_EXITCODE_CR5_WRITE		0x0015
+#define VMCB_EXITCODE_CR6_WRITE		0x0016
+#define VMCB_EXITCODE_CR7_WRITE		0x0017
+#define VMCB_EXITCODE_CR8_WRITE		0x0018
+#define VMCB_EXITCODE_CR9_WRITE		0x0019
+#define VMCB_EXITCODE_CR10_WRITE	0x001A
+#define VMCB_EXITCODE_CR11_WRITE	0x001B
+#define VMCB_EXITCODE_CR12_WRITE	0x001C
+#define VMCB_EXITCODE_CR13_WRITE	0x001D
+#define VMCB_EXITCODE_CR14_WRITE	0x001E
+#define VMCB_EXITCODE_CR15_WRITE	0x001F
+#define VMCB_EXITCODE_DR0_READ		0x0020
+#define VMCB_EXITCODE_DR1_READ		0x0021
+#define VMCB_EXITCODE_DR2_READ		0x0022
+#define VMCB_EXITCODE_DR3_READ		0x0023
+#define VMCB_EXITCODE_DR4_READ		0x0024
+#define VMCB_EXITCODE_DR5_READ		0x0025
+#define VMCB_EXITCODE_DR6_READ		0x0026
+#define VMCB_EXITCODE_DR7_READ		0x0027
+#define VMCB_EXITCODE_DR8_READ		0x0028
+#define VMCB_EXITCODE_DR9_READ		0x0029
+#define VMCB_EXITCODE_DR10_READ		0x002A
+#define VMCB_EXITCODE_DR11_READ		0x002B
+#define VMCB_EXITCODE_DR12_READ		0x002C
+#define VMCB_EXITCODE_DR13_READ		0x002D
+#define VMCB_EXITCODE_DR14_READ		0x002E
+#define VMCB_EXITCODE_DR15_READ		0x002F
+#define VMCB_EXITCODE_DR0_WRITE		0x0030
+#define VMCB_EXITCODE_DR1_WRITE		0x0031
+#define VMCB_EXITCODE_DR2_WRITE		0x0032
+#define VMCB_EXITCODE_DR3_WRITE		0x0033
+#define VMCB_EXITCODE_DR4_WRITE		0x0034
+#define VMCB_EXITCODE_DR5_WRITE		0x0035
+#define VMCB_EXITCODE_DR6_WRITE		0x0036
+#define VMCB_EXITCODE_DR7_WRITE		0x0037
+#define VMCB_EXITCODE_DR8_WRITE		0x0038
+#define VMCB_EXITCODE_DR9_WRITE		0x0039
+#define VMCB_EXITCODE_DR10_WRITE	0x003A
+#define VMCB_EXITCODE_DR11_WRITE	0x003B
+#define VMCB_EXITCODE_DR12_WRITE	0x003C
+#define VMCB_EXITCODE_DR13_WRITE	0x003D
+#define VMCB_EXITCODE_DR14_WRITE	0x003E
+#define VMCB_EXITCODE_DR15_WRITE	0x003F
+#define VMCB_EXITCODE_EXCP0		0x0040
+#define VMCB_EXITCODE_EXCP1		0x0041
+#define VMCB_EXITCODE_EXCP2		0x0042
+#define VMCB_EXITCODE_EXCP3		0x0043
+#define VMCB_EXITCODE_EXCP4		0x0044
+#define VMCB_EXITCODE_EXCP5		0x0045
+#define VMCB_EXITCODE_EXCP6		0x0046
+#define VMCB_EXITCODE_EXCP7		0x0047
+#define VMCB_EXITCODE_EXCP8		0x0048
+#define VMCB_EXITCODE_EXCP9		0x0049
+#define VMCB_EXITCODE_EXCP10		0x004A
+#define VMCB_EXITCODE_EXCP11		0x004B
+#define VMCB_EXITCODE_EXCP12		0x004C
+#define VMCB_EXITCODE_EXCP13		0x004D
+#define VMCB_EXITCODE_EXCP14		0x004E
+#define VMCB_EXITCODE_EXCP15		0x004F
+#define VMCB_EXITCODE_EXCP16		0x0050
+#define VMCB_EXITCODE_EXCP17		0x0051
+#define VMCB_EXITCODE_EXCP18		0x0052
+#define VMCB_EXITCODE_EXCP19		0x0053
+#define VMCB_EXITCODE_EXCP20		0x0054
+#define VMCB_EXITCODE_EXCP21		0x0055
+#define VMCB_EXITCODE_EXCP22		0x0056
+#define VMCB_EXITCODE_EXCP23		0x0057
+#define VMCB_EXITCODE_EXCP24		0x0058
+#define VMCB_EXITCODE_EXCP25		0x0059
+#define VMCB_EXITCODE_EXCP26		0x005A
+#define VMCB_EXITCODE_EXCP27		0x005B
+#define VMCB_EXITCODE_EXCP28		0x005C
+#define VMCB_EXITCODE_EXCP29		0x005D
+#define VMCB_EXITCODE_EXCP30		0x005E
+#define VMCB_EXITCODE_EXCP31		0x005F
+#define VMCB_EXITCODE_INTR		0x0060
+#define VMCB_EXITCODE_NMI		0x0061
+#define VMCB_EXITCODE_SMI		0x0062
+#define VMCB_EXITCODE_INIT		0x0063
+#define VMCB_EXITCODE_VINTR		0x0064
+#define VMCB_EXITCODE_CR0_SEL_WRITE	0x0065
+#define VMCB_EXITCODE_IDTR_READ		0x0066
+#define VMCB_EXITCODE_GDTR_READ		0x0067
+#define VMCB_EXITCODE_LDTR_READ		0x0068
+#define VMCB_EXITCODE_TR_READ		0x0069
+#define VMCB_EXITCODE_IDTR_WRITE	0x006A
+#define VMCB_EXITCODE_GDTR_WRITE	0x006B
+#define VMCB_EXITCODE_LDTR_WRITE	0x006C
+#define VMCB_EXITCODE_TR_WRITE		0x006D
+#define VMCB_EXITCODE_RDTSC		0x006E
+#define VMCB_EXITCODE_RDPMC		0x006F
+#define VMCB_EXITCODE_PUSHF		0x0070
+#define VMCB_EXITCODE_POPF		0x0071
+#define VMCB_EXITCODE_CPUID		0x0072
+#define VMCB_EXITCODE_RSM		0x0073
+#define VMCB_EXITCODE_IRET		0x0074
+#define VMCB_EXITCODE_SWINT		0x0075
+#define VMCB_EXITCODE_INVD		0x0076
+#define VMCB_EXITCODE_PAUSE		0x0077
+#define VMCB_EXITCODE_HLT		0x0078
+#define VMCB_EXITCODE_INVLPG		0x0079
+#define VMCB_EXITCODE_INVLPGA		0x007A
+#define VMCB_EXITCODE_IOIO		0x007B
+#define VMCB_EXITCODE_MSR		0x007C
+#define VMCB_EXITCODE_TASK_SWITCH	0x007D
+#define VMCB_EXITCODE_FERR_FREEZE	0x007E
+#define VMCB_EXITCODE_SHUTDOWN		0x007F
+#define VMCB_EXITCODE_VMRUN		0x0080
+#define VMCB_EXITCODE_VMMCALL		0x0081
+#define VMCB_EXITCODE_VMLOAD		0x0082
+#define VMCB_EXITCODE_VMSAVE		0x0083
+#define VMCB_EXITCODE_STGI		0x0084
+#define VMCB_EXITCODE_CLGI		0x0085
+#define VMCB_EXITCODE_SKINIT		0x0086
+#define VMCB_EXITCODE_RDTSCP		0x0087
+#define VMCB_EXITCODE_ICEBP		0x0088
+#define VMCB_EXITCODE_WBINVD		0x0089
+#define VMCB_EXITCODE_MONITOR		0x008A
+#define VMCB_EXITCODE_MWAIT		0x008B
+#define VMCB_EXITCODE_MWAIT_CONDITIONAL	0x008C
+#define VMCB_EXITCODE_XSETBV		0x008D
+#define VMCB_EXITCODE_EFER_WRITE_TRAP	0x008F
+#define VMCB_EXITCODE_CR0_WRITE_TRAP	0x0090
+#define VMCB_EXITCODE_CR1_WRITE_TRAP	0x0091
+#define VMCB_EXITCODE_CR2_WRITE_TRAP	0x0092
+#define VMCB_EXITCODE_CR3_WRITE_TRAP	0x0093
+#define VMCB_EXITCODE_CR4_WRITE_TRAP	0x0094
+#define VMCB_EXITCODE_CR5_WRITE_TRAP	0x0095
+#define VMCB_EXITCODE_CR6_WRITE_TRAP	0x0096
+#define VMCB_EXITCODE_CR7_WRITE_TRAP	0x0097
+#define VMCB_EXITCODE_CR8_WRITE_TRAP	0x0098
+#define VMCB_EXITCODE_CR9_WRITE_TRAP	0x0099
+#define VMCB_EXITCODE_CR10_WRITE_TRAP	0x009A
+#define VMCB_EXITCODE_CR11_WRITE_TRAP	0x009B
+#define VMCB_EXITCODE_CR12_WRITE_TRAP	0x009C
+#define VMCB_EXITCODE_CR13_WRITE_TRAP	0x009D
+#define VMCB_EXITCODE_CR14_WRITE_TRAP	0x009E
+#define VMCB_EXITCODE_CR15_WRITE_TRAP	0x009F
+#define VMCB_EXITCODE_NPF		0x0400
+#define VMCB_EXITCODE_AVIC_INCOMP_IPI	0x0401
+#define VMCB_EXITCODE_AVIC_NOACCEL	0x0402
+#define VMCB_EXITCODE_VMGEXIT		0x0403
+#define VMCB_EXITCODE_INVALID		-1
+
+/* -------------------------------------------------------------------------- */
+
+struct vmcb_ctrl {
+	uint32_t intercept_cr;
+#define VMCB_CTRL_INTERCEPT_RCR(x)	__BIT( 0 + x)
+#define VMCB_CTRL_INTERCEPT_WCR(x)	__BIT(16 + x)
+
+	uint32_t intercept_dr;
+#define VMCB_CTRL_INTERCEPT_RDR(x)	__BIT( 0 + x)
+#define VMCB_CTRL_INTERCEPT_WDR(x)	__BIT(16 + x)
+
+	uint32_t intercept_vec;
+#define VMCB_CTRL_INTERCEPT_VEC(x)	__BIT(x)
+
+	uint32_t intercept_misc1;
+#define VMCB_CTRL_INTERCEPT_INTR	__BIT(0)
+#define VMCB_CTRL_INTERCEPT_NMI		__BIT(1)
+#define VMCB_CTRL_INTERCEPT_SMI		__BIT(2)
+#define VMCB_CTRL_INTERCEPT_INIT	__BIT(3)
+#define VMCB_CTRL_INTERCEPT_VINTR	__BIT(4)
+#define VMCB_CTRL_INTERCEPT_CR0_SPEC	__BIT(5)
+#define VMCB_CTRL_INTERCEPT_RIDTR	__BIT(6)
+#define VMCB_CTRL_INTERCEPT_RGDTR	__BIT(7)
+#define VMCB_CTRL_INTERCEPT_RLDTR	__BIT(8)
+#define VMCB_CTRL_INTERCEPT_RTR		__BIT(9)
+#define VMCB_CTRL_INTERCEPT_WIDTR	__BIT(10)
+#define VMCB_CTRL_INTERCEPT_WGDTR	__BIT(11)
+#define VMCB_CTRL_INTERCEPT_WLDTR	__BIT(12)
+#define VMCB_CTRL_INTERCEPT_WTR		__BIT(13)
+#define VMCB_CTRL_INTERCEPT_RDTSC	__BIT(14)
+#define VMCB_CTRL_INTERCEPT_RDPMC	__BIT(15)
+#define VMCB_CTRL_INTERCEPT_PUSHF	__BIT(16)
+#define VMCB_CTRL_INTERCEPT_POPF	__BIT(17)
+#define VMCB_CTRL_INTERCEPT_CPUID	__BIT(18)
+#define VMCB_CTRL_INTERCEPT_RSM		__BIT(19)
+#define VMCB_CTRL_INTERCEPT_IRET	__BIT(20)
+#define VMCB_CTRL_INTERCEPT_INTN	__BIT(21)
+#define VMCB_CTRL_INTERCEPT_INVD	__BIT(22)
+#define VMCB_CTRL_INTERCEPT_PAUSE	__BIT(23)
+#define VMCB_CTRL_INTERCEPT_HLT		__BIT(24)
+#define VMCB_CTRL_INTERCEPT_INVLPG	__BIT(25)
+#define VMCB_CTRL_INTERCEPT_INVLPGA	__BIT(26)
+#define VMCB_CTRL_INTERCEPT_IOIO_PROT	__BIT(27)
+#define VMCB_CTRL_INTERCEPT_MSR_PROT	__BIT(28)
+#define VMCB_CTRL_INTERCEPT_TASKSW	__BIT(29)
+#define VMCB_CTRL_INTERCEPT_FERR_FREEZE	__BIT(30)
+#define VMCB_CTRL_INTERCEPT_SHUTDOWN	__BIT(31)
+
+	uint32_t intercept_misc2;
+#define VMCB_CTRL_INTERCEPT_VMRUN	__BIT(0)
+#define VMCB_CTRL_INTERCEPT_VMMCALL	__BIT(1)
+#define VMCB_CTRL_INTERCEPT_VMLOAD	__BIT(2)
+#define VMCB_CTRL_INTERCEPT_VMSAVE	__BIT(3)
+#define VMCB_CTRL_INTERCEPT_STGI	__BIT(4)
+#define VMCB_CTRL_INTERCEPT_CLGI	__BIT(5)
+#define VMCB_CTRL_INTERCEPT_SKINIT	__BIT(6)
+#define VMCB_CTRL_INTERCEPT_RDTSCP	__BIT(7)
+#define VMCB_CTRL_INTERCEPT_ICEBP	__BIT(8)
+#define VMCB_CTRL_INTERCEPT_WBINVD	__BIT(9)
+#define VMCB_CTRL_INTERCEPT_MONITOR	__BIT(10)
+#define VMCB_CTRL_INTERCEPT_MWAIT	__BIT(12)
+#define VMCB_CTRL_INTERCEPT_XSETBV	__BIT(13)
+#define VMCB_CTRL_INTERCEPT_EFER_SPEC	__BIT(15)
+#define VMCB_CTRL_INTERCEPT_WCR_SPEC(x)	__BIT(16 + x)
+
+	uint8_t  rsvd1[40];
+	uint16_t pause_filt_thresh;
+	uint16_t pause_filt_cnt;
+	uint64_t iopm_base_pa;
+	uint64_t msrpm_base_pa;
+	uint64_t tsc_offset;
+	uint32_t guest_asid;
+
+	uint32_t tlb_ctrl;
+#define VMCB_CTRL_TLB_CTRL_FLUSH_ALL			0x01
+#define VMCB_CTRL_TLB_CTRL_FLUSH_GUEST			0x03
+#define VMCB_CTRL_TLB_CTRL_FLUSH_GUEST_NONGLOBAL	0x07
+
+	uint64_t v;
+#define VMCB_CTRL_V_TPR			__BITS(7,0)
+#define VMCB_CTRL_V_IRQ			__BIT(8)
+#define VMCB_CTRL_V_VGIF		__BIT(9)
+#define VMCB_CTRL_V_INTR_PRIO		__BITS(19,16)
+#define VMCB_CTRL_V_IGN_TPR		__BIT(20)
+#define VMCB_CTRL_V_INTR_MASKING	__BIT(24)
+#define VMCB_CTRL_V_GUEST_VGIF		__BIT(25)
+#define VMCB_CTRL_V_AVIC_EN		__BIT(31)
+#define VMCB_CTRL_V_INTR_VECTOR		__BITS(39,32)
+
+	uint64_t intr;
+#define VMCB_CTRL_INTR_SHADOW		__BIT(0)
+#define VMCB_CTRL_GUEST_INTR_MASK	__BIT(1)
+
+	uint64_t exitcode;
+	uint64_t exitinfo1;
+	uint64_t exitinfo2;
+
+	uint64_t exitintinfo;
+#define VMCB_CTRL_EXITINTINFO_VECTOR	__BITS(7,0)
+#define VMCB_CTRL_EXITINTINFO_TYPE	__BITS(10,8)
+#define VMCB_CTRL_EXITINTINFO_EV	__BIT(11)
+#define VMCB_CTRL_EXITINTINFO_V		__BIT(31)
+#define VMCB_CTRL_EXITINTINFO_ERRORCODE	__BITS(63,32)
+
+	uint64_t enable1;
+#define VMCB_CTRL_ENABLE_NP		__BIT(0)
+#define VMCB_CTRL_ENABLE_SEV		__BIT(1)
+#define VMCB_CTRL_ENABLE_ES_SEV		__BIT(2)
+
+	uint64_t avic;
+#define VMCB_CTRL_AVIC_APIC_BAR		__BITS(51,0)
+
+	uint64_t ghcb;
+
+	uint64_t eventinj;
+#define VMCB_CTRL_EVENTINJ_VECTOR	__BITS(7,0)
+#define VMCB_CTRL_EVENTINJ_TYPE		__BITS(10,8)
+#define VMCB_CTRL_EVENTINJ_EV		__BIT(11)
+#define VMCB_CTRL_EVENTINJ_V		__BIT(31)
+#define VMCB_CTRL_EVENTINJ_ERRORCODE	__BITS(63,32)
+
+	uint64_t n_cr3;
+
+	uint64_t enable2;
+#define VMCB_CTRL_ENABLE_LBR		__BIT(0)
+#define VMCB_CTRL_ENABLE_VVMSAVE	__BIT(1)
+
+	uint32_t vmcb_clean;
+#define VMCB_CTRL_VMCB_CLEAN_I		__BIT(0)
+#define VMCB_CTRL_VMCB_CLEAN_IOPM	__BIT(1)
+#define VMCB_CTRL_VMCB_CLEAN_ASID	__BIT(2)
+#define VMCB_CTRL_VMCB_CLEAN_TPR	__BIT(3)
+#define VMCB_CTRL_VMCB_CLEAN_NP		__BIT(4)
+#define VMCB_CTRL_VMCB_CLEAN_CR		__BIT(5)
+#define VMCB_CTRL_VMCB_CLEAN_DR		__BIT(6)
+#define VMCB_CTRL_VMCB_CLEAN_DT		__BIT(7)
+#define VMCB_CTRL_VMCB_CLEAN_SEG	__BIT(8)
+#define VMCB_CTRL_VMCB_CLEAN_CR2	__BIT(9)
+#define VMCB_CTRL_VMCB_CLEAN_LBR	__BIT(10)
+#define VMCB_CTRL_VMCB_CLEAN_AVIC	__BIT(11)
+
+	uint32_t rsvd2;
+	uint64_t nrip;
+	uint8_t	inst_len;
+	uint8_t	inst_bytes[15];
+	uint8_t	pad[800];
+} __packed;
+
+CTASSERT(sizeof(struct vmcb_ctrl) == 1024);
+
+struct vmcb_segment {
+	uint16_t selector;
+	uint16_t attrib;	/* hidden */
+	uint32_t limit;		/* hidden */
+	uint64_t base;		/* hidden */
+} __packed;
+
+CTASSERT(sizeof(struct vmcb_segment) == 16);
+
+struct vmcb_state {
+	struct   vmcb_segment es;
+	struct   vmcb_segment cs;
+	struct   vmcb_segment ss;
+	struct   vmcb_segment ds;
+	struct   vmcb_segment fs;
+	struct   vmcb_segment gs;
+	struct   vmcb_segment gdt;
+	struct   vmcb_segment ldt;
+	struct   vmcb_segment idt;
+	struct   vmcb_segment tr;
+	uint8_t	 rsvd1[43];
+	uint8_t	 cpl;
+	uint8_t  rsvd2[4];
+	uint64_t efer;
+	uint8_t	 rsvd3[112];
+	uint64_t cr4;
+	uint64_t cr3;
+	uint64_t cr0;
+	uint64_t dr7;
+	uint64_t dr6;
+	uint64_t rflags;
+	uint64_t rip;
+	uint8_t	 rsvd4[88];
+	uint64_t rsp;
+	uint8_t	 rsvd5[24];
+	uint64_t rax;
+	uint64_t star;
+	uint64_t lstar;
+	uint64_t cstar;
+	uint64_t sfmask;
+	uint64_t kernelgsbase;
+	uint64_t sysenter_cs;
+	uint64_t sysenter_esp;
+	uint64_t sysenter_eip;
+	uint64_t cr2;
+	uint8_t	 rsvd6[32];
+	uint64_t g_pat;
+	uint64_t dbgctl;
+	uint64_t br_from;
+	uint64_t br_to;
+	uint64_t int_from;
+	uint64_t int_to;
+	uint8_t	 pad[2408];
+} __packed;
+
+CTASSERT(sizeof(struct vmcb_state) == 0xC00);
+
+struct vmcb {
+	struct vmcb_ctrl ctrl;
+	struct vmcb_state state;
+} __packed;
+
+CTASSERT(sizeof(struct vmcb) == PAGE_SIZE);
+CTASSERT(offsetof(struct vmcb, state) == 0x400);
+
+/* -------------------------------------------------------------------------- */
+
+struct svm_hsave {
+	paddr_t pa;
+};
+
+static struct svm_hsave hsave[MAXCPUS];
+
+static uint8_t *svm_asidmap __read_mostly;
+static uint32_t svm_maxasid __read_mostly;
+static kmutex_t svm_asidlock __cacheline_aligned;
+
+static bool svm_decode_assist __read_mostly;
+static uint32_t svm_ctrl_tlb_flush __read_mostly;
+
+#define SVM_XCR0_MASK_DEFAULT	(XCR0_X87|XCR0_SSE)
+static uint64_t svm_xcr0_mask __read_mostly;
+
+#define SVM_NCPUIDS	32
+
+#define VMCB_NPAGES	1
+
+#define MSRBM_NPAGES	2
+#define MSRBM_SIZE	(MSRBM_NPAGES * PAGE_SIZE)
+
+#define IOBM_NPAGES	3
+#define IOBM_SIZE	(IOBM_NPAGES * PAGE_SIZE)
+
+/* Does not include EFER_LMSLE. */
+#define EFER_VALID \
+	(EFER_SCE|EFER_LME|EFER_LMA|EFER_NXE|EFER_SVME|EFER_FFXSR|EFER_TCE)
+
+#define EFER_TLB_FLUSH \
+	(EFER_NXE|EFER_LMA|EFER_LME)
+#define CR0_TLB_FLUSH \
+	(CR0_PG|CR0_WP|CR0_CD|CR0_NW)
+#define CR4_TLB_FLUSH \
+	(CR4_PGE|CR4_PAE|CR4_PSE)
+
+/* -------------------------------------------------------------------------- */
+
+struct svm_machdata {
+	bool cpuidpresent[SVM_NCPUIDS];
+	struct nvmm_x86_conf_cpuid cpuid[SVM_NCPUIDS];
+};
+
+static const size_t svm_conf_sizes[NVMM_X86_NCONF] = {
+	[NVMM_X86_CONF_CPUID] = sizeof(struct nvmm_x86_conf_cpuid)
+};
+
+struct svm_cpudata {
+	/* x64-specific */
+	struct nvmm_x64_state state;
+
+	/* General */
+	bool shared_asid;
+	bool tlb_want_flush;
+
+	/* VMCB */
+	struct vmcb *vmcb;
+	paddr_t vmcb_pa;
+
+	/* I/O bitmap */
+	uint8_t *iobm;
+	paddr_t iobm_pa;
+
+	/* MSR bitmap */
+	uint8_t *msrbm;
+	paddr_t msrbm_pa;
+
+	/* Host state */
+	uint64_t xcr0;
+	uint64_t star;
+	uint64_t lstar;
+	uint64_t cstar;
+	uint64_t sfmask;
+	uint64_t cr2;
+	bool ts_set;
+	struct xsave_header hfpu __aligned(16);
+
+	/* Guest state */
+	bool in_nmi;
+	uint64_t tsc_offset;
+	struct xsave_header gfpu __aligned(16);
+};
+
+#define SVM_EVENT_TYPE_HW_INT	0
+#define SVM_EVENT_TYPE_NMI	2
+#define SVM_EVENT_TYPE_EXC	3
+#define SVM_EVENT_TYPE_SW_INT	4
+
+static void
+svm_event_waitexit_enable(struct vmcb *vmcb, bool nmi)
+{
+	if (nmi) {
+		vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_IRET;
+	} else {
+		vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_VINTR;
+		vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ |
+		    __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR));
+	}
+}
+
+static void
+svm_event_waitexit_disable(struct vmcb *vmcb, bool nmi)
+{
+	if (nmi) {
+		vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_IRET;
+	} else {
+		vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_VINTR;
+		vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ |
+		    __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR));
+	}
+}
+
+static inline int
+svm_event_has_error(uint64_t vector)
+{
+	switch (vector) {
+	case 8:		/* #DF */
+	case 10:	/* #TS */
+	case 11:	/* #NP */
+	case 12:	/* #SS */
+	case 13:	/* #GP */
+	case 14:	/* #PF */
+	case 17:	/* #AC */
+	case 30:	/* #SX */
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int
+svm_vcpu_inject(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+    struct nvmm_event *event)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct vmcb *vmcb = cpudata->vmcb;
+	uint64_t rflags = vmcb->state.rflags;
+	int type = 0, err = 0;
+	uint64_t tpr;
+
+	if (event->vector >= 256) {
+		return EINVAL;
+	}
+
+	switch (event->type) {
+	case NVMM_EVENT_INTERRUPT_HW:
+		type = SVM_EVENT_TYPE_HW_INT;
+		if (event->vector == 2) {
+			type = SVM_EVENT_TYPE_NMI;
+		}
+		if (type == SVM_EVENT_TYPE_NMI) {
+			if (cpudata->in_nmi) {
+				svm_event_waitexit_enable(vmcb, true);
+				return EAGAIN;
+			}
+			cpudata->in_nmi = true;
+		} else {
+			tpr = __SHIFTOUT(vmcb->ctrl.v, VMCB_CTRL_V_TPR);
+			if ((rflags & PSL_I) == 0 || event->u.prio <= tpr) {
+				svm_event_waitexit_enable(vmcb, false);
+				return EAGAIN;
+			}
+		}
+		err = 0;
+		break;
+	case NVMM_EVENT_INTERRUPT_SW:
+		type = SVM_EVENT_TYPE_SW_INT;
+		err = 0;
+		break;
+	case NVMM_EVENT_EXCEPTION:
+		type = SVM_EVENT_TYPE_EXC;
+		if (event->vector == 2 || event->vector >= 32)
+			return EINVAL;
+		err = svm_event_has_error(event->vector);
+		break;
+	default:
+		return EINVAL;
+	}
+
+	vmcb->ctrl.eventinj =
+	    __SHIFTIN(event->vector, VMCB_CTRL_EVENTINJ_VECTOR) |
+	    __SHIFTIN(type, VMCB_CTRL_EVENTINJ_TYPE) |
+	    __SHIFTIN(err, VMCB_CTRL_EVENTINJ_EV) |
+	    __SHIFTIN(1, VMCB_CTRL_EVENTINJ_V) |
+	    __SHIFTIN(event->u.error, VMCB_CTRL_EVENTINJ_ERRORCODE);
+
+	return 0;
+}
+
+static void
+svm_inject_ud(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+	struct nvmm_event event;
+	int ret __diagused;
+
+	event.type = NVMM_EVENT_EXCEPTION;
+	event.vector = 6;
+	event.u.error = 0;
+
+	ret = svm_vcpu_inject(mach, vcpu, &event);
+	KASSERT(ret == 0);
+}
+
+static void
+svm_inject_db(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+	struct nvmm_event event;
+	int ret __diagused;
+
+	event.type = NVMM_EVENT_EXCEPTION;
+	event.vector = 1;
+	event.u.error = 0;
+
+	ret = svm_vcpu_inject(mach, vcpu, &event);
+	KASSERT(ret == 0);
+}
+
+static void
+svm_inject_gp(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+	struct nvmm_event event;
+	int ret __diagused;
+
+	event.type = NVMM_EVENT_EXCEPTION;
+	event.vector = 13;
+	event.u.error = 0;
+
+	ret = svm_vcpu_inject(mach, vcpu, &event);
+	KASSERT(ret == 0);
+}
+
+static void
+svm_inkernel_handle_cpuid(struct nvmm_cpu *vcpu, uint64_t eax, uint64_t ecx)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct nvmm_x64_state *state = &cpudata->state;
+
+	switch (eax) {
+	case 0x00000001: /* APIC number in RBX. The rest is tunable. */
+		state->gprs[NVMM_X64_GPR_RBX] &= ~CPUID_LOCAL_APIC_ID;
+		state->gprs[NVMM_X64_GPR_RBX] |= __SHIFTIN(vcpu->cpuid,
+		    CPUID_LOCAL_APIC_ID);
+		break;
+	case 0x0000000D: /* FPU description. Not tunable. */
+		if (ecx != 0 || svm_xcr0_mask == 0) {
+			break;
+		}
+		cpudata->vmcb->state.rax = svm_xcr0_mask & 0xFFFFFFFF;
+		if (state->crs[NVMM_X64_CR_XCR0] & XCR0_SSE) {
+			state->gprs[NVMM_X64_GPR_RBX] = sizeof(struct fxsave);
+		} else {
+			state->gprs[NVMM_X64_GPR_RBX] = sizeof(struct save87);
+		}
+		state->gprs[NVMM_X64_GPR_RBX] += 64; /* XSAVE header */
+		state->gprs[NVMM_X64_GPR_RCX] = sizeof(struct fxsave);
+		state->gprs[NVMM_X64_GPR_RDX] = svm_xcr0_mask >> 32;
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+svm_exit_cpuid(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+    struct nvmm_exit *exit)
+{
+	struct svm_machdata *machdata = mach->machdata;
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct nvmm_x64_state *state = &cpudata->state;
+	struct nvmm_x86_conf_cpuid *cpuid;
+	uint64_t eax, ecx;
+	u_int descs[4];
+	size_t i;
+
+	eax = cpudata->vmcb->state.rax;
+	ecx = state->gprs[NVMM_X64_GPR_RCX];
+	x86_cpuid2(eax, ecx, descs);
+
+	cpudata->vmcb->state.rax = descs[0];
+	state->gprs[NVMM_X64_GPR_RBX] = descs[1];
+	state->gprs[NVMM_X64_GPR_RCX] = descs[2];
+	state->gprs[NVMM_X64_GPR_RDX] = descs[3];
+
+	for (i = 0; i < SVM_NCPUIDS; i++) {
+		cpuid = &machdata->cpuid[i];
+		if (!machdata->cpuidpresent[i]) {
+			continue;
+		}
+		if (cpuid->leaf != eax) {
+			continue;
+		}
+
+		/* del */
+		cpudata->vmcb->state.rax &= ~cpuid->del.eax;
+		state->gprs[NVMM_X64_GPR_RBX] &= ~cpuid->del.ebx;
+		state->gprs[NVMM_X64_GPR_RCX] &= ~cpuid->del.ecx;
+		state->gprs[NVMM_X64_GPR_RDX] &= ~cpuid->del.edx;
+
+		/* set */
+		cpudata->vmcb->state.rax |= cpuid->set.eax;
+		state->gprs[NVMM_X64_GPR_RBX] |= cpuid->set.ebx;
+		state->gprs[NVMM_X64_GPR_RCX] |= cpuid->set.ecx;
+		state->gprs[NVMM_X64_GPR_RDX] |= cpuid->set.edx;
+
+		break;
+	}
+
+	/* Overwrite non-tunable leaves. */
+	svm_inkernel_handle_cpuid(vcpu, eax, ecx);
+
+	/* For now we omit DBREGS. */
+	if (__predict_false(cpudata->vmcb->state.rflags & PSL_T)) {
+		svm_inject_db(mach, vcpu);
+	}
+
+	cpudata->vmcb->state.rip = cpudata->vmcb->ctrl.nrip;
+	exit->reason = NVMM_EXIT_NONE;
+}
+
+#define SVM_EXIT_IO_PORT	__BITS(31,16)
+#define SVM_EXIT_IO_SEG		__BITS(12,10)
+#define SVM_EXIT_IO_A64		__BIT(9)
+#define SVM_EXIT_IO_A32		__BIT(8)
+#define SVM_EXIT_IO_A16		__BIT(7)
+#define SVM_EXIT_IO_SZ32	__BIT(6)
+#define SVM_EXIT_IO_SZ16	__BIT(5)
+#define SVM_EXIT_IO_SZ8		__BIT(4)
+#define SVM_EXIT_IO_REP		__BIT(3)
+#define SVM_EXIT_IO_STR		__BIT(2)
+#define SVM_EXIT_IO_TYPE	__BIT(0)
+
+static const int seg_to_nvmm[] = {
+	[0] = NVMM_X64_SEG_ES,
+	[1] = NVMM_X64_SEG_CS,
+	[2] = NVMM_X64_SEG_SS,
+	[3] = NVMM_X64_SEG_DS,
+	[4] = NVMM_X64_SEG_FS,
+	[5] = NVMM_X64_SEG_GS
+};
+
+static void
+svm_exit_io(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+    struct nvmm_exit *exit)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	uint64_t info = cpudata->vmcb->ctrl.exitinfo1;
+	uint64_t nextpc = cpudata->vmcb->ctrl.exitinfo2;
+
+	exit->reason = NVMM_EXIT_IO;
+
+	if (info & SVM_EXIT_IO_TYPE) {
+		exit->u.io.type = NVMM_EXIT_IO_IN;
+	} else {
+		exit->u.io.type = NVMM_EXIT_IO_OUT;
+	}
+
+	exit->u.io.port = __SHIFTOUT(info, SVM_EXIT_IO_PORT);
+
+	if (svm_decode_assist) {
+		KASSERT(__SHIFTOUT(info, SVM_EXIT_IO_SEG) < 6);
+		exit->u.io.seg = seg_to_nvmm[__SHIFTOUT(info, SVM_EXIT_IO_SEG)];
+	} else {
+		if (exit->u.io.type == NVMM_EXIT_IO_IN) {
+			exit->u.io.seg = NVMM_X64_SEG_ES;
+		} else {
+			exit->u.io.seg = NVMM_X64_SEG_DS;
+		}
+	}
+
+	if (info & SVM_EXIT_IO_A64) {
+		exit->u.io.address_size = 8;
+	} else if (info & SVM_EXIT_IO_A32) {
+		exit->u.io.address_size = 4;
+	} else if (info & SVM_EXIT_IO_A16) {
+		exit->u.io.address_size = 2;
+	}
+
+	if (info & SVM_EXIT_IO_SZ32) {
+		exit->u.io.operand_size = 4;
+	} else if (info & SVM_EXIT_IO_SZ16) {
+		exit->u.io.operand_size = 2;
+	} else if (info & SVM_EXIT_IO_SZ8) {
+		exit->u.io.operand_size = 1;
+	}
+
+	exit->u.io.rep = (info & SVM_EXIT_IO_REP) != 0;
+	exit->u.io.str = (info & SVM_EXIT_IO_STR) != 0;
+	exit->u.io.npc = nextpc;
+}
+
+static bool
+svm_inkernel_handle_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+    struct nvmm_exit *exit)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct nvmm_x64_state *state = &cpudata->state;
+	uint64_t pat;
+
+	switch (exit->u.msr.type) {
+	case NVMM_EXIT_MSR_RDMSR:
+		if (exit->u.msr.msr == MSR_CR_PAT) {
+			pat = cpudata->vmcb->state.g_pat;
+			state->gprs[NVMM_X64_GPR_RAX] = (pat & 0xFFFFFFFF);
+			state->gprs[NVMM_X64_GPR_RDX] = (pat >> 32);
+			goto handled;
+		}
+		break;
+	case NVMM_EXIT_MSR_WRMSR:
+		if (exit->u.msr.msr == MSR_EFER) {
+			if (__predict_false(exit->u.msr.val & ~EFER_VALID)) {
+				svm_inject_gp(mach, vcpu);
+				goto handled;
+			}
+			if ((cpudata->vmcb->state.efer ^ exit->u.msr.val) &
+			     EFER_TLB_FLUSH) {
+				cpudata->tlb_want_flush = true;
+			}
+			cpudata->vmcb->state.efer = exit->u.msr.val | EFER_SVME;
+			goto handled;
+		}
+		if (exit->u.msr.msr == MSR_CR_PAT) {
+			cpudata->vmcb->state.g_pat = exit->u.msr.val;
+			goto handled;
+		}
+		break;
+	}
+
+	return false;
+
+handled:
+	cpudata->vmcb->state.rip = cpudata->vmcb->ctrl.nrip;
+	return true;
+}
+
+static void
+svm_exit_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+    struct nvmm_exit *exit)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct nvmm_x64_state *state = &cpudata->state;
+	uint64_t info = cpudata->vmcb->ctrl.exitinfo1;
+
+	if (info == 0) {
+		exit->u.msr.type = NVMM_EXIT_MSR_RDMSR;
+	} else {
+		exit->u.msr.type = NVMM_EXIT_MSR_WRMSR;
+	}
+
+	exit->u.msr.msr = state->gprs[NVMM_X64_GPR_RCX];
+
+	if (info == 1) {
+		uint64_t rdx, rax;
+		rdx = state->gprs[NVMM_X64_GPR_RDX];
+		rax = cpudata->vmcb->state.rax;
+		exit->u.msr.val = (rdx << 32) | (rax & 0xFFFFFFFF);
+	} else {
+		exit->u.msr.val = 0;
+	}
+
+	if (svm_inkernel_handle_msr(mach, vcpu, exit)) {
+		exit->reason = NVMM_EXIT_NONE;
+		return;
+	}
+
+	exit->reason = NVMM_EXIT_MSR;
+	exit->u.msr.npc = cpudata->vmcb->ctrl.nrip;
+}
+
+static void
+svm_exit_npf(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+    struct nvmm_exit *exit)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	gpaddr_t gpa = cpudata->vmcb->ctrl.exitinfo2;
+	int error;
+
+	error = uvm_fault(&mach->vm->vm_map, gpa, VM_PROT_ALL);
+
+	if (error) {
+		exit->reason = NVMM_EXIT_MEMORY;
+		if (cpudata->vmcb->ctrl.exitinfo1 & PGEX_W)
+			exit->u.mem.perm = NVMM_EXIT_MEMORY_WRITE;
+		else if (cpudata->vmcb->ctrl.exitinfo1 & PGEX_X)
+			exit->u.mem.perm = NVMM_EXIT_MEMORY_EXEC;
+		else
+			exit->u.mem.perm = NVMM_EXIT_MEMORY_READ;
+		exit->u.mem.gpa = gpa;
+		exit->u.mem.inst_len = cpudata->vmcb->ctrl.inst_len;
+		memcpy(exit->u.mem.inst_bytes, cpudata->vmcb->ctrl.inst_bytes,
+		    sizeof(exit->u.mem.inst_bytes));
+		exit->u.mem.npc = cpudata->vmcb->ctrl.nrip;
+	} else {
+		exit->reason = NVMM_EXIT_NONE;
+	}
+}
+
+static void
+svm_exit_xsetbv(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+    struct nvmm_exit *exit)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct nvmm_x64_state *state = &cpudata->state;
+	struct vmcb *vmcb = cpudata->vmcb;
+	uint64_t val;
+
+	exit->reason = NVMM_EXIT_NONE;
+
+	val = (state->gprs[NVMM_X64_GPR_RDX] << 32) |
+	    (state->gprs[NVMM_X64_GPR_RAX] & 0xFFFFFFFF);
+
+	if (__predict_false(state->gprs[NVMM_X64_GPR_RCX] != 0)) {
+		goto error;
+	} else if (__predict_false(vmcb->state.cpl != 0)) {
+		goto error;
+	} else if (__predict_false((val & ~svm_xcr0_mask) != 0)) {
+		goto error;
+	} else if (__predict_false((val & XCR0_X87) == 0)) {
+		goto error;
+	}
+
+	state->crs[NVMM_X64_CR_XCR0] = val;
+
+	return;
+
+error:
+	svm_inject_gp(mach, vcpu);
+}
+
+static void
+svm_vmcb_cache_default(struct vmcb *vmcb)
+{
+	vmcb->ctrl.vmcb_clean =
+	    VMCB_CTRL_VMCB_CLEAN_I |
+	    VMCB_CTRL_VMCB_CLEAN_IOPM |
+	    VMCB_CTRL_VMCB_CLEAN_ASID |
+	    VMCB_CTRL_VMCB_CLEAN_LBR |
+	    VMCB_CTRL_VMCB_CLEAN_AVIC;
+}
+
+static void
+svm_vmcb_cache_flush(struct vmcb *vmcb)
+{
+	vmcb->ctrl.vmcb_clean = 0;
+}
+
+static void
+svm_vcpu_guest_fpu_enter(struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+
+	if (x86_xsave_features != 0) {
+		cpudata->xcr0 = rdxcr(0);
+		wrxcr(0, cpudata->state.crs[NVMM_X64_CR_XCR0]);
+	}
+
+	cpudata->ts_set = (rcr0() & CR0_TS) != 0;
+
+	fpu_area_save(&cpudata->hfpu);
+	fpu_area_restore(&cpudata->gfpu);
+}
+
+static void
+svm_vcpu_guest_fpu_leave(struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+
+	fpu_area_save(&cpudata->gfpu);
+	fpu_area_restore(&cpudata->hfpu);
+
+	if (cpudata->ts_set) {
+		stts();
+	}
+
+	if (x86_xsave_features != 0) {
+		cpudata->state.crs[NVMM_X64_CR_XCR0] = rdxcr(0);
+		wrxcr(0, cpudata->xcr0);
+	}
+}
+
+static void
+svm_vcpu_guest_dbregs_enter(struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct nvmm_x64_state *state = &cpudata->state;
+
+	x86_dbregs_save(curlwp);
+
+	ldr0(state->drs[NVMM_X64_DR_DR0]);
+	ldr1(state->drs[NVMM_X64_DR_DR1]);
+	ldr2(state->drs[NVMM_X64_DR_DR2]);
+	ldr3(state->drs[NVMM_X64_DR_DR3]);
+}
+
+static void
+svm_vcpu_guest_dbregs_leave(struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct nvmm_x64_state *state = &cpudata->state;
+
+	state->drs[NVMM_X64_DR_DR0] = rdr0();
+	state->drs[NVMM_X64_DR_DR1] = rdr1();
+	state->drs[NVMM_X64_DR_DR2] = rdr2();
+	state->drs[NVMM_X64_DR_DR3] = rdr3();
+
+	x86_dbregs_restore(curlwp);
+}
+
+static void
+svm_vcpu_guest_misc_enter(struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+
+	/* Save the fixed Host MSRs. */
+	cpudata->star = rdmsr(MSR_STAR);
+	cpudata->lstar = rdmsr(MSR_LSTAR);
+	cpudata->cstar = rdmsr(MSR_CSTAR);
+	cpudata->sfmask = rdmsr(MSR_SFMASK);
+
+	/* Save the Host CR2. */
+	cpudata->cr2 = rcr2();
+}
+
+static void
+svm_vcpu_guest_misc_leave(struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+
+	/* Restore the fixed Host MSRs. */
+	wrmsr(MSR_STAR, cpudata->star);
+	wrmsr(MSR_LSTAR, cpudata->lstar);
+	wrmsr(MSR_CSTAR, cpudata->cstar);
+	wrmsr(MSR_SFMASK, cpudata->sfmask);
+
+	/* Restore the Host CR2. */
+	lcr2(cpudata->cr2);
+}
+
+static int
+svm_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+    struct nvmm_exit *exit)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct vmcb *vmcb = cpudata->vmcb;
+	bool tlb_need_flush = false;
+	int hcpu, s;
+
+	kpreempt_disable();
+	hcpu = cpu_number();
+
+	if (vcpu->hcpu_last != hcpu || cpudata->shared_asid) {
+		tlb_need_flush = true;
+	}
+
+	if (cpudata->tlb_want_flush || tlb_need_flush) {
+		vmcb->ctrl.tlb_ctrl = svm_ctrl_tlb_flush;
+	} else {
+		vmcb->ctrl.tlb_ctrl = 0;
+	}
+
+	if (vcpu->hcpu_last != hcpu) {
+		vmcb->ctrl.tsc_offset = cpudata->tsc_offset +
+		    curcpu()->ci_data.cpu_cc_skew;
+		svm_vmcb_cache_flush(vmcb);
+	}
+
+	svm_vcpu_guest_dbregs_enter(vcpu);
+	svm_vcpu_guest_misc_enter(vcpu);
+
+	while (1) {
+		s = splhigh();
+		svm_vcpu_guest_fpu_enter(vcpu);
+		svm_vmrun(cpudata->vmcb_pa, cpudata->state.gprs);
+		svm_vcpu_guest_fpu_leave(vcpu);
+		splx(s);
+
+		svm_vmcb_cache_default(vmcb);
+
+		if (vmcb->ctrl.exitcode != VMCB_EXITCODE_INVALID) {
+			if (cpudata->tlb_want_flush) {
+				cpudata->tlb_want_flush = false;
+			}
+			vcpu->hcpu_last = hcpu;
+		}
+
+		switch (vmcb->ctrl.exitcode) {
+		case VMCB_EXITCODE_INTR:
+		case VMCB_EXITCODE_NMI:
+			exit->reason = NVMM_EXIT_NONE;
+			break;
+		case VMCB_EXITCODE_VINTR:
+			svm_event_waitexit_disable(vmcb, false);
+			exit->reason = NVMM_EXIT_INT_READY;
+			break;
+		case VMCB_EXITCODE_IRET:
+			svm_event_waitexit_disable(vmcb, true);
+			cpudata->in_nmi = false;
+			exit->reason = NVMM_EXIT_NMI_READY;
+			break;
+		case VMCB_EXITCODE_CPUID:
+			svm_exit_cpuid(mach, vcpu, exit);
+			break;
+		case VMCB_EXITCODE_HLT:
+			exit->reason = NVMM_EXIT_HLT;
+			break;
+		case VMCB_EXITCODE_IOIO:
+			svm_exit_io(mach, vcpu, exit);
+			break;
+		case VMCB_EXITCODE_MSR:
+			svm_exit_msr(mach, vcpu, exit);
+			break;
+		case VMCB_EXITCODE_SHUTDOWN:
+			exit->reason = NVMM_EXIT_SHUTDOWN;
+			break;
+		case VMCB_EXITCODE_RDPMC:
+		case VMCB_EXITCODE_RSM:
+		case VMCB_EXITCODE_INVLPGA:
+		case VMCB_EXITCODE_VMRUN:
+		case VMCB_EXITCODE_VMMCALL:
+		case VMCB_EXITCODE_VMLOAD:
+		case VMCB_EXITCODE_VMSAVE:
+		case VMCB_EXITCODE_STGI:
+		case VMCB_EXITCODE_CLGI:
+		case VMCB_EXITCODE_SKINIT:
+		case VMCB_EXITCODE_RDTSCP:
+			svm_inject_ud(mach, vcpu);
+			exit->reason = NVMM_EXIT_NONE;
+			break;
+		case VMCB_EXITCODE_MONITOR:
+			exit->reason = NVMM_EXIT_MONITOR;
+			break;
+		case VMCB_EXITCODE_MWAIT:
+			exit->reason = NVMM_EXIT_MWAIT;
+			break;
+		case VMCB_EXITCODE_MWAIT_CONDITIONAL:
+			exit->reason = NVMM_EXIT_MWAIT_COND;
+			break;
+		case VMCB_EXITCODE_XSETBV:
+			svm_exit_xsetbv(mach, vcpu, exit);
+			break;
+		case VMCB_EXITCODE_NPF:
+			svm_exit_npf(mach, vcpu, exit);
+			break;
+		case VMCB_EXITCODE_FERR_FREEZE: /* ? */
+		default:
+			exit->reason = NVMM_EXIT_INVALID;
+			break;
+		}
+
+		/* If no reason to return to userland, keep rolling. */
+		if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) {
+			break;
+		}
+		if (exit->reason != NVMM_EXIT_NONE) {
+			break;
+		}
+	}
+
+	svm_vcpu_guest_misc_leave(vcpu);
+	svm_vcpu_guest_dbregs_leave(vcpu);
+
+	kpreempt_enable();
+
+	exit->exitstate[NVMM_X64_EXITSTATE_CR8] = __SHIFTOUT(vmcb->ctrl.v,
+	    VMCB_CTRL_V_TPR);
+
+	return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+svm_memalloc(paddr_t *pa, vaddr_t *va, size_t npages)
+{
+	struct pglist pglist;
+	paddr_t _pa;
+	vaddr_t _va;
+	size_t i;
+	int ret;
+
+	ret = uvm_pglistalloc(npages * PAGE_SIZE, 0, ~0UL, PAGE_SIZE, 0,
+	    &pglist, 1, 0);
+	if (ret != 0)
+		return ENOMEM;
+	_pa = TAILQ_FIRST(&pglist)->phys_addr;
+	_va = uvm_km_alloc(kernel_map, npages * PAGE_SIZE, 0,
+	    UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
+	if (_va == 0)
+		goto error;
+
+	for (i = 0; i < npages; i++) {
+		pmap_kenter_pa(_va + i * PAGE_SIZE, _pa + i * PAGE_SIZE,
+		    VM_PROT_READ | VM_PROT_WRITE, PMAP_WRITE_BACK);
+	}
+
+	memset((void *)_va, 0, npages * PAGE_SIZE);
+
+	*pa = _pa;
+	*va = _va;
+	return 0;
+
+error:
+	for (i = 0; i < npages; i++) {
+		uvm_pagefree(PHYS_TO_VM_PAGE(_pa + i * PAGE_SIZE));
+	}
+	return ENOMEM;
+}
+
+static void
+svm_memfree(paddr_t pa, vaddr_t va, size_t npages)
+{
+	size_t i;
+
+	pmap_kremove(va, npages * PAGE_SIZE);
+	pmap_update(pmap_kernel());
+	uvm_km_free(kernel_map, va, npages * PAGE_SIZE, UVM_KMF_VAONLY);
+	for (i = 0; i < npages; i++) {
+		uvm_pagefree(PHYS_TO_VM_PAGE(pa + i * PAGE_SIZE));
+	}
+}
+
+/* -------------------------------------------------------------------------- */
+
+#define SVM_MSRBM_READ	__BIT(0)
+#define SVM_MSRBM_WRITE	__BIT(1)
+
+static void
+svm_vcpu_msr_allow(uint8_t *bitmap, uint64_t msr, bool read, bool write)
+{
+	uint64_t byte;
+	uint8_t bitoff;
+
+	if (msr < 0x00002000) {
+		/* Range 1 */
+		byte = ((msr - 0x00000000) >> 2UL) + 0x0000;
+	} else if (msr >= 0xC0000000 && msr < 0xC0002000) {
+		/* Range 2 */
+		byte = ((msr - 0xC0000000) >> 2UL) + 0x0800;
+	} else if (msr >= 0xC0010000 && msr < 0xC0012000) {
+		/* Range 3 */
+		byte = ((msr - 0xC0010000) >> 2UL) + 0x1000;
+	} else {
+		panic("%s: wrong range", __func__);
+	}
+
+	bitoff = (msr & 0x3) << 1;
+
+	if (read) {
+		bitmap[byte] &= ~(SVM_MSRBM_READ << bitoff);
+	}
+	if (write) {
+		bitmap[byte] &= ~(SVM_MSRBM_WRITE << bitoff);
+	}
+}
+
+static void
+svm_asid_alloc(struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct vmcb *vmcb = cpudata->vmcb;
+	size_t i, oct, bit;
+
+	mutex_enter(&svm_asidlock);
+
+	for (i = 0; i < svm_maxasid; i++) {
+		oct = i / 8;
+		bit = i % 8;
+
+		if (svm_asidmap[oct] & __BIT(bit)) {
+			continue;
+		}
+
+		svm_asidmap[oct] |= __BIT(bit);
+		vmcb->ctrl.guest_asid = i;
+		mutex_exit(&svm_asidlock);
+		return;
+	}
+
+	/*
+	 * No free ASID. Use the last one, which is shared and requires
+	 * special TLB handling.
+	 */
+	cpudata->shared_asid = true;
+	vmcb->ctrl.guest_asid = svm_maxasid - 1;
+	mutex_exit(&svm_asidlock);
+}
+
+static void
+svm_asid_free(struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct vmcb *vmcb = cpudata->vmcb;
+	size_t oct, bit;
+
+	if (cpudata->shared_asid) {
+		return;
+	}
+
+	oct = vmcb->ctrl.guest_asid / 8;
+	bit = vmcb->ctrl.guest_asid % 8;
+
+	mutex_enter(&svm_asidlock);
+	svm_asidmap[oct] &= ~__BIT(bit);
+	mutex_exit(&svm_asidlock);
+}
+
+static void
+svm_vcpu_init(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct vmcb *vmcb = cpudata->vmcb;
+
+	/* Allow reads/writes of Control Registers. */
+	vmcb->ctrl.intercept_cr = 0;
+
+	/* Allow reads/writes of Debug Registers. */
+	vmcb->ctrl.intercept_dr = 0;
+
+	/* Allow exceptions 0 to 31. */
+	vmcb->ctrl.intercept_vec = 0;
+
+	/*
+	 * Allow:
+	 *  - SMI [smm interrupts]
+	 *  - VINTR [virtual interrupts]
+	 *  - CR0_SPEC [CR0 writes changing other fields than CR0.TS or CR0.MP]
+	 *  - RIDTR [reads of IDTR]
+	 *  - RGDTR [reads of GDTR]
+	 *  - RLDTR [reads of LDTR]
+	 *  - RTR [reads of TR]
+	 *  - WIDTR [writes of IDTR]
+	 *  - WGDTR [writes of GDTR]
+	 *  - WLDTR [writes of LDTR]
+	 *  - WTR [writes of TR]
+	 *  - RDTSC [rdtsc instruction]
+	 *  - PUSHF [pushf instruction]
+	 *  - POPF [popf instruction]
+	 *  - IRET [iret instruction]
+	 *  - INTN [int $n instructions]
+	 *  - INVD [invd instruction]
+	 *  - PAUSE [pause instruction]
+	 *  - INVLPG [invplg instruction]
+	 *  - TASKSW [task switches]
+	 *
+	 * Intercept the rest below.
+	 */
+	vmcb->ctrl.intercept_misc1 =
+	    VMCB_CTRL_INTERCEPT_INTR |
+	    VMCB_CTRL_INTERCEPT_NMI |
+	    VMCB_CTRL_INTERCEPT_INIT |
+	    VMCB_CTRL_INTERCEPT_RDPMC |
+	    VMCB_CTRL_INTERCEPT_CPUID |
+	    VMCB_CTRL_INTERCEPT_RSM |
+	    VMCB_CTRL_INTERCEPT_HLT |
+	    VMCB_CTRL_INTERCEPT_INVLPGA |
+	    VMCB_CTRL_INTERCEPT_IOIO_PROT |
+	    VMCB_CTRL_INTERCEPT_MSR_PROT |
+	    VMCB_CTRL_INTERCEPT_FERR_FREEZE |
+	    VMCB_CTRL_INTERCEPT_SHUTDOWN;
+
+	/*
+	 * Allow:
+	 *  - ICEBP [icebp instruction]
+	 *  - WBINVD [wbinvd instruction]
+	 *  - WCR_SPEC(0..15) [writes of CR0-15, received after instruction]
+	 *
+	 * Intercept the rest below.
+	 */
+	vmcb->ctrl.intercept_misc2 =
+	    VMCB_CTRL_INTERCEPT_VMRUN |
+	    VMCB_CTRL_INTERCEPT_VMMCALL |
+	    VMCB_CTRL_INTERCEPT_VMLOAD |
+	    VMCB_CTRL_INTERCEPT_VMSAVE |
+	    VMCB_CTRL_INTERCEPT_STGI |
+	    VMCB_CTRL_INTERCEPT_CLGI |
+	    VMCB_CTRL_INTERCEPT_SKINIT |
+	    VMCB_CTRL_INTERCEPT_RDTSCP |
+	    VMCB_CTRL_INTERCEPT_MONITOR |
+	    VMCB_CTRL_INTERCEPT_MWAIT |
+	    VMCB_CTRL_INTERCEPT_XSETBV;
+
+	/* Intercept all I/O accesses. */
+	memset(cpudata->iobm, 0xFF, IOBM_SIZE);
+	vmcb->ctrl.iopm_base_pa = cpudata->iobm_pa;
+
+	/*
+	 * Allow:
+	 *  - EFER [read]
+	 *  - STAR [read, write]
+	 *  - LSTAR [read, write]
+	 *  - CSTAR [read, write]
+	 *  - SFMASK [read, write]
+	 *  - KERNELGSBASE [read, write]
+	 *  - SYSENTER_CS [read, write]
+	 *  - SYSENTER_ESP [read, write]
+	 *  - SYSENTER_EIP [read, write]
+	 *  - FSBASE [read, write]
+	 *  - GSBASE [read, write]
+	 *
+	 * Intercept the rest.
+	 */
+	memset(cpudata->msrbm, 0xFF, MSRBM_SIZE);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_EFER, true, false);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_STAR, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_LSTAR, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_CSTAR, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_SFMASK, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_KERNELGSBASE, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_CS, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_ESP, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_EIP, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_FSBASE, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_GSBASE, true, true);
+	vmcb->ctrl.msrpm_base_pa = cpudata->msrbm_pa;
+
+	/* Generate ASID. */
+	svm_asid_alloc(vcpu);
+
+	/* Virtual TPR. */
+	vmcb->ctrl.v = VMCB_CTRL_V_INTR_MASKING;
+
+	/* Enable Nested Paging. */
+	vmcb->ctrl.enable1 = VMCB_CTRL_ENABLE_NP;
+	vmcb->ctrl.n_cr3 = mach->vm->vm_map.pmap->pm_pdirpa[0];
+
+	/* Must always be set. */
+	vmcb->state.efer = EFER_SVME;
+
+	/* Init XSAVE header. */
+	cpudata->gfpu.xsh_xstate_bv = svm_xcr0_mask;
+	cpudata->gfpu.xsh_xcomp_bv = 0;
+
+	/* Bluntly hide the host TSC. */
+	cpudata->tsc_offset = rdtsc();
+}
+
+static int
+svm_vcpu_create(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata;
+	int error;
+
+	/* Allocate the SVM cpudata. */
+	cpudata = (struct svm_cpudata *)uvm_km_alloc(kernel_map,
+	    roundup(sizeof(*cpudata), PAGE_SIZE), 0,
+	    UVM_KMF_WIRED|UVM_KMF_ZERO);
+	vcpu->cpudata = cpudata;
+
+	/* VMCB */
+	error = svm_memalloc(&cpudata->vmcb_pa, (vaddr_t *)&cpudata->vmcb,
+	    VMCB_NPAGES);
+	if (error)
+		goto error;
+
+	/* I/O Bitmap */
+	error = svm_memalloc(&cpudata->iobm_pa, (vaddr_t *)&cpudata->iobm,
+	    IOBM_NPAGES);
+	if (error)
+		goto error;
+
+	/* MSR Bitmap */
+	error = svm_memalloc(&cpudata->msrbm_pa, (vaddr_t *)&cpudata->msrbm,
+	    MSRBM_NPAGES);
+	if (error)
+		goto error;
+
+	/* Init the VCPU info. */
+	svm_vcpu_init(mach, vcpu);
+
+	return 0;
+
+error:
+	if (cpudata->vmcb_pa) {
+		svm_memfree(cpudata->vmcb_pa, (vaddr_t)cpudata->vmcb,
+		    VMCB_NPAGES);
+	}
+	if (cpudata->iobm_pa) {
+		svm_memfree(cpudata->iobm_pa, (vaddr_t)cpudata->iobm,
+		    IOBM_NPAGES);
+	}
+	if (cpudata->msrbm_pa) {
+		svm_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm,
+		    MSRBM_NPAGES);
+	}
+	uvm_km_free(kernel_map, (vaddr_t)cpudata,
+	    roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED);
+	return error;
+}
+
+static void
+svm_vcpu_destroy(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+
+	svm_asid_free(vcpu);
+
+	svm_memfree(cpudata->vmcb_pa, (vaddr_t)cpudata->vmcb, VMCB_NPAGES);
+	svm_memfree(cpudata->iobm_pa, (vaddr_t)cpudata->iobm, IOBM_NPAGES);
+	svm_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm, MSRBM_NPAGES);
+
+	uvm_km_free(kernel_map, (vaddr_t)cpudata,
+	    roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED);
+}
+
+#define SVM_SEG_ATTRIB_TYPE		__BITS(4,0)
+#define SVM_SEG_ATTRIB_DPL		__BITS(6,5)
+#define SVM_SEG_ATTRIB_P		__BIT(7)
+#define SVM_SEG_ATTRIB_AVL		__BIT(8)
+#define SVM_SEG_ATTRIB_LONG		__BIT(9)
+#define SVM_SEG_ATTRIB_DEF32		__BIT(10)
+#define SVM_SEG_ATTRIB_GRAN		__BIT(11)
+
+static void
+svm_vcpu_setstate_seg(struct nvmm_x64_state_seg *seg, struct vmcb_segment *vseg)
+{
+	vseg->selector = seg->selector;
+	vseg->attrib =
+	    __SHIFTIN(seg->attrib.type, SVM_SEG_ATTRIB_TYPE) |
+	    __SHIFTIN(seg->attrib.dpl, SVM_SEG_ATTRIB_DPL) |
+	    __SHIFTIN(seg->attrib.p, SVM_SEG_ATTRIB_P) |
+	    __SHIFTIN(seg->attrib.avl, SVM_SEG_ATTRIB_AVL) |
+	    __SHIFTIN(seg->attrib.lng, SVM_SEG_ATTRIB_LONG) |
+	    __SHIFTIN(seg->attrib.def32, SVM_SEG_ATTRIB_DEF32) |
+	    __SHIFTIN(seg->attrib.gran, SVM_SEG_ATTRIB_GRAN);
+	vseg->limit = seg->limit;
+	vseg->base = seg->base;
+}
+
+static void
+svm_vcpu_getstate_seg(struct nvmm_x64_state_seg *seg, struct vmcb_segment *vseg)
+{
+	seg->selector = vseg->selector;
+	seg->attrib.type = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_TYPE);
+	seg->attrib.dpl = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_DPL);
+	seg->attrib.p = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_P);
+	seg->attrib.avl = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_AVL);
+	seg->attrib.lng = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_LONG);
+	seg->attrib.def32 = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_DEF32);
+	seg->attrib.gran = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_GRAN);
+	seg->limit = vseg->limit;
+	seg->base = vseg->base;
+}
+
+static bool
+svm_state_tlb_flush(struct nvmm_x64_state *cstate,
+    struct nvmm_x64_state *nstate, uint64_t flags)
+{
+	if (flags & NVMM_X64_STATE_CRS) {
+		if ((cstate->crs[NVMM_X64_CR_CR0] ^
+		     nstate->crs[NVMM_X64_CR_CR0]) & CR0_TLB_FLUSH) {
+			return true;
+		}
+		if (cstate->crs[NVMM_X64_CR_CR3] !=
+		    nstate->crs[NVMM_X64_CR_CR3]) {
+			return true;
+		}
+		if ((cstate->crs[NVMM_X64_CR_CR4] ^
+		     nstate->crs[NVMM_X64_CR_CR4]) & CR4_TLB_FLUSH) {
+			return true;
+		}
+	}
+
+	if (flags & NVMM_X64_STATE_MSRS) {
+		if ((cstate->msrs[NVMM_X64_MSR_EFER] ^
+		     nstate->msrs[NVMM_X64_MSR_EFER]) & EFER_TLB_FLUSH) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void
+svm_vcpu_setstate(struct nvmm_cpu *vcpu, void *data, uint64_t flags)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct nvmm_x64_state *cstate = &cpudata->state;
+	struct nvmm_x64_state *nstate = (struct nvmm_x64_state *)data;
+	struct vmcb *vmcb = cpudata->vmcb;
+	struct fxsave *fpustate;
+
+	if (svm_state_tlb_flush(cstate, nstate, flags)) {
+		cpudata->tlb_want_flush = true;
+	}
+
+	if (flags & NVMM_X64_STATE_SEGS) {
+		memcpy(cstate->segs, nstate->segs, sizeof(nstate->segs));
+
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_CS],
+		    &vmcb->state.cs);
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_DS],
+		    &vmcb->state.ds);
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_ES],
+		    &vmcb->state.es);
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_FS],
+		    &vmcb->state.fs);
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_GS],
+		    &vmcb->state.gs);
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_SS],
+		    &vmcb->state.ss);
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_GDT],
+		    &vmcb->state.gdt);
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_IDT],
+		    &vmcb->state.idt);
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_LDT],
+		    &vmcb->state.ldt);
+		svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_TR],
+		    &vmcb->state.tr);
+	}
+
+	if (flags & NVMM_X64_STATE_GPRS) {
+		memcpy(cstate->gprs, nstate->gprs, sizeof(nstate->gprs));
+
+		vmcb->state.rip = cstate->gprs[NVMM_X64_GPR_RIP];
+		vmcb->state.rsp = cstate->gprs[NVMM_X64_GPR_RSP];
+		vmcb->state.rax = cstate->gprs[NVMM_X64_GPR_RAX];
+		vmcb->state.rflags = cstate->gprs[NVMM_X64_GPR_RFLAGS];
+	}
+
+	if (flags & NVMM_X64_STATE_CRS) {
+		memcpy(cstate->crs, nstate->crs, sizeof(nstate->crs));
+
+		vmcb->state.cr0 = cstate->crs[NVMM_X64_CR_CR0];
+		vmcb->state.cr2 = cstate->crs[NVMM_X64_CR_CR2];
+		vmcb->state.cr3 = cstate->crs[NVMM_X64_CR_CR3];
+		vmcb->state.cr4 = cstate->crs[NVMM_X64_CR_CR4];
+
+		vmcb->ctrl.v &= ~VMCB_CTRL_V_TPR;
+		vmcb->ctrl.v |= __SHIFTIN(cstate->crs[NVMM_X64_CR_CR8],
+		    VMCB_CTRL_V_TPR);
+
+		/* Clear unsupported XCR0 bits, set mandatory X87 bit. */
+		if (svm_xcr0_mask != 0) {
+			cstate->crs[NVMM_X64_CR_XCR0] &= svm_xcr0_mask;
+			cstate->crs[NVMM_X64_CR_XCR0] |= XCR0_X87;
+		} else {
+			cstate->crs[NVMM_X64_CR_XCR0] = 0;
+		}
+	}
+
+	if (flags & NVMM_X64_STATE_DRS) {
+		memcpy(cstate->drs, nstate->drs, sizeof(nstate->drs));
+
+		vmcb->state.dr6 = cstate->drs[NVMM_X64_DR_DR6];
+		vmcb->state.dr7 = cstate->drs[NVMM_X64_DR_DR7];
+	}
+
+	if (flags & NVMM_X64_STATE_MSRS) {
+		memcpy(cstate->msrs, nstate->msrs, sizeof(nstate->msrs));
+
+		/* Bit EFER_SVME is mandatory. */
+		cstate->msrs[NVMM_X64_MSR_EFER] |= EFER_SVME;
+
+		vmcb->state.efer = cstate->msrs[NVMM_X64_MSR_EFER];
+		vmcb->state.star = cstate->msrs[NVMM_X64_MSR_STAR];
+		vmcb->state.lstar = cstate->msrs[NVMM_X64_MSR_LSTAR];
+		vmcb->state.cstar = cstate->msrs[NVMM_X64_MSR_CSTAR];
+		vmcb->state.sfmask = cstate->msrs[NVMM_X64_MSR_SFMASK];
+		vmcb->state.kernelgsbase =
+		    cstate->msrs[NVMM_X64_MSR_KERNELGSBASE];
+		vmcb->state.sysenter_cs =
+		    cstate->msrs[NVMM_X64_MSR_SYSENTER_CS];
+		vmcb->state.sysenter_esp =
+		    cstate->msrs[NVMM_X64_MSR_SYSENTER_ESP];
+		vmcb->state.sysenter_eip =
+		    cstate->msrs[NVMM_X64_MSR_SYSENTER_EIP];
+		vmcb->state.g_pat = cstate->msrs[NVMM_X64_MSR_PAT];
+	}
+
+	if (flags & NVMM_X64_STATE_MISC) {
+		memcpy(cstate->misc, nstate->misc, sizeof(nstate->misc));
+
+		vmcb->state.cpl = cstate->misc[NVMM_X64_MISC_CPL];
+	}
+
+	CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(cstate->fpu));
+	if (flags & NVMM_X64_STATE_FPU) {
+		memcpy(&cstate->fpu, &nstate->fpu, sizeof(nstate->fpu));
+
+		memcpy(cpudata->gfpu.xsh_fxsave, &cstate->fpu,
+		    sizeof(cstate->fpu));
+
+		fpustate = (struct fxsave *)cpudata->gfpu.xsh_fxsave;
+		fpustate->fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
+		fpustate->fx_mxcsr &= fpustate->fx_mxcsr_mask;
+	}
+}
+
+static void
+svm_vcpu_getstate(struct nvmm_cpu *vcpu, void *data, uint64_t flags)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct nvmm_x64_state *cstate = &cpudata->state;
+	struct nvmm_x64_state *nstate = (struct nvmm_x64_state *)data;
+	struct vmcb *vmcb = cpudata->vmcb;
+
+	if (flags & NVMM_X64_STATE_SEGS) {
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_CS],
+		    &vmcb->state.cs);
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_DS],
+		    &vmcb->state.ds);
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_ES],
+		    &vmcb->state.es);
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_FS],
+		    &vmcb->state.fs);
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_GS],
+		    &vmcb->state.gs);
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_SS],
+		    &vmcb->state.ss);
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_GDT],
+		    &vmcb->state.gdt);
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_IDT],
+		    &vmcb->state.idt);
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_LDT],
+		    &vmcb->state.ldt);
+		svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_TR],
+		    &vmcb->state.tr);
+
+		memcpy(nstate->segs, cstate->segs, sizeof(cstate->segs));
+	}
+
+	if (flags & NVMM_X64_STATE_GPRS) {
+		cstate->gprs[NVMM_X64_GPR_RIP] = vmcb->state.rip;
+		cstate->gprs[NVMM_X64_GPR_RSP] = vmcb->state.rsp;
+		cstate->gprs[NVMM_X64_GPR_RAX] = vmcb->state.rax;
+		cstate->gprs[NVMM_X64_GPR_RFLAGS] = vmcb->state.rflags;
+
+		memcpy(nstate->gprs, cstate->gprs, sizeof(cstate->gprs));
+	}
+
+	if (flags & NVMM_X64_STATE_CRS) {
+		cstate->crs[NVMM_X64_CR_CR0] = vmcb->state.cr0;
+		cstate->crs[NVMM_X64_CR_CR2] = vmcb->state.cr2;
+		cstate->crs[NVMM_X64_CR_CR3] = vmcb->state.cr3;
+		cstate->crs[NVMM_X64_CR_CR4] = vmcb->state.cr4;
+		cstate->crs[NVMM_X64_CR_CR8] = __SHIFTOUT(vmcb->ctrl.v,
+		    VMCB_CTRL_V_TPR);
+
+		memcpy(nstate->crs, cstate->crs, sizeof(cstate->crs));
+	}
+
+	if (flags & NVMM_X64_STATE_DRS) {
+		cstate->drs[NVMM_X64_DR_DR6] = vmcb->state.dr6;
+		cstate->drs[NVMM_X64_DR_DR7] = vmcb->state.dr7;
+
+		memcpy(nstate->drs, cstate->drs, sizeof(cstate->drs));
+	}
+
+	if (flags & NVMM_X64_STATE_MSRS) {
+		cstate->msrs[NVMM_X64_MSR_EFER] = vmcb->state.efer;
+		cstate->msrs[NVMM_X64_MSR_STAR] = vmcb->state.star;
+		cstate->msrs[NVMM_X64_MSR_LSTAR] = vmcb->state.lstar;
+		cstate->msrs[NVMM_X64_MSR_CSTAR] = vmcb->state.cstar;
+		cstate->msrs[NVMM_X64_MSR_SFMASK] = vmcb->state.sfmask;
+		cstate->msrs[NVMM_X64_MSR_KERNELGSBASE] =
+		    vmcb->state.kernelgsbase;
+		cstate->msrs[NVMM_X64_MSR_SYSENTER_CS] =
+		    vmcb->state.sysenter_cs;
+		cstate->msrs[NVMM_X64_MSR_SYSENTER_ESP] =
+		    vmcb->state.sysenter_esp;
+		cstate->msrs[NVMM_X64_MSR_SYSENTER_EIP] =
+		    vmcb->state.sysenter_eip;
+		cstate->msrs[NVMM_X64_MSR_PAT] = vmcb->state.g_pat;
+
+		memcpy(nstate->msrs, cstate->msrs, sizeof(cstate->msrs));
+
+		/* Hide SVME. */
+		nstate->msrs[NVMM_X64_MSR_EFER] &= ~EFER_SVME;
+	}
+
+	if (flags & NVMM_X64_STATE_MISC) {
+		cstate->misc[NVMM_X64_MISC_CPL] = vmcb->state.cpl;
+
+		memcpy(nstate->misc, cstate->misc, sizeof(cstate->misc));
+	}
+
+	CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(cstate->fpu));
+	if (flags & NVMM_X64_STATE_FPU) {
+		memcpy(&cstate->fpu, cpudata->gfpu.xsh_fxsave,
+		    sizeof(cstate->fpu));
+
+		memcpy(&cstate->fpu, &nstate->fpu, sizeof(cstate->fpu));
+	}
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+svm_tlb_flush(struct pmap *pm)
+{
+	struct nvmm_machine *mach = pm->pm_data;
+	struct svm_cpudata *cpudata;
+	struct nvmm_cpu *vcpu;
+	int error;
+	size_t i;
+
+	/* Request TLB flushes. */
+	for (i = 0; i < NVMM_MAX_VCPUS; i++) {
+		error = nvmm_vcpu_get(mach, i, &vcpu);
+		if (error)
+			continue;
+		cpudata = vcpu->cpudata;
+		cpudata->tlb_want_flush = true;
+		nvmm_vcpu_put(vcpu);
+	}
+}
+
+static void
+svm_machine_create(struct nvmm_machine *mach)
+{
+	/* Fill in pmap info. */
+	mach->vm->vm_map.pmap->pm_data = (void *)mach;
+	mach->vm->vm_map.pmap->pm_tlb_flush = svm_tlb_flush;
+
+	mach->machdata = kmem_zalloc(sizeof(struct svm_machdata), KM_SLEEP);
+}
+
+static void
+svm_machine_destroy(struct nvmm_machine *mach)
+{
+	kmem_free(mach->machdata, sizeof(struct svm_machdata));
+}
+
+static int
+svm_machine_configure(struct nvmm_machine *mach, uint64_t op, void *data)
+{
+	struct nvmm_x86_conf_cpuid *cpuid = data;
+	struct svm_machdata *machdata = (struct svm_machdata *)mach->machdata;
+	size_t i;
+
+	if (__predict_false(op != NVMM_X86_CONF_CPUID)) {
+		return EINVAL;
+	}
+
+	if (__predict_false((cpuid->set.eax & cpuid->del.eax) ||
+	    (cpuid->set.ebx & cpuid->del.ebx) ||
+	    (cpuid->set.ecx & cpuid->del.ecx) ||
+	    (cpuid->set.edx & cpuid->del.edx))) {
+		return EINVAL;
+	}
+
+	/* If already here, replace. */
+	for (i = 0; i < SVM_NCPUIDS; i++) {
+		if (!machdata->cpuidpresent[i]) {
+			continue;
+		}
+		if (machdata->cpuid[i].leaf == cpuid->leaf) {
+			memcpy(&machdata->cpuid[i], cpuid,
+			    sizeof(struct nvmm_x86_conf_cpuid));
+			return 0;
+		}
+	}
+
+	/* Not here, insert. */
+	for (i = 0; i < SVM_NCPUIDS; i++) {
+		if (!machdata->cpuidpresent[i]) {
+			machdata->cpuidpresent[i] = true;
+			memcpy(&machdata->cpuid[i], cpuid,
+			    sizeof(struct nvmm_x86_conf_cpuid));
+			return 0;
+		}
+	}
+
+	return ENOBUFS;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static bool
+svm_ident(void)
+{
+	u_int descs[4];
+	uint64_t msr;
+
+	if (cpu_vendor != CPUVENDOR_AMD) {
+		return false;
+	}
+	if (!(cpu_feature[3] & CPUID_SVM)) {
+		return false;
+	}
+
+	if (curcpu()->ci_max_ext_cpuid < 0x8000000a) {
+		return false;
+	}
+	x86_cpuid(0x8000000a, descs);
+
+	/* Want Nested Paging. */
+	if (!(descs[3] & CPUID_AMD_SVM_NP)) {
+		return false;
+	}
+
+	/* Want nRIP. */
+	if (!(descs[3] & CPUID_AMD_SVM_NRIPS)) {
+		return false;
+	}
+
+	svm_decode_assist = (descs[3] & CPUID_AMD_SVM_DecodeAssist) != 0;
+
+	msr = rdmsr(MSR_VMCR);
+	if ((msr & VMCR_SVMED) && (msr & VMCR_LOCK)) {
+		return false;
+	}
+
+	return true;
+}
+
+static void
+svm_init_asid(uint32_t maxasid)
+{
+	size_t i, j, allocsz;
+
+	mutex_init(&svm_asidlock, MUTEX_DEFAULT, IPL_NONE);
+
+	/* Arbitrarily limit. */
+	maxasid = uimin(maxasid, 8192);
+
+	svm_maxasid = maxasid;
+	allocsz = roundup(maxasid, 8) / 8;
+	svm_asidmap = kmem_zalloc(allocsz, KM_SLEEP);
+
+	/* ASID 0 is reserved for the host. */
+	svm_asidmap[0] |= __BIT(0);
+
+	/* ASID n-1 is special, we share it. */
+	i = (maxasid - 1) / 8;
+	j = (maxasid - 1) % 8;
+	svm_asidmap[i] |= __BIT(j);
+}
+
+static void
+svm_change_cpu(void *arg1, void *arg2)
+{
+	bool enable = (bool)arg1;
+	uint64_t msr;
+
+	msr = rdmsr(MSR_VMCR);
+	if (msr & VMCR_SVMED) {
+		wrmsr(MSR_VMCR, msr & ~VMCR_SVMED);
+	}
+
+	if (!enable) {
+		wrmsr(MSR_VM_HSAVE_PA, 0);
+	}
+
+	msr = rdmsr(MSR_EFER);
+	if (enable) {
+		msr |= EFER_SVME;
+	} else {
+		msr &= ~EFER_SVME;
+	}
+	wrmsr(MSR_EFER, msr);
+
+	if (enable) {
+		wrmsr(MSR_VM_HSAVE_PA, hsave[cpu_index(curcpu())].pa);
+	}
+}
+
+static void
+svm_init(void)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+	struct vm_page *pg;
+	u_int descs[4];
+	uint64_t xc;
+
+	x86_cpuid(0x8000000a, descs);
+
+	/* The guest TLB flush command. */
+	if (descs[3] & CPUID_AMD_SVM_FlushByASID) {
+		svm_ctrl_tlb_flush = VMCB_CTRL_TLB_CTRL_FLUSH_GUEST;
+	} else {
+		svm_ctrl_tlb_flush = VMCB_CTRL_TLB_CTRL_FLUSH_ALL;
+	}
+
+	/* Init the ASID. */
+	svm_init_asid(descs[1]);
+
+	/* Init the XCR0 mask. */
+	svm_xcr0_mask = SVM_XCR0_MASK_DEFAULT & x86_xsave_features;
+
+	memset(hsave, 0, sizeof(hsave));
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+		hsave[cpu_index(ci)].pa = VM_PAGE_TO_PHYS(pg);
+	}
+
+	xc = xc_broadcast(0, svm_change_cpu, (void *)true, NULL);
+	xc_wait(xc);
+}
+
+static void
+svm_fini_asid(void)
+{
+	size_t allocsz;
+
+	allocsz = roundup(svm_maxasid, 8) / 8;
+	kmem_free(svm_asidmap, allocsz);
+
+	mutex_destroy(&svm_asidlock);
+}
+
+static void
+svm_fini(void)
+{
+	uint64_t xc;
+	size_t i;
+
+	xc = xc_broadcast(0, svm_change_cpu, (void *)false, NULL);
+	xc_wait(xc);
+
+	for (i = 0; i < MAXCPUS; i++) {
+		if (hsave[i].pa != 0)
+			uvm_pagefree(PHYS_TO_VM_PAGE(hsave[i].pa));
+	}
+
+	svm_fini_asid();
+}
+
+static void
+svm_capability(struct nvmm_capability *cap)
+{
+	cap->u.x86.xcr0_mask = svm_xcr0_mask;
+	cap->u.x86.mxcsr_mask = x86_fpu_mxcsr_mask;
+	cap->u.x86.conf_cpuid_maxops = SVM_NCPUIDS;
+}
+
+const struct nvmm_impl nvmm_x86_svm = {
+	.ident = svm_ident,
+	.init = svm_init,
+	.fini = svm_fini,
+	.capability = svm_capability,
+	.conf_max = NVMM_X86_NCONF,
+	.conf_sizes = svm_conf_sizes,
+	.state_size = sizeof(struct nvmm_x64_state),
+	.machine_create = svm_machine_create,
+	.machine_destroy = svm_machine_destroy,
+	.machine_configure = svm_machine_configure,
+	.vcpu_create = svm_vcpu_create,
+	.vcpu_destroy = svm_vcpu_destroy,
+	.vcpu_setstate = svm_vcpu_setstate,
+	.vcpu_getstate = svm_vcpu_getstate,
+	.vcpu_inject = svm_vcpu_inject,
+	.vcpu_run = svm_vcpu_run
+};
Index: src/sys/dev/nvmm/x86/nvmm_x86_svmfunc.S
diff -u /dev/null src/sys/dev/nvmm/x86/nvmm_x86_svmfunc.S:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/x86/nvmm_x86_svmfunc.S	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,218 @@
+/*	$NetBSD: nvmm_x86_svmfunc.S,v 1.1 2018/11/07 07:43:08 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Override user-land alignment before including asm.h */
+#define	ALIGN_DATA	.align	8
+#define ALIGN_TEXT	.align 16,0x90
+#define _ALIGN_TEXT	ALIGN_TEXT
+
+#define _LOCORE
+#include "assym.h"
+#include <machine/asm.h>
+#include <machine/segments.h>
+#include <x86/specialreg.h>
+
+#define ASM_NVMM
+#include <dev/nvmm/x86/nvmm_x86.h>
+
+	.text
+
+#define HOST_SAVE_GPRS		\
+	pushq	%rbx		;\
+	pushq	%rbp		;\
+	pushq	%r12		;\
+	pushq	%r13		;\
+	pushq	%r14		;\
+	pushq	%r15
+
+#define HOST_RESTORE_GPRS	\
+	popq	%r15		;\
+	popq	%r14		;\
+	popq	%r13		;\
+	popq	%r12		;\
+	popq	%rbp		;\
+	popq	%rbx
+
+#define HOST_SAVE_MSR(msr)	\
+	movq	$msr,%rcx	;\
+	rdmsr			;\
+	pushq	%rdx		;\
+	pushq	%rax
+
+#define HOST_RESTORE_MSR(msr)	\
+	popq	%rax		;\
+	popq	%rdx		;\
+	movq	$msr,%rcx	;\
+	wrmsr
+
+#define HOST_SAVE_SEGREG(sreg)	\
+	movw	sreg,%ax	;\
+	pushw	%ax
+
+#define HOST_RESTORE_SEGREG(sreg)\
+	popw	%ax		;\
+	movw	%ax,sreg
+
+#define HOST_SAVE_TR		\
+	strw	%ax		;\
+	pushw	%ax
+
+#define HOST_RESTORE_TR				\
+	popw	%ax				;\
+	movzwq	%ax,%rdx			;\
+	movq	CPUVAR(GDT),%rax		;\
+	andq	$~0x0200,4(%rax,%rdx, 1)	;\
+	ltrw	%dx
+
+#define HOST_SAVE_LDT		\
+	sldtw	%ax		;\
+	pushw	%ax
+
+#define HOST_RESTORE_LDT	\
+	popw	%ax		;\
+	lldtw	%ax
+
+/*
+ * All GPRs except RAX and RSP, which are taken care of in VMCB.
+ */
+
+#define GUEST_SAVE_GPRS(reg)				\
+	movq	%rbx,(NVMM_X64_GPR_RBX * 8)(reg)	;\
+	movq	%rcx,(NVMM_X64_GPR_RCX * 8)(reg)	;\
+	movq	%rdx,(NVMM_X64_GPR_RDX * 8)(reg)	;\
+	movq	%r8,(NVMM_X64_GPR_R8 * 8)(reg)		;\
+	movq	%r9,(NVMM_X64_GPR_R9 * 8)(reg)		;\
+	movq	%r10,(NVMM_X64_GPR_R10 * 8)(reg)	;\
+	movq	%r11,(NVMM_X64_GPR_R11 * 8)(reg)	;\
+	movq	%r12,(NVMM_X64_GPR_R12 * 8)(reg)	;\
+	movq	%r13,(NVMM_X64_GPR_R13 * 8)(reg)	;\
+	movq	%r14,(NVMM_X64_GPR_R14 * 8)(reg)	;\
+	movq	%r15,(NVMM_X64_GPR_R15 * 8)(reg)	;\
+	movq	%rbp,(NVMM_X64_GPR_RBP * 8)(reg)	;\
+	movq	%rdi,(NVMM_X64_GPR_RDI * 8)(reg)	;\
+	movq	%rsi,(NVMM_X64_GPR_RSI * 8)(reg)
+
+#define GUEST_RESTORE_GPRS(reg)				\
+	movq	(NVMM_X64_GPR_RBX * 8)(reg),%rbx	;\
+	movq	(NVMM_X64_GPR_RCX * 8)(reg),%rcx	;\
+	movq	(NVMM_X64_GPR_RDX * 8)(reg),%rdx	;\
+	movq	(NVMM_X64_GPR_R8 * 8)(reg),%r8		;\
+	movq	(NVMM_X64_GPR_R9 * 8)(reg),%r9		;\
+	movq	(NVMM_X64_GPR_R10 * 8)(reg),%r10	;\
+	movq	(NVMM_X64_GPR_R11 * 8)(reg),%r11	;\
+	movq	(NVMM_X64_GPR_R12 * 8)(reg),%r12	;\
+	movq	(NVMM_X64_GPR_R13 * 8)(reg),%r13	;\
+	movq	(NVMM_X64_GPR_R14 * 8)(reg),%r14	;\
+	movq	(NVMM_X64_GPR_R15 * 8)(reg),%r15	;\
+	movq	(NVMM_X64_GPR_RBP * 8)(reg),%rbp	;\
+	movq	(NVMM_X64_GPR_RDI * 8)(reg),%rdi	;\
+	movq	(NVMM_X64_GPR_RSI * 8)(reg),%rsi
+
+/*
+ * %rdi = PA of VMCB
+ * %rsi = VA of guest GPR state
+ */
+ENTRY(svm_vmrun)
+	/* Save the Host GPRs. */
+	HOST_SAVE_GPRS
+
+	/* Disable Host interrupts. */
+	clgi
+
+	/* Save the Host TR. */
+	HOST_SAVE_TR
+
+	/* Save the variable Host MSRs. */
+	HOST_SAVE_MSR(MSR_KERNELGSBASE)
+	HOST_SAVE_MSR(MSR_GSBASE)
+	HOST_SAVE_MSR(MSR_FSBASE)
+
+	/* Reset the Host Segregs. */
+	movq	$GSEL(GUDATA_SEL, SEL_UPL),%rax
+	movw	%ax,%ds
+	movw	%ax,%es
+	xorq	%rax,%rax
+	movw	%ax,%fs
+	movw	%ax,%gs
+
+	/* Save some Host Segregs. */
+	HOST_SAVE_SEGREG(%fs)
+	HOST_SAVE_SEGREG(%gs)
+
+	/* Save the Host LDT. */
+	HOST_SAVE_LDT
+
+	/* Prepare RAX. */
+	pushq	%rsi
+	pushq	%rdi
+
+	/* Restore the Guest GPRs. */
+	movq	%rsi,%rax
+	GUEST_RESTORE_GPRS(%rax)
+
+	/* Set RAX. */
+	popq	%rax
+
+	/* Run the VM. */
+	vmload	%rax
+	vmrun	%rax
+	vmsave	%rax
+
+	/* Get RAX. */
+	popq	%rax
+
+	/* Save the Guest GPRs. */
+	GUEST_SAVE_GPRS(%rax)
+
+	/* Restore the Host LDT. */
+	HOST_RESTORE_LDT
+
+	/* Restore the Host Segregs. */
+	HOST_RESTORE_SEGREG(%gs)
+	HOST_RESTORE_SEGREG(%fs)
+
+	/* Restore the variable Host MSRs. */
+	HOST_RESTORE_MSR(MSR_FSBASE)
+	HOST_RESTORE_MSR(MSR_GSBASE)
+	HOST_RESTORE_MSR(MSR_KERNELGSBASE)
+
+	/* Restore the Host TR. */
+	HOST_RESTORE_TR
+
+	/* Enable Host interrupts. */
+	stgi
+
+	/* Restore the Host GPRs. */
+	HOST_RESTORE_GPRS
+
+	xorq	%rax,%rax
+	retq
+END(svm_vmrun)

Index: src/sys/modules/nvmm/Makefile
diff -u /dev/null src/sys/modules/nvmm/Makefile:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/modules/nvmm/Makefile	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,19 @@
+#	$NetBSD: Makefile,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+.include "../Makefile.inc"
+.include "../Makefile.assym"
+
+CPPFLAGS+=
+
+.PATH:	${S}/dev/nvmm
+.PATH:	${S}/dev/nvmm/x86
+
+KMOD=	nvmm
+IOCONF=	nvmm.ioconf
+SRCS=	nvmm.c
+
+.if ${MACHINE_ARCH} == "x86_64"
+SRCS+=	nvmm_x86_svm.c nvmm_x86_svmfunc.S
+.endif
+
+.include <bsd.kmodule.mk>
Index: src/sys/modules/nvmm/nvmm.ioconf
diff -u /dev/null src/sys/modules/nvmm/nvmm.ioconf:1.1
--- /dev/null	Wed Nov  7 07:43:08 2018
+++ src/sys/modules/nvmm/nvmm.ioconf	Wed Nov  7 07:43:08 2018
@@ -0,0 +1,7 @@
+#	$NetBSD: nvmm.ioconf,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+ioconf		nvmm
+
+include		"conf/files"
+
+pseudo-device   nvmm

Reply via email to