Module Name: src
Committed By: maxv
Date: Wed Nov 7 07:43:08 UTC 2018
Modified Files:
src/distrib/sets/lists/comp: md.amd64
src/distrib/sets/lists/modules: md.amd64
src/etc: MAKEDEV.tmpl
src/sys/conf: files majors
src/sys/dev: Makefile
src/sys/modules: Makefile
Added Files:
src/sys/dev/nvmm: Makefile files.nvmm nvmm.c nvmm.h nvmm_internal.h
nvmm_ioctl.h
src/sys/dev/nvmm/x86: Makefile nvmm_x86.h nvmm_x86_svm.c
nvmm_x86_svmfunc.S
src/sys/modules/nvmm: Makefile nvmm.ioconf
Log Message:
Add NVMM - for NetBSD Virtual Machine Monitor -, a kernel driver that
provides support for hardware-accelerated virtualization on NetBSD.
It is made of an MI frontend, to which MD backends can be plugged. One
MD backend is implemented, x86-SVM, for x86 AMD CPUs.
We install
/usr/include/dev/nvmm/nvmm.h
/usr/include/dev/nvmm/nvmm_ioctl.h
/usr/include/dev/nvmm/{arch}/nvmm_{arch}.h
And the kernel module. For now, the only architecture where we do that
is amd64 (arch=x86).
NVMM is not enabled by default in amd64-GENERIC, but is instead easily
modloadable.
Sent to tech-kern@ a month ago. Validated with kASan, and optimized
with tprof.
To generate a diff of this commit:
cvs rdiff -u -r1.259 -r1.260 src/distrib/sets/lists/comp/md.amd64
cvs rdiff -u -r1.77 -r1.78 src/distrib/sets/lists/modules/md.amd64
cvs rdiff -u -r1.195 -r1.196 src/etc/MAKEDEV.tmpl
cvs rdiff -u -r1.1215 -r1.1216 src/sys/conf/files
cvs rdiff -u -r1.79 -r1.80 src/sys/conf/majors
cvs rdiff -u -r1.39 -r1.40 src/sys/dev/Makefile
cvs rdiff -u -r0 -r1.1 src/sys/dev/nvmm/Makefile src/sys/dev/nvmm/files.nvmm \
src/sys/dev/nvmm/nvmm.c src/sys/dev/nvmm/nvmm.h \
src/sys/dev/nvmm/nvmm_internal.h src/sys/dev/nvmm/nvmm_ioctl.h
cvs rdiff -u -r0 -r1.1 src/sys/dev/nvmm/x86/Makefile \
src/sys/dev/nvmm/x86/nvmm_x86.h src/sys/dev/nvmm/x86/nvmm_x86_svm.c \
src/sys/dev/nvmm/x86/nvmm_x86_svmfunc.S
cvs rdiff -u -r1.209 -r1.210 src/sys/modules/Makefile
cvs rdiff -u -r0 -r1.1 src/sys/modules/nvmm/Makefile \
src/sys/modules/nvmm/nvmm.ioconf
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/distrib/sets/lists/comp/md.amd64
diff -u src/distrib/sets/lists/comp/md.amd64:1.259 src/distrib/sets/lists/comp/md.amd64:1.260
--- src/distrib/sets/lists/comp/md.amd64:1.259 Tue Jul 17 18:55:24 2018
+++ src/distrib/sets/lists/comp/md.amd64 Wed Nov 7 07:43:07 2018
@@ -1,4 +1,4 @@
-# $NetBSD: md.amd64,v 1.259 2018/07/17 18:55:24 joerg Exp $
+# $NetBSD: md.amd64,v 1.260 2018/11/07 07:43:07 maxv Exp $
./usr/include/amd64 comp-c-include
./usr/include/amd64/ansi.h comp-c-include
@@ -677,6 +677,11 @@
./usr/include/ieeefp.h comp-c-include
./usr/include/mm_malloc.h comp-obsolete obsolete
./usr/include/mmintrin.h comp-obsolete obsolete
+./usr/include/dev/nvmm comp-c-include
+./usr/include/dev/nvmm/nvmm.h comp-c-include
+./usr/include/dev/nvmm/nvmm_ioctl.h comp-c-include
+./usr/include/dev/nvmm/x86 comp-c-include
+./usr/include/dev/nvmm/x86/nvmm_x86.h comp-c-include
./usr/include/pmmintrin.h comp-obsolete obsolete
./usr/include/x64_64 comp-obsolete obsolete
./usr/include/x64_64/ansi.h comp-obsolete obsolete
Index: src/distrib/sets/lists/modules/md.amd64
diff -u src/distrib/sets/lists/modules/md.amd64:1.77 src/distrib/sets/lists/modules/md.amd64:1.78
--- src/distrib/sets/lists/modules/md.amd64:1.77 Tue Aug 28 09:42:10 2018
+++ src/distrib/sets/lists/modules/md.amd64 Wed Nov 7 07:43:07 2018
@@ -1,4 +1,4 @@
-# $NetBSD: md.amd64,v 1.77 2018/08/28 09:42:10 martin Exp $
+# $NetBSD: md.amd64,v 1.78 2018/11/07 07:43:07 maxv Exp $
#
# NOTE that there are two sets of files here:
# @MODULEDIR@ and amd64-xen
@@ -141,6 +141,8 @@
./@MODULEDIR@/mt2131/mt2131.kmod base-kernel-modules kmod
./@MODULEDIR@/nvme base-obsolete obsolete
./@MODULEDIR@/nvme/nvme.kmod base-obsolete obsolete
+./@MODULEDIR@/nvmm base-kernel-modules kmod
+./@MODULEDIR@/nvmm/nvmm.kmod base-kernel-modules kmod
./@MODULEDIR@/nxt2k base-kernel-modules kmod
./@MODULEDIR@/nxt2k/nxt2k.kmod base-kernel-modules kmod
./@MODULEDIR@/odcm base-kernel-modules kmod
Index: src/etc/MAKEDEV.tmpl
diff -u src/etc/MAKEDEV.tmpl:1.195 src/etc/MAKEDEV.tmpl:1.196
--- src/etc/MAKEDEV.tmpl:1.195 Sun Nov 4 12:48:01 2018
+++ src/etc/MAKEDEV.tmpl Wed Nov 7 07:43:07 2018
@@ -1,5 +1,5 @@
#!/bin/sh -
-# $NetBSD: MAKEDEV.tmpl,v 1.195 2018/11/04 12:48:01 maxv Exp $
+# $NetBSD: MAKEDEV.tmpl,v 1.196 2018/11/07 07:43:07 maxv Exp $
#
# Copyright (c) 2003,2007,2008 The NetBSD Foundation, Inc.
# All rights reserved.
@@ -258,6 +258,7 @@
# nsmb* SMB requester
# nvme* Non-Volatile Memory Host Controller Interface device driver
# nvme*ns* Non-Volatile Memory namespace
+# nvmm NetBSD Virtual Machine Monitor
# openfirm OpenFirmware accessor
# pad* Pseudo-audio device driver
# pci* PCI bus access devices
@@ -277,7 +278,7 @@
# stic* PixelStamp interface chip
# sysmon System Monitoring hardware
# tap* virtual Ethernet device
-# tprof task profiler
+# tprof task profiler
# tun* network tunnel driver
# twa 3ware Apache control interface
# twe 3ware Escalade control interface
@@ -2205,6 +2206,10 @@ nvme[0-9]*)
mkdev nvme$unit c %nvme_chr% $(($unit * 65536))
;;
+nvmm)
+ mkdev nvmm c %nvmm_chr% 0
+ ;;
+
autofs)
mkdev autofs c %autofs_chr% 0 600
;;
Index: src/sys/conf/files
diff -u src/sys/conf/files:1.1215 src/sys/conf/files:1.1216
--- src/sys/conf/files:1.1215 Fri Oct 19 21:09:10 2018
+++ src/sys/conf/files Wed Nov 7 07:43:07 2018
@@ -1,4 +1,4 @@
-# $NetBSD: files,v 1.1215 2018/10/19 21:09:10 jakllsch Exp $
+# $NetBSD: files,v 1.1216 2018/11/07 07:43:07 maxv Exp $
# @(#)files.newconf 7.5 (Berkeley) 5/10/93
version 20171118
@@ -1549,6 +1549,11 @@ include "lib/libx86emu/files.x86emu"
include "dev/tprof/files.tprof"
#
+# NetBSD Virtual Machine Monitor.
+#
+include "dev/nvmm/files.nvmm"
+
+#
# alternate memory device
#
include "dev/altmem/files.altmem"
Index: src/sys/conf/majors
diff -u src/sys/conf/majors:1.79 src/sys/conf/majors:1.80
--- src/sys/conf/majors:1.79 Sun May 20 14:08:33 2018
+++ src/sys/conf/majors Wed Nov 7 07:43:07 2018
@@ -1,4 +1,4 @@
-# $NetBSD: majors,v 1.79 2018/05/20 14:08:33 thorpej Exp $
+# $NetBSD: majors,v 1.80 2018/11/07 07:43:07 maxv Exp $
#
# Device majors for Machine-Independent drivers.
#
@@ -78,3 +78,4 @@ device-major nvme char 341 nvme
device-major qemufwcfg char 342 qemufwcfg
device-major autofs char 343 autofs
device-major gpiopps char 344 gpiopps
+device-major nvmm char 345 nvmm
Index: src/sys/dev/Makefile
diff -u src/sys/dev/Makefile:1.39 src/sys/dev/Makefile:1.40
--- src/sys/dev/Makefile:1.39 Sun Dec 10 20:38:14 2017
+++ src/sys/dev/Makefile Wed Nov 7 07:43:08 2018
@@ -1,10 +1,14 @@
-# $NetBSD: Makefile,v 1.39 2017/12/10 20:38:14 bouyer Exp $
+# $NetBSD: Makefile,v 1.40 2018/11/07 07:43:08 maxv Exp $
SUBDIR= apm ata bluetooth dec dm dmover dtv filemon hdaudio hdmicec hid hpc \
i2c i2o ic ieee1394 ir isa \
microcode ofw pci pckbport pcmcia pud putter raidframe sbus scsipi \
sun tc usb vme wscons
+.if ${MACHINE_ARCH} == "x86_64"
+SUBDIR+= nvmm
+.endif
+
.include <bsd.own.mk>
.if ${MKISCSI} != "no"
Index: src/sys/modules/Makefile
diff -u src/sys/modules/Makefile:1.209 src/sys/modules/Makefile:1.210
--- src/sys/modules/Makefile:1.209 Tue Aug 28 03:41:38 2018
+++ src/sys/modules/Makefile Wed Nov 7 07:43:08 2018
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.209 2018/08/28 03:41:38 riastradh Exp $
+# $NetBSD: Makefile,v 1.210 2018/11/07 07:43:08 maxv Exp $
.include <bsd.own.mk>
@@ -202,6 +202,10 @@ SUBDIR+= tprof_x86
SUBDIR+= vmt
.endif
+.if ${MACHINE_ARCH} == "x86_64"
+SUBDIR+= nvmm
+.endif
+
.if ${MACHINE_ARCH} == "i386" || \
${MACHINE_ARCH} == "x86_64"
SUBDIR+= ubsec # Builds on architectures with PCI bus
Added files:
Index: src/sys/dev/nvmm/Makefile
diff -u /dev/null src/sys/dev/nvmm/Makefile:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/Makefile Wed Nov 7 07:43:08 2018
@@ -0,0 +1,13 @@
+# $NetBSD: Makefile,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+.if ${MACHINE_ARCH} == "x86_64"
+SUBDIR= x86
+.endif
+
+.include <bsd.own.mk>
+
+INCSDIR= /usr/include/dev/nvmm
+
+INCS= nvmm.h nvmm_ioctl.h
+
+.include <bsd.kinc.mk>
Index: src/sys/dev/nvmm/files.nvmm
diff -u /dev/null src/sys/dev/nvmm/files.nvmm:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/files.nvmm Wed Nov 7 07:43:08 2018
@@ -0,0 +1,11 @@
+# $NetBSD: files.nvmm,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+defpseudo nvmm
+
+file dev/nvmm/nvmm.c nvmm
+
+ifdef amd64
+file dev/nvmm/x86/nvmm_x86_svm.c nvmm
+file dev/nvmm/x86/nvmm_x86_svmfunc.S nvmm
+endif
+
Index: src/sys/dev/nvmm/nvmm.c
diff -u /dev/null src/sys/dev/nvmm/nvmm.c:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm.c Wed Nov 7 07:43:08 2018
@@ -0,0 +1,788 @@
+/* $NetBSD: nvmm.c,v 1.1 2018/11/07 07:43:08 maxv Exp $ */
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.1 2018/11/07 07:43:08 maxv Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+
+#include <sys/cpu.h>
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_page.h>
+
+#include "ioconf.h"
+
+#include <dev/nvmm/nvmm.h>
+#include <dev/nvmm/nvmm_internal.h>
+#include <dev/nvmm/nvmm_ioctl.h>
+
+static struct nvmm_machine machines[NVMM_MAX_MACHINES];
+
+static const struct nvmm_impl *nvmm_impl_list[] = {
+ &nvmm_x86_svm /* x86 AMD SVM */
+};
+
+static const struct nvmm_impl *nvmm_impl = NULL;
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_machine_alloc(struct nvmm_machine **ret)
+{
+ struct nvmm_machine *mach;
+ size_t i;
+
+ for (i = 0; i < NVMM_MAX_MACHINES; i++) {
+ mach = &machines[i];
+
+ rw_enter(&mach->lock, RW_WRITER);
+ if (mach->present) {
+ rw_exit(&mach->lock);
+ continue;
+ }
+
+ mach->present = true;
+ *ret = mach;
+ return 0;
+ }
+
+ return ENOBUFS;
+}
+
+static void
+nvmm_machine_free(struct nvmm_machine *mach)
+{
+ KASSERT(rw_write_held(&mach->lock));
+ KASSERT(mach->present);
+ mach->present = false;
+}
+
+static int
+nvmm_machine_get(nvmm_machid_t machid, struct nvmm_machine **ret, bool writer)
+{
+ struct nvmm_machine *mach;
+ krw_t op = writer ? RW_WRITER : RW_READER;
+
+ if (machid >= NVMM_MAX_MACHINES) {
+ return EINVAL;
+ }
+ mach = &machines[machid];
+
+ rw_enter(&mach->lock, op);
+ if (!mach->present) {
+ rw_exit(&mach->lock);
+ return ENOENT;
+ }
+ if (mach->procid != curproc->p_pid) {
+ rw_exit(&mach->lock);
+ return EPERM;
+ }
+ *ret = mach;
+
+ return 0;
+}
+
+static void
+nvmm_machine_put(struct nvmm_machine *mach)
+{
+ rw_exit(&mach->lock);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_vcpu_alloc(struct nvmm_machine *mach, struct nvmm_cpu **ret)
+{
+ struct nvmm_cpu *vcpu;
+ size_t i;
+
+ for (i = 0; i < NVMM_MAX_VCPUS; i++) {
+ vcpu = &mach->cpus[i];
+
+ mutex_enter(&vcpu->lock);
+ if (vcpu->present) {
+ mutex_exit(&vcpu->lock);
+ continue;
+ }
+
+ vcpu->present = true;
+ vcpu->cpuid = i;
+ *ret = vcpu;
+ return 0;
+ }
+
+ return ENOBUFS;
+}
+
+static void
+nvmm_vcpu_free(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+ KASSERT(mutex_owned(&vcpu->lock));
+ vcpu->present = false;
+ vcpu->hcpu_last = -1;
+}
+
+int
+nvmm_vcpu_get(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
+ struct nvmm_cpu **ret)
+{
+ struct nvmm_cpu *vcpu;
+
+ if (cpuid >= NVMM_MAX_VCPUS) {
+ return EINVAL;
+ }
+ vcpu = &mach->cpus[cpuid];
+
+ mutex_enter(&vcpu->lock);
+ if (!vcpu->present) {
+ mutex_exit(&vcpu->lock);
+ return ENOENT;
+ }
+ *ret = vcpu;
+
+ return 0;
+}
+
+void
+nvmm_vcpu_put(struct nvmm_cpu *vcpu)
+{
+ mutex_exit(&vcpu->lock);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+nvmm_kill_machines(pid_t pid)
+{
+ struct nvmm_machine *mach;
+ struct nvmm_cpu *vcpu;
+ size_t i, j;
+ int error;
+
+ for (i = 0; i < NVMM_MAX_MACHINES; i++) {
+ mach = &machines[i];
+
+ rw_enter(&mach->lock, RW_WRITER);
+ if (!mach->present || mach->procid != pid) {
+ rw_exit(&mach->lock);
+ continue;
+ }
+
+ /* Kill it. */
+ for (j = 0; j < NVMM_MAX_VCPUS; j++) {
+ error = nvmm_vcpu_get(mach, j, &vcpu);
+ if (error)
+ continue;
+ (*nvmm_impl->vcpu_destroy)(mach, vcpu);
+ nvmm_vcpu_free(mach, vcpu);
+ nvmm_vcpu_put(vcpu);
+ }
+ uvmspace_free(mach->vm);
+ uao_detach(mach->uobj);
+ nvmm_machine_free(mach);
+
+ rw_exit(&mach->lock);
+ }
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_capability(struct nvmm_ioc_capability *args)
+{
+ args->cap.version = NVMM_CAPABILITY_VERSION;
+ args->cap.state_size = nvmm_impl->state_size;
+ args->cap.max_machines = NVMM_MAX_MACHINES;
+ args->cap.max_vcpus = NVMM_MAX_VCPUS;
+ args->cap.max_ram = NVMM_MAX_RAM;
+
+ (*nvmm_impl->capability)(&args->cap);
+
+ return 0;
+}
+
+static int
+nvmm_machine_create(struct nvmm_ioc_machine_create *args)
+{
+ struct nvmm_machine *mach;
+ int error;
+
+ error = nvmm_machine_alloc(&mach);
+ if (error)
+ return error;
+
+ /* Curproc owns the machine. */
+ mach->procid = curproc->p_pid;
+
+ /* Create the machine vmspace. */
+ mach->gpa_begin = 0;
+ mach->gpa_end = NVMM_MAX_RAM;
+ mach->vm = uvmspace_alloc(0, mach->gpa_end - mach->gpa_begin, false);
+ mach->uobj = uao_create(mach->gpa_end - mach->gpa_begin, 0);
+
+ /* Grab a reference for the machine. */
+ uao_reference(mach->uobj);
+
+ (*nvmm_impl->machine_create)(mach);
+
+ args->machid = mach->machid;
+ nvmm_machine_put(mach);
+
+ return 0;
+}
+
+static int
+nvmm_machine_destroy(struct nvmm_ioc_machine_destroy *args)
+{
+ struct nvmm_machine *mach;
+ struct nvmm_cpu *vcpu;
+ int error;
+ size_t i;
+
+ error = nvmm_machine_get(args->machid, &mach, true);
+ if (error)
+ return error;
+
+ for (i = 0; i < NVMM_MAX_VCPUS; i++) {
+ error = nvmm_vcpu_get(mach, i, &vcpu);
+ if (error)
+ continue;
+
+ (*nvmm_impl->vcpu_destroy)(mach, vcpu);
+ nvmm_vcpu_free(mach, vcpu);
+ nvmm_vcpu_put(vcpu);
+ }
+
+ (*nvmm_impl->machine_destroy)(mach);
+
+ /* Free the machine vmspace. */
+ uvmspace_free(mach->vm);
+ uao_detach(mach->uobj);
+
+ nvmm_machine_free(mach);
+ nvmm_machine_put(mach);
+
+ return 0;
+}
+
+static int
+nvmm_machine_configure(struct nvmm_ioc_machine_configure *args)
+{
+ struct nvmm_machine *mach;
+ size_t allocsz;
+ void *data;
+ int error;
+
+ if (__predict_false(args->op >= nvmm_impl->conf_max)) {
+ return EINVAL;
+ }
+
+ allocsz = nvmm_impl->conf_sizes[args->op];
+ data = kmem_alloc(allocsz, KM_SLEEP);
+
+ error = nvmm_machine_get(args->machid, &mach, true);
+ if (error) {
+ kmem_free(data, allocsz);
+ return error;
+ }
+
+ error = copyin(args->conf, data, allocsz);
+ if (error) {
+ goto out;
+ }
+
+ error = (*nvmm_impl->machine_configure)(mach, args->op, data);
+
+out:
+ nvmm_machine_put(mach);
+ kmem_free(data, allocsz);
+ return error;
+}
+
+static int
+nvmm_vcpu_create(struct nvmm_ioc_vcpu_create *args)
+{
+ struct nvmm_machine *mach;
+ struct nvmm_cpu *vcpu;
+ int error;
+
+ error = nvmm_machine_get(args->machid, &mach, false);
+ if (error)
+ return error;
+
+ error = nvmm_vcpu_alloc(mach, &vcpu);
+ if (error)
+ goto out;
+
+ error = (*nvmm_impl->vcpu_create)(mach, vcpu);
+ if (error) {
+ nvmm_vcpu_free(mach, vcpu);
+ nvmm_vcpu_put(vcpu);
+ goto out;
+ }
+
+ nvmm_vcpu_put(vcpu);
+
+out:
+ nvmm_machine_put(mach);
+ return error;
+}
+
+static int
+nvmm_vcpu_destroy(struct nvmm_ioc_vcpu_destroy *args)
+{
+ struct nvmm_machine *mach;
+ struct nvmm_cpu *vcpu;
+ int error;
+
+ error = nvmm_machine_get(args->machid, &mach, false);
+ if (error)
+ return error;
+
+ error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+ if (error)
+ goto out;
+
+ (*nvmm_impl->vcpu_destroy)(mach, vcpu);
+ nvmm_vcpu_free(mach, vcpu);
+ nvmm_vcpu_put(vcpu);
+
+out:
+ nvmm_machine_put(mach);
+ return error;
+}
+
+static int
+nvmm_vcpu_setstate(struct nvmm_ioc_vcpu_setstate *args)
+{
+ struct nvmm_machine *mach;
+ struct nvmm_cpu *vcpu;
+ void *data;
+ int error;
+
+ data = kmem_alloc(nvmm_impl->state_size, KM_SLEEP);
+
+ error = nvmm_machine_get(args->machid, &mach, false);
+ if (error) {
+ kmem_free(data, nvmm_impl->state_size);
+ return error;
+ }
+
+ error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+ if (error)
+ goto out;
+
+ error = copyin(args->state, data, nvmm_impl->state_size);
+ if (error) {
+ nvmm_vcpu_put(vcpu);
+ goto out;
+ }
+
+ (*nvmm_impl->vcpu_setstate)(vcpu, data, args->flags);
+ nvmm_vcpu_put(vcpu);
+
+out:
+ nvmm_machine_put(mach);
+ kmem_free(data, nvmm_impl->state_size);
+ return error;
+}
+
+static int
+nvmm_vcpu_getstate(struct nvmm_ioc_vcpu_getstate *args)
+{
+ struct nvmm_machine *mach;
+ struct nvmm_cpu *vcpu;
+ void *data;
+ int error;
+
+ data = kmem_alloc(nvmm_impl->state_size, KM_SLEEP);
+
+ error = nvmm_machine_get(args->machid, &mach, false);
+ if (error) {
+ kmem_free(data, nvmm_impl->state_size);
+ return error;
+ }
+
+ error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+ if (error)
+ goto out;
+
+ (*nvmm_impl->vcpu_getstate)(vcpu, data, args->flags);
+ nvmm_vcpu_put(vcpu);
+ error = copyout(data, args->state, nvmm_impl->state_size);
+
+out:
+ nvmm_machine_put(mach);
+ kmem_free(data, nvmm_impl->state_size);
+ return error;
+}
+
+static int
+nvmm_vcpu_inject(struct nvmm_ioc_vcpu_inject *args)
+{
+ struct nvmm_machine *mach;
+ struct nvmm_cpu *vcpu;
+ int error;
+
+ error = nvmm_machine_get(args->machid, &mach, false);
+ if (error)
+ return error;
+
+ error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+ if (error)
+ goto out;
+
+ error = (*nvmm_impl->vcpu_inject)(mach, vcpu, &args->event);
+ nvmm_vcpu_put(vcpu);
+
+out:
+ nvmm_machine_put(mach);
+ return error;
+}
+
+static int
+nvmm_vcpu_run(struct nvmm_ioc_vcpu_run *args)
+{
+ struct nvmm_machine *mach;
+ struct nvmm_cpu *vcpu;
+ int error;
+
+ error = nvmm_machine_get(args->machid, &mach, false);
+ if (error)
+ return error;
+
+ error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
+ if (error)
+ goto out;
+
+ (*nvmm_impl->vcpu_run)(mach, vcpu, &args->exit);
+ nvmm_vcpu_put(vcpu);
+
+out:
+ nvmm_machine_put(mach);
+ return error;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_gpa_map(struct nvmm_ioc_gpa_map *args)
+{
+ struct proc *p = curproc;
+ struct nvmm_machine *mach;
+ struct vmspace *vmspace;
+ gpaddr_t gpa;
+ vaddr_t uva;
+ int error;
+
+ error = nvmm_machine_get(args->machid, &mach, false);
+ if (error)
+ return error;
+
+ vmspace = p->p_vmspace;
+
+ if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0 ||
+ (args->hva % PAGE_SIZE) != 0) {
+ error = EINVAL;
+ goto out;
+ }
+ if (args->hva == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
+ error = EINVAL;
+ goto out;
+ }
+ if (args->gpa + args->size <= args->gpa) {
+ error = EINVAL;
+ goto out;
+ }
+ if (args->gpa + args->size >= mach->gpa_end) {
+ error = EINVAL;
+ goto out;
+ }
+ gpa = args->gpa;
+
+ /* Take a reference for the kernel. */
+ uao_reference(mach->uobj);
+
+ /* Map the uobj into the machine address space, as pageable. */
+ error = uvm_map(&mach->vm->vm_map, &gpa, args->size, mach->uobj,
+ args->gpa, 0, UVM_MAPFLAG(UVM_PROT_RWX, UVM_PROT_RWX,
+ UVM_INH_NONE, UVM_ADV_NORMAL, UVM_FLAG_FIXED));
+ if (error) {
+ uao_detach(mach->uobj);
+ goto out;
+ }
+ if (gpa != args->gpa) {
+ uao_detach(mach->uobj);
+ printf("[!] uvm_map problem\n");
+ error = EINVAL;
+ goto out;
+ }
+
+ uva = (vaddr_t)args->hva;
+
+ /* Take a reference for the user. */
+ uao_reference(mach->uobj);
+
+ /* Map the uobj into the user address space, as pageable. */
+ error = uvm_map(&vmspace->vm_map, &uva, args->size, mach->uobj,
+ args->gpa, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
+ UVM_INH_SHARE, UVM_ADV_NORMAL, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
+ if (error) {
+ uao_detach(mach->uobj);
+ goto out;
+ }
+
+out:
+ nvmm_machine_put(mach);
+ return error;
+}
+
+static int
+nvmm_gpa_unmap(struct nvmm_ioc_gpa_unmap *args)
+{
+ struct nvmm_machine *mach;
+ gpaddr_t gpa;
+ int error;
+
+ error = nvmm_machine_get(args->machid, &mach, false);
+ if (error)
+ return error;
+
+ if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0) {
+ error = EINVAL;
+ goto out;
+ }
+ if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
+ error = EINVAL;
+ goto out;
+ }
+ if (args->gpa + args->size <= args->gpa) {
+ error = EINVAL;
+ goto out;
+ }
+ if (args->gpa + args->size >= mach->gpa_end) {
+ error = EINVAL;
+ goto out;
+ }
+ gpa = args->gpa;
+
+ /* Unmap the memory from the machine. */
+ uvm_unmap(&mach->vm->vm_map, gpa, gpa + args->size);
+
+out:
+ nvmm_machine_put(mach);
+ return error;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_init(void)
+{
+ size_t i, n;
+
+ for (i = 0; i < __arraycount(nvmm_impl_list); i++) {
+ if (!(*nvmm_impl_list[i]->ident)()) {
+ continue;
+ }
+ nvmm_impl = nvmm_impl_list[i];
+ break;
+ }
+ if (nvmm_impl == NULL) {
+ printf("[!] No implementation found\n");
+ return ENOTSUP;
+ }
+
+ for (i = 0; i < NVMM_MAX_MACHINES; i++) {
+ machines[i].machid = i;
+ rw_init(&machines[i].lock);
+ for (n = 0; n < NVMM_MAX_VCPUS; n++) {
+ mutex_init(&machines[i].cpus[n].lock, MUTEX_DEFAULT,
+ IPL_NONE);
+ machines[i].cpus[n].hcpu_last = -1;
+ }
+ }
+
+ (*nvmm_impl->init)();
+
+ return 0;
+}
+
+static void
+nvmm_fini(void)
+{
+ size_t i, n;
+
+ for (i = 0; i < NVMM_MAX_MACHINES; i++) {
+ rw_destroy(&machines[i].lock);
+ for (n = 0; n < NVMM_MAX_VCPUS; n++) {
+ mutex_destroy(&machines[i].cpus[n].lock);
+ }
+ /* TODO need to free stuff, etc */
+ }
+
+ (*nvmm_impl->fini)();
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_open(dev_t dev, int flags, int type, struct lwp *l)
+{
+ if (minor(dev) != 0) {
+ return EXDEV;
+ }
+
+ return 0;
+}
+
+static int
+nvmm_close(dev_t dev, int flags, int type, struct lwp *l)
+{
+ KASSERT(minor(dev) == 0);
+
+ nvmm_kill_machines(l->l_proc->p_pid);
+
+ return 0;
+}
+
+static int
+nvmm_ioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
+{
+ KASSERT(minor(dev) == 0);
+
+ switch (cmd) {
+ case NVMM_IOC_CAPABILITY:
+ return nvmm_capability(data);
+ case NVMM_IOC_MACHINE_CREATE:
+ return nvmm_machine_create(data);
+ case NVMM_IOC_MACHINE_DESTROY:
+ return nvmm_machine_destroy(data);
+ case NVMM_IOC_MACHINE_CONFIGURE:
+ return nvmm_machine_configure(data);
+ case NVMM_IOC_VCPU_CREATE:
+ return nvmm_vcpu_create(data);
+ case NVMM_IOC_VCPU_DESTROY:
+ return nvmm_vcpu_destroy(data);
+ case NVMM_IOC_VCPU_SETSTATE:
+ return nvmm_vcpu_setstate(data);
+ case NVMM_IOC_VCPU_GETSTATE:
+ return nvmm_vcpu_getstate(data);
+ case NVMM_IOC_VCPU_INJECT:
+ return nvmm_vcpu_inject(data);
+ case NVMM_IOC_VCPU_RUN:
+ return nvmm_vcpu_run(data);
+ case NVMM_IOC_GPA_MAP:
+ return nvmm_gpa_map(data);
+ case NVMM_IOC_GPA_UNMAP:
+ return nvmm_gpa_unmap(data);
+ default:
+ return EINVAL;
+ }
+}
+
+const struct cdevsw nvmm_cdevsw = {
+ .d_open = nvmm_open,
+ .d_close = nvmm_close,
+ .d_read = noread,
+ .d_write = nowrite,
+ .d_ioctl = nvmm_ioctl,
+ .d_stop = nostop,
+ .d_tty = notty,
+ .d_poll = nopoll,
+ .d_mmap = nommap,
+ .d_kqfilter = nokqfilter,
+ .d_discard = nodiscard,
+ .d_flag = D_OTHER | D_MPSAFE
+};
+
+void
+nvmmattach(int nunits)
+{
+ /* nothing */
+}
+
+MODULE(MODULE_CLASS_DRIVER, nvmm, NULL);
+
+static int
+nvmm_modcmd(modcmd_t cmd, void *arg)
+{
+ int error;
+
+ switch (cmd) {
+ case MODULE_CMD_INIT:
+ error = nvmm_init();
+ if (error)
+ return error;
+
+#if defined(_MODULE)
+ {
+ devmajor_t bmajor = NODEVMAJOR;
+ devmajor_t cmajor = 345;
+
+ /* mknod /dev/nvmm c 345 0 */
+ error = devsw_attach("nvmm", NULL, &bmajor,
+ &nvmm_cdevsw, &cmajor);
+ if (error) {
+ nvmm_fini();
+ return error;
+ }
+ }
+#endif
+ return 0;
+
+ case MODULE_CMD_FINI:
+#if defined(_MODULE)
+ {
+ error = devsw_detach(NULL, &nvmm_cdevsw);
+ if (error) {
+ return error;
+ }
+ }
+#endif
+ nvmm_fini();
+ return 0;
+
+ default:
+ return ENOTTY;
+ }
+}
Index: src/sys/dev/nvmm/nvmm.h
diff -u /dev/null src/sys/dev/nvmm/nvmm.h:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm.h Wed Nov 7 07:43:08 2018
@@ -0,0 +1,155 @@
+/* $NetBSD: nvmm.h,v 1.1 2018/11/07 07:43:08 maxv Exp $ */
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NVMM_H_
+#define _NVMM_H_
+
+#include <sys/types.h>
+
+#ifndef _KERNEL
+#include <stdbool.h>
+#endif
+
+typedef uint64_t gpaddr_t;
+typedef uint64_t gvaddr_t;
+
+typedef uint32_t nvmm_machid_t;
+typedef uint32_t nvmm_cpuid_t;
+
+enum nvmm_exit_reason {
+ NVMM_EXIT_NONE = 0x0000000000000000,
+
+ /* General. */
+ NVMM_EXIT_MEMORY = 0x0000000000000001,
+ NVMM_EXIT_IO = 0x0000000000000002,
+ NVMM_EXIT_MSR = 0x0000000000000003,
+ NVMM_EXIT_INT_READY = 0x0000000000000004,
+ NVMM_EXIT_NMI_READY = 0x0000000000000005,
+ NVMM_EXIT_SHUTDOWN = 0x0000000000000006,
+
+ /* Instructions (x86). */
+ NVMM_EXIT_HLT = 0x0000000000001000,
+ NVMM_EXIT_MONITOR = 0x0000000000001001,
+ NVMM_EXIT_MWAIT = 0x0000000000001002,
+ NVMM_EXIT_MWAIT_COND = 0x0000000000001003,
+
+ NVMM_EXIT_INVALID = 0xFFFFFFFFFFFFFFFF
+};
+
+enum nvmm_exit_memory_perm {
+ NVMM_EXIT_MEMORY_READ,
+ NVMM_EXIT_MEMORY_WRITE,
+ NVMM_EXIT_MEMORY_EXEC
+};
+
+struct nvmm_exit_memory {
+ enum nvmm_exit_memory_perm perm;
+ gpaddr_t gpa;
+ uint8_t inst_len;
+ uint8_t inst_bytes[15];
+ uint64_t npc;
+};
+
+enum nvmm_exit_io_type {
+ NVMM_EXIT_IO_IN,
+ NVMM_EXIT_IO_OUT
+};
+
+struct nvmm_exit_io {
+ enum nvmm_exit_io_type type;
+ uint16_t port;
+ int seg;
+ uint8_t address_size;
+ uint8_t operand_size;
+ bool rep;
+ bool str;
+ uint64_t npc;
+};
+
+enum nvmm_exit_msr_type {
+ NVMM_EXIT_MSR_RDMSR,
+ NVMM_EXIT_MSR_WRMSR
+};
+
+struct nvmm_exit_msr {
+ enum nvmm_exit_msr_type type;
+ uint64_t msr;
+ uint64_t val;
+ uint64_t npc;
+};
+
+struct nvmm_exit {
+ enum nvmm_exit_reason reason;
+ union {
+ struct nvmm_exit_memory mem;
+ struct nvmm_exit_io io;
+ struct nvmm_exit_msr msr;
+ } u;
+ uint64_t exitstate[8];
+};
+
+enum nvmm_event_type {
+ NVMM_EVENT_INTERRUPT_HW,
+ NVMM_EVENT_INTERRUPT_SW,
+ NVMM_EVENT_EXCEPTION
+};
+
+struct nvmm_event {
+ enum nvmm_event_type type;
+ uint64_t vector;
+ union {
+ /* NVMM_EVENT_INTERRUPT_HW */
+ uint8_t prio;
+
+ /* NVMM_EVENT_EXCEPTION */
+ uint64_t error;
+ } u;
+};
+
+#define NVMM_CAPABILITY_VERSION 1
+
+struct nvmm_capability {
+ uint64_t version;
+ uint64_t state_size;
+ uint64_t max_machines;
+ uint64_t max_vcpus;
+ uint64_t max_ram;
+ union {
+ struct {
+ uint64_t xcr0_mask;
+ uint64_t mxcsr_mask;
+ uint64_t conf_cpuid_maxops;
+ } x86;
+ uint64_t rsvd[8];
+ } u;
+};
+
+#endif
Index: src/sys/dev/nvmm/nvmm_internal.h
diff -u /dev/null src/sys/dev/nvmm/nvmm_internal.h:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm_internal.h Wed Nov 7 07:43:08 2018
@@ -0,0 +1,100 @@
+/* $NetBSD: nvmm_internal.h,v 1.1 2018/11/07 07:43:08 maxv Exp $ */
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NVMM_INTERNAL_H_
+#define _NVMM_INTERNAL_H_
+
+#define NVMM_MAX_MACHINES 128
+#define NVMM_MAX_VCPUS 256
+#define NVMM_MAX_RAM (4UL * (1 << 30))
+
+struct nvmm_cpu {
+ /* Shared. */
+ bool present;
+ nvmm_cpuid_t cpuid;
+ kmutex_t lock;
+
+ /* Last host CPU on which the VCPU ran. */
+ int hcpu_last;
+
+ /* Implementation-specific. */
+ void *cpudata;
+};
+
+struct nvmm_machine {
+ bool present;
+ nvmm_machid_t machid;
+ pid_t procid;
+ krwlock_t lock;
+
+ /* Kernel */
+ struct vmspace *vm;
+ struct uvm_object *uobj;
+ gpaddr_t gpa_begin;
+ gpaddr_t gpa_end;
+
+ /* CPU */
+ struct nvmm_cpu cpus[NVMM_MAX_VCPUS];
+
+ /* Implementation-specific */
+ void *machdata;
+};
+
+struct nvmm_impl {
+ bool (*ident)(void);
+ void (*init)(void);
+ void (*fini)(void);
+ void (*capability)(struct nvmm_capability *);
+
+ size_t conf_max;
+ const size_t *conf_sizes;
+ size_t state_size;
+
+ void (*machine_create)(struct nvmm_machine *);
+ void (*machine_destroy)(struct nvmm_machine *);
+ int (*machine_configure)(struct nvmm_machine *, uint64_t, void *);
+
+ int (*vcpu_create)(struct nvmm_machine *, struct nvmm_cpu *);
+ void (*vcpu_destroy)(struct nvmm_machine *, struct nvmm_cpu *);
+ void (*vcpu_setstate)(struct nvmm_cpu *, void *, uint64_t);
+ void (*vcpu_getstate)(struct nvmm_cpu *, void *, uint64_t);
+ int (*vcpu_inject)(struct nvmm_machine *, struct nvmm_cpu *,
+ struct nvmm_event *);
+ int (*vcpu_run)(struct nvmm_machine *, struct nvmm_cpu *,
+ struct nvmm_exit *);
+};
+
+int nvmm_vcpu_get(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_cpu **);
+void nvmm_vcpu_put(struct nvmm_cpu *);
+
+extern const struct nvmm_impl nvmm_x86_svm;
+
+#endif /* _NVMM_INTERNAL_H_ */
Index: src/sys/dev/nvmm/nvmm_ioctl.h
diff -u /dev/null src/sys/dev/nvmm/nvmm_ioctl.h:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm_ioctl.h Wed Nov 7 07:43:08 2018
@@ -0,0 +1,120 @@
+/* $NetBSD: nvmm_ioctl.h,v 1.1 2018/11/07 07:43:08 maxv Exp $ */
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NVMM_IOCTL_H_
+#define _NVMM_IOCTL_H_
+
+#include <dev/nvmm/nvmm.h>
+
+struct nvmm_ioc_capability {
+ struct nvmm_capability cap;
+};
+
+struct nvmm_ioc_machine_create {
+ nvmm_machid_t machid;
+};
+
+struct nvmm_ioc_machine_destroy {
+ nvmm_machid_t machid;
+};
+
+struct nvmm_ioc_machine_configure {
+ nvmm_machid_t machid;
+ uint64_t op;
+ void *conf;
+};
+
+struct nvmm_ioc_vcpu_create {
+ nvmm_machid_t machid;
+ nvmm_cpuid_t cpuid;
+};
+
+struct nvmm_ioc_vcpu_destroy {
+ nvmm_machid_t machid;
+ nvmm_cpuid_t cpuid;
+};
+
+struct nvmm_ioc_vcpu_setstate {
+ nvmm_machid_t machid;
+ nvmm_cpuid_t cpuid;
+ uint64_t flags;
+ void *state;
+};
+
+struct nvmm_ioc_vcpu_getstate {
+ nvmm_machid_t machid;
+ nvmm_cpuid_t cpuid;
+ uint64_t flags;
+ void *state;
+};
+
+struct nvmm_ioc_vcpu_inject {
+ nvmm_machid_t machid;
+ nvmm_cpuid_t cpuid;
+ struct nvmm_event event;
+};
+
+struct nvmm_ioc_vcpu_run {
+ /* input */
+ nvmm_machid_t machid;
+ nvmm_cpuid_t cpuid;
+ /* output */
+ struct nvmm_exit exit;
+};
+
+struct nvmm_ioc_gpa_map {
+ nvmm_machid_t machid;
+ uintptr_t hva;
+ gpaddr_t gpa;
+ size_t size;
+ int flags;
+};
+
+struct nvmm_ioc_gpa_unmap {
+ nvmm_machid_t machid;
+ gpaddr_t gpa;
+ size_t size;
+};
+
+#define NVMM_IOC_CAPABILITY _IOR ('N', 0, struct nvmm_ioc_capability)
+#define NVMM_IOC_MACHINE_CREATE _IOWR('N', 1, struct nvmm_ioc_machine_create)
+#define NVMM_IOC_MACHINE_DESTROY _IOW ('N', 2, struct nvmm_ioc_machine_destroy)
+#define NVMM_IOC_MACHINE_CONFIGURE _IOW ('N', 3, struct nvmm_ioc_machine_configure)
+#define NVMM_IOC_VCPU_CREATE _IOW ('N', 4, struct nvmm_ioc_vcpu_create)
+#define NVMM_IOC_VCPU_DESTROY _IOW ('N', 5, struct nvmm_ioc_vcpu_destroy)
+#define NVMM_IOC_VCPU_SETSTATE _IOW ('N', 6, struct nvmm_ioc_vcpu_setstate)
+#define NVMM_IOC_VCPU_GETSTATE _IOW ('N', 7, struct nvmm_ioc_vcpu_getstate)
+#define NVMM_IOC_VCPU_INJECT _IOWR('N', 8, struct nvmm_ioc_vcpu_inject)
+#define NVMM_IOC_VCPU_RUN _IOWR('N', 9, struct nvmm_ioc_vcpu_run)
+#define NVMM_IOC_GPA_MAP _IOW ('N', 10, struct nvmm_ioc_gpa_map)
+#define NVMM_IOC_GPA_UNMAP _IOW ('N', 11, struct nvmm_ioc_gpa_unmap)
+
+#endif /* _NVMM_IOCTL_H_ */
Index: src/sys/dev/nvmm/x86/Makefile
diff -u /dev/null src/sys/dev/nvmm/x86/Makefile:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/x86/Makefile Wed Nov 7 07:43:08 2018
@@ -0,0 +1,7 @@
+# $NetBSD: Makefile,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+INCSDIR= /usr/include/dev/nvmm/x86
+
+INCS= nvmm_x86.h
+
+.include <bsd.kinc.mk>
Index: src/sys/dev/nvmm/x86/nvmm_x86.h
diff -u /dev/null src/sys/dev/nvmm/x86/nvmm_x86.h:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/x86/nvmm_x86.h Wed Nov 7 07:43:08 2018
@@ -0,0 +1,172 @@
+/* $NetBSD: nvmm_x86.h,v 1.1 2018/11/07 07:43:08 maxv Exp $ */
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NVMM_X86_H_
+#define _NVMM_X86_H_
+
+/* Segments. */
+#define NVMM_X64_SEG_CS 0
+#define NVMM_X64_SEG_DS 1
+#define NVMM_X64_SEG_ES 2
+#define NVMM_X64_SEG_FS 3
+#define NVMM_X64_SEG_GS 4
+#define NVMM_X64_SEG_SS 5
+#define NVMM_X64_SEG_GDT 6
+#define NVMM_X64_SEG_IDT 7
+#define NVMM_X64_SEG_LDT 8
+#define NVMM_X64_SEG_TR 9
+#define NVMM_X64_NSEG 10
+
+/* General Purpose Registers. */
+#define NVMM_X64_GPR_RAX 0
+#define NVMM_X64_GPR_RBX 1
+#define NVMM_X64_GPR_RCX 2
+#define NVMM_X64_GPR_RDX 3
+#define NVMM_X64_GPR_R8 4
+#define NVMM_X64_GPR_R9 5
+#define NVMM_X64_GPR_R10 6
+#define NVMM_X64_GPR_R11 7
+#define NVMM_X64_GPR_R12 8
+#define NVMM_X64_GPR_R13 9
+#define NVMM_X64_GPR_R14 10
+#define NVMM_X64_GPR_R15 11
+#define NVMM_X64_GPR_RDI 12
+#define NVMM_X64_GPR_RSI 13
+#define NVMM_X64_GPR_RBP 14
+#define NVMM_X64_GPR_RSP 15
+#define NVMM_X64_GPR_RIP 16
+#define NVMM_X64_GPR_RFLAGS 17
+#define NVMM_X64_NGPR 18
+
+/* Control Registers. */
+#define NVMM_X64_CR_CR0 0
+#define NVMM_X64_CR_CR2 1
+#define NVMM_X64_CR_CR3 2
+#define NVMM_X64_CR_CR4 3
+#define NVMM_X64_CR_CR8 4
+#define NVMM_X64_CR_XCR0 5
+#define NVMM_X64_NCR 6
+
+/* Debug Registers. */
+#define NVMM_X64_DR_DR0 0
+#define NVMM_X64_DR_DR1 1
+#define NVMM_X64_DR_DR2 2
+#define NVMM_X64_DR_DR3 3
+#define NVMM_X64_DR_DR6 4
+#define NVMM_X64_DR_DR7 5
+#define NVMM_X64_NDR 6
+
+/* MSRs. */
+#define NVMM_X64_MSR_EFER 0
+#define NVMM_X64_MSR_STAR 1
+#define NVMM_X64_MSR_LSTAR 2
+#define NVMM_X64_MSR_CSTAR 3
+#define NVMM_X64_MSR_SFMASK 4
+#define NVMM_X64_MSR_KERNELGSBASE 5
+#define NVMM_X64_MSR_SYSENTER_CS 6
+#define NVMM_X64_MSR_SYSENTER_ESP 7
+#define NVMM_X64_MSR_SYSENTER_EIP 8
+#define NVMM_X64_MSR_PAT 9
+#define NVMM_X64_NMSR 10
+
+/* Misc. */
+#define NVMM_X64_MISC_CPL 0
+#define NVMM_X64_NMISC 1
+
+#ifndef ASM_NVMM
+
+#include <sys/types.h>
+#include <x86/cpu_extended_state.h>
+
+struct nvmm_x64_state_seg {
+ uint64_t selector;
+ struct { /* hidden */
+ uint64_t type:5;
+ uint64_t dpl:2;
+ uint64_t p:1;
+ uint64_t avl:1;
+ uint64_t lng:1;
+ uint64_t def32:1;
+ uint64_t gran:1;
+ uint64_t rsvd:52;
+ } attrib;
+ uint64_t limit; /* hidden */
+ uint64_t base; /* hidden */
+};
+
+/* VM exit state indexes. */
+#define NVMM_X64_EXITSTATE_CR8 0
+
+/* Flags. */
+#define NVMM_X64_STATE_SEGS 0x01
+#define NVMM_X64_STATE_GPRS 0x02
+#define NVMM_X64_STATE_CRS 0x04
+#define NVMM_X64_STATE_DRS 0x08
+#define NVMM_X64_STATE_MSRS 0x10
+#define NVMM_X64_STATE_MISC 0x20
+#define NVMM_X64_STATE_FPU 0x40
+#define NVMM_X64_STATE_ALL \
+ (NVMM_X64_STATE_SEGS | NVMM_X64_STATE_GPRS | NVMM_X64_STATE_CRS | \
+ NVMM_X64_STATE_DRS | NVMM_X64_STATE_MSRS | NVMM_X64_STATE_MISC | \
+ NVMM_X64_STATE_FPU)
+
+struct nvmm_x64_state {
+ struct nvmm_x64_state_seg segs[NVMM_X64_NSEG];
+ uint64_t gprs[NVMM_X64_NGPR];
+ uint64_t crs[NVMM_X64_NCR];
+ uint64_t drs[NVMM_X64_NDR];
+ uint64_t msrs[NVMM_X64_NMSR];
+ uint64_t misc[NVMM_X64_NMISC];
+ struct fxsave fpu;
+};
+
+#define NVMM_X86_CONF_CPUID 0
+#define NVMM_X86_NCONF 1
+
+struct nvmm_x86_conf_cpuid {
+ uint32_t leaf;
+ struct {
+ uint32_t eax;
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ } set;
+ struct {
+ uint32_t eax;
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ } del;
+};
+
+#endif /* ASM_NVMM */
+
+#endif /* _NVMM_X86_H_ */
Index: src/sys/dev/nvmm/x86/nvmm_x86_svm.c
diff -u /dev/null src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/x86/nvmm_x86_svm.c Wed Nov 7 07:43:08 2018
@@ -0,0 +1,2088 @@
+/* $NetBSD: nvmm_x86_svm.c,v 1.1 2018/11/07 07:43:08 maxv Exp $ */
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_svm.c,v 1.1 2018/11/07 07:43:08 maxv Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/cpu.h>
+#include <sys/xcall.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_page.h>
+
+#include <x86/cputypes.h>
+#include <x86/cpu_msr.h>
+#include <x86/specialreg.h>
+#include <x86/pmap.h>
+#include <x86/dbregs.h>
+#include <machine/cpuvar.h>
+
+#include <dev/nvmm/nvmm.h>
+#include <dev/nvmm/nvmm_internal.h>
+#include <dev/nvmm/x86/nvmm_x86.h>
+
+int svm_vmrun(paddr_t, uint64_t *);
+
+#define MSR_VM_HSAVE_PA 0xC0010117
+
+/* -------------------------------------------------------------------------- */
+
+#define VMCB_EXITCODE_CR0_READ 0x0000
+#define VMCB_EXITCODE_CR1_READ 0x0001
+#define VMCB_EXITCODE_CR2_READ 0x0002
+#define VMCB_EXITCODE_CR3_READ 0x0003
+#define VMCB_EXITCODE_CR4_READ 0x0004
+#define VMCB_EXITCODE_CR5_READ 0x0005
+#define VMCB_EXITCODE_CR6_READ 0x0006
+#define VMCB_EXITCODE_CR7_READ 0x0007
+#define VMCB_EXITCODE_CR8_READ 0x0008
+#define VMCB_EXITCODE_CR9_READ 0x0009
+#define VMCB_EXITCODE_CR10_READ 0x000A
+#define VMCB_EXITCODE_CR11_READ 0x000B
+#define VMCB_EXITCODE_CR12_READ 0x000C
+#define VMCB_EXITCODE_CR13_READ 0x000D
+#define VMCB_EXITCODE_CR14_READ 0x000E
+#define VMCB_EXITCODE_CR15_READ 0x000F
+#define VMCB_EXITCODE_CR0_WRITE 0x0010
+#define VMCB_EXITCODE_CR1_WRITE 0x0011
+#define VMCB_EXITCODE_CR2_WRITE 0x0012
+#define VMCB_EXITCODE_CR3_WRITE 0x0013
+#define VMCB_EXITCODE_CR4_WRITE 0x0014
+#define VMCB_EXITCODE_CR5_WRITE 0x0015
+#define VMCB_EXITCODE_CR6_WRITE 0x0016
+#define VMCB_EXITCODE_CR7_WRITE 0x0017
+#define VMCB_EXITCODE_CR8_WRITE 0x0018
+#define VMCB_EXITCODE_CR9_WRITE 0x0019
+#define VMCB_EXITCODE_CR10_WRITE 0x001A
+#define VMCB_EXITCODE_CR11_WRITE 0x001B
+#define VMCB_EXITCODE_CR12_WRITE 0x001C
+#define VMCB_EXITCODE_CR13_WRITE 0x001D
+#define VMCB_EXITCODE_CR14_WRITE 0x001E
+#define VMCB_EXITCODE_CR15_WRITE 0x001F
+#define VMCB_EXITCODE_DR0_READ 0x0020
+#define VMCB_EXITCODE_DR1_READ 0x0021
+#define VMCB_EXITCODE_DR2_READ 0x0022
+#define VMCB_EXITCODE_DR3_READ 0x0023
+#define VMCB_EXITCODE_DR4_READ 0x0024
+#define VMCB_EXITCODE_DR5_READ 0x0025
+#define VMCB_EXITCODE_DR6_READ 0x0026
+#define VMCB_EXITCODE_DR7_READ 0x0027
+#define VMCB_EXITCODE_DR8_READ 0x0028
+#define VMCB_EXITCODE_DR9_READ 0x0029
+#define VMCB_EXITCODE_DR10_READ 0x002A
+#define VMCB_EXITCODE_DR11_READ 0x002B
+#define VMCB_EXITCODE_DR12_READ 0x002C
+#define VMCB_EXITCODE_DR13_READ 0x002D
+#define VMCB_EXITCODE_DR14_READ 0x002E
+#define VMCB_EXITCODE_DR15_READ 0x002F
+#define VMCB_EXITCODE_DR0_WRITE 0x0030
+#define VMCB_EXITCODE_DR1_WRITE 0x0031
+#define VMCB_EXITCODE_DR2_WRITE 0x0032
+#define VMCB_EXITCODE_DR3_WRITE 0x0033
+#define VMCB_EXITCODE_DR4_WRITE 0x0034
+#define VMCB_EXITCODE_DR5_WRITE 0x0035
+#define VMCB_EXITCODE_DR6_WRITE 0x0036
+#define VMCB_EXITCODE_DR7_WRITE 0x0037
+#define VMCB_EXITCODE_DR8_WRITE 0x0038
+#define VMCB_EXITCODE_DR9_WRITE 0x0039
+#define VMCB_EXITCODE_DR10_WRITE 0x003A
+#define VMCB_EXITCODE_DR11_WRITE 0x003B
+#define VMCB_EXITCODE_DR12_WRITE 0x003C
+#define VMCB_EXITCODE_DR13_WRITE 0x003D
+#define VMCB_EXITCODE_DR14_WRITE 0x003E
+#define VMCB_EXITCODE_DR15_WRITE 0x003F
+#define VMCB_EXITCODE_EXCP0 0x0040
+#define VMCB_EXITCODE_EXCP1 0x0041
+#define VMCB_EXITCODE_EXCP2 0x0042
+#define VMCB_EXITCODE_EXCP3 0x0043
+#define VMCB_EXITCODE_EXCP4 0x0044
+#define VMCB_EXITCODE_EXCP5 0x0045
+#define VMCB_EXITCODE_EXCP6 0x0046
+#define VMCB_EXITCODE_EXCP7 0x0047
+#define VMCB_EXITCODE_EXCP8 0x0048
+#define VMCB_EXITCODE_EXCP9 0x0049
+#define VMCB_EXITCODE_EXCP10 0x004A
+#define VMCB_EXITCODE_EXCP11 0x004B
+#define VMCB_EXITCODE_EXCP12 0x004C
+#define VMCB_EXITCODE_EXCP13 0x004D
+#define VMCB_EXITCODE_EXCP14 0x004E
+#define VMCB_EXITCODE_EXCP15 0x004F
+#define VMCB_EXITCODE_EXCP16 0x0050
+#define VMCB_EXITCODE_EXCP17 0x0051
+#define VMCB_EXITCODE_EXCP18 0x0052
+#define VMCB_EXITCODE_EXCP19 0x0053
+#define VMCB_EXITCODE_EXCP20 0x0054
+#define VMCB_EXITCODE_EXCP21 0x0055
+#define VMCB_EXITCODE_EXCP22 0x0056
+#define VMCB_EXITCODE_EXCP23 0x0057
+#define VMCB_EXITCODE_EXCP24 0x0058
+#define VMCB_EXITCODE_EXCP25 0x0059
+#define VMCB_EXITCODE_EXCP26 0x005A
+#define VMCB_EXITCODE_EXCP27 0x005B
+#define VMCB_EXITCODE_EXCP28 0x005C
+#define VMCB_EXITCODE_EXCP29 0x005D
+#define VMCB_EXITCODE_EXCP30 0x005E
+#define VMCB_EXITCODE_EXCP31 0x005F
+#define VMCB_EXITCODE_INTR 0x0060
+#define VMCB_EXITCODE_NMI 0x0061
+#define VMCB_EXITCODE_SMI 0x0062
+#define VMCB_EXITCODE_INIT 0x0063
+#define VMCB_EXITCODE_VINTR 0x0064
+#define VMCB_EXITCODE_CR0_SEL_WRITE 0x0065
+#define VMCB_EXITCODE_IDTR_READ 0x0066
+#define VMCB_EXITCODE_GDTR_READ 0x0067
+#define VMCB_EXITCODE_LDTR_READ 0x0068
+#define VMCB_EXITCODE_TR_READ 0x0069
+#define VMCB_EXITCODE_IDTR_WRITE 0x006A
+#define VMCB_EXITCODE_GDTR_WRITE 0x006B
+#define VMCB_EXITCODE_LDTR_WRITE 0x006C
+#define VMCB_EXITCODE_TR_WRITE 0x006D
+#define VMCB_EXITCODE_RDTSC 0x006E
+#define VMCB_EXITCODE_RDPMC 0x006F
+#define VMCB_EXITCODE_PUSHF 0x0070
+#define VMCB_EXITCODE_POPF 0x0071
+#define VMCB_EXITCODE_CPUID 0x0072
+#define VMCB_EXITCODE_RSM 0x0073
+#define VMCB_EXITCODE_IRET 0x0074
+#define VMCB_EXITCODE_SWINT 0x0075
+#define VMCB_EXITCODE_INVD 0x0076
+#define VMCB_EXITCODE_PAUSE 0x0077
+#define VMCB_EXITCODE_HLT 0x0078
+#define VMCB_EXITCODE_INVLPG 0x0079
+#define VMCB_EXITCODE_INVLPGA 0x007A
+#define VMCB_EXITCODE_IOIO 0x007B
+#define VMCB_EXITCODE_MSR 0x007C
+#define VMCB_EXITCODE_TASK_SWITCH 0x007D
+#define VMCB_EXITCODE_FERR_FREEZE 0x007E
+#define VMCB_EXITCODE_SHUTDOWN 0x007F
+#define VMCB_EXITCODE_VMRUN 0x0080
+#define VMCB_EXITCODE_VMMCALL 0x0081
+#define VMCB_EXITCODE_VMLOAD 0x0082
+#define VMCB_EXITCODE_VMSAVE 0x0083
+#define VMCB_EXITCODE_STGI 0x0084
+#define VMCB_EXITCODE_CLGI 0x0085
+#define VMCB_EXITCODE_SKINIT 0x0086
+#define VMCB_EXITCODE_RDTSCP 0x0087
+#define VMCB_EXITCODE_ICEBP 0x0088
+#define VMCB_EXITCODE_WBINVD 0x0089
+#define VMCB_EXITCODE_MONITOR 0x008A
+#define VMCB_EXITCODE_MWAIT 0x008B
+#define VMCB_EXITCODE_MWAIT_CONDITIONAL 0x008C
+#define VMCB_EXITCODE_XSETBV 0x008D
+#define VMCB_EXITCODE_EFER_WRITE_TRAP 0x008F
+#define VMCB_EXITCODE_CR0_WRITE_TRAP 0x0090
+#define VMCB_EXITCODE_CR1_WRITE_TRAP 0x0091
+#define VMCB_EXITCODE_CR2_WRITE_TRAP 0x0092
+#define VMCB_EXITCODE_CR3_WRITE_TRAP 0x0093
+#define VMCB_EXITCODE_CR4_WRITE_TRAP 0x0094
+#define VMCB_EXITCODE_CR5_WRITE_TRAP 0x0095
+#define VMCB_EXITCODE_CR6_WRITE_TRAP 0x0096
+#define VMCB_EXITCODE_CR7_WRITE_TRAP 0x0097
+#define VMCB_EXITCODE_CR8_WRITE_TRAP 0x0098
+#define VMCB_EXITCODE_CR9_WRITE_TRAP 0x0099
+#define VMCB_EXITCODE_CR10_WRITE_TRAP 0x009A
+#define VMCB_EXITCODE_CR11_WRITE_TRAP 0x009B
+#define VMCB_EXITCODE_CR12_WRITE_TRAP 0x009C
+#define VMCB_EXITCODE_CR13_WRITE_TRAP 0x009D
+#define VMCB_EXITCODE_CR14_WRITE_TRAP 0x009E
+#define VMCB_EXITCODE_CR15_WRITE_TRAP 0x009F
+#define VMCB_EXITCODE_NPF 0x0400
+#define VMCB_EXITCODE_AVIC_INCOMP_IPI 0x0401
+#define VMCB_EXITCODE_AVIC_NOACCEL 0x0402
+#define VMCB_EXITCODE_VMGEXIT 0x0403
+#define VMCB_EXITCODE_INVALID -1
+
+/* -------------------------------------------------------------------------- */
+
+struct vmcb_ctrl {
+ uint32_t intercept_cr;
+#define VMCB_CTRL_INTERCEPT_RCR(x) __BIT( 0 + x)
+#define VMCB_CTRL_INTERCEPT_WCR(x) __BIT(16 + x)
+
+ uint32_t intercept_dr;
+#define VMCB_CTRL_INTERCEPT_RDR(x) __BIT( 0 + x)
+#define VMCB_CTRL_INTERCEPT_WDR(x) __BIT(16 + x)
+
+ uint32_t intercept_vec;
+#define VMCB_CTRL_INTERCEPT_VEC(x) __BIT(x)
+
+ uint32_t intercept_misc1;
+#define VMCB_CTRL_INTERCEPT_INTR __BIT(0)
+#define VMCB_CTRL_INTERCEPT_NMI __BIT(1)
+#define VMCB_CTRL_INTERCEPT_SMI __BIT(2)
+#define VMCB_CTRL_INTERCEPT_INIT __BIT(3)
+#define VMCB_CTRL_INTERCEPT_VINTR __BIT(4)
+#define VMCB_CTRL_INTERCEPT_CR0_SPEC __BIT(5)
+#define VMCB_CTRL_INTERCEPT_RIDTR __BIT(6)
+#define VMCB_CTRL_INTERCEPT_RGDTR __BIT(7)
+#define VMCB_CTRL_INTERCEPT_RLDTR __BIT(8)
+#define VMCB_CTRL_INTERCEPT_RTR __BIT(9)
+#define VMCB_CTRL_INTERCEPT_WIDTR __BIT(10)
+#define VMCB_CTRL_INTERCEPT_WGDTR __BIT(11)
+#define VMCB_CTRL_INTERCEPT_WLDTR __BIT(12)
+#define VMCB_CTRL_INTERCEPT_WTR __BIT(13)
+#define VMCB_CTRL_INTERCEPT_RDTSC __BIT(14)
+#define VMCB_CTRL_INTERCEPT_RDPMC __BIT(15)
+#define VMCB_CTRL_INTERCEPT_PUSHF __BIT(16)
+#define VMCB_CTRL_INTERCEPT_POPF __BIT(17)
+#define VMCB_CTRL_INTERCEPT_CPUID __BIT(18)
+#define VMCB_CTRL_INTERCEPT_RSM __BIT(19)
+#define VMCB_CTRL_INTERCEPT_IRET __BIT(20)
+#define VMCB_CTRL_INTERCEPT_INTN __BIT(21)
+#define VMCB_CTRL_INTERCEPT_INVD __BIT(22)
+#define VMCB_CTRL_INTERCEPT_PAUSE __BIT(23)
+#define VMCB_CTRL_INTERCEPT_HLT __BIT(24)
+#define VMCB_CTRL_INTERCEPT_INVLPG __BIT(25)
+#define VMCB_CTRL_INTERCEPT_INVLPGA __BIT(26)
+#define VMCB_CTRL_INTERCEPT_IOIO_PROT __BIT(27)
+#define VMCB_CTRL_INTERCEPT_MSR_PROT __BIT(28)
+#define VMCB_CTRL_INTERCEPT_TASKSW __BIT(29)
+#define VMCB_CTRL_INTERCEPT_FERR_FREEZE __BIT(30)
+#define VMCB_CTRL_INTERCEPT_SHUTDOWN __BIT(31)
+
+ uint32_t intercept_misc2;
+#define VMCB_CTRL_INTERCEPT_VMRUN __BIT(0)
+#define VMCB_CTRL_INTERCEPT_VMMCALL __BIT(1)
+#define VMCB_CTRL_INTERCEPT_VMLOAD __BIT(2)
+#define VMCB_CTRL_INTERCEPT_VMSAVE __BIT(3)
+#define VMCB_CTRL_INTERCEPT_STGI __BIT(4)
+#define VMCB_CTRL_INTERCEPT_CLGI __BIT(5)
+#define VMCB_CTRL_INTERCEPT_SKINIT __BIT(6)
+#define VMCB_CTRL_INTERCEPT_RDTSCP __BIT(7)
+#define VMCB_CTRL_INTERCEPT_ICEBP __BIT(8)
+#define VMCB_CTRL_INTERCEPT_WBINVD __BIT(9)
+#define VMCB_CTRL_INTERCEPT_MONITOR __BIT(10)
+#define VMCB_CTRL_INTERCEPT_MWAIT __BIT(12)
+#define VMCB_CTRL_INTERCEPT_XSETBV __BIT(13)
+#define VMCB_CTRL_INTERCEPT_EFER_SPEC __BIT(15)
+#define VMCB_CTRL_INTERCEPT_WCR_SPEC(x) __BIT(16 + x)
+
+ uint8_t rsvd1[40];
+ uint16_t pause_filt_thresh;
+ uint16_t pause_filt_cnt;
+ uint64_t iopm_base_pa;
+ uint64_t msrpm_base_pa;
+ uint64_t tsc_offset;
+ uint32_t guest_asid;
+
+ uint32_t tlb_ctrl;
+#define VMCB_CTRL_TLB_CTRL_FLUSH_ALL 0x01
+#define VMCB_CTRL_TLB_CTRL_FLUSH_GUEST 0x03
+#define VMCB_CTRL_TLB_CTRL_FLUSH_GUEST_NONGLOBAL 0x07
+
+ uint64_t v;
+#define VMCB_CTRL_V_TPR __BITS(7,0)
+#define VMCB_CTRL_V_IRQ __BIT(8)
+#define VMCB_CTRL_V_VGIF __BIT(9)
+#define VMCB_CTRL_V_INTR_PRIO __BITS(19,16)
+#define VMCB_CTRL_V_IGN_TPR __BIT(20)
+#define VMCB_CTRL_V_INTR_MASKING __BIT(24)
+#define VMCB_CTRL_V_GUEST_VGIF __BIT(25)
+#define VMCB_CTRL_V_AVIC_EN __BIT(31)
+#define VMCB_CTRL_V_INTR_VECTOR __BITS(39,32)
+
+ uint64_t intr;
+#define VMCB_CTRL_INTR_SHADOW __BIT(0)
+#define VMCB_CTRL_GUEST_INTR_MASK __BIT(1)
+
+ uint64_t exitcode;
+ uint64_t exitinfo1;
+ uint64_t exitinfo2;
+
+ uint64_t exitintinfo;
+#define VMCB_CTRL_EXITINTINFO_VECTOR __BITS(7,0)
+#define VMCB_CTRL_EXITINTINFO_TYPE __BITS(10,8)
+#define VMCB_CTRL_EXITINTINFO_EV __BIT(11)
+#define VMCB_CTRL_EXITINTINFO_V __BIT(31)
+#define VMCB_CTRL_EXITINTINFO_ERRORCODE __BITS(63,32)
+
+ uint64_t enable1;
+#define VMCB_CTRL_ENABLE_NP __BIT(0)
+#define VMCB_CTRL_ENABLE_SEV __BIT(1)
+#define VMCB_CTRL_ENABLE_ES_SEV __BIT(2)
+
+ uint64_t avic;
+#define VMCB_CTRL_AVIC_APIC_BAR __BITS(51,0)
+
+ uint64_t ghcb;
+
+ uint64_t eventinj;
+#define VMCB_CTRL_EVENTINJ_VECTOR __BITS(7,0)
+#define VMCB_CTRL_EVENTINJ_TYPE __BITS(10,8)
+#define VMCB_CTRL_EVENTINJ_EV __BIT(11)
+#define VMCB_CTRL_EVENTINJ_V __BIT(31)
+#define VMCB_CTRL_EVENTINJ_ERRORCODE __BITS(63,32)
+
+ uint64_t n_cr3;
+
+ uint64_t enable2;
+#define VMCB_CTRL_ENABLE_LBR __BIT(0)
+#define VMCB_CTRL_ENABLE_VVMSAVE __BIT(1)
+
+ uint32_t vmcb_clean;
+#define VMCB_CTRL_VMCB_CLEAN_I __BIT(0)
+#define VMCB_CTRL_VMCB_CLEAN_IOPM __BIT(1)
+#define VMCB_CTRL_VMCB_CLEAN_ASID __BIT(2)
+#define VMCB_CTRL_VMCB_CLEAN_TPR __BIT(3)
+#define VMCB_CTRL_VMCB_CLEAN_NP __BIT(4)
+#define VMCB_CTRL_VMCB_CLEAN_CR __BIT(5)
+#define VMCB_CTRL_VMCB_CLEAN_DR __BIT(6)
+#define VMCB_CTRL_VMCB_CLEAN_DT __BIT(7)
+#define VMCB_CTRL_VMCB_CLEAN_SEG __BIT(8)
+#define VMCB_CTRL_VMCB_CLEAN_CR2 __BIT(9)
+#define VMCB_CTRL_VMCB_CLEAN_LBR __BIT(10)
+#define VMCB_CTRL_VMCB_CLEAN_AVIC __BIT(11)
+
+ uint32_t rsvd2;
+ uint64_t nrip;
+ uint8_t inst_len;
+ uint8_t inst_bytes[15];
+ uint8_t pad[800];
+} __packed;
+
+CTASSERT(sizeof(struct vmcb_ctrl) == 1024);
+
+struct vmcb_segment {
+ uint16_t selector;
+ uint16_t attrib; /* hidden */
+ uint32_t limit; /* hidden */
+ uint64_t base; /* hidden */
+} __packed;
+
+CTASSERT(sizeof(struct vmcb_segment) == 16);
+
+struct vmcb_state {
+ struct vmcb_segment es;
+ struct vmcb_segment cs;
+ struct vmcb_segment ss;
+ struct vmcb_segment ds;
+ struct vmcb_segment fs;
+ struct vmcb_segment gs;
+ struct vmcb_segment gdt;
+ struct vmcb_segment ldt;
+ struct vmcb_segment idt;
+ struct vmcb_segment tr;
+ uint8_t rsvd1[43];
+ uint8_t cpl;
+ uint8_t rsvd2[4];
+ uint64_t efer;
+ uint8_t rsvd3[112];
+ uint64_t cr4;
+ uint64_t cr3;
+ uint64_t cr0;
+ uint64_t dr7;
+ uint64_t dr6;
+ uint64_t rflags;
+ uint64_t rip;
+ uint8_t rsvd4[88];
+ uint64_t rsp;
+ uint8_t rsvd5[24];
+ uint64_t rax;
+ uint64_t star;
+ uint64_t lstar;
+ uint64_t cstar;
+ uint64_t sfmask;
+ uint64_t kernelgsbase;
+ uint64_t sysenter_cs;
+ uint64_t sysenter_esp;
+ uint64_t sysenter_eip;
+ uint64_t cr2;
+ uint8_t rsvd6[32];
+ uint64_t g_pat;
+ uint64_t dbgctl;
+ uint64_t br_from;
+ uint64_t br_to;
+ uint64_t int_from;
+ uint64_t int_to;
+ uint8_t pad[2408];
+} __packed;
+
+CTASSERT(sizeof(struct vmcb_state) == 0xC00);
+
+struct vmcb {
+ struct vmcb_ctrl ctrl;
+ struct vmcb_state state;
+} __packed;
+
+CTASSERT(sizeof(struct vmcb) == PAGE_SIZE);
+CTASSERT(offsetof(struct vmcb, state) == 0x400);
+
+/* -------------------------------------------------------------------------- */
+
+struct svm_hsave {
+ paddr_t pa;
+};
+
+static struct svm_hsave hsave[MAXCPUS];
+
+static uint8_t *svm_asidmap __read_mostly;
+static uint32_t svm_maxasid __read_mostly;
+static kmutex_t svm_asidlock __cacheline_aligned;
+
+static bool svm_decode_assist __read_mostly;
+static uint32_t svm_ctrl_tlb_flush __read_mostly;
+
+#define SVM_XCR0_MASK_DEFAULT (XCR0_X87|XCR0_SSE)
+static uint64_t svm_xcr0_mask __read_mostly;
+
+#define SVM_NCPUIDS 32
+
+#define VMCB_NPAGES 1
+
+#define MSRBM_NPAGES 2
+#define MSRBM_SIZE (MSRBM_NPAGES * PAGE_SIZE)
+
+#define IOBM_NPAGES 3
+#define IOBM_SIZE (IOBM_NPAGES * PAGE_SIZE)
+
+/* Does not include EFER_LMSLE. */
+#define EFER_VALID \
+ (EFER_SCE|EFER_LME|EFER_LMA|EFER_NXE|EFER_SVME|EFER_FFXSR|EFER_TCE)
+
+#define EFER_TLB_FLUSH \
+ (EFER_NXE|EFER_LMA|EFER_LME)
+#define CR0_TLB_FLUSH \
+ (CR0_PG|CR0_WP|CR0_CD|CR0_NW)
+#define CR4_TLB_FLUSH \
+ (CR4_PGE|CR4_PAE|CR4_PSE)
+
+/* -------------------------------------------------------------------------- */
+
+struct svm_machdata {
+ bool cpuidpresent[SVM_NCPUIDS];
+ struct nvmm_x86_conf_cpuid cpuid[SVM_NCPUIDS];
+};
+
+static const size_t svm_conf_sizes[NVMM_X86_NCONF] = {
+ [NVMM_X86_CONF_CPUID] = sizeof(struct nvmm_x86_conf_cpuid)
+};
+
+struct svm_cpudata {
+ /* x64-specific */
+ struct nvmm_x64_state state;
+
+ /* General */
+ bool shared_asid;
+ bool tlb_want_flush;
+
+ /* VMCB */
+ struct vmcb *vmcb;
+ paddr_t vmcb_pa;
+
+ /* I/O bitmap */
+ uint8_t *iobm;
+ paddr_t iobm_pa;
+
+ /* MSR bitmap */
+ uint8_t *msrbm;
+ paddr_t msrbm_pa;
+
+ /* Host state */
+ uint64_t xcr0;
+ uint64_t star;
+ uint64_t lstar;
+ uint64_t cstar;
+ uint64_t sfmask;
+ uint64_t cr2;
+ bool ts_set;
+ struct xsave_header hfpu __aligned(16);
+
+ /* Guest state */
+ bool in_nmi;
+ uint64_t tsc_offset;
+ struct xsave_header gfpu __aligned(16);
+};
+
+#define SVM_EVENT_TYPE_HW_INT 0
+#define SVM_EVENT_TYPE_NMI 2
+#define SVM_EVENT_TYPE_EXC 3
+#define SVM_EVENT_TYPE_SW_INT 4
+
+static void
+svm_event_waitexit_enable(struct vmcb *vmcb, bool nmi)
+{
+ if (nmi) {
+ vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_IRET;
+ } else {
+ vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_VINTR;
+ vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ |
+ __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR));
+ }
+}
+
+static void
+svm_event_waitexit_disable(struct vmcb *vmcb, bool nmi)
+{
+ if (nmi) {
+ vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_IRET;
+ } else {
+ vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_VINTR;
+ vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ |
+ __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR));
+ }
+}
+
+static inline int
+svm_event_has_error(uint64_t vector)
+{
+ switch (vector) {
+ case 8: /* #DF */
+ case 10: /* #TS */
+ case 11: /* #NP */
+ case 12: /* #SS */
+ case 13: /* #GP */
+ case 14: /* #PF */
+ case 17: /* #AC */
+ case 30: /* #SX */
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int
+svm_vcpu_inject(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+ struct nvmm_event *event)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct vmcb *vmcb = cpudata->vmcb;
+ uint64_t rflags = vmcb->state.rflags;
+ int type = 0, err = 0;
+ uint64_t tpr;
+
+ if (event->vector >= 256) {
+ return EINVAL;
+ }
+
+ switch (event->type) {
+ case NVMM_EVENT_INTERRUPT_HW:
+ type = SVM_EVENT_TYPE_HW_INT;
+ if (event->vector == 2) {
+ type = SVM_EVENT_TYPE_NMI;
+ }
+ if (type == SVM_EVENT_TYPE_NMI) {
+ if (cpudata->in_nmi) {
+ svm_event_waitexit_enable(vmcb, true);
+ return EAGAIN;
+ }
+ cpudata->in_nmi = true;
+ } else {
+ tpr = __SHIFTOUT(vmcb->ctrl.v, VMCB_CTRL_V_TPR);
+ if ((rflags & PSL_I) == 0 || event->u.prio <= tpr) {
+ svm_event_waitexit_enable(vmcb, false);
+ return EAGAIN;
+ }
+ }
+ err = 0;
+ break;
+ case NVMM_EVENT_INTERRUPT_SW:
+ type = SVM_EVENT_TYPE_SW_INT;
+ err = 0;
+ break;
+ case NVMM_EVENT_EXCEPTION:
+ type = SVM_EVENT_TYPE_EXC;
+ if (event->vector == 2 || event->vector >= 32)
+ return EINVAL;
+ err = svm_event_has_error(event->vector);
+ break;
+ default:
+ return EINVAL;
+ }
+
+ vmcb->ctrl.eventinj =
+ __SHIFTIN(event->vector, VMCB_CTRL_EVENTINJ_VECTOR) |
+ __SHIFTIN(type, VMCB_CTRL_EVENTINJ_TYPE) |
+ __SHIFTIN(err, VMCB_CTRL_EVENTINJ_EV) |
+ __SHIFTIN(1, VMCB_CTRL_EVENTINJ_V) |
+ __SHIFTIN(event->u.error, VMCB_CTRL_EVENTINJ_ERRORCODE);
+
+ return 0;
+}
+
+static void
+svm_inject_ud(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+ struct nvmm_event event;
+ int ret __diagused;
+
+ event.type = NVMM_EVENT_EXCEPTION;
+ event.vector = 6;
+ event.u.error = 0;
+
+ ret = svm_vcpu_inject(mach, vcpu, &event);
+ KASSERT(ret == 0);
+}
+
+static void
+svm_inject_db(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+ struct nvmm_event event;
+ int ret __diagused;
+
+ event.type = NVMM_EVENT_EXCEPTION;
+ event.vector = 1;
+ event.u.error = 0;
+
+ ret = svm_vcpu_inject(mach, vcpu, &event);
+ KASSERT(ret == 0);
+}
+
+static void
+svm_inject_gp(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+ struct nvmm_event event;
+ int ret __diagused;
+
+ event.type = NVMM_EVENT_EXCEPTION;
+ event.vector = 13;
+ event.u.error = 0;
+
+ ret = svm_vcpu_inject(mach, vcpu, &event);
+ KASSERT(ret == 0);
+}
+
+static void
+svm_inkernel_handle_cpuid(struct nvmm_cpu *vcpu, uint64_t eax, uint64_t ecx)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct nvmm_x64_state *state = &cpudata->state;
+
+ switch (eax) {
+ case 0x00000001: /* APIC number in RBX. The rest is tunable. */
+ state->gprs[NVMM_X64_GPR_RBX] &= ~CPUID_LOCAL_APIC_ID;
+ state->gprs[NVMM_X64_GPR_RBX] |= __SHIFTIN(vcpu->cpuid,
+ CPUID_LOCAL_APIC_ID);
+ break;
+ case 0x0000000D: /* FPU description. Not tunable. */
+ if (ecx != 0 || svm_xcr0_mask == 0) {
+ break;
+ }
+ cpudata->vmcb->state.rax = svm_xcr0_mask & 0xFFFFFFFF;
+ if (state->crs[NVMM_X64_CR_XCR0] & XCR0_SSE) {
+ state->gprs[NVMM_X64_GPR_RBX] = sizeof(struct fxsave);
+ } else {
+ state->gprs[NVMM_X64_GPR_RBX] = sizeof(struct save87);
+ }
+ state->gprs[NVMM_X64_GPR_RBX] += 64; /* XSAVE header */
+ state->gprs[NVMM_X64_GPR_RCX] = sizeof(struct fxsave);
+ state->gprs[NVMM_X64_GPR_RDX] = svm_xcr0_mask >> 32;
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+svm_exit_cpuid(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+ struct nvmm_exit *exit)
+{
+ struct svm_machdata *machdata = mach->machdata;
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct nvmm_x64_state *state = &cpudata->state;
+ struct nvmm_x86_conf_cpuid *cpuid;
+ uint64_t eax, ecx;
+ u_int descs[4];
+ size_t i;
+
+ eax = cpudata->vmcb->state.rax;
+ ecx = state->gprs[NVMM_X64_GPR_RCX];
+ x86_cpuid2(eax, ecx, descs);
+
+ cpudata->vmcb->state.rax = descs[0];
+ state->gprs[NVMM_X64_GPR_RBX] = descs[1];
+ state->gprs[NVMM_X64_GPR_RCX] = descs[2];
+ state->gprs[NVMM_X64_GPR_RDX] = descs[3];
+
+ for (i = 0; i < SVM_NCPUIDS; i++) {
+ cpuid = &machdata->cpuid[i];
+ if (!machdata->cpuidpresent[i]) {
+ continue;
+ }
+ if (cpuid->leaf != eax) {
+ continue;
+ }
+
+ /* del */
+ cpudata->vmcb->state.rax &= ~cpuid->del.eax;
+ state->gprs[NVMM_X64_GPR_RBX] &= ~cpuid->del.ebx;
+ state->gprs[NVMM_X64_GPR_RCX] &= ~cpuid->del.ecx;
+ state->gprs[NVMM_X64_GPR_RDX] &= ~cpuid->del.edx;
+
+ /* set */
+ cpudata->vmcb->state.rax |= cpuid->set.eax;
+ state->gprs[NVMM_X64_GPR_RBX] |= cpuid->set.ebx;
+ state->gprs[NVMM_X64_GPR_RCX] |= cpuid->set.ecx;
+ state->gprs[NVMM_X64_GPR_RDX] |= cpuid->set.edx;
+
+ break;
+ }
+
+ /* Overwrite non-tunable leaves. */
+ svm_inkernel_handle_cpuid(vcpu, eax, ecx);
+
+ /* For now we omit DBREGS. */
+ if (__predict_false(cpudata->vmcb->state.rflags & PSL_T)) {
+ svm_inject_db(mach, vcpu);
+ }
+
+ cpudata->vmcb->state.rip = cpudata->vmcb->ctrl.nrip;
+ exit->reason = NVMM_EXIT_NONE;
+}
+
+#define SVM_EXIT_IO_PORT __BITS(31,16)
+#define SVM_EXIT_IO_SEG __BITS(12,10)
+#define SVM_EXIT_IO_A64 __BIT(9)
+#define SVM_EXIT_IO_A32 __BIT(8)
+#define SVM_EXIT_IO_A16 __BIT(7)
+#define SVM_EXIT_IO_SZ32 __BIT(6)
+#define SVM_EXIT_IO_SZ16 __BIT(5)
+#define SVM_EXIT_IO_SZ8 __BIT(4)
+#define SVM_EXIT_IO_REP __BIT(3)
+#define SVM_EXIT_IO_STR __BIT(2)
+#define SVM_EXIT_IO_TYPE __BIT(0)
+
+static const int seg_to_nvmm[] = {
+ [0] = NVMM_X64_SEG_ES,
+ [1] = NVMM_X64_SEG_CS,
+ [2] = NVMM_X64_SEG_SS,
+ [3] = NVMM_X64_SEG_DS,
+ [4] = NVMM_X64_SEG_FS,
+ [5] = NVMM_X64_SEG_GS
+};
+
+static void
+svm_exit_io(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+ struct nvmm_exit *exit)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ uint64_t info = cpudata->vmcb->ctrl.exitinfo1;
+ uint64_t nextpc = cpudata->vmcb->ctrl.exitinfo2;
+
+ exit->reason = NVMM_EXIT_IO;
+
+ if (info & SVM_EXIT_IO_TYPE) {
+ exit->u.io.type = NVMM_EXIT_IO_IN;
+ } else {
+ exit->u.io.type = NVMM_EXIT_IO_OUT;
+ }
+
+ exit->u.io.port = __SHIFTOUT(info, SVM_EXIT_IO_PORT);
+
+ if (svm_decode_assist) {
+ KASSERT(__SHIFTOUT(info, SVM_EXIT_IO_SEG) < 6);
+ exit->u.io.seg = seg_to_nvmm[__SHIFTOUT(info, SVM_EXIT_IO_SEG)];
+ } else {
+ if (exit->u.io.type == NVMM_EXIT_IO_IN) {
+ exit->u.io.seg = NVMM_X64_SEG_ES;
+ } else {
+ exit->u.io.seg = NVMM_X64_SEG_DS;
+ }
+ }
+
+ if (info & SVM_EXIT_IO_A64) {
+ exit->u.io.address_size = 8;
+ } else if (info & SVM_EXIT_IO_A32) {
+ exit->u.io.address_size = 4;
+ } else if (info & SVM_EXIT_IO_A16) {
+ exit->u.io.address_size = 2;
+ }
+
+ if (info & SVM_EXIT_IO_SZ32) {
+ exit->u.io.operand_size = 4;
+ } else if (info & SVM_EXIT_IO_SZ16) {
+ exit->u.io.operand_size = 2;
+ } else if (info & SVM_EXIT_IO_SZ8) {
+ exit->u.io.operand_size = 1;
+ }
+
+ exit->u.io.rep = (info & SVM_EXIT_IO_REP) != 0;
+ exit->u.io.str = (info & SVM_EXIT_IO_STR) != 0;
+ exit->u.io.npc = nextpc;
+}
+
+static bool
+svm_inkernel_handle_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+ struct nvmm_exit *exit)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct nvmm_x64_state *state = &cpudata->state;
+ uint64_t pat;
+
+ switch (exit->u.msr.type) {
+ case NVMM_EXIT_MSR_RDMSR:
+ if (exit->u.msr.msr == MSR_CR_PAT) {
+ pat = cpudata->vmcb->state.g_pat;
+ state->gprs[NVMM_X64_GPR_RAX] = (pat & 0xFFFFFFFF);
+ state->gprs[NVMM_X64_GPR_RDX] = (pat >> 32);
+ goto handled;
+ }
+ break;
+ case NVMM_EXIT_MSR_WRMSR:
+ if (exit->u.msr.msr == MSR_EFER) {
+ if (__predict_false(exit->u.msr.val & ~EFER_VALID)) {
+ svm_inject_gp(mach, vcpu);
+ goto handled;
+ }
+ if ((cpudata->vmcb->state.efer ^ exit->u.msr.val) &
+ EFER_TLB_FLUSH) {
+ cpudata->tlb_want_flush = true;
+ }
+ cpudata->vmcb->state.efer = exit->u.msr.val | EFER_SVME;
+ goto handled;
+ }
+ if (exit->u.msr.msr == MSR_CR_PAT) {
+ cpudata->vmcb->state.g_pat = exit->u.msr.val;
+ goto handled;
+ }
+ break;
+ }
+
+ return false;
+
+handled:
+ cpudata->vmcb->state.rip = cpudata->vmcb->ctrl.nrip;
+ return true;
+}
+
+static void
+svm_exit_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+ struct nvmm_exit *exit)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct nvmm_x64_state *state = &cpudata->state;
+ uint64_t info = cpudata->vmcb->ctrl.exitinfo1;
+
+ if (info == 0) {
+ exit->u.msr.type = NVMM_EXIT_MSR_RDMSR;
+ } else {
+ exit->u.msr.type = NVMM_EXIT_MSR_WRMSR;
+ }
+
+ exit->u.msr.msr = state->gprs[NVMM_X64_GPR_RCX];
+
+ if (info == 1) {
+ uint64_t rdx, rax;
+ rdx = state->gprs[NVMM_X64_GPR_RDX];
+ rax = cpudata->vmcb->state.rax;
+ exit->u.msr.val = (rdx << 32) | (rax & 0xFFFFFFFF);
+ } else {
+ exit->u.msr.val = 0;
+ }
+
+ if (svm_inkernel_handle_msr(mach, vcpu, exit)) {
+ exit->reason = NVMM_EXIT_NONE;
+ return;
+ }
+
+ exit->reason = NVMM_EXIT_MSR;
+ exit->u.msr.npc = cpudata->vmcb->ctrl.nrip;
+}
+
+static void
+svm_exit_npf(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+ struct nvmm_exit *exit)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ gpaddr_t gpa = cpudata->vmcb->ctrl.exitinfo2;
+ int error;
+
+ error = uvm_fault(&mach->vm->vm_map, gpa, VM_PROT_ALL);
+
+ if (error) {
+ exit->reason = NVMM_EXIT_MEMORY;
+ if (cpudata->vmcb->ctrl.exitinfo1 & PGEX_W)
+ exit->u.mem.perm = NVMM_EXIT_MEMORY_WRITE;
+ else if (cpudata->vmcb->ctrl.exitinfo1 & PGEX_X)
+ exit->u.mem.perm = NVMM_EXIT_MEMORY_EXEC;
+ else
+ exit->u.mem.perm = NVMM_EXIT_MEMORY_READ;
+ exit->u.mem.gpa = gpa;
+ exit->u.mem.inst_len = cpudata->vmcb->ctrl.inst_len;
+ memcpy(exit->u.mem.inst_bytes, cpudata->vmcb->ctrl.inst_bytes,
+ sizeof(exit->u.mem.inst_bytes));
+ exit->u.mem.npc = cpudata->vmcb->ctrl.nrip;
+ } else {
+ exit->reason = NVMM_EXIT_NONE;
+ }
+}
+
+static void
+svm_exit_xsetbv(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+ struct nvmm_exit *exit)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct nvmm_x64_state *state = &cpudata->state;
+ struct vmcb *vmcb = cpudata->vmcb;
+ uint64_t val;
+
+ exit->reason = NVMM_EXIT_NONE;
+
+ val = (state->gprs[NVMM_X64_GPR_RDX] << 32) |
+ (state->gprs[NVMM_X64_GPR_RAX] & 0xFFFFFFFF);
+
+ if (__predict_false(state->gprs[NVMM_X64_GPR_RCX] != 0)) {
+ goto error;
+ } else if (__predict_false(vmcb->state.cpl != 0)) {
+ goto error;
+ } else if (__predict_false((val & ~svm_xcr0_mask) != 0)) {
+ goto error;
+ } else if (__predict_false((val & XCR0_X87) == 0)) {
+ goto error;
+ }
+
+ state->crs[NVMM_X64_CR_XCR0] = val;
+
+ return;
+
+error:
+ svm_inject_gp(mach, vcpu);
+}
+
+static void
+svm_vmcb_cache_default(struct vmcb *vmcb)
+{
+ vmcb->ctrl.vmcb_clean =
+ VMCB_CTRL_VMCB_CLEAN_I |
+ VMCB_CTRL_VMCB_CLEAN_IOPM |
+ VMCB_CTRL_VMCB_CLEAN_ASID |
+ VMCB_CTRL_VMCB_CLEAN_LBR |
+ VMCB_CTRL_VMCB_CLEAN_AVIC;
+}
+
+static void
+svm_vmcb_cache_flush(struct vmcb *vmcb)
+{
+ vmcb->ctrl.vmcb_clean = 0;
+}
+
+static void
+svm_vcpu_guest_fpu_enter(struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+
+ if (x86_xsave_features != 0) {
+ cpudata->xcr0 = rdxcr(0);
+ wrxcr(0, cpudata->state.crs[NVMM_X64_CR_XCR0]);
+ }
+
+ cpudata->ts_set = (rcr0() & CR0_TS) != 0;
+
+ fpu_area_save(&cpudata->hfpu);
+ fpu_area_restore(&cpudata->gfpu);
+}
+
+static void
+svm_vcpu_guest_fpu_leave(struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+
+ fpu_area_save(&cpudata->gfpu);
+ fpu_area_restore(&cpudata->hfpu);
+
+ if (cpudata->ts_set) {
+ stts();
+ }
+
+ if (x86_xsave_features != 0) {
+ cpudata->state.crs[NVMM_X64_CR_XCR0] = rdxcr(0);
+ wrxcr(0, cpudata->xcr0);
+ }
+}
+
+static void
+svm_vcpu_guest_dbregs_enter(struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct nvmm_x64_state *state = &cpudata->state;
+
+ x86_dbregs_save(curlwp);
+
+ ldr0(state->drs[NVMM_X64_DR_DR0]);
+ ldr1(state->drs[NVMM_X64_DR_DR1]);
+ ldr2(state->drs[NVMM_X64_DR_DR2]);
+ ldr3(state->drs[NVMM_X64_DR_DR3]);
+}
+
+static void
+svm_vcpu_guest_dbregs_leave(struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct nvmm_x64_state *state = &cpudata->state;
+
+ state->drs[NVMM_X64_DR_DR0] = rdr0();
+ state->drs[NVMM_X64_DR_DR1] = rdr1();
+ state->drs[NVMM_X64_DR_DR2] = rdr2();
+ state->drs[NVMM_X64_DR_DR3] = rdr3();
+
+ x86_dbregs_restore(curlwp);
+}
+
+static void
+svm_vcpu_guest_misc_enter(struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+
+ /* Save the fixed Host MSRs. */
+ cpudata->star = rdmsr(MSR_STAR);
+ cpudata->lstar = rdmsr(MSR_LSTAR);
+ cpudata->cstar = rdmsr(MSR_CSTAR);
+ cpudata->sfmask = rdmsr(MSR_SFMASK);
+
+ /* Save the Host CR2. */
+ cpudata->cr2 = rcr2();
+}
+
+static void
+svm_vcpu_guest_misc_leave(struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+
+ /* Restore the fixed Host MSRs. */
+ wrmsr(MSR_STAR, cpudata->star);
+ wrmsr(MSR_LSTAR, cpudata->lstar);
+ wrmsr(MSR_CSTAR, cpudata->cstar);
+ wrmsr(MSR_SFMASK, cpudata->sfmask);
+
+ /* Restore the Host CR2. */
+ lcr2(cpudata->cr2);
+}
+
+static int
+svm_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+ struct nvmm_exit *exit)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct vmcb *vmcb = cpudata->vmcb;
+ bool tlb_need_flush = false;
+ int hcpu, s;
+
+ kpreempt_disable();
+ hcpu = cpu_number();
+
+ if (vcpu->hcpu_last != hcpu || cpudata->shared_asid) {
+ tlb_need_flush = true;
+ }
+
+ if (cpudata->tlb_want_flush || tlb_need_flush) {
+ vmcb->ctrl.tlb_ctrl = svm_ctrl_tlb_flush;
+ } else {
+ vmcb->ctrl.tlb_ctrl = 0;
+ }
+
+ if (vcpu->hcpu_last != hcpu) {
+ vmcb->ctrl.tsc_offset = cpudata->tsc_offset +
+ curcpu()->ci_data.cpu_cc_skew;
+ svm_vmcb_cache_flush(vmcb);
+ }
+
+ svm_vcpu_guest_dbregs_enter(vcpu);
+ svm_vcpu_guest_misc_enter(vcpu);
+
+ while (1) {
+ s = splhigh();
+ svm_vcpu_guest_fpu_enter(vcpu);
+ svm_vmrun(cpudata->vmcb_pa, cpudata->state.gprs);
+ svm_vcpu_guest_fpu_leave(vcpu);
+ splx(s);
+
+ svm_vmcb_cache_default(vmcb);
+
+ if (vmcb->ctrl.exitcode != VMCB_EXITCODE_INVALID) {
+ if (cpudata->tlb_want_flush) {
+ cpudata->tlb_want_flush = false;
+ }
+ vcpu->hcpu_last = hcpu;
+ }
+
+ switch (vmcb->ctrl.exitcode) {
+ case VMCB_EXITCODE_INTR:
+ case VMCB_EXITCODE_NMI:
+ exit->reason = NVMM_EXIT_NONE;
+ break;
+ case VMCB_EXITCODE_VINTR:
+ svm_event_waitexit_disable(vmcb, false);
+ exit->reason = NVMM_EXIT_INT_READY;
+ break;
+ case VMCB_EXITCODE_IRET:
+ svm_event_waitexit_disable(vmcb, true);
+ cpudata->in_nmi = false;
+ exit->reason = NVMM_EXIT_NMI_READY;
+ break;
+ case VMCB_EXITCODE_CPUID:
+ svm_exit_cpuid(mach, vcpu, exit);
+ break;
+ case VMCB_EXITCODE_HLT:
+ exit->reason = NVMM_EXIT_HLT;
+ break;
+ case VMCB_EXITCODE_IOIO:
+ svm_exit_io(mach, vcpu, exit);
+ break;
+ case VMCB_EXITCODE_MSR:
+ svm_exit_msr(mach, vcpu, exit);
+ break;
+ case VMCB_EXITCODE_SHUTDOWN:
+ exit->reason = NVMM_EXIT_SHUTDOWN;
+ break;
+ case VMCB_EXITCODE_RDPMC:
+ case VMCB_EXITCODE_RSM:
+ case VMCB_EXITCODE_INVLPGA:
+ case VMCB_EXITCODE_VMRUN:
+ case VMCB_EXITCODE_VMMCALL:
+ case VMCB_EXITCODE_VMLOAD:
+ case VMCB_EXITCODE_VMSAVE:
+ case VMCB_EXITCODE_STGI:
+ case VMCB_EXITCODE_CLGI:
+ case VMCB_EXITCODE_SKINIT:
+ case VMCB_EXITCODE_RDTSCP:
+ svm_inject_ud(mach, vcpu);
+ exit->reason = NVMM_EXIT_NONE;
+ break;
+ case VMCB_EXITCODE_MONITOR:
+ exit->reason = NVMM_EXIT_MONITOR;
+ break;
+ case VMCB_EXITCODE_MWAIT:
+ exit->reason = NVMM_EXIT_MWAIT;
+ break;
+ case VMCB_EXITCODE_MWAIT_CONDITIONAL:
+ exit->reason = NVMM_EXIT_MWAIT_COND;
+ break;
+ case VMCB_EXITCODE_XSETBV:
+ svm_exit_xsetbv(mach, vcpu, exit);
+ break;
+ case VMCB_EXITCODE_NPF:
+ svm_exit_npf(mach, vcpu, exit);
+ break;
+ case VMCB_EXITCODE_FERR_FREEZE: /* ? */
+ default:
+ exit->reason = NVMM_EXIT_INVALID;
+ break;
+ }
+
+ /* If no reason to return to userland, keep rolling. */
+ if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) {
+ break;
+ }
+ if (exit->reason != NVMM_EXIT_NONE) {
+ break;
+ }
+ }
+
+ svm_vcpu_guest_misc_leave(vcpu);
+ svm_vcpu_guest_dbregs_leave(vcpu);
+
+ kpreempt_enable();
+
+ exit->exitstate[NVMM_X64_EXITSTATE_CR8] = __SHIFTOUT(vmcb->ctrl.v,
+ VMCB_CTRL_V_TPR);
+
+ return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+svm_memalloc(paddr_t *pa, vaddr_t *va, size_t npages)
+{
+ struct pglist pglist;
+ paddr_t _pa;
+ vaddr_t _va;
+ size_t i;
+ int ret;
+
+ ret = uvm_pglistalloc(npages * PAGE_SIZE, 0, ~0UL, PAGE_SIZE, 0,
+ &pglist, 1, 0);
+ if (ret != 0)
+ return ENOMEM;
+ _pa = TAILQ_FIRST(&pglist)->phys_addr;
+ _va = uvm_km_alloc(kernel_map, npages * PAGE_SIZE, 0,
+ UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
+ if (_va == 0)
+ goto error;
+
+ for (i = 0; i < npages; i++) {
+ pmap_kenter_pa(_va + i * PAGE_SIZE, _pa + i * PAGE_SIZE,
+ VM_PROT_READ | VM_PROT_WRITE, PMAP_WRITE_BACK);
+ }
+
+ memset((void *)_va, 0, npages * PAGE_SIZE);
+
+ *pa = _pa;
+ *va = _va;
+ return 0;
+
+error:
+ for (i = 0; i < npages; i++) {
+ uvm_pagefree(PHYS_TO_VM_PAGE(_pa + i * PAGE_SIZE));
+ }
+ return ENOMEM;
+}
+
+static void
+svm_memfree(paddr_t pa, vaddr_t va, size_t npages)
+{
+ size_t i;
+
+ pmap_kremove(va, npages * PAGE_SIZE);
+ pmap_update(pmap_kernel());
+ uvm_km_free(kernel_map, va, npages * PAGE_SIZE, UVM_KMF_VAONLY);
+ for (i = 0; i < npages; i++) {
+ uvm_pagefree(PHYS_TO_VM_PAGE(pa + i * PAGE_SIZE));
+ }
+}
+
+/* -------------------------------------------------------------------------- */
+
+#define SVM_MSRBM_READ __BIT(0)
+#define SVM_MSRBM_WRITE __BIT(1)
+
+static void
+svm_vcpu_msr_allow(uint8_t *bitmap, uint64_t msr, bool read, bool write)
+{
+ uint64_t byte;
+ uint8_t bitoff;
+
+ if (msr < 0x00002000) {
+ /* Range 1 */
+ byte = ((msr - 0x00000000) >> 2UL) + 0x0000;
+ } else if (msr >= 0xC0000000 && msr < 0xC0002000) {
+ /* Range 2 */
+ byte = ((msr - 0xC0000000) >> 2UL) + 0x0800;
+ } else if (msr >= 0xC0010000 && msr < 0xC0012000) {
+ /* Range 3 */
+ byte = ((msr - 0xC0010000) >> 2UL) + 0x1000;
+ } else {
+ panic("%s: wrong range", __func__);
+ }
+
+ bitoff = (msr & 0x3) << 1;
+
+ if (read) {
+ bitmap[byte] &= ~(SVM_MSRBM_READ << bitoff);
+ }
+ if (write) {
+ bitmap[byte] &= ~(SVM_MSRBM_WRITE << bitoff);
+ }
+}
+
+static void
+svm_asid_alloc(struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct vmcb *vmcb = cpudata->vmcb;
+ size_t i, oct, bit;
+
+ mutex_enter(&svm_asidlock);
+
+ for (i = 0; i < svm_maxasid; i++) {
+ oct = i / 8;
+ bit = i % 8;
+
+ if (svm_asidmap[oct] & __BIT(bit)) {
+ continue;
+ }
+
+ svm_asidmap[oct] |= __BIT(bit);
+ vmcb->ctrl.guest_asid = i;
+ mutex_exit(&svm_asidlock);
+ return;
+ }
+
+ /*
+ * No free ASID. Use the last one, which is shared and requires
+ * special TLB handling.
+ */
+ cpudata->shared_asid = true;
+ vmcb->ctrl.guest_asid = svm_maxasid - 1;
+ mutex_exit(&svm_asidlock);
+}
+
+static void
+svm_asid_free(struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct vmcb *vmcb = cpudata->vmcb;
+ size_t oct, bit;
+
+ if (cpudata->shared_asid) {
+ return;
+ }
+
+ oct = vmcb->ctrl.guest_asid / 8;
+ bit = vmcb->ctrl.guest_asid % 8;
+
+ mutex_enter(&svm_asidlock);
+ svm_asidmap[oct] &= ~__BIT(bit);
+ mutex_exit(&svm_asidlock);
+}
+
+static void
+svm_vcpu_init(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct vmcb *vmcb = cpudata->vmcb;
+
+ /* Allow reads/writes of Control Registers. */
+ vmcb->ctrl.intercept_cr = 0;
+
+ /* Allow reads/writes of Debug Registers. */
+ vmcb->ctrl.intercept_dr = 0;
+
+ /* Allow exceptions 0 to 31. */
+ vmcb->ctrl.intercept_vec = 0;
+
+ /*
+ * Allow:
+ * - SMI [smm interrupts]
+ * - VINTR [virtual interrupts]
+ * - CR0_SPEC [CR0 writes changing other fields than CR0.TS or CR0.MP]
+ * - RIDTR [reads of IDTR]
+ * - RGDTR [reads of GDTR]
+ * - RLDTR [reads of LDTR]
+ * - RTR [reads of TR]
+ * - WIDTR [writes of IDTR]
+ * - WGDTR [writes of GDTR]
+ * - WLDTR [writes of LDTR]
+ * - WTR [writes of TR]
+ * - RDTSC [rdtsc instruction]
+ * - PUSHF [pushf instruction]
+ * - POPF [popf instruction]
+ * - IRET [iret instruction]
+ * - INTN [int $n instructions]
+ * - INVD [invd instruction]
+ * - PAUSE [pause instruction]
+ * - INVLPG [invplg instruction]
+ * - TASKSW [task switches]
+ *
+ * Intercept the rest below.
+ */
+ vmcb->ctrl.intercept_misc1 =
+ VMCB_CTRL_INTERCEPT_INTR |
+ VMCB_CTRL_INTERCEPT_NMI |
+ VMCB_CTRL_INTERCEPT_INIT |
+ VMCB_CTRL_INTERCEPT_RDPMC |
+ VMCB_CTRL_INTERCEPT_CPUID |
+ VMCB_CTRL_INTERCEPT_RSM |
+ VMCB_CTRL_INTERCEPT_HLT |
+ VMCB_CTRL_INTERCEPT_INVLPGA |
+ VMCB_CTRL_INTERCEPT_IOIO_PROT |
+ VMCB_CTRL_INTERCEPT_MSR_PROT |
+ VMCB_CTRL_INTERCEPT_FERR_FREEZE |
+ VMCB_CTRL_INTERCEPT_SHUTDOWN;
+
+ /*
+ * Allow:
+ * - ICEBP [icebp instruction]
+ * - WBINVD [wbinvd instruction]
+ * - WCR_SPEC(0..15) [writes of CR0-15, received after instruction]
+ *
+ * Intercept the rest below.
+ */
+ vmcb->ctrl.intercept_misc2 =
+ VMCB_CTRL_INTERCEPT_VMRUN |
+ VMCB_CTRL_INTERCEPT_VMMCALL |
+ VMCB_CTRL_INTERCEPT_VMLOAD |
+ VMCB_CTRL_INTERCEPT_VMSAVE |
+ VMCB_CTRL_INTERCEPT_STGI |
+ VMCB_CTRL_INTERCEPT_CLGI |
+ VMCB_CTRL_INTERCEPT_SKINIT |
+ VMCB_CTRL_INTERCEPT_RDTSCP |
+ VMCB_CTRL_INTERCEPT_MONITOR |
+ VMCB_CTRL_INTERCEPT_MWAIT |
+ VMCB_CTRL_INTERCEPT_XSETBV;
+
+ /* Intercept all I/O accesses. */
+ memset(cpudata->iobm, 0xFF, IOBM_SIZE);
+ vmcb->ctrl.iopm_base_pa = cpudata->iobm_pa;
+
+ /*
+ * Allow:
+ * - EFER [read]
+ * - STAR [read, write]
+ * - LSTAR [read, write]
+ * - CSTAR [read, write]
+ * - SFMASK [read, write]
+ * - KERNELGSBASE [read, write]
+ * - SYSENTER_CS [read, write]
+ * - SYSENTER_ESP [read, write]
+ * - SYSENTER_EIP [read, write]
+ * - FSBASE [read, write]
+ * - GSBASE [read, write]
+ *
+ * Intercept the rest.
+ */
+ memset(cpudata->msrbm, 0xFF, MSRBM_SIZE);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_EFER, true, false);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_STAR, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_LSTAR, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_CSTAR, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_SFMASK, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_KERNELGSBASE, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_CS, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_ESP, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_EIP, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_FSBASE, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_GSBASE, true, true);
+ vmcb->ctrl.msrpm_base_pa = cpudata->msrbm_pa;
+
+ /* Generate ASID. */
+ svm_asid_alloc(vcpu);
+
+ /* Virtual TPR. */
+ vmcb->ctrl.v = VMCB_CTRL_V_INTR_MASKING;
+
+ /* Enable Nested Paging. */
+ vmcb->ctrl.enable1 = VMCB_CTRL_ENABLE_NP;
+ vmcb->ctrl.n_cr3 = mach->vm->vm_map.pmap->pm_pdirpa[0];
+
+ /* Must always be set. */
+ vmcb->state.efer = EFER_SVME;
+
+ /* Init XSAVE header. */
+ cpudata->gfpu.xsh_xstate_bv = svm_xcr0_mask;
+ cpudata->gfpu.xsh_xcomp_bv = 0;
+
+ /* Bluntly hide the host TSC. */
+ cpudata->tsc_offset = rdtsc();
+}
+
+static int
+svm_vcpu_create(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata;
+ int error;
+
+ /* Allocate the SVM cpudata. */
+ cpudata = (struct svm_cpudata *)uvm_km_alloc(kernel_map,
+ roundup(sizeof(*cpudata), PAGE_SIZE), 0,
+ UVM_KMF_WIRED|UVM_KMF_ZERO);
+ vcpu->cpudata = cpudata;
+
+ /* VMCB */
+ error = svm_memalloc(&cpudata->vmcb_pa, (vaddr_t *)&cpudata->vmcb,
+ VMCB_NPAGES);
+ if (error)
+ goto error;
+
+ /* I/O Bitmap */
+ error = svm_memalloc(&cpudata->iobm_pa, (vaddr_t *)&cpudata->iobm,
+ IOBM_NPAGES);
+ if (error)
+ goto error;
+
+ /* MSR Bitmap */
+ error = svm_memalloc(&cpudata->msrbm_pa, (vaddr_t *)&cpudata->msrbm,
+ MSRBM_NPAGES);
+ if (error)
+ goto error;
+
+ /* Init the VCPU info. */
+ svm_vcpu_init(mach, vcpu);
+
+ return 0;
+
+error:
+ if (cpudata->vmcb_pa) {
+ svm_memfree(cpudata->vmcb_pa, (vaddr_t)cpudata->vmcb,
+ VMCB_NPAGES);
+ }
+ if (cpudata->iobm_pa) {
+ svm_memfree(cpudata->iobm_pa, (vaddr_t)cpudata->iobm,
+ IOBM_NPAGES);
+ }
+ if (cpudata->msrbm_pa) {
+ svm_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm,
+ MSRBM_NPAGES);
+ }
+ uvm_km_free(kernel_map, (vaddr_t)cpudata,
+ roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED);
+ return error;
+}
+
+static void
+svm_vcpu_destroy(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+
+ svm_asid_free(vcpu);
+
+ svm_memfree(cpudata->vmcb_pa, (vaddr_t)cpudata->vmcb, VMCB_NPAGES);
+ svm_memfree(cpudata->iobm_pa, (vaddr_t)cpudata->iobm, IOBM_NPAGES);
+ svm_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm, MSRBM_NPAGES);
+
+ uvm_km_free(kernel_map, (vaddr_t)cpudata,
+ roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED);
+}
+
+#define SVM_SEG_ATTRIB_TYPE __BITS(4,0)
+#define SVM_SEG_ATTRIB_DPL __BITS(6,5)
+#define SVM_SEG_ATTRIB_P __BIT(7)
+#define SVM_SEG_ATTRIB_AVL __BIT(8)
+#define SVM_SEG_ATTRIB_LONG __BIT(9)
+#define SVM_SEG_ATTRIB_DEF32 __BIT(10)
+#define SVM_SEG_ATTRIB_GRAN __BIT(11)
+
+static void
+svm_vcpu_setstate_seg(struct nvmm_x64_state_seg *seg, struct vmcb_segment *vseg)
+{
+ vseg->selector = seg->selector;
+ vseg->attrib =
+ __SHIFTIN(seg->attrib.type, SVM_SEG_ATTRIB_TYPE) |
+ __SHIFTIN(seg->attrib.dpl, SVM_SEG_ATTRIB_DPL) |
+ __SHIFTIN(seg->attrib.p, SVM_SEG_ATTRIB_P) |
+ __SHIFTIN(seg->attrib.avl, SVM_SEG_ATTRIB_AVL) |
+ __SHIFTIN(seg->attrib.lng, SVM_SEG_ATTRIB_LONG) |
+ __SHIFTIN(seg->attrib.def32, SVM_SEG_ATTRIB_DEF32) |
+ __SHIFTIN(seg->attrib.gran, SVM_SEG_ATTRIB_GRAN);
+ vseg->limit = seg->limit;
+ vseg->base = seg->base;
+}
+
+static void
+svm_vcpu_getstate_seg(struct nvmm_x64_state_seg *seg, struct vmcb_segment *vseg)
+{
+ seg->selector = vseg->selector;
+ seg->attrib.type = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_TYPE);
+ seg->attrib.dpl = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_DPL);
+ seg->attrib.p = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_P);
+ seg->attrib.avl = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_AVL);
+ seg->attrib.lng = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_LONG);
+ seg->attrib.def32 = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_DEF32);
+ seg->attrib.gran = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_GRAN);
+ seg->limit = vseg->limit;
+ seg->base = vseg->base;
+}
+
+static bool
+svm_state_tlb_flush(struct nvmm_x64_state *cstate,
+ struct nvmm_x64_state *nstate, uint64_t flags)
+{
+ if (flags & NVMM_X64_STATE_CRS) {
+ if ((cstate->crs[NVMM_X64_CR_CR0] ^
+ nstate->crs[NVMM_X64_CR_CR0]) & CR0_TLB_FLUSH) {
+ return true;
+ }
+ if (cstate->crs[NVMM_X64_CR_CR3] !=
+ nstate->crs[NVMM_X64_CR_CR3]) {
+ return true;
+ }
+ if ((cstate->crs[NVMM_X64_CR_CR4] ^
+ nstate->crs[NVMM_X64_CR_CR4]) & CR4_TLB_FLUSH) {
+ return true;
+ }
+ }
+
+ if (flags & NVMM_X64_STATE_MSRS) {
+ if ((cstate->msrs[NVMM_X64_MSR_EFER] ^
+ nstate->msrs[NVMM_X64_MSR_EFER]) & EFER_TLB_FLUSH) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void
+svm_vcpu_setstate(struct nvmm_cpu *vcpu, void *data, uint64_t flags)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct nvmm_x64_state *cstate = &cpudata->state;
+ struct nvmm_x64_state *nstate = (struct nvmm_x64_state *)data;
+ struct vmcb *vmcb = cpudata->vmcb;
+ struct fxsave *fpustate;
+
+ if (svm_state_tlb_flush(cstate, nstate, flags)) {
+ cpudata->tlb_want_flush = true;
+ }
+
+ if (flags & NVMM_X64_STATE_SEGS) {
+ memcpy(cstate->segs, nstate->segs, sizeof(nstate->segs));
+
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_CS],
+ &vmcb->state.cs);
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_DS],
+ &vmcb->state.ds);
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_ES],
+ &vmcb->state.es);
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_FS],
+ &vmcb->state.fs);
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_GS],
+ &vmcb->state.gs);
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_SS],
+ &vmcb->state.ss);
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_GDT],
+ &vmcb->state.gdt);
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_IDT],
+ &vmcb->state.idt);
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_LDT],
+ &vmcb->state.ldt);
+ svm_vcpu_setstate_seg(&cstate->segs[NVMM_X64_SEG_TR],
+ &vmcb->state.tr);
+ }
+
+ if (flags & NVMM_X64_STATE_GPRS) {
+ memcpy(cstate->gprs, nstate->gprs, sizeof(nstate->gprs));
+
+ vmcb->state.rip = cstate->gprs[NVMM_X64_GPR_RIP];
+ vmcb->state.rsp = cstate->gprs[NVMM_X64_GPR_RSP];
+ vmcb->state.rax = cstate->gprs[NVMM_X64_GPR_RAX];
+ vmcb->state.rflags = cstate->gprs[NVMM_X64_GPR_RFLAGS];
+ }
+
+ if (flags & NVMM_X64_STATE_CRS) {
+ memcpy(cstate->crs, nstate->crs, sizeof(nstate->crs));
+
+ vmcb->state.cr0 = cstate->crs[NVMM_X64_CR_CR0];
+ vmcb->state.cr2 = cstate->crs[NVMM_X64_CR_CR2];
+ vmcb->state.cr3 = cstate->crs[NVMM_X64_CR_CR3];
+ vmcb->state.cr4 = cstate->crs[NVMM_X64_CR_CR4];
+
+ vmcb->ctrl.v &= ~VMCB_CTRL_V_TPR;
+ vmcb->ctrl.v |= __SHIFTIN(cstate->crs[NVMM_X64_CR_CR8],
+ VMCB_CTRL_V_TPR);
+
+ /* Clear unsupported XCR0 bits, set mandatory X87 bit. */
+ if (svm_xcr0_mask != 0) {
+ cstate->crs[NVMM_X64_CR_XCR0] &= svm_xcr0_mask;
+ cstate->crs[NVMM_X64_CR_XCR0] |= XCR0_X87;
+ } else {
+ cstate->crs[NVMM_X64_CR_XCR0] = 0;
+ }
+ }
+
+ if (flags & NVMM_X64_STATE_DRS) {
+ memcpy(cstate->drs, nstate->drs, sizeof(nstate->drs));
+
+ vmcb->state.dr6 = cstate->drs[NVMM_X64_DR_DR6];
+ vmcb->state.dr7 = cstate->drs[NVMM_X64_DR_DR7];
+ }
+
+ if (flags & NVMM_X64_STATE_MSRS) {
+ memcpy(cstate->msrs, nstate->msrs, sizeof(nstate->msrs));
+
+ /* Bit EFER_SVME is mandatory. */
+ cstate->msrs[NVMM_X64_MSR_EFER] |= EFER_SVME;
+
+ vmcb->state.efer = cstate->msrs[NVMM_X64_MSR_EFER];
+ vmcb->state.star = cstate->msrs[NVMM_X64_MSR_STAR];
+ vmcb->state.lstar = cstate->msrs[NVMM_X64_MSR_LSTAR];
+ vmcb->state.cstar = cstate->msrs[NVMM_X64_MSR_CSTAR];
+ vmcb->state.sfmask = cstate->msrs[NVMM_X64_MSR_SFMASK];
+ vmcb->state.kernelgsbase =
+ cstate->msrs[NVMM_X64_MSR_KERNELGSBASE];
+ vmcb->state.sysenter_cs =
+ cstate->msrs[NVMM_X64_MSR_SYSENTER_CS];
+ vmcb->state.sysenter_esp =
+ cstate->msrs[NVMM_X64_MSR_SYSENTER_ESP];
+ vmcb->state.sysenter_eip =
+ cstate->msrs[NVMM_X64_MSR_SYSENTER_EIP];
+ vmcb->state.g_pat = cstate->msrs[NVMM_X64_MSR_PAT];
+ }
+
+ if (flags & NVMM_X64_STATE_MISC) {
+ memcpy(cstate->misc, nstate->misc, sizeof(nstate->misc));
+
+ vmcb->state.cpl = cstate->misc[NVMM_X64_MISC_CPL];
+ }
+
+ CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(cstate->fpu));
+ if (flags & NVMM_X64_STATE_FPU) {
+ memcpy(&cstate->fpu, &nstate->fpu, sizeof(nstate->fpu));
+
+ memcpy(cpudata->gfpu.xsh_fxsave, &cstate->fpu,
+ sizeof(cstate->fpu));
+
+ fpustate = (struct fxsave *)cpudata->gfpu.xsh_fxsave;
+ fpustate->fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
+ fpustate->fx_mxcsr &= fpustate->fx_mxcsr_mask;
+ }
+}
+
+static void
+svm_vcpu_getstate(struct nvmm_cpu *vcpu, void *data, uint64_t flags)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct nvmm_x64_state *cstate = &cpudata->state;
+ struct nvmm_x64_state *nstate = (struct nvmm_x64_state *)data;
+ struct vmcb *vmcb = cpudata->vmcb;
+
+ if (flags & NVMM_X64_STATE_SEGS) {
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_CS],
+ &vmcb->state.cs);
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_DS],
+ &vmcb->state.ds);
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_ES],
+ &vmcb->state.es);
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_FS],
+ &vmcb->state.fs);
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_GS],
+ &vmcb->state.gs);
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_SS],
+ &vmcb->state.ss);
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_GDT],
+ &vmcb->state.gdt);
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_IDT],
+ &vmcb->state.idt);
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_LDT],
+ &vmcb->state.ldt);
+ svm_vcpu_getstate_seg(&cstate->segs[NVMM_X64_SEG_TR],
+ &vmcb->state.tr);
+
+ memcpy(nstate->segs, cstate->segs, sizeof(cstate->segs));
+ }
+
+ if (flags & NVMM_X64_STATE_GPRS) {
+ cstate->gprs[NVMM_X64_GPR_RIP] = vmcb->state.rip;
+ cstate->gprs[NVMM_X64_GPR_RSP] = vmcb->state.rsp;
+ cstate->gprs[NVMM_X64_GPR_RAX] = vmcb->state.rax;
+ cstate->gprs[NVMM_X64_GPR_RFLAGS] = vmcb->state.rflags;
+
+ memcpy(nstate->gprs, cstate->gprs, sizeof(cstate->gprs));
+ }
+
+ if (flags & NVMM_X64_STATE_CRS) {
+ cstate->crs[NVMM_X64_CR_CR0] = vmcb->state.cr0;
+ cstate->crs[NVMM_X64_CR_CR2] = vmcb->state.cr2;
+ cstate->crs[NVMM_X64_CR_CR3] = vmcb->state.cr3;
+ cstate->crs[NVMM_X64_CR_CR4] = vmcb->state.cr4;
+ cstate->crs[NVMM_X64_CR_CR8] = __SHIFTOUT(vmcb->ctrl.v,
+ VMCB_CTRL_V_TPR);
+
+ memcpy(nstate->crs, cstate->crs, sizeof(cstate->crs));
+ }
+
+ if (flags & NVMM_X64_STATE_DRS) {
+ cstate->drs[NVMM_X64_DR_DR6] = vmcb->state.dr6;
+ cstate->drs[NVMM_X64_DR_DR7] = vmcb->state.dr7;
+
+ memcpy(nstate->drs, cstate->drs, sizeof(cstate->drs));
+ }
+
+ if (flags & NVMM_X64_STATE_MSRS) {
+ cstate->msrs[NVMM_X64_MSR_EFER] = vmcb->state.efer;
+ cstate->msrs[NVMM_X64_MSR_STAR] = vmcb->state.star;
+ cstate->msrs[NVMM_X64_MSR_LSTAR] = vmcb->state.lstar;
+ cstate->msrs[NVMM_X64_MSR_CSTAR] = vmcb->state.cstar;
+ cstate->msrs[NVMM_X64_MSR_SFMASK] = vmcb->state.sfmask;
+ cstate->msrs[NVMM_X64_MSR_KERNELGSBASE] =
+ vmcb->state.kernelgsbase;
+ cstate->msrs[NVMM_X64_MSR_SYSENTER_CS] =
+ vmcb->state.sysenter_cs;
+ cstate->msrs[NVMM_X64_MSR_SYSENTER_ESP] =
+ vmcb->state.sysenter_esp;
+ cstate->msrs[NVMM_X64_MSR_SYSENTER_EIP] =
+ vmcb->state.sysenter_eip;
+ cstate->msrs[NVMM_X64_MSR_PAT] = vmcb->state.g_pat;
+
+ memcpy(nstate->msrs, cstate->msrs, sizeof(cstate->msrs));
+
+ /* Hide SVME. */
+ nstate->msrs[NVMM_X64_MSR_EFER] &= ~EFER_SVME;
+ }
+
+ if (flags & NVMM_X64_STATE_MISC) {
+ cstate->misc[NVMM_X64_MISC_CPL] = vmcb->state.cpl;
+
+ memcpy(nstate->misc, cstate->misc, sizeof(cstate->misc));
+ }
+
+ CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(cstate->fpu));
+ if (flags & NVMM_X64_STATE_FPU) {
+ memcpy(&cstate->fpu, cpudata->gfpu.xsh_fxsave,
+ sizeof(cstate->fpu));
+
+ memcpy(&cstate->fpu, &nstate->fpu, sizeof(cstate->fpu));
+ }
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+svm_tlb_flush(struct pmap *pm)
+{
+ struct nvmm_machine *mach = pm->pm_data;
+ struct svm_cpudata *cpudata;
+ struct nvmm_cpu *vcpu;
+ int error;
+ size_t i;
+
+ /* Request TLB flushes. */
+ for (i = 0; i < NVMM_MAX_VCPUS; i++) {
+ error = nvmm_vcpu_get(mach, i, &vcpu);
+ if (error)
+ continue;
+ cpudata = vcpu->cpudata;
+ cpudata->tlb_want_flush = true;
+ nvmm_vcpu_put(vcpu);
+ }
+}
+
+static void
+svm_machine_create(struct nvmm_machine *mach)
+{
+ /* Fill in pmap info. */
+ mach->vm->vm_map.pmap->pm_data = (void *)mach;
+ mach->vm->vm_map.pmap->pm_tlb_flush = svm_tlb_flush;
+
+ mach->machdata = kmem_zalloc(sizeof(struct svm_machdata), KM_SLEEP);
+}
+
+static void
+svm_machine_destroy(struct nvmm_machine *mach)
+{
+ kmem_free(mach->machdata, sizeof(struct svm_machdata));
+}
+
+static int
+svm_machine_configure(struct nvmm_machine *mach, uint64_t op, void *data)
+{
+ struct nvmm_x86_conf_cpuid *cpuid = data;
+ struct svm_machdata *machdata = (struct svm_machdata *)mach->machdata;
+ size_t i;
+
+ if (__predict_false(op != NVMM_X86_CONF_CPUID)) {
+ return EINVAL;
+ }
+
+ if (__predict_false((cpuid->set.eax & cpuid->del.eax) ||
+ (cpuid->set.ebx & cpuid->del.ebx) ||
+ (cpuid->set.ecx & cpuid->del.ecx) ||
+ (cpuid->set.edx & cpuid->del.edx))) {
+ return EINVAL;
+ }
+
+ /* If already here, replace. */
+ for (i = 0; i < SVM_NCPUIDS; i++) {
+ if (!machdata->cpuidpresent[i]) {
+ continue;
+ }
+ if (machdata->cpuid[i].leaf == cpuid->leaf) {
+ memcpy(&machdata->cpuid[i], cpuid,
+ sizeof(struct nvmm_x86_conf_cpuid));
+ return 0;
+ }
+ }
+
+ /* Not here, insert. */
+ for (i = 0; i < SVM_NCPUIDS; i++) {
+ if (!machdata->cpuidpresent[i]) {
+ machdata->cpuidpresent[i] = true;
+ memcpy(&machdata->cpuid[i], cpuid,
+ sizeof(struct nvmm_x86_conf_cpuid));
+ return 0;
+ }
+ }
+
+ return ENOBUFS;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static bool
+svm_ident(void)
+{
+ u_int descs[4];
+ uint64_t msr;
+
+ if (cpu_vendor != CPUVENDOR_AMD) {
+ return false;
+ }
+ if (!(cpu_feature[3] & CPUID_SVM)) {
+ return false;
+ }
+
+ if (curcpu()->ci_max_ext_cpuid < 0x8000000a) {
+ return false;
+ }
+ x86_cpuid(0x8000000a, descs);
+
+ /* Want Nested Paging. */
+ if (!(descs[3] & CPUID_AMD_SVM_NP)) {
+ return false;
+ }
+
+ /* Want nRIP. */
+ if (!(descs[3] & CPUID_AMD_SVM_NRIPS)) {
+ return false;
+ }
+
+ svm_decode_assist = (descs[3] & CPUID_AMD_SVM_DecodeAssist) != 0;
+
+ msr = rdmsr(MSR_VMCR);
+ if ((msr & VMCR_SVMED) && (msr & VMCR_LOCK)) {
+ return false;
+ }
+
+ return true;
+}
+
+static void
+svm_init_asid(uint32_t maxasid)
+{
+ size_t i, j, allocsz;
+
+ mutex_init(&svm_asidlock, MUTEX_DEFAULT, IPL_NONE);
+
+ /* Arbitrarily limit. */
+ maxasid = uimin(maxasid, 8192);
+
+ svm_maxasid = maxasid;
+ allocsz = roundup(maxasid, 8) / 8;
+ svm_asidmap = kmem_zalloc(allocsz, KM_SLEEP);
+
+ /* ASID 0 is reserved for the host. */
+ svm_asidmap[0] |= __BIT(0);
+
+ /* ASID n-1 is special, we share it. */
+ i = (maxasid - 1) / 8;
+ j = (maxasid - 1) % 8;
+ svm_asidmap[i] |= __BIT(j);
+}
+
+static void
+svm_change_cpu(void *arg1, void *arg2)
+{
+ bool enable = (bool)arg1;
+ uint64_t msr;
+
+ msr = rdmsr(MSR_VMCR);
+ if (msr & VMCR_SVMED) {
+ wrmsr(MSR_VMCR, msr & ~VMCR_SVMED);
+ }
+
+ if (!enable) {
+ wrmsr(MSR_VM_HSAVE_PA, 0);
+ }
+
+ msr = rdmsr(MSR_EFER);
+ if (enable) {
+ msr |= EFER_SVME;
+ } else {
+ msr &= ~EFER_SVME;
+ }
+ wrmsr(MSR_EFER, msr);
+
+ if (enable) {
+ wrmsr(MSR_VM_HSAVE_PA, hsave[cpu_index(curcpu())].pa);
+ }
+}
+
+static void
+svm_init(void)
+{
+ CPU_INFO_ITERATOR cii;
+ struct cpu_info *ci;
+ struct vm_page *pg;
+ u_int descs[4];
+ uint64_t xc;
+
+ x86_cpuid(0x8000000a, descs);
+
+ /* The guest TLB flush command. */
+ if (descs[3] & CPUID_AMD_SVM_FlushByASID) {
+ svm_ctrl_tlb_flush = VMCB_CTRL_TLB_CTRL_FLUSH_GUEST;
+ } else {
+ svm_ctrl_tlb_flush = VMCB_CTRL_TLB_CTRL_FLUSH_ALL;
+ }
+
+ /* Init the ASID. */
+ svm_init_asid(descs[1]);
+
+ /* Init the XCR0 mask. */
+ svm_xcr0_mask = SVM_XCR0_MASK_DEFAULT & x86_xsave_features;
+
+ memset(hsave, 0, sizeof(hsave));
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+ hsave[cpu_index(ci)].pa = VM_PAGE_TO_PHYS(pg);
+ }
+
+ xc = xc_broadcast(0, svm_change_cpu, (void *)true, NULL);
+ xc_wait(xc);
+}
+
+static void
+svm_fini_asid(void)
+{
+ size_t allocsz;
+
+ allocsz = roundup(svm_maxasid, 8) / 8;
+ kmem_free(svm_asidmap, allocsz);
+
+ mutex_destroy(&svm_asidlock);
+}
+
+static void
+svm_fini(void)
+{
+ uint64_t xc;
+ size_t i;
+
+ xc = xc_broadcast(0, svm_change_cpu, (void *)false, NULL);
+ xc_wait(xc);
+
+ for (i = 0; i < MAXCPUS; i++) {
+ if (hsave[i].pa != 0)
+ uvm_pagefree(PHYS_TO_VM_PAGE(hsave[i].pa));
+ }
+
+ svm_fini_asid();
+}
+
+static void
+svm_capability(struct nvmm_capability *cap)
+{
+ cap->u.x86.xcr0_mask = svm_xcr0_mask;
+ cap->u.x86.mxcsr_mask = x86_fpu_mxcsr_mask;
+ cap->u.x86.conf_cpuid_maxops = SVM_NCPUIDS;
+}
+
+const struct nvmm_impl nvmm_x86_svm = {
+ .ident = svm_ident,
+ .init = svm_init,
+ .fini = svm_fini,
+ .capability = svm_capability,
+ .conf_max = NVMM_X86_NCONF,
+ .conf_sizes = svm_conf_sizes,
+ .state_size = sizeof(struct nvmm_x64_state),
+ .machine_create = svm_machine_create,
+ .machine_destroy = svm_machine_destroy,
+ .machine_configure = svm_machine_configure,
+ .vcpu_create = svm_vcpu_create,
+ .vcpu_destroy = svm_vcpu_destroy,
+ .vcpu_setstate = svm_vcpu_setstate,
+ .vcpu_getstate = svm_vcpu_getstate,
+ .vcpu_inject = svm_vcpu_inject,
+ .vcpu_run = svm_vcpu_run
+};
Index: src/sys/dev/nvmm/x86/nvmm_x86_svmfunc.S
diff -u /dev/null src/sys/dev/nvmm/x86/nvmm_x86_svmfunc.S:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/x86/nvmm_x86_svmfunc.S Wed Nov 7 07:43:08 2018
@@ -0,0 +1,218 @@
+/* $NetBSD: nvmm_x86_svmfunc.S,v 1.1 2018/11/07 07:43:08 maxv Exp $ */
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Override user-land alignment before including asm.h */
+#define ALIGN_DATA .align 8
+#define ALIGN_TEXT .align 16,0x90
+#define _ALIGN_TEXT ALIGN_TEXT
+
+#define _LOCORE
+#include "assym.h"
+#include <machine/asm.h>
+#include <machine/segments.h>
+#include <x86/specialreg.h>
+
+#define ASM_NVMM
+#include <dev/nvmm/x86/nvmm_x86.h>
+
+ .text
+
+#define HOST_SAVE_GPRS \
+ pushq %rbx ;\
+ pushq %rbp ;\
+ pushq %r12 ;\
+ pushq %r13 ;\
+ pushq %r14 ;\
+ pushq %r15
+
+#define HOST_RESTORE_GPRS \
+ popq %r15 ;\
+ popq %r14 ;\
+ popq %r13 ;\
+ popq %r12 ;\
+ popq %rbp ;\
+ popq %rbx
+
+#define HOST_SAVE_MSR(msr) \
+ movq $msr,%rcx ;\
+ rdmsr ;\
+ pushq %rdx ;\
+ pushq %rax
+
+#define HOST_RESTORE_MSR(msr) \
+ popq %rax ;\
+ popq %rdx ;\
+ movq $msr,%rcx ;\
+ wrmsr
+
+#define HOST_SAVE_SEGREG(sreg) \
+ movw sreg,%ax ;\
+ pushw %ax
+
+#define HOST_RESTORE_SEGREG(sreg)\
+ popw %ax ;\
+ movw %ax,sreg
+
+#define HOST_SAVE_TR \
+ strw %ax ;\
+ pushw %ax
+
+#define HOST_RESTORE_TR \
+ popw %ax ;\
+ movzwq %ax,%rdx ;\
+ movq CPUVAR(GDT),%rax ;\
+ andq $~0x0200,4(%rax,%rdx, 1) ;\
+ ltrw %dx
+
+#define HOST_SAVE_LDT \
+ sldtw %ax ;\
+ pushw %ax
+
+#define HOST_RESTORE_LDT \
+ popw %ax ;\
+ lldtw %ax
+
+/*
+ * All GPRs except RAX and RSP, which are taken care of in VMCB.
+ */
+
+#define GUEST_SAVE_GPRS(reg) \
+ movq %rbx,(NVMM_X64_GPR_RBX * 8)(reg) ;\
+ movq %rcx,(NVMM_X64_GPR_RCX * 8)(reg) ;\
+ movq %rdx,(NVMM_X64_GPR_RDX * 8)(reg) ;\
+ movq %r8,(NVMM_X64_GPR_R8 * 8)(reg) ;\
+ movq %r9,(NVMM_X64_GPR_R9 * 8)(reg) ;\
+ movq %r10,(NVMM_X64_GPR_R10 * 8)(reg) ;\
+ movq %r11,(NVMM_X64_GPR_R11 * 8)(reg) ;\
+ movq %r12,(NVMM_X64_GPR_R12 * 8)(reg) ;\
+ movq %r13,(NVMM_X64_GPR_R13 * 8)(reg) ;\
+ movq %r14,(NVMM_X64_GPR_R14 * 8)(reg) ;\
+ movq %r15,(NVMM_X64_GPR_R15 * 8)(reg) ;\
+ movq %rbp,(NVMM_X64_GPR_RBP * 8)(reg) ;\
+ movq %rdi,(NVMM_X64_GPR_RDI * 8)(reg) ;\
+ movq %rsi,(NVMM_X64_GPR_RSI * 8)(reg)
+
+#define GUEST_RESTORE_GPRS(reg) \
+ movq (NVMM_X64_GPR_RBX * 8)(reg),%rbx ;\
+ movq (NVMM_X64_GPR_RCX * 8)(reg),%rcx ;\
+ movq (NVMM_X64_GPR_RDX * 8)(reg),%rdx ;\
+ movq (NVMM_X64_GPR_R8 * 8)(reg),%r8 ;\
+ movq (NVMM_X64_GPR_R9 * 8)(reg),%r9 ;\
+ movq (NVMM_X64_GPR_R10 * 8)(reg),%r10 ;\
+ movq (NVMM_X64_GPR_R11 * 8)(reg),%r11 ;\
+ movq (NVMM_X64_GPR_R12 * 8)(reg),%r12 ;\
+ movq (NVMM_X64_GPR_R13 * 8)(reg),%r13 ;\
+ movq (NVMM_X64_GPR_R14 * 8)(reg),%r14 ;\
+ movq (NVMM_X64_GPR_R15 * 8)(reg),%r15 ;\
+ movq (NVMM_X64_GPR_RBP * 8)(reg),%rbp ;\
+ movq (NVMM_X64_GPR_RDI * 8)(reg),%rdi ;\
+ movq (NVMM_X64_GPR_RSI * 8)(reg),%rsi
+
+/*
+ * %rdi = PA of VMCB
+ * %rsi = VA of guest GPR state
+ */
+ENTRY(svm_vmrun)
+ /* Save the Host GPRs. */
+ HOST_SAVE_GPRS
+
+ /* Disable Host interrupts. */
+ clgi
+
+ /* Save the Host TR. */
+ HOST_SAVE_TR
+
+ /* Save the variable Host MSRs. */
+ HOST_SAVE_MSR(MSR_KERNELGSBASE)
+ HOST_SAVE_MSR(MSR_GSBASE)
+ HOST_SAVE_MSR(MSR_FSBASE)
+
+ /* Reset the Host Segregs. */
+ movq $GSEL(GUDATA_SEL, SEL_UPL),%rax
+ movw %ax,%ds
+ movw %ax,%es
+ xorq %rax,%rax
+ movw %ax,%fs
+ movw %ax,%gs
+
+ /* Save some Host Segregs. */
+ HOST_SAVE_SEGREG(%fs)
+ HOST_SAVE_SEGREG(%gs)
+
+ /* Save the Host LDT. */
+ HOST_SAVE_LDT
+
+ /* Prepare RAX. */
+ pushq %rsi
+ pushq %rdi
+
+ /* Restore the Guest GPRs. */
+ movq %rsi,%rax
+ GUEST_RESTORE_GPRS(%rax)
+
+ /* Set RAX. */
+ popq %rax
+
+ /* Run the VM. */
+ vmload %rax
+ vmrun %rax
+ vmsave %rax
+
+ /* Get RAX. */
+ popq %rax
+
+ /* Save the Guest GPRs. */
+ GUEST_SAVE_GPRS(%rax)
+
+ /* Restore the Host LDT. */
+ HOST_RESTORE_LDT
+
+ /* Restore the Host Segregs. */
+ HOST_RESTORE_SEGREG(%gs)
+ HOST_RESTORE_SEGREG(%fs)
+
+ /* Restore the variable Host MSRs. */
+ HOST_RESTORE_MSR(MSR_FSBASE)
+ HOST_RESTORE_MSR(MSR_GSBASE)
+ HOST_RESTORE_MSR(MSR_KERNELGSBASE)
+
+ /* Restore the Host TR. */
+ HOST_RESTORE_TR
+
+ /* Enable Host interrupts. */
+ stgi
+
+ /* Restore the Host GPRs. */
+ HOST_RESTORE_GPRS
+
+ xorq %rax,%rax
+ retq
+END(svm_vmrun)
Index: src/sys/modules/nvmm/Makefile
diff -u /dev/null src/sys/modules/nvmm/Makefile:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/modules/nvmm/Makefile Wed Nov 7 07:43:08 2018
@@ -0,0 +1,19 @@
+# $NetBSD: Makefile,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+.include "../Makefile.inc"
+.include "../Makefile.assym"
+
+CPPFLAGS+=
+
+.PATH: ${S}/dev/nvmm
+.PATH: ${S}/dev/nvmm/x86
+
+KMOD= nvmm
+IOCONF= nvmm.ioconf
+SRCS= nvmm.c
+
+.if ${MACHINE_ARCH} == "x86_64"
+SRCS+= nvmm_x86_svm.c nvmm_x86_svmfunc.S
+.endif
+
+.include <bsd.kmodule.mk>
Index: src/sys/modules/nvmm/nvmm.ioconf
diff -u /dev/null src/sys/modules/nvmm/nvmm.ioconf:1.1
--- /dev/null Wed Nov 7 07:43:08 2018
+++ src/sys/modules/nvmm/nvmm.ioconf Wed Nov 7 07:43:08 2018
@@ -0,0 +1,7 @@
+# $NetBSD: nvmm.ioconf,v 1.1 2018/11/07 07:43:08 maxv Exp $
+
+ioconf nvmm
+
+include "conf/files"
+
+pseudo-device nvmm