Module Name: src
Committed By: maxv
Date: Sun Jan 6 16:10:51 UTC 2019
Modified Files:
src/lib/libnvmm: libnvmm.3 libnvmm_x86.c nvmm.h
src/sys/dev/nvmm: nvmm.c nvmm.h
src/sys/dev/nvmm/x86: nvmm_x86.h nvmm_x86_svm.c
Log Message:
Improvements and fixes in NVMM.
Kernel driver:
* Don't take an extra (unneeded) reference to the UAO.
* Provide npc for HLT. I'm not really happy with it right now, will
likely be revisited.
* Add the INT_SHADOW, INT_WINDOW_EXIT and NMI_WINDOW_EXIT states. Provide
them in the exitstate too.
* Don't take the TPR into account when processing INTs. The virtualizer
can do that itself (Qemu already does).
* Provide a hypervisor signature in CPUID, and hide SVM.
* Ignore certain MSRs. One special case is MSR_NB_CFG in which we set
NB_CFG_INITAPICCPUIDLO. Allow reads of MSR_TSC.
* If the LWP has pending signals or softints, leave, rather than waiting
for a rescheduling to happen later. This reduces interrupt processing
time in the guest (Qemu sends a signal to the thread, and now we leave
right away). This could be improved even more by sending an actual IPI
to the CPU, but I'll see later.
Libnvmm:
* Fix the MMU translation of large pages, we need to add the lower bits
too.
* Change the IO and Mem structures to take a pointer rather than a
static array. This provides more flexibility.
* Batch together the str+rep IO transactions. We do one big memory
read/write, and then send the IO commands to the hypervisor all at
once. This considerably increases performance.
* Decode MOVZX.
With these changes in place, Qemu+NVMM works. I can install NetBSD 8.0
in a VM with multiple VCPUs, connect to the network, etc.
To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 src/lib/libnvmm/libnvmm.3
cvs rdiff -u -r1.9 -r1.10 src/lib/libnvmm/libnvmm_x86.c
cvs rdiff -u -r1.4 -r1.5 src/lib/libnvmm/nvmm.h
cvs rdiff -u -r1.4 -r1.5 src/sys/dev/nvmm/nvmm.c
cvs rdiff -u -r1.1 -r1.2 src/sys/dev/nvmm/nvmm.h
cvs rdiff -u -r1.2 -r1.3 src/sys/dev/nvmm/x86/nvmm_x86.h
cvs rdiff -u -r1.9 -r1.10 src/sys/dev/nvmm/x86/nvmm_x86_svm.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/lib/libnvmm/libnvmm.3
diff -u src/lib/libnvmm/libnvmm.3:1.6 src/lib/libnvmm/libnvmm.3:1.7
--- src/lib/libnvmm/libnvmm.3:1.6 Thu Dec 27 07:22:31 2018
+++ src/lib/libnvmm/libnvmm.3 Sun Jan 6 16:10:51 2019
@@ -1,4 +1,4 @@
-.\" $NetBSD: libnvmm.3,v 1.6 2018/12/27 07:22:31 maxv Exp $
+.\" $NetBSD: libnvmm.3,v 1.7 2019/01/06 16:10:51 maxv Exp $
.\"
.\" Copyright (c) 2018 The NetBSD Foundation, Inc.
.\" All rights reserved.
@@ -27,7 +27,7 @@
.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
.\" POSSIBILITY OF SUCH DAMAGE.
.\"
-.Dd December 26, 2018
+.Dd January 06, 2019
.Dt LIBNVMM 3
.Os
.Sh NAME
@@ -242,8 +242,6 @@ on CPU
.Fa cpuid
from machine
.Fa mach .
-.Fa cb
-will be called to handle the transaction.
See
.Sx I/O Assist
below for details.
@@ -255,8 +253,6 @@ on CPU
.Fa cpuid
from machine
.Fa mach .
-.Fa cb
-will be called to handle the transaction.
See
.Sx Mem Assist
below for details.
@@ -415,7 +411,7 @@ struct nvmm_io {
uint64_t port;
bool in;
size_t size;
- uint8_t data[8];
+ uint8_t *data;
};
.Ed
.Pp
@@ -463,7 +459,7 @@ struct nvmm_mem {
gpaddr_t gpa;
bool write;
size_t size;
- uint8_t data[8];
+ uint8_t *data;
};
.Ed
.Pp
Index: src/lib/libnvmm/libnvmm_x86.c
diff -u src/lib/libnvmm/libnvmm_x86.c:1.9 src/lib/libnvmm/libnvmm_x86.c:1.10
--- src/lib/libnvmm/libnvmm_x86.c:1.9 Fri Jan 4 10:25:39 2019
+++ src/lib/libnvmm/libnvmm_x86.c Sun Jan 6 16:10:51 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: libnvmm_x86.c,v 1.9 2019/01/04 10:25:39 maxv Exp $ */
+/* $NetBSD: libnvmm_x86.c,v 1.10 2019/01/06 16:10:51 maxv Exp $ */
/*
* Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -45,6 +45,8 @@
#include "nvmm.h"
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+
#include <x86/specialreg.h>
extern struct nvmm_callbacks __callbacks;
@@ -83,6 +85,11 @@ nvmm_vcpu_dump(struct nvmm_machine *mach
(void *)state.segs[i].limit,
state.segs[i].attrib.p, state.segs[i].attrib.def32);
}
+ printf("| -> MSR_EFER=%p\n", (void *)state.msrs[NVMM_X64_MSR_EFER]);
+ printf("| -> CR0=%p\n", (void *)state.crs[NVMM_X64_CR_CR0]);
+ printf("| -> CR3=%p\n", (void *)state.crs[NVMM_X64_CR_CR3]);
+ printf("| -> CR4=%p\n", (void *)state.crs[NVMM_X64_CR_CR4]);
+ printf("| -> CR8=%p\n", (void *)state.crs[NVMM_X64_CR_CR8]);
printf("| -> CPL=%p\n", (void *)state.misc[NVMM_X64_MISC_CPL]);
return 0;
@@ -131,6 +138,7 @@ x86_gva_to_gpa_32bit(struct nvmm_machine
return -1;
if (pte & PG_PS) {
*gpa = (pte & PTE32_L2_FRAME);
+ *gpa = *gpa + (gva & PTE32_L1_MASK);
return 0;
}
@@ -215,6 +223,7 @@ x86_gva_to_gpa_32bit_pae(struct nvmm_mac
return -1;
if (pte & PG_PS) {
*gpa = (pte & PTE32_PAE_L2_FRAME);
+ *gpa = *gpa + (gva & PTE32_PAE_L1_MASK);
return 0;
}
@@ -320,6 +329,7 @@ x86_gva_to_gpa_64bit(struct nvmm_machine
return -1;
if (pte & PG_PS) {
*gpa = (pte & PTE64_L3_FRAME);
+ *gpa = *gpa + (gva & (PTE64_L2_MASK|PTE64_L1_MASK));
return 0;
}
@@ -341,6 +351,7 @@ x86_gva_to_gpa_64bit(struct nvmm_machine
return -1;
if (pte & PG_PS) {
*gpa = (pte & PTE64_L2_FRAME);
+ *gpa = *gpa + (gva & PTE64_L1_MASK);
return 0;
}
@@ -500,13 +511,34 @@ mask_from_adsize(size_t adsize)
}
static uint64_t
+rep_get_cnt(struct nvmm_x64_state *state, size_t adsize)
+{
+ uint64_t mask, cnt;
+
+ mask = mask_from_adsize(adsize);
+ cnt = state->gprs[NVMM_X64_GPR_RCX] & mask;
+
+ return cnt;
+}
+
+static void
+rep_set_cnt(struct nvmm_x64_state *state, size_t adsize, uint64_t cnt)
+{
+ uint64_t mask;
+
+ mask = mask_from_adsize(adsize);
+ state->gprs[NVMM_X64_GPR_RCX] &= ~mask;
+ state->gprs[NVMM_X64_GPR_RCX] |= cnt;
+}
+
+static uint64_t
rep_dec_apply(struct nvmm_x64_state *state, size_t adsize)
{
uint64_t mask, cnt;
mask = mask_from_adsize(adsize);
- cnt = state->gprs[NVMM_X64_GPR_RCX] & mask;
+ cnt = state->gprs[NVMM_X64_GPR_RCX] & mask;
cnt -= 1;
cnt &= mask;
@@ -521,6 +553,7 @@ read_guest_memory(struct nvmm_machine *m
gvaddr_t gva, uint8_t *data, size_t size)
{
struct nvmm_mem mem;
+ uint8_t membuf[8];
nvmm_prot_t prot;
gpaddr_t gpa;
uintptr_t hva;
@@ -547,6 +580,7 @@ read_guest_memory(struct nvmm_machine *m
is_mmio = (ret == -1);
if (is_mmio) {
+ mem.data = membuf;
mem.gva = gva;
mem.gpa = gpa;
mem.write = false;
@@ -572,6 +606,7 @@ write_guest_memory(struct nvmm_machine *
gvaddr_t gva, uint8_t *data, size_t size)
{
struct nvmm_mem mem;
+ uint8_t membuf[8];
nvmm_prot_t prot;
gpaddr_t gpa;
uintptr_t hva;
@@ -598,6 +633,7 @@ write_guest_memory(struct nvmm_machine *
is_mmio = (ret == -1);
if (is_mmio) {
+ mem.data = membuf;
mem.gva = gva;
mem.gpa = gpa;
mem.write = true;
@@ -622,16 +658,55 @@ write_guest_memory(struct nvmm_machine *
static int fetch_segment(struct nvmm_machine *, struct nvmm_x64_state *);
+#define NVMM_IO_BATCH_SIZE 32
+
+static int
+assist_io_batch(struct nvmm_machine *mach, struct nvmm_x64_state *state,
+ struct nvmm_io *io, gvaddr_t gva, uint64_t cnt)
+{
+ uint8_t iobuf[NVMM_IO_BATCH_SIZE];
+ size_t i, iosize, iocnt;
+ int ret;
+
+ cnt = MIN(cnt, NVMM_IO_BATCH_SIZE);
+ iosize = MIN(io->size * cnt, NVMM_IO_BATCH_SIZE);
+ iocnt = iosize / io->size;
+
+ io->data = iobuf;
+
+ if (!io->in) {
+ ret = read_guest_memory(mach, state, gva, iobuf, iosize);
+ if (ret == -1)
+ return -1;
+ }
+
+ for (i = 0; i < iocnt; i++) {
+ (*__callbacks.io)(io);
+ io->data += io->size;
+ }
+
+ if (io->in) {
+ ret = write_guest_memory(mach, state, gva, iobuf, iosize);
+ if (ret == -1)
+ return -1;
+ }
+
+ return iocnt;
+}
+
int
nvmm_assist_io(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
struct nvmm_exit *exit)
{
struct nvmm_x64_state state;
struct nvmm_io io;
- uint64_t cnt;
+ uint64_t cnt = 0; /* GCC */
+ uint8_t iobuf[8];
+ int iocnt = 1;
gvaddr_t gva;
int reg = 0; /* GCC */
int ret, seg;
+ bool psld = false;
if (__predict_false(exit->reason != NVMM_EXIT_IO)) {
errno = EINVAL;
@@ -641,6 +716,7 @@ nvmm_assist_io(struct nvmm_machine *mach
io.port = exit->u.io.port;
io.in = (exit->u.io.type == NVMM_EXIT_IO_IN);
io.size = exit->u.io.operand_size;
+ io.data = iobuf;
ret = nvmm_vcpu_getstate(mach, cpuid, &state,
NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
@@ -648,6 +724,17 @@ nvmm_assist_io(struct nvmm_machine *mach
if (ret == -1)
return -1;
+ if (exit->u.io.rep) {
+ cnt = rep_get_cnt(&state, exit->u.io.address_size);
+ if (__predict_false(cnt == 0)) {
+ return 0;
+ }
+ }
+
+ if (__predict_false(state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D)) {
+ psld = true;
+ }
+
/*
* Determine GVA.
*/
@@ -678,6 +765,13 @@ nvmm_assist_io(struct nvmm_machine *mach
if (ret == -1)
return -1;
}
+
+ if (exit->u.io.rep && !psld) {
+ iocnt = assist_io_batch(mach, &state, &io, gva, cnt);
+ if (iocnt == -1)
+ return -1;
+ goto done;
+ }
}
if (!io.in) {
@@ -704,16 +798,18 @@ nvmm_assist_io(struct nvmm_machine *mach
}
}
+done:
if (exit->u.io.str) {
- if (state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) {
- state.gprs[reg] -= io.size;
+ if (__predict_false(psld)) {
+ state.gprs[reg] -= iocnt * io.size;
} else {
- state.gprs[reg] += io.size;
+ state.gprs[reg] += iocnt * io.size;
}
}
if (exit->u.io.rep) {
- cnt = rep_dec_apply(&state, exit->u.io.address_size);
+ cnt -= iocnt;
+ rep_set_cnt(&state, exit->u.io.address_size, cnt);
if (cnt == 0) {
state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
}
@@ -858,6 +954,7 @@ struct x86_instr {
struct x86_rexpref rexpref;
size_t operand_size;
size_t address_size;
+ uint64_t zeroextend_mask;
struct x86_regmodrm regmodrm;
@@ -912,6 +1009,7 @@ struct x86_group_entry {
#define OPSIZE_QUAD 0x08 /* 8 bytes */
#define FLAG_z 0x02
+#define FLAG_e 0x10
static const struct x86_group_entry group11[8] = {
[0] = { .emul = x86_emul_mov }
@@ -1230,6 +1328,34 @@ static const struct x86_opcode primary_o
},
};
+static const struct x86_opcode secondary_opcode_table[] = {
+ /*
+ * MOVZX
+ */
+ {
+ /* Gv, Eb */
+ .byte = 0xB6,
+ .regmodrm = true,
+ .regtorm = false,
+ .szoverride = true,
+ .defsize = OPSIZE_BYTE,
+ .allsize = OPSIZE_WORD|OPSIZE_DOUB|OPSIZE_QUAD,
+ .flags = FLAG_e,
+ .emul = x86_emul_mov
+ },
+ {
+ /* Gv, Ew */
+ .byte = 0xB7,
+ .regmodrm = true,
+ .regtorm = false,
+ .szoverride = true,
+ .defsize = OPSIZE_WORD,
+ .allsize = OPSIZE_WORD|OPSIZE_DOUB|OPSIZE_QUAD,
+ .flags = FLAG_e,
+ .emul = x86_emul_mov
+ },
+};
+
static const struct x86_reg gpr_map__rip = { NVMM_X64_GPR_RIP, 0xFFFFFFFFFFFFFFFF };
/* [REX-present][enc][opsize] */
@@ -2059,6 +2185,67 @@ node_primary_opcode(struct x86_decode_fs
return 0;
}
+static uint64_t
+size_to_mask(size_t size)
+{
+ switch (size) {
+ case 1:
+ return 0x00000000000000FF;
+ case 2:
+ return 0x000000000000FFFF;
+ case 4:
+ return 0x00000000FFFFFFFF;
+ case 8:
+ default:
+ return 0xFFFFFFFFFFFFFFFF;
+ }
+}
+
+static int
+node_secondary_opcode(struct x86_decode_fsm *fsm, struct x86_instr *instr)
+{
+ const struct x86_opcode *opcode;
+ uint8_t byte;
+ size_t i, n;
+
+ if (fsm_read(fsm, &byte, sizeof(byte)) == -1) {
+ return -1;
+ }
+
+ n = sizeof(secondary_opcode_table) / sizeof(secondary_opcode_table[0]);
+ for (i = 0; i < n; i++) {
+ if (secondary_opcode_table[i].byte == byte)
+ break;
+ }
+ if (i == n) {
+ return -1;
+ }
+ opcode = &secondary_opcode_table[i];
+
+ instr->opcode = opcode;
+ instr->emul = opcode->emul;
+ instr->operand_size = get_operand_size(fsm, instr);
+ instr->address_size = get_address_size(fsm, instr);
+
+ if (opcode->flags & FLAG_e) {
+ /*
+ * Compute the mask for zero-extend. Update the operand size,
+ * we move fewer bytes.
+ */
+ instr->zeroextend_mask = size_to_mask(instr->operand_size);
+ instr->zeroextend_mask &= ~size_to_mask(opcode->defsize);
+ instr->operand_size = opcode->defsize;
+ }
+
+ if (opcode->regmodrm) {
+ fsm_advance(fsm, 1, node_regmodrm);
+ } else {
+ return -1;
+ }
+
+ return 0;
+}
+
static int
node_main(struct x86_decode_fsm *fsm, struct x86_instr *instr)
{
@@ -2078,7 +2265,7 @@ node_main(struct x86_decode_fsm *fsm, st
* after being introduced.
*/
if (byte == ESCAPE) {
- return -1;
+ fsm_advance(fsm, 1, node_secondary_opcode);
} else if (!instr->rexpref.present) {
if (byte == VEX_1) {
return -1;
@@ -2600,10 +2787,12 @@ assist_mem_single(struct nvmm_machine *m
struct x86_instr *instr)
{
struct nvmm_mem mem;
+ uint8_t membuf[8];
uint64_t val;
int ret;
memset(&mem, 0, sizeof(mem));
+ mem.data = membuf;
switch (instr->src.type) {
case STORE_REG:
@@ -2703,6 +2892,7 @@ assist_mem_single(struct nvmm_machine *m
val = __SHIFTIN(val, instr->dst.u.reg->mask);
state->gprs[instr->dst.u.reg->num] &= ~instr->dst.u.reg->mask;
state->gprs[instr->dst.u.reg->num] |= val;
+ state->gprs[instr->dst.u.reg->num] &= ~instr->zeroextend_mask;
}
return 0;
Index: src/lib/libnvmm/nvmm.h
diff -u src/lib/libnvmm/nvmm.h:1.4 src/lib/libnvmm/nvmm.h:1.5
--- src/lib/libnvmm/nvmm.h:1.4 Thu Dec 27 07:22:31 2018
+++ src/lib/libnvmm/nvmm.h Sun Jan 6 16:10:51 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: nvmm.h,v 1.4 2018/12/27 07:22:31 maxv Exp $ */
+/* $NetBSD: nvmm.h,v 1.5 2019/01/06 16:10:51 maxv Exp $ */
/*
* Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -50,7 +50,7 @@ struct nvmm_io {
uint64_t port;
bool in;
size_t size;
- uint8_t data[8];
+ uint8_t *data;
};
struct nvmm_mem {
@@ -58,7 +58,7 @@ struct nvmm_mem {
gpaddr_t gpa;
bool write;
size_t size;
- uint8_t data[8];
+ uint8_t *data;
};
struct nvmm_callbacks {
Index: src/sys/dev/nvmm/nvmm.c
diff -u src/sys/dev/nvmm/nvmm.c:1.4 src/sys/dev/nvmm/nvmm.c:1.5
--- src/sys/dev/nvmm/nvmm.c:1.4 Sat Dec 15 13:39:43 2018
+++ src/sys/dev/nvmm/nvmm.c Sun Jan 6 16:10:51 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: nvmm.c,v 1.4 2018/12/15 13:39:43 maxv Exp $ */
+/* $NetBSD: nvmm.c,v 1.5 2019/01/06 16:10:51 maxv Exp $ */
/*
* Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.4 2018/12/15 13:39:43 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.5 2019/01/06 16:10:51 maxv Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -644,9 +644,6 @@ nvmm_hva_map(struct nvmm_ioc_hva_map *ar
seg->uobj = uao_create(seg->size, 0);
uva = seg->hva;
- /* Take a reference for the kernel. */
- uao_reference(seg->uobj);
-
/* Take a reference for the user. */
uao_reference(seg->uobj);
Index: src/sys/dev/nvmm/nvmm.h
diff -u src/sys/dev/nvmm/nvmm.h:1.1 src/sys/dev/nvmm/nvmm.h:1.2
--- src/sys/dev/nvmm/nvmm.h:1.1 Wed Nov 7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm.h Sun Jan 6 16:10:51 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: nvmm.h,v 1.1 2018/11/07 07:43:08 maxv Exp $ */
+/* $NetBSD: nvmm.h,v 1.2 2019/01/06 16:10:51 maxv Exp $ */
/*
* Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -106,12 +106,17 @@ struct nvmm_exit_msr {
uint64_t npc;
};
+struct nvmm_exit_hlt {
+ uint64_t npc;
+};
+
struct nvmm_exit {
enum nvmm_exit_reason reason;
union {
struct nvmm_exit_memory mem;
struct nvmm_exit_io io;
struct nvmm_exit_msr msr;
+ struct nvmm_exit_hlt hlt;
} u;
uint64_t exitstate[8];
};
Index: src/sys/dev/nvmm/x86/nvmm_x86.h
diff -u src/sys/dev/nvmm/x86/nvmm_x86.h:1.2 src/sys/dev/nvmm/x86/nvmm_x86.h:1.3
--- src/sys/dev/nvmm/x86/nvmm_x86.h:1.2 Sun Nov 25 14:09:57 2018
+++ src/sys/dev/nvmm/x86/nvmm_x86.h Sun Jan 6 16:10:51 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: nvmm_x86.h,v 1.2 2018/11/25 14:09:57 maxv Exp $ */
+/* $NetBSD: nvmm_x86.h,v 1.3 2019/01/06 16:10:51 maxv Exp $ */
/*
* Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -99,7 +99,10 @@
/* Misc. */
#define NVMM_X64_MISC_CPL 0
-#define NVMM_X64_NMISC 1
+#define NVMM_X64_MISC_INT_SHADOW 1
+#define NVMM_X64_MISC_INT_WINDOW_EXIT 2
+#define NVMM_X64_MISC_NMI_WINDOW_EXIT 3
+#define NVMM_X64_NMISC 4
#ifndef ASM_NVMM
@@ -123,8 +126,11 @@ struct nvmm_x64_state_seg {
};
/* VM exit state indexes. */
-#define NVMM_X64_EXITSTATE_CR8 0
-#define NVMM_X64_EXITSTATE_RFLAGS 1
+#define NVMM_X64_EXITSTATE_CR8 0
+#define NVMM_X64_EXITSTATE_RFLAGS 1
+#define NVMM_X64_EXITSTATE_INT_SHADOW 2
+#define NVMM_X64_EXITSTATE_INT_WINDOW_EXIT 3
+#define NVMM_X64_EXITSTATE_NMI_WINDOW_EXIT 4
/* Flags. */
#define NVMM_X64_STATE_SEGS 0x01
Index: src/sys/dev/nvmm/x86/nvmm_x86_svm.c
diff -u src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.9 src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.10
--- src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.9 Thu Jan 3 08:02:49 2019
+++ src/sys/dev/nvmm/x86/nvmm_x86_svm.c Sun Jan 6 16:10:51 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: nvmm_x86_svm.c,v 1.9 2019/01/03 08:02:49 maxv Exp $ */
+/* $NetBSD: nvmm_x86_svm.c,v 1.10 2019/01/06 16:10:51 maxv Exp $ */
/*
* Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_svm.c,v 1.9 2019/01/03 08:02:49 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_svm.c,v 1.10 2019/01/06 16:10:51 maxv Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -518,8 +518,11 @@ struct svm_cpudata {
bool ts_set;
struct xsave_header hfpu __aligned(16);
+ /* Event state */
+ bool int_window_exit;
+ bool nmi_window_exit;
+
/* Guest state */
- bool in_nmi;
uint64_t tsc_offset;
struct xsave_header gfpu __aligned(16);
};
@@ -530,26 +533,34 @@ struct svm_cpudata {
#define SVM_EVENT_TYPE_SW_INT 4
static void
-svm_event_waitexit_enable(struct vmcb *vmcb, bool nmi)
+svm_event_waitexit_enable(struct nvmm_cpu *vcpu, bool nmi)
{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct vmcb *vmcb = cpudata->vmcb;
+
if (nmi) {
vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_IRET;
+ cpudata->nmi_window_exit = true;
} else {
vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_VINTR;
- vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ |
- __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR));
+ vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR);
+ cpudata->int_window_exit = true;
}
}
static void
-svm_event_waitexit_disable(struct vmcb *vmcb, bool nmi)
+svm_event_waitexit_disable(struct nvmm_cpu *vcpu, bool nmi)
{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+ struct vmcb *vmcb = cpudata->vmcb;
+
if (nmi) {
vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_IRET;
+ cpudata->nmi_window_exit = false;
} else {
vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_VINTR;
- vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ |
- __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR));
+ vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR);
+ cpudata->int_window_exit = false;
}
}
@@ -577,9 +588,7 @@ svm_vcpu_inject(struct nvmm_machine *mac
{
struct svm_cpudata *cpudata = vcpu->cpudata;
struct vmcb *vmcb = cpudata->vmcb;
- uint64_t rflags = vmcb->state.rflags;
int type = 0, err = 0;
- uint64_t tpr;
if (event->vector >= 256) {
return EINVAL;
@@ -592,15 +601,14 @@ svm_vcpu_inject(struct nvmm_machine *mac
type = SVM_EVENT_TYPE_NMI;
}
if (type == SVM_EVENT_TYPE_NMI) {
- if (cpudata->in_nmi) {
- svm_event_waitexit_enable(vmcb, true);
+ if (cpudata->nmi_window_exit) {
return EAGAIN;
}
- cpudata->in_nmi = true;
+ svm_event_waitexit_enable(vcpu, true);
} else {
- tpr = __SHIFTOUT(vmcb->ctrl.v, VMCB_CTRL_V_TPR);
- if ((rflags & PSL_I) == 0 || event->u.prio <= tpr) {
- svm_event_waitexit_enable(vmcb, false);
+ if (((vmcb->state.rflags & PSL_I) == 0) ||
+ ((vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0)) {
+ svm_event_waitexit_enable(vcpu, false);
return EAGAIN;
}
}
@@ -698,6 +706,14 @@ svm_inkernel_handle_cpuid(struct nvmm_cp
state->gprs[NVMM_X64_GPR_RCX] = sizeof(struct fxsave);
state->gprs[NVMM_X64_GPR_RDX] = svm_xcr0_mask >> 32;
break;
+ case 0x40000000:
+ memcpy(&state->gprs[NVMM_X64_GPR_RBX], "___ ", 4);
+ memcpy(&state->gprs[NVMM_X64_GPR_RCX], "NVMM", 4);
+ memcpy(&state->gprs[NVMM_X64_GPR_RDX], " ___", 4);
+ break;
+ case 0x80000001: /* No SVM in ECX. The rest is tunable. */
+ state->gprs[NVMM_X64_GPR_RCX] &= ~CPUID_SVM;
+ break;
default:
break;
}
@@ -760,6 +776,16 @@ svm_exit_cpuid(struct nvmm_machine *mach
exit->reason = NVMM_EXIT_NONE;
}
+static void
+svm_exit_hlt(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+ struct nvmm_exit *exit)
+{
+ struct svm_cpudata *cpudata = vcpu->cpudata;
+
+ exit->reason = NVMM_EXIT_HLT;
+ exit->u.hlt.npc = cpudata->vmcb->ctrl.nrip;
+}
+
#define SVM_EXIT_IO_PORT __BITS(31,16)
#define SVM_EXIT_IO_SEG __BITS(12,10)
#define SVM_EXIT_IO_A64 __BIT(9)
@@ -827,20 +853,42 @@ svm_exit_io(struct nvmm_machine *mach, s
exit->u.io.npc = nextpc;
}
+static const uint64_t msr_ignore_list[] = {
+ 0xc0010055, /* MSR_CMPHALT */
+ MSR_DE_CFG,
+ MSR_IC_CFG,
+ MSR_UCODE_AMD_PATCHLEVEL
+};
+
static bool
svm_inkernel_handle_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
struct nvmm_exit *exit)
{
struct svm_cpudata *cpudata = vcpu->cpudata;
struct nvmm_x64_state *state = &cpudata->state;
- uint64_t pat;
+ uint64_t val;
+ size_t i;
switch (exit->u.msr.type) {
case NVMM_EXIT_MSR_RDMSR:
if (exit->u.msr.msr == MSR_CR_PAT) {
- pat = cpudata->vmcb->state.g_pat;
- cpudata->vmcb->state.rax = (pat & 0xFFFFFFFF);
- state->gprs[NVMM_X64_GPR_RDX] = (pat >> 32);
+ val = cpudata->vmcb->state.g_pat;
+ cpudata->vmcb->state.rax = (val & 0xFFFFFFFF);
+ state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
+ goto handled;
+ }
+ if (exit->u.msr.msr == MSR_NB_CFG) {
+ val = NB_CFG_INITAPICCPUIDLO;
+ cpudata->vmcb->state.rax = (val & 0xFFFFFFFF);
+ state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
+ goto handled;
+ }
+ for (i = 0; i < __arraycount(msr_ignore_list); i++) {
+ if (msr_ignore_list[i] != exit->u.msr.msr)
+ continue;
+ val = 0;
+ cpudata->vmcb->state.rax = (val & 0xFFFFFFFF);
+ state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
goto handled;
}
break;
@@ -861,6 +909,11 @@ svm_inkernel_handle_msr(struct nvmm_mach
cpudata->vmcb->state.g_pat = exit->u.msr.val;
goto handled;
}
+ for (i = 0; i < __arraycount(msr_ignore_list); i++) {
+ if (msr_ignore_list[i] != exit->u.msr.msr)
+ continue;
+ goto handled;
+ }
break;
}
@@ -1128,19 +1181,18 @@ svm_vcpu_run(struct nvmm_machine *mach,
exit->reason = NVMM_EXIT_NONE;
break;
case VMCB_EXITCODE_VINTR:
- svm_event_waitexit_disable(vmcb, false);
+ svm_event_waitexit_disable(vcpu, false);
exit->reason = NVMM_EXIT_INT_READY;
break;
case VMCB_EXITCODE_IRET:
- svm_event_waitexit_disable(vmcb, true);
- cpudata->in_nmi = false;
+ svm_event_waitexit_disable(vcpu, true);
exit->reason = NVMM_EXIT_NMI_READY;
break;
case VMCB_EXITCODE_CPUID:
svm_exit_cpuid(mach, vcpu, exit);
break;
case VMCB_EXITCODE_HLT:
- exit->reason = NVMM_EXIT_HLT;
+ svm_exit_hlt(mach, vcpu, exit);
break;
case VMCB_EXITCODE_IOIO:
svm_exit_io(mach, vcpu, exit);
@@ -1186,10 +1238,20 @@ svm_vcpu_run(struct nvmm_machine *mach,
break;
}
+ if (vmcb->ctrl.exitintinfo & VMCB_CTRL_EXITINTINFO_V) {
+ printf("WAS PROCESSING!\n");
+ }
+
/* If no reason to return to userland, keep rolling. */
if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) {
break;
}
+ if (curcpu()->ci_data.cpu_softints != 0) {
+ break;
+ }
+ if (curlwp->l_flag & LW_USERRET) {
+ break;
+ }
if (exit->reason != NVMM_EXIT_NONE) {
break;
}
@@ -1204,6 +1266,13 @@ svm_vcpu_run(struct nvmm_machine *mach,
VMCB_CTRL_V_TPR);
exit->exitstate[NVMM_X64_EXITSTATE_RFLAGS] = vmcb->state.rflags;
+ exit->exitstate[NVMM_X64_EXITSTATE_INT_SHADOW] =
+ ((vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0);
+ exit->exitstate[NVMM_X64_EXITSTATE_INT_WINDOW_EXIT] =
+ cpudata->int_window_exit;
+ exit->exitstate[NVMM_X64_EXITSTATE_NMI_WINDOW_EXIT] =
+ cpudata->nmi_window_exit;
+
return 0;
}
@@ -1437,6 +1506,7 @@ svm_vcpu_init(struct nvmm_machine *mach,
* - SYSENTER_EIP [read, write]
* - FSBASE [read, write]
* - GSBASE [read, write]
+ * - TSC [read]
*
* Intercept the rest.
*/
@@ -1452,6 +1522,7 @@ svm_vcpu_init(struct nvmm_machine *mach,
svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_EIP, true, true);
svm_vcpu_msr_allow(cpudata->msrbm, MSR_FSBASE, true, true);
svm_vcpu_msr_allow(cpudata->msrbm, MSR_GSBASE, true, true);
+ svm_vcpu_msr_allow(cpudata->msrbm, MSR_TSC, true, false);
vmcb->ctrl.msrpm_base_pa = cpudata->msrbm_pa;
/* Generate ASID. */
@@ -1712,6 +1783,24 @@ svm_vcpu_setstate(struct nvmm_cpu *vcpu,
memcpy(cstate->misc, nstate->misc, sizeof(nstate->misc));
vmcb->state.cpl = cstate->misc[NVMM_X64_MISC_CPL];
+
+ if (cstate->misc[NVMM_X64_MISC_INT_SHADOW]) {
+ vmcb->ctrl.intr |= VMCB_CTRL_INTR_SHADOW;
+ } else {
+ vmcb->ctrl.intr &= ~VMCB_CTRL_INTR_SHADOW;
+ }
+
+ if (cstate->misc[NVMM_X64_MISC_INT_WINDOW_EXIT]) {
+ svm_event_waitexit_enable(vcpu, false);
+ } else {
+ svm_event_waitexit_disable(vcpu, false);
+ }
+
+ if (cstate->misc[NVMM_X64_MISC_NMI_WINDOW_EXIT]) {
+ svm_event_waitexit_enable(vcpu, true);
+ } else {
+ svm_event_waitexit_disable(vcpu, true);
+ }
}
CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(cstate->fpu));
@@ -1812,6 +1901,13 @@ svm_vcpu_getstate(struct nvmm_cpu *vcpu,
if (flags & NVMM_X64_STATE_MISC) {
cstate->misc[NVMM_X64_MISC_CPL] = vmcb->state.cpl;
+ cstate->misc[NVMM_X64_MISC_INT_SHADOW] =
+ (vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0;
+ cstate->misc[NVMM_X64_MISC_INT_WINDOW_EXIT] =
+ cpudata->int_window_exit;
+ cstate->misc[NVMM_X64_MISC_NMI_WINDOW_EXIT] =
+ cpudata->nmi_window_exit;
+
memcpy(nstate->misc, cstate->misc, sizeof(cstate->misc));
}