Author: neel
Date: Wed Jul 16 21:26:26 2014
New Revision: 268777
URL: http://svnweb.freebsd.org/changeset/base/268777

Log:
  Add emulation for legacy x86 task switching mechanism.
  
  FreeBSD/i386 uses task switching to handle double fault exceptions and this
  change enables that to work.
  
  Reported by:  glebius

Added:
  head/usr.sbin/bhyve/task_switch.c   (contents, props changed)
Modified:
  head/sys/amd64/include/vmm.h
  head/sys/amd64/vmm/intel/vmcs.c
  head/sys/amd64/vmm/intel/vmcs.h
  head/sys/amd64/vmm/intel/vmx.c
  head/usr.sbin/bhyve/Makefile
  head/usr.sbin/bhyve/bhyverun.c
  head/usr.sbin/bhyve/bhyverun.h

Modified: head/sys/amd64/include/vmm.h
==============================================================================
--- head/sys/amd64/include/vmm.h        Wed Jul 16 21:06:43 2014        
(r268776)
+++ head/sys/amd64/include/vmm.h        Wed Jul 16 21:26:26 2014        
(r268777)
@@ -75,6 +75,10 @@ enum vm_reg_name {
        VM_REG_GUEST_GDTR,
        VM_REG_GUEST_EFER,
        VM_REG_GUEST_CR2,
+       VM_REG_GUEST_PDPTE0,
+       VM_REG_GUEST_PDPTE1,
+       VM_REG_GUEST_PDPTE2,
+       VM_REG_GUEST_PDPTE3,
        VM_REG_LAST
 };
 
@@ -323,6 +327,7 @@ struct seg_desc {
        uint32_t        access;
 };
 #define        SEG_DESC_TYPE(access)           ((access) & 0x001f)
+#define        SEG_DESC_DPL(access)            (((access) >> 5) & 0x3)
 #define        SEG_DESC_PRESENT(access)        (((access) & 0x0080) ? 1 : 0)
 #define        SEG_DESC_DEF32(access)          (((access) & 0x4000) ? 1 : 0)
 #define        SEG_DESC_GRANULARITY(access)    (((access) & 0x8000) ? 1 : 0)
@@ -415,6 +420,7 @@ enum vm_exitcode {
        VM_EXITCODE_IOAPIC_EOI,
        VM_EXITCODE_SUSPENDED,
        VM_EXITCODE_INOUT_STR,
+       VM_EXITCODE_TASK_SWITCH,
        VM_EXITCODE_MAX
 };
 
@@ -439,6 +445,22 @@ struct vm_inout_str {
        struct seg_desc seg_desc;
 };
 
+enum task_switch_reason {
+       TSR_CALL,
+       TSR_IRET,
+       TSR_JMP,
+       TSR_IDT_GATE,   /* task gate in IDT */
+};
+
+struct vm_task_switch {
+       uint16_t        tsssel;         /* new TSS selector */
+       int             ext;            /* task switch due to external event */
+       uint32_t        errcode;
+       int             errcode_valid;  /* push 'errcode' on the new stack */
+       enum task_switch_reason reason;
+       struct vm_guest_paging paging;
+};
+
 struct vm_exit {
        enum vm_exitcode        exitcode;
        int                     inst_length;    /* 0 means unknown */
@@ -493,6 +515,7 @@ struct vm_exit {
                struct {
                        enum vm_suspend_how how;
                } suspended;
+               struct vm_task_switch task_switch;
        } u;
 };
 

Modified: head/sys/amd64/vmm/intel/vmcs.c
==============================================================================
--- head/sys/amd64/vmm/intel/vmcs.c     Wed Jul 16 21:06:43 2014        
(r268776)
+++ head/sys/amd64/vmm/intel/vmcs.c     Wed Jul 16 21:26:26 2014        
(r268777)
@@ -103,6 +103,14 @@ vmcs_field_encoding(int ident)
                return (VMCS_GUEST_LDTR_SELECTOR);
        case VM_REG_GUEST_EFER:
                return (VMCS_GUEST_IA32_EFER);
+       case VM_REG_GUEST_PDPTE0:
+               return (VMCS_GUEST_PDPTE0);
+       case VM_REG_GUEST_PDPTE1:
+               return (VMCS_GUEST_PDPTE1);
+       case VM_REG_GUEST_PDPTE2:
+               return (VMCS_GUEST_PDPTE2);
+       case VM_REG_GUEST_PDPTE3:
+               return (VMCS_GUEST_PDPTE3);
        default:
                return (-1);
        }

Modified: head/sys/amd64/vmm/intel/vmcs.h
==============================================================================
--- head/sys/amd64/vmm/intel/vmcs.h     Wed Jul 16 21:06:43 2014        
(r268776)
+++ head/sys/amd64/vmm/intel/vmcs.h     Wed Jul 16 21:26:26 2014        
(r268777)
@@ -346,6 +346,9 @@ vmcs_write(uint32_t encoding, uint64_t v
 #define        VMCS_INTR_T_HWINTR      (0 << 8)
 #define        VMCS_INTR_T_NMI         (2 << 8)
 #define        VMCS_INTR_T_HWEXCEPTION (3 << 8)
+#define        VMCS_INTR_T_SWINTR      (4 << 8)
+#define        VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
+#define        VMCS_INTR_T_SWEXCEPTION (6 << 8)
 #define        VMCS_INTR_DEL_ERRCODE   (1 << 11)
 
 /*

Modified: head/sys/amd64/vmm/intel/vmx.c
==============================================================================
--- head/sys/amd64/vmm/intel/vmx.c      Wed Jul 16 21:06:43 2014        
(r268776)
+++ head/sys/amd64/vmm/intel/vmx.c      Wed Jul 16 21:26:26 2014        
(r268777)
@@ -2020,6 +2020,26 @@ vmx_handle_apic_access(struct vmx *vmx, 
        return (UNHANDLED);
 }
 
+static enum task_switch_reason
+vmx_task_switch_reason(uint64_t qual)
+{
+       int reason;
+
+       reason = (qual >> 30) & 0x3;
+       switch (reason) {
+       case 0:
+               return (TSR_CALL);
+       case 1:
+               return (TSR_IRET);
+       case 2:
+               return (TSR_JMP);
+       case 3:
+               return (TSR_IDT_GATE);
+       default:
+               panic("%s: invalid reason %d", __func__, reason);
+       }
+}
+
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
@@ -2027,8 +2047,9 @@ vmx_exit_process(struct vmx *vmx, int vc
        struct vmxctx *vmxctx;
        struct vlapic *vlapic;
        struct vm_inout_str *vis;
+       struct vm_task_switch *ts;
        uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
-       uint32_t reason;
+       uint32_t intr_type, reason;
        uint64_t qual, gpa;
        bool retu;
 
@@ -2045,9 +2066,13 @@ vmx_exit_process(struct vmx *vmx, int vc
        vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 
        /*
-        * VM exits that could be triggered during event injection on the
-        * previous VM entry need to be handled specially by re-injecting
-        * the event.
+        * VM exits that can be triggered during event delivery need to
+        * be handled specially by re-injecting the event if the IDT
+        * vectoring information field's valid bit is set.
+        *
+        * If the VM-exit is due to a task gate in the IDT then we don't
+        * reinject the event because emulating the task switch also
+        * completes the event delivery.
         *
         * See "Information for VM Exits During Event Delivery" in Intel SDM
         * for details.
@@ -2059,7 +2084,10 @@ vmx_exit_process(struct vmx *vmx, int vc
        case EXIT_REASON_TASK_SWITCH:
        case EXIT_REASON_EXCEPTION:
                idtvec_info = vmcs_idt_vectoring_info();
-               if (idtvec_info & VMCS_IDT_VEC_VALID) {
+               VCPU_CTR2(vmx->vm, vcpu, "vm exit %s: idtvec_info 0x%08x",
+                   exit_reason_to_str(reason), idtvec_info);   
+               if ((idtvec_info & VMCS_IDT_VEC_VALID) &&
+                   (reason != EXIT_REASON_TASK_SWITCH)) {
                        idtvec_info &= ~(1 << 12); /* clear undefined bit */
                        vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
                        if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
@@ -2079,12 +2107,56 @@ vmx_exit_process(struct vmx *vmx, int vc
                        }
                        vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
                }
+               break;
        default:
                idtvec_info = 0;
                break;
        }
 
        switch (reason) {
+       case EXIT_REASON_TASK_SWITCH:
+               ts = &vmexit->u.task_switch;
+               ts->tsssel = qual & 0xffff;
+               ts->reason = vmx_task_switch_reason(qual);
+               ts->ext = 0;
+               ts->errcode_valid = 0;
+               vmx_paging_info(&ts->paging);
+               /*
+                * If the task switch was due to a CALL, JMP, IRET, software
+                * interrupt (INT n) or software exception (INT3, INTO),
+                * then the saved %rip references the instruction that caused
+                * the task switch. The instruction length field in the VMCS
+                * is valid in this case.
+                *
+                * In all other cases (e.g., NMI, hardware exception) the
+                * saved %rip is one that would have been saved in the old TSS
+                * had the task switch completed normally so the instruction
+                * length field is not needed in this case and is explicitly
+                * set to 0.
+                */
+               if (ts->reason == TSR_IDT_GATE) {
+                       KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
+                           ("invalid idtvec_info %x for IDT task switch",
+                           idtvec_info));
+                       intr_type = idtvec_info & VMCS_INTR_T_MASK;
+                       if (intr_type != VMCS_INTR_T_SWINTR &&
+                           intr_type != VMCS_INTR_T_SWEXCEPTION &&
+                           intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
+                               /* Task switch triggered by external event */
+                               ts->ext = 1;
+                               vmexit->inst_length = 0;
+                               if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+                                       ts->errcode_valid = 1;
+                                       ts->errcode = vmcs_idt_vectoring_err();
+                               }
+                       }
+               }
+               vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
+               VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
+                   "%s errcode 0x%016lx", ts->reason, ts->tsssel,
+                   ts->ext ? "external" : "internal",
+                   ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
+               break;
        case EXIT_REASON_CR_ACCESS:
                vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
                switch (qual & 0xf) {

Modified: head/usr.sbin/bhyve/Makefile
==============================================================================
--- head/usr.sbin/bhyve/Makefile        Wed Jul 16 21:06:43 2014        
(r268776)
+++ head/usr.sbin/bhyve/Makefile        Wed Jul 16 21:26:26 2014        
(r268777)
@@ -35,6 +35,7 @@ SRCS= \
        post.c                  \
        rtc.c                   \
        smbiostbl.c             \
+       task_switch.c           \
        uart_emul.c             \
        virtio.c                \
        xmsr.c                  \

Modified: head/usr.sbin/bhyve/bhyverun.c
==============================================================================
--- head/usr.sbin/bhyve/bhyverun.c      Wed Jul 16 21:06:43 2014        
(r268776)
+++ head/usr.sbin/bhyve/bhyverun.c      Wed Jul 16 21:26:26 2014        
(r268777)
@@ -69,16 +69,11 @@ __FBSDID("$FreeBSD$");
 
 #define GUEST_NIO_PORT         0x488   /* guest upcalls via i/o port */
 
-#define        VMEXIT_CONTINUE         1       /* continue from next 
instruction */
-#define        VMEXIT_RESTART          2       /* restart current instruction 
*/
-#define        VMEXIT_ABORT            3       /* abort the vm run loop */
-#define        VMEXIT_RESET            4       /* guest machine has reset */
-#define        VMEXIT_POWEROFF         5       /* guest machine has powered 
off */
-
 #define MB             (1024UL * 1024)
 #define GB             (1024UL * MB)
 
 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 
 char *vmname;
 
@@ -556,7 +551,8 @@ static vmexit_handler_t handler[VM_EXITC
        [VM_EXITCODE_MTRAP]  = vmexit_mtrap,
        [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
        [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
-       [VM_EXITCODE_SUSPENDED] = vmexit_suspend
+       [VM_EXITCODE_SUSPENDED] = vmexit_suspend,
+       [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 };
 
 static void

Modified: head/usr.sbin/bhyve/bhyverun.h
==============================================================================
--- head/usr.sbin/bhyve/bhyverun.h      Wed Jul 16 21:06:43 2014        
(r268776)
+++ head/usr.sbin/bhyve/bhyverun.h      Wed Jul 16 21:26:26 2014        
(r268777)
@@ -35,6 +35,12 @@
 #define        __CTASSERT(x, y)        typedef char __assert ## y[(x) ? 1 : -1]
 #endif
 
+#define        VMEXIT_CONTINUE         1       /* continue from next 
instruction */
+#define        VMEXIT_RESTART          2       /* restart current instruction 
*/
+#define        VMEXIT_ABORT            3       /* abort the vm run loop */
+#define        VMEXIT_RESET            4       /* guest machine has reset */
+#define        VMEXIT_POWEROFF         5       /* guest machine has powered 
off */
+
 struct vmctx;
 extern int guest_ncpus;
 extern char *guest_uuid_str;

Added: head/usr.sbin/bhyve/task_switch.c
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/usr.sbin/bhyve/task_switch.c   Wed Jul 16 21:26:26 2014        
(r268777)
@@ -0,0 +1,916 @@
+/*-
+ * Copyright (c) 2014 Neel Natu <n...@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+#include <x86/specialreg.h>
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <errno.h>
+
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+
+/*
+ * Various functions in this file use 0 to denote success and VMEXIT_ABORT
+ * or VMEXIT_RESTART to denote failure. This assumes that the VMEXIT_xyz
+ * macros expand to non-zero values. Enforce this with a compile-time
+ * assertion.
+ */
+CTASSERT(VMEXIT_ABORT != 0);
+CTASSERT(VMEXIT_RESTART != 0);
+
+/*
+ * Using 'struct i386tss' is tempting but causes myriad sign extension
+ * issues because all of its fields are defined as signed integers.
+ */
+struct tss32 {
+       uint16_t        tss_link;
+       uint16_t        rsvd1;
+       uint32_t        tss_esp0;
+       uint16_t        tss_ss0;
+       uint16_t        rsvd2;
+       uint32_t        tss_esp1;
+       uint16_t        tss_ss1;
+       uint16_t        rsvd3;
+       uint32_t        tss_esp2;
+       uint16_t        tss_ss2;
+       uint16_t        rsvd4;
+       uint32_t        tss_cr3;
+       uint32_t        tss_eip;
+       uint32_t        tss_eflags;
+       uint32_t        tss_eax;
+       uint32_t        tss_ecx;
+       uint32_t        tss_edx;
+       uint32_t        tss_ebx;
+       uint32_t        tss_esp;
+       uint32_t        tss_ebp;
+       uint32_t        tss_esi;
+       uint32_t        tss_edi;
+       uint16_t        tss_es;
+       uint16_t        rsvd5;
+       uint16_t        tss_cs;
+       uint16_t        rsvd6;
+       uint16_t        tss_ss;
+       uint16_t        rsvd7;
+       uint16_t        tss_ds;
+       uint16_t        rsvd8;
+       uint16_t        tss_fs;
+       uint16_t        rsvd9;
+       uint16_t        tss_gs;
+       uint16_t        rsvd10;
+       uint16_t        tss_ldt;
+       uint16_t        rsvd11;
+       uint16_t        tss_trap;
+       uint16_t        tss_iomap;
+};
+CTASSERT(sizeof(struct tss32) == 104);
+
+#define        SEL_START(sel)  (((sel) & ~0x7))
+#define        SEL_LIMIT(sel)  (((sel) | 0x7))
+#define        TSS_BUSY(type)  (((type) & 0x2) != 0)
+
+static uint64_t
+GETREG(struct vmctx *ctx, int vcpu, int reg)
+{
+       uint64_t val;
+       int error;
+
+       error = vm_get_register(ctx, vcpu, reg, &val);
+       assert(error == 0);
+       return (val);
+}
+
+static void
+SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+       int error;
+
+       error = vm_set_register(ctx, vcpu, reg, val);
+       assert(error == 0);
+}
+
+static struct seg_desc
+usd_to_seg_desc(struct user_segment_descriptor *usd)
+{
+       struct seg_desc seg_desc;
+
+       seg_desc.base = (u_int)USD_GETBASE(usd);
+       if (usd->sd_gran)
+               seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
+       else
+               seg_desc.limit = (u_int)USD_GETLIMIT(usd);
+       seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
+       seg_desc.access |= usd->sd_xx << 12;
+       seg_desc.access |= usd->sd_def32 << 14;
+       seg_desc.access |= usd->sd_gran << 15;
+
+       return (seg_desc);
+}
+
+/*
+ * Inject an exception with an error code that is a segment selector.
+ * The format of the error code is described in section 6.13, "Error Code",
+ * Intel SDM volume 3.
+ *
+ * Bit 0 (EXT) denotes whether the exception occurred during delivery
+ * of an external event like an interrupt.
+ *
+ * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
+ * in the IDT.
+ *
+ * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
+ */
+static void
+sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
+{
+       int error;
+
+       /*
+        * Bit 2 from the selector is retained as-is in the error code.
+        *
+        * Bit 1 can be safely cleared because none of the selectors
+        * encountered during task switch emulation refer to a task
+        * gate in the IDT.
+        *
+        * Bit 0 is set depending on the value of 'ext'.
+        */
+       sel &= ~0x3;
+       if (ext)
+               sel |= 0x1;
+       error = vm_inject_exception2(ctx, vcpu, vector, sel);
+       assert(error == 0);
+}
+
+static int
+desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
+{
+       uint64_t base;
+       uint32_t limit, access;
+       int error, reg;
+
+       reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+       error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+       assert(error == 0);
+
+       if (reg == VM_REG_GUEST_LDTR) {
+               if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
+                       return (-1);
+       }
+
+       if (limit < SEL_LIMIT(sel))
+               return (-1);
+       else
+               return (0);
+}
+
+static int
+desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc, bool doread)
+{
+       struct iovec iov[2];
+       uint64_t base;
+       uint32_t limit, access;
+       int error, reg;
+
+       reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+       error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+       assert(error == 0);
+       assert(limit >= SEL_LIMIT(sel));
+
+       error = vm_gla2gpa(ctx, vcpu, paging, base + SEL_START(sel),
+           sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
+       if (error == 0) {
+               if (doread)
+                       vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
+               else
+                       vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
+       }
+       return (error);
+}
+
+static int
+desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc)
+{
+       return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
+}
+
+static int
+desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc)
+{
+       return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
+}
+
+static int
+read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    uint16_t sel, struct user_segment_descriptor *desc)
+{
+       struct vm_guest_paging sup_paging;
+       int error;
+
+       assert(!ISLDT(sel));
+       assert(IDXSEL(sel) != 0);
+
+       /* Fetch the new TSS descriptor */
+       if (desc_table_limit_check(ctx, vcpu, sel)) {
+               if (ts->reason == TSR_IRET)
+                       sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+               else
+                       sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
+               return (VMEXIT_RESTART);
+       }
+
+       sup_paging = ts->paging;
+       sup_paging.cpl = 0;             /* implicit supervisor mode */
+       error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
+       if (error < 0)
+               return (VMEXIT_ABORT);
+       else if (error > 0)
+               return (VMEXIT_RESTART);
+       else
+               return (0);
+}
+
+static bool
+code_desc(int sd_type)
+{
+       /* code descriptor */
+       return ((sd_type & 0x18) == 0x18);
+}
+
+static bool
+stack_desc(int sd_type)
+{
+       /* writable data descriptor */
+       return ((sd_type & 0x1A) == 0x12);
+}
+
+static bool
+data_desc(int sd_type)
+{
+       /* data descriptor or a readable code descriptor */
+       return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
+}
+
+static bool
+ldt_desc(int sd_type)
+{
+
+       return (sd_type == SDT_SYSLDT);
+}
+
+static int
+validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    int segment, struct seg_desc *seg_desc)
+{
+       struct vm_guest_paging sup_paging;
+       struct user_segment_descriptor usd;
+       int error, idtvec;
+       int cpl, dpl, rpl;
+       uint16_t sel, cs;
+       bool ldtseg, codeseg, stackseg, dataseg, conforming;
+
+       ldtseg = codeseg = stackseg = dataseg = false;
+       switch (segment) {
+       case VM_REG_GUEST_LDTR:
+               ldtseg = true;
+               break;
+       case VM_REG_GUEST_CS:
+               codeseg = true;
+               break;
+       case VM_REG_GUEST_SS:
+               stackseg = true;
+               break;
+       case VM_REG_GUEST_DS:
+       case VM_REG_GUEST_ES:
+       case VM_REG_GUEST_FS:
+       case VM_REG_GUEST_GS:
+               dataseg = true;
+               break;
+       default:
+               assert(0);
+       }
+
+       /* Get the segment selector */
+       sel = GETREG(ctx, vcpu, segment);
+
+       /* LDT selector must point into the GDT */
+       if (ldtseg && ISLDT(sel)) {
+               sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+               return (VMEXIT_RESTART);
+       }
+
+       /* Descriptor table limit check */
+       if (desc_table_limit_check(ctx, vcpu, sel)) {
+               sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+               return (VMEXIT_RESTART);
+       }
+
+       /* NULL selector */
+       if (IDXSEL(sel) == 0) {
+               /* Code and stack segment selectors cannot be NULL */
+               if (codeseg || stackseg) {
+                       sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+                       return (VMEXIT_RESTART);
+               }
+               seg_desc->base = 0;
+               seg_desc->limit = 0;
+               seg_desc->access = 0x10000;     /* unusable */
+               return (0);
+       }
+
+       /* Read the descriptor from the GDT/LDT */
+       sup_paging = ts->paging;
+       sup_paging.cpl = 0;     /* implicit supervisor mode */
+       error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
+       if (error < 0)
+               return (VMEXIT_ABORT);
+       else if (error > 0)
+               return (VMEXIT_RESTART);
+
+       /* Verify that the descriptor type is compatible with the segment */
+       if ((ldtseg && !ldt_desc(usd.sd_type)) ||
+           (codeseg && !code_desc(usd.sd_type)) ||
+           (dataseg && !data_desc(usd.sd_type)) ||
+           (stackseg && !stack_desc(usd.sd_type))) {
+               sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+               return (VMEXIT_RESTART);
+       }
+
+       /* Segment must be marked present */
+       if (!usd.sd_p) {
+               if (ldtseg)
+                       idtvec = IDT_TS;
+               else if (stackseg)
+                       idtvec = IDT_SS;
+               else
+                       idtvec = IDT_NP;
+               sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
+               return (VMEXIT_RESTART);
+       }
+
+       cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+       cpl = cs & SEL_RPL_MASK;
+       rpl = sel & SEL_RPL_MASK;
+       dpl = usd.sd_dpl;
+
+       if (stackseg && (rpl != cpl || dpl != cpl)) {
+               sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+               return (VMEXIT_RESTART);
+       }
+
+       if (codeseg) {
+               conforming = (usd.sd_type & 0x4) ? true : false;
+               if ((conforming && (cpl < dpl)) ||
+                   (!conforming && (cpl != dpl))) {
+                       sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+                       return (VMEXIT_RESTART);
+               }
+       }
+
+       if (dataseg) {
+               /*
+                * A data segment is always non-conforming except when it's
+                * descriptor is a readable, conforming code segment.
+                */
+               if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
+                       conforming = true;
+               else
+                       conforming = false;
+
+               if (!conforming && (rpl > dpl || cpl > dpl)) {
+                       sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+                       return (VMEXIT_RESTART);
+               }
+       }
+       *seg_desc = usd_to_seg_desc(&usd);
+       return (0);
+}
+
+static void
+tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
+    uint32_t eip, struct tss32 *tss, struct iovec *iov)
+{
+
+       /* General purpose registers */
+       tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
+       tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
+       tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
+       tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
+       tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+       tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
+       tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
+       tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
+
+       /* Segment selectors */
+       tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
+       tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+       tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+       tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
+       tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
+       tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
+
+       /* eflags and eip */
+       tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+       if (task_switch->reason == TSR_IRET)
+               tss->tss_eflags &= ~PSL_NT;
+       tss->tss_eip = eip;
+
+       /* Copy updated old TSS into guest memory */
+       vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
+}
+
+static void
+update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
+{
+       int error;
+
+       error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
+       assert(error == 0);
+}
+
+static int
+tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
+{
+       struct seg_desc seg_desc, seg_desc2;
+       uint64_t *pdpte, maxphyaddr, reserved;
+       uint32_t eflags;
+       int error, i;
+       bool nested;
+
+       nested = false;
+       if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
+               tss->tss_link = ot_sel;
+               nested = true;
+       }
+
+       eflags = tss->tss_eflags;
+       if (nested)
+               eflags |= PSL_NT;
+
+       /* LDTR */
+       SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
+
+       /* PBDR */
+       if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
+               if (ts->paging.paging_mode == PAGING_MODE_PAE) {
+                       /*
+                        * XXX Assuming 36-bit MAXPHYADDR.
+                        */
+                       maxphyaddr = (1UL << 36) - 1;
+                       pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
+                       for (i = 0; i < 4; i++) {
+                               /* Check reserved bits if the PDPTE is valid */
+                               if (!(pdpte[i] & 0x1))
+                                       continue;
+                               /*
+                                * Bits 2:1, 8:5 and bits above the processor's
+                                * maximum physical address are reserved.
+                                */
+                               reserved = ~maxphyaddr | 0x1E6;
+                               if (pdpte[i] & reserved) {
+                                       error = vm_inject_exception2(ctx, vcpu,
+                                           IDT_GP, 0);
+                                       assert(error == 0);
+                                       return (VMEXIT_RESTART);
+                               }
+                       }
+                       SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
+                       SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
+                       SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
+                       SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
+               }
+               SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
+               ts->paging.cr3 = tss->tss_cr3;
+       }
+
+       /* eflags and eip */
+       SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
+       SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
+
+       /* General purpose registers */
+       SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
+       SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
+       SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
+       SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
+       SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
+       SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
+       SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
+       SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
+
+       /* Segment selectors */
+       SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
+       SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
+       SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
+       SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
+       SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
+       SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
+
+       /*
+        * If this is a nested task then write out the new TSS to update
+        * the previous link field.
+        */
+       if (nested)
+               vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
+
+       /* Validate segment descriptors */
+       error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
+       if (error)
+               return (error);
+       update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
+
+       /*
+        * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
+        *
+        * The SS and CS attribute checks on VM-entry are inter-dependent so
+        * we need to make sure that both segments are valid before updating
+        * either of them. This ensures that the VMCS state can pass the
+        * VM-entry checks so the guest can handle any exception injected
+        * during task switch emulation.
+        */
+       error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
+       if (error)
+               return (error);
+       error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
+       if (error)
+               return (error);
+       update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
+       update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
+       ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
+
+       error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
+       if (error)
+               return (error);
+       update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
+
+       error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
+       if (error)
+               return (error);
+       update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
+
+       error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
+       if (error)
+               return (error);
+       update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
+
+       error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
+       if (error)
+               return (error);
+       update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
+
+       return (0);
+}
+
+static int
+push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    int task_type, uint32_t errcode)
+{
+       struct iovec iov[2];
+       struct seg_desc seg_desc;
+       int stacksize, bytes, error;
+       uint64_t gla, cr0, rflags;
+       uint32_t esp;
+       uint16_t stacksel;
+
+       cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+       rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+       stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+
+       error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
+           &seg_desc.limit, &seg_desc.access);
+       assert(error == 0);
+
+       /*
+        * Section "Error Code" in the Intel SDM vol 3: the error code is
+        * pushed on the stack as a doubleword or word (depending on the
+        * default interrupt, trap or task gate size).
+        */
+       if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
+               bytes = 4;
+       else
+               bytes = 2;
+
+       /*
+        * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
+        * stack-segment descriptor determines the size of the stack
+        * pointer outside of 64-bit mode.
+        */
+       if (SEG_DESC_DEF32(seg_desc.access))
+               stacksize = 4;
+       else
+               stacksize = 2;
+
+       esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+       esp -= bytes;
+
+       if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
+           &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
+               sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
+               return (VMEXIT_RESTART);
+       }
+
+       if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
+               error = vm_inject_exception2(ctx, vcpu, IDT_AC, 1);
+               assert(error == 0);
+               return (VMEXIT_RESTART);
+       }
+
+       error = vm_gla2gpa(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
+           iov, nitems(iov));
+       assert(error == 0 || error == 1 || error == -1);
+       if (error) {
+               return ((error == 1) ? VMEXIT_RESTART : VMEXIT_ABORT);
+       }
+
+       vm_copyout(ctx, vcpu, &errcode, iov, bytes);
+       SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
+       return (0);
+}
+
+int
+vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+       struct seg_desc nt;
+       struct tss32 oldtss, newtss;
+       struct vm_task_switch *task_switch;
+       struct vm_guest_paging *paging, sup_paging;
+       struct user_segment_descriptor nt_desc, ot_desc;
+       struct iovec nt_iov[2], ot_iov[2];
+       uint64_t cr0, ot_base;
+       uint32_t eip, ot_lim, access;
+       int error, ext, minlimit, nt_type, ot_type, vcpu;
+       enum task_switch_reason reason;
+       uint16_t nt_sel, ot_sel;
+
+       task_switch = &vmexit->u.task_switch;
+       nt_sel = task_switch->tsssel;
+       ext = vmexit->u.task_switch.ext;
+       reason = vmexit->u.task_switch.reason;
+       paging = &vmexit->u.task_switch.paging;
+       vcpu = *pvcpu;
+
+       assert(paging->cpu_mode == CPU_MODE_PROTECTED);
+
+       /*
+        * Section 4.6, "Access Rights" in Intel SDM Vol 3.
+        * The following page table accesses are implicitly supervisor mode:
+        * - accesses to GDT or LDT to load segment descriptors
+        * - accesses to the task state segment during task switch
+        */
+       sup_paging = *paging;
+       sup_paging.cpl = 0;     /* implicit supervisor mode */
+
+       /* Fetch the new TSS descriptor */
+       error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
+       if (error)
+               return (error);
+
+       nt = usd_to_seg_desc(&nt_desc);
+
+       /* Verify the type of the new TSS */
+       nt_type = SEG_DESC_TYPE(nt.access);
+       if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
+           nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
+               sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+               return (VMEXIT_RESTART);
+       }
+
+       /* TSS descriptor must have present bit set */
+       if (!SEG_DESC_PRESENT(nt.access)) {
+               sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
+               return (VMEXIT_RESTART);
+       }
+
+       /*
+        * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
+        * 44 bytes for a 16-bit TSS.
+        */
+       if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to