Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]> --- drivers/kvm/Makefile | 2 drivers/kvm/kernint.c | 149 +++++ drivers/kvm/kvm.h | 35 + drivers/kvm/kvm_main.c | 198 ++++++- drivers/kvm/lapic.c | 1418 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/kvm/svm.c | 13 drivers/kvm/userint.c | 8 drivers/kvm/vmx.c | 16 - include/linux/kvm.h | 15 + 9 files changed, 1809 insertions(+), 45 deletions(-)
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile index 540afbc..1aad737 100644 --- a/drivers/kvm/Makefile +++ b/drivers/kvm/Makefile @@ -2,7 +2,7 @@ # Makefile for Kernel-based Virtual Machine module # -kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o +kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o lapic.o kernint.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/drivers/kvm/kernint.c b/drivers/kvm/kernint.c new file mode 100644 index 0000000..b5cbcae --- /dev/null +++ b/drivers/kvm/kernint.c @@ -0,0 +1,149 @@ +/* + * Kernel Interrupt IRQ device + * + * Provides a model for connecting in-kernel interrupt resources to a VCPU. + * + * A typical modern x86 processor has the concept of an internal Local-APIC + * and some external signal pins. The way in which interrupts are injected is + * dependent on whether software enables the LAPIC or not. When enabled, + * interrupts are acknowledged through the LAPIC. Otherwise they are through + * an externally connected PIC (typically an i8259 on the BSP) + * + * Copyright (C) 2007 Novell + * + * Authors: + * Gregory Haskins <[EMAIL PROTECTED]> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "kvm.h" + +struct kvm_kernint { + struct kvm_vcpu *vcpu; + struct kvm_irqdevice *self_irq; + struct kvm_irqdevice *ext_irq; + struct kvm_irqdevice apic_irq; + +}; + +static struct kvm_irqdevice *get_irq_dev(struct kvm_kernint *s) +{ + struct kvm_irqdevice *dev; + + if (kvm_lapic_enabled(s->vcpu)) + dev = &s->apic_irq; + else + dev = s->ext_irq; + + if (!dev) + kvm_crash_guest(s->vcpu->kvm); + + return dev; +} + +static int kernint_irqdev_ack(struct kvm_irqdevice *this, int flags, + struct kvm_irqack_data *data) +{ + struct kvm_kernint *s = (struct kvm_kernint*)this->private; + + return kvm_irqdevice_ack(get_irq_dev(s), flags, data); +} + +static int kernint_irqdev_set_pin(struct kvm_irqdevice *this, + int irq, int level) +{ + /* no-op */ + return 0; +} + +static void kernint_irqdev_destructor(struct kvm_irqdevice *this) +{ + struct kvm_kernint *s = (struct kvm_kernint*)this->private; + + kvm_irqdevice_destructor(&s->apic_irq); + kvm_lapic_destroy(s->vcpu); + kfree(s); +} + +static void kvm_apic_intr(struct kvm_irqsink *this, + struct kvm_irqdevice *dev, + kvm_irqpin_t pin) +{ + struct kvm_kernint *s = (struct kvm_kernint*)this->private; + + /* + * If the LAPIC sent us an interrupt it *must* be enabled, + * just forward it on to the CPU + */ + kvm_irqdevice_set_intr(s->self_irq, pin); +} + +static void kvm_ext_intr(struct kvm_irqsink *this, + struct kvm_irqdevice *dev, + kvm_irqpin_t pin) +{ + struct kvm_kernint *s = (struct kvm_kernint*)this->private; + + /* + * If the EXTINT device sent us an interrupt, forward it to the LINT0 + * pin of the LAPIC + */ + if (pin != kvm_irqpin_localint) + return; + + /* + * "irq 0" = LINT0, 1 = LINT1 + */ + kvm_irqdevice_set_pin(&s->apic_irq, 0, 1); +} + +int kvm_kernint_init(struct kvm_vcpu *vcpu) +{ + struct kvm_irqdevice *irqdev = &vcpu->irq.dev; + struct kvm_kernint *s; + struct kvm_irqsink apicsink; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->vcpu = vcpu; + + /* + * Configure the irqdevice interface + */ + irqdev->ack = kernint_irqdev_ack; + irqdev->set_pin = kernint_irqdev_set_pin; + irqdev->destructor = kernint_irqdev_destructor; + + irqdev->private = s; + s->self_irq = irqdev; + + /* + * Configure the EXTINT device if this is the BSP processor + */ + if (!vcpu_slot(vcpu)) { + struct kvm_irqsink extsink = { + .set_intr = kvm_ext_intr, + .private = s + }; + s->ext_irq = &vcpu->kvm->isa_irq; + kvm_irqdevice_register_sink(s->ext_irq, &extsink); + } + + /* + * Configure the LAPIC device + */ + apicsink.set_intr = kvm_apic_intr; + apicsink.private = s; + + kvm_irqdevice_init(&s->apic_irq); + kvm_irqdevice_register_sink(&s->apic_irq, &apicsink); + kvm_lapic_init(vcpu, &s->apic_irq, 0); + + return 0; +} + diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index f84950c..1f30274 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -167,6 +167,21 @@ int kvm_user_irqdev_init(struct kvm_irqdevice *dev); int kvm_user_irqdev_save(struct kvm_irqdevice *this, void *data); int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data); int kvm_userint_init(struct kvm_vcpu *vcpu); +int kvm_kernint_init(struct kvm_vcpu *vcpu); + +#define KVM_LAPIC_OPTION_USERMODE (1 << 0) + +int kvm_lapic_init(struct kvm_vcpu *vcpu, struct kvm_irqdevice *dev, + int flags); +void kvm_lapic_destroy(struct kvm_vcpu *vcpu); +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, u64 cr8); +u64 kvm_lapic_get_tpr(struct kvm_vcpu *vcpu); +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 base); +u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); +void kvm_lapic_save(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +void kvm_lapic_restore(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +void kvm_lapic_reset(struct kvm_vcpu *vcpu); +int kvm_lapic_enabled(struct kvm_vcpu *vcpu); /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level @@ -337,6 +352,11 @@ struct kvm_vcpu_irq { int guest_cpu; }; +struct kvm_lapic { + void *dev; + struct kvm_io_device *mmio; +}; + struct kvm_vcpu { struct kvm *kvm; union { @@ -350,6 +370,7 @@ struct kvm_vcpu { struct kvm_run *run; int interrupt_window_open; struct kvm_vcpu_irq irq; + struct kvm_lapic apic; unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ unsigned long rip; /* needs vcpu_load_rsp_rip() */ @@ -360,10 +381,8 @@ struct kvm_vcpu { struct page *para_state_page; gpa_t hypercall_gpa; unsigned long cr4; - unsigned long cr8; u64 pdptrs[4]; /* pae */ u64 shadow_efer; - u64 apic_base; u64 ia32_misc_enable_msr; int nmsrs; int save_nmsrs; @@ -532,6 +551,8 @@ struct kvm { struct list_head vm_list; struct file *filp; struct kvm_io_bus mmio_bus; + int enable_kernel_pic; + struct kvm_irqdevice isa_irq; }; struct descriptor_table { @@ -606,6 +627,9 @@ void kvm_exit_arch(void); int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); +int kvm_apicbus_send(struct kvm *kvm, int dest, int trig_mode, int level, + int dest_mode, int delivery_mode, int vector); + void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); @@ -739,6 +763,13 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } +static inline int vcpu_slot(struct kvm_vcpu *vcpu) +{ + return vcpu - vcpu->kvm->vcpus; +} + +void kvm_crash_guest(struct kvm *kvm); + static inline u16 read_fs(void) { u16 seg; diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 2957023..2a1b376 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -319,6 +319,7 @@ static struct kvm *kvm_create_vm(void) spin_lock_init(&kvm->lock); INIT_LIST_HEAD(&kvm->active_mmu_pages); kvm_io_bus_init(&kvm->mmio_bus); + kvm_irqdevice_init(&kvm->isa_irq); for (i = 0; i < KVM_MAX_VCPUS; ++i) { struct kvm_vcpu *vcpu = &kvm->vcpus[i]; @@ -411,6 +412,23 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_free_vcpu(&kvm->vcpus[i]); } +/* + * The function kills a guest while there still is a user space processes + * with a descriptor to it + */ +void kvm_crash_guest(struct kvm *kvm) +{ + unsigned int i; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + /* + * FIXME: in the future it should send IPI to gracefully + * stop the other vCPUs + */ + kvm_free_vcpu(&kvm->vcpus[i]); + } +} + static int kvm_dev_release(struct inode *inode, struct file *filp) { return 0; @@ -422,6 +440,8 @@ static void kvm_destroy_vm(struct kvm *kvm) list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_io_bus_destroy(&kvm->mmio_bus); + if (kvm->enable_kernel_pic) + kvm_irqdevice_destructor(&kvm->isa_irq); kvm_free_vcpus(kvm); kvm_free_physmem(kvm); kfree(kvm); @@ -627,7 +647,7 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) inject_gp(vcpu); return; } - vcpu->cr8 = cr8; + kvm_lapic_set_tpr(vcpu, cr8); } EXPORT_SYMBOL_GPL(set_cr8); @@ -928,6 +948,69 @@ out: return r; } +static int kvm_vm_ioctl_enable_kernel_pic(struct kvm *kvm, __u32 val) +{ + /* + * FIXME: We should not allow this if VCPUs have already been created + */ + if (kvm->enable_kernel_pic) + return -EINVAL; + + /* + * Someday we may offer two levels of in-kernel PIC support: + * + * level 0 = (default) compatiblity mode (everything in userspace) + * level 1 = LAPIC in kernel, IOAPIC/i8259 in userspace + * level 2 = All three in kernel + * + * For now we only support level 0 and 1. However, you cant set + * level 0 + */ + if (val != 1) + return -EINVAL; + + kvm->enable_kernel_pic = val; + + printk(KERN_INFO "KVM: Setting in-kernel PIC level to %d\n", val); + + /* + * installing a user_irqdev model to the kvm->isa_irq device + * creates a level-1 environment, where the userspace completely + * controls the ISA domain interrupts in the IOAPIC/i8259. + * Interrupts come down to the VCPU either as an ISA vector to + * this controller, or as an APIC bus message (or both) + */ + kvm_user_irqdev_init(&kvm->isa_irq); + + return 0; +} + +static int kvm_vm_ioctl_isa_interrupt(struct kvm *kvm, + struct kvm_interrupt *irq) +{ + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + + if (!kvm->enable_kernel_pic) + return -EINVAL; + + return kvm_irqdevice_set_pin(&kvm->isa_irq, irq->irq, 1); +} + +static int kvm_vm_ioctl_apic_msg(struct kvm *kvm, + struct kvm_apic_msg *msg) +{ + if (!kvm->enable_kernel_pic) + return -EINVAL; + + msg->delivery_mode = (msg->delivery_mode << 8) & 0xF00; + + kvm_apicbus_send(kvm, msg->dest, msg->trig_mode, 1, msg->dest_mode, + msg->delivery_mode, msg->vector); + + return 0; +} + static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; @@ -1048,10 +1131,16 @@ static int emulator_write_std(unsigned long addr, static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, gpa_t addr) { + struct kvm_io_device *dev = vcpu->apic.mmio; + + /* + * First check if the LAPIC will snarf this request + */ + if (dev && dev->in_range(dev, addr)) + return dev; + /* - * Note that its important to have this wrapper function because - * in the very near future we will be checking for MMIOs against - * the LAPIC as well as the general MMIO bus + * And then fallback to allow any device to participate */ return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); } @@ -1518,7 +1607,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = 3; break; case MSR_IA32_APICBASE: - data = vcpu->apic_base; + data = kvm_lapic_get_base(vcpu); break; case MSR_IA32_MISC_ENABLE: data = vcpu->ia32_misc_enable_msr; @@ -1596,7 +1685,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case 0x200 ... 0x2ff: /* MTRRs */ break; case MSR_IA32_APICBASE: - vcpu->apic_base = data; + kvm_lapic_set_base(vcpu, data); break; case MSR_IA32_MISC_ENABLE: vcpu->ia32_misc_enable_msr = data; @@ -1860,8 +1949,9 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - /* re-sync apic's tpr */ - vcpu->cr8 = kvm_run->cr8; + if (!vcpu->kvm->enable_kernel_pic) + /* re-sync apic's tpr if the APIC is in userspace */ + kvm_lapic_set_tpr(vcpu, kvm_run->cr8); if (vcpu->pio.cur_count) { r = complete_pio(vcpu); @@ -2010,11 +2100,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->cr2 = vcpu->cr2; sregs->cr3 = vcpu->cr3; sregs->cr4 = vcpu->cr4; - sregs->cr8 = vcpu->cr8; sregs->efer = vcpu->shadow_efer; - sregs->apic_base = vcpu->apic_base; - kvm_user_irqdev_save(&vcpu->irq.dev, &sregs->interrupt_bitmap); + kvm_lapic_save(vcpu, sregs); + + if (!vcpu->kvm->enable_kernel_pic) + kvm_user_irqdev_save(&vcpu->irq.dev, &sregs->interrupt_bitmap); vcpu_put(vcpu); @@ -2046,14 +2137,10 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= vcpu->cr3 != sregs->cr3; vcpu->cr3 = sregs->cr3; - vcpu->cr8 = sregs->cr8; - mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; #ifdef CONFIG_X86_64 kvm_arch_ops->set_efer(vcpu, sregs->efer); #endif - vcpu->apic_base = sregs->apic_base; - kvm_arch_ops->decache_cr4_guest_bits(vcpu); mmu_reset_needed |= vcpu->cr0 != sregs->cr0; @@ -2067,8 +2154,11 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - kvm_user_irqdev_restore(&vcpu->irq.dev, - &sregs->interrupt_bitmap[0]); + kvm_lapic_restore(vcpu, sregs); + + if (!vcpu->kvm->enable_kernel_pic) + kvm_user_irqdev_restore(&vcpu->irq.dev, + &sregs->interrupt_bitmap[0]); set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); @@ -2457,7 +2547,12 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) kvm_irqdevice_init(&vcpu->irq.dev); kvm_vcpu_irqsink_init(vcpu); - r = kvm_userint_init(vcpu); + + if (kvm->enable_kernel_pic) + r = kvm_kernint_init(vcpu); + else + r = kvm_userint_init(vcpu); + if (r < 0) goto out_free_vcpus; @@ -2601,6 +2696,12 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } +static int kvm_vcpu_ioctl_apic_reset(struct kvm_vcpu *vcpu) +{ + kvm_lapic_reset(vcpu); + return 0; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2770,6 +2871,13 @@ static long kvm_vcpu_ioctl(struct file *filp, r = 0; break; } + case KVM_APIC_RESET: { + r = kvm_vcpu_ioctl_apic_reset(vcpu); + if (r) + goto out; + r = 0; + break; + } default: ; } @@ -2823,6 +2931,41 @@ static long kvm_vm_ioctl(struct file *filp, goto out; break; } + case KVM_ENABLE_KERNEL_PIC: { + __u32 val; + + r = -EFAULT; + if (copy_from_user(&val, argp, sizeof val)) + goto out; + r = kvm_vm_ioctl_enable_kernel_pic(kvm, val); + if (r) + goto out; + break; + } + case KVM_ISA_INTERRUPT: { + struct kvm_interrupt irq; + + r = -EFAULT; + if (copy_from_user(&irq, argp, sizeof irq)) + goto out; + r = kvm_vm_ioctl_isa_interrupt(kvm, &irq); + if (r) + goto out; + r = 0; + break; + } + case KVM_APIC_MSG: { + struct kvm_apic_msg msg; + + r = -EFAULT; + if (copy_from_user(&msg, argp, sizeof msg)) + goto out; + r = kvm_vm_ioctl_apic_msg(kvm, &msg); + if (r) + goto out; + r = 0; + break; + } default: ; } @@ -2954,12 +3097,21 @@ static long kvm_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_CHECK_EXTENSION: - /* - * No extensions defined at present. - */ - r = 0; + case KVM_CHECK_EXTENSION: { + int ext = (long)argp; + + switch (ext) { + case KVM_ISA_INTERRUPT: + case KVM_APIC_MSG: + case KVM_APIC_RESET: + r = 1; + break; + default: + r = 0; + break; + } break; + } case KVM_GET_VCPU_MMAP_SIZE: r = -EINVAL; if (arg) diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c new file mode 100644 index 0000000..9f0ab7e --- /dev/null +++ b/drivers/kvm/lapic.c @@ -0,0 +1,1418 @@ +/* + * Local APIC virtualization + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2007 Novell + * + * Authors: + * Dor Laor <[EMAIL PROTECTED]> + * Gregory Haskins <[EMAIL PROTECTED]> + * + * Based on Xen 3.0 code, Copyright (c) 2004, Intel Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "kvm.h" +#include <linux/kvm.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/smp.h> +#include <linux/hrtimer.h> +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/msr.h> +#include <asm/page.h> +#include <asm/current.h> + +/*XXX remove this definition after GFW enabled */ +#define APIC_NO_BIOS + +#define PRId64 "d" +#define PRIx64 "llx" +#define PRIu64 "u" +#define PRIo64 "o" + +#define APIC_BUS_CYCLE_NS 1 + +/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ +#define apic_debug(fmt,arg...) + +struct kvm_kern_apic { + spinlock_t lock; + atomic_t ref_count; + int usermode; + u32 status; + u32 vcpu_id; + u64 base_msr; + unsigned long base_address; + struct kvm_io_device mmio_dev; + struct { + unsigned long pending; + u32 divide_count; + ktime_t last_update; + struct hrtimer dev; + } timer; + u32 err_status; + u32 err_write_count; + struct kvm_vcpu *vcpu; + struct kvm_irqdevice *irq_dev; + struct page *regs_page; + void *regs; +}; + +static __inline__ int find_highest_bit(unsigned long *data, int nr_bits) +{ + int length = BITS_TO_LONGS(nr_bits); + while (length && !data[--length]) + continue; + return __ffs(data[length]) + (length * BITS_PER_LONG); +} + +#define APIC_LVT_NUM 6 +/* 14 is the version for Xeon and Pentium 8.4.8*/ +#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) +#define VLOCAL_APIC_MEM_LENGTH (1 << 12) +/* followed define is not in apicdef.h */ +#define APIC_SHORT_MASK 0xc0000 +#define APIC_DEST_NOSHORT 0x0 +#define APIC_DEST_MASK 0x800 +#define _APIC_GLOB_DISABLE 0x0 +#define APIC_GLOB_DISABLE_MASK 0x1 +#define APIC_SOFTWARE_DISABLE_MASK 0x2 +#define _APIC_BSP_ACCEPT_PIC 0x3 +#define MAX_APIC_INT_VECTOR 256 + +#define inject_gp(vcpu) kvm_arch_ops->inject_gp(vcpu, 0); + +#define apic_enabled(apic) \ + (!((apic)->status & \ + (APIC_GLOB_DISABLE_MASK | APIC_SOFTWARE_DISABLE_MASK))) + +#define apic_global_enabled(apic) \ + (!(test_bit(_APIC_GLOB_DISABLE, &(apic)->status))) + +#define LVT_MASK \ + APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK + +#define LINT_MASK \ + LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY |\ + APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER + +#define KVM_APIC_ID(apic) \ + (GET_APIC_ID(apic_get_reg(apic, APIC_ID))) + +#define apic_lvt_enabled(apic, lvt_type) \ + (!(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED)) + +#define apic_lvt_vector(apic, lvt_type) \ + (apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK) + +#define apic_lvt_dm(apic, lvt_type) \ + (apic_get_reg(apic, lvt_type) & APIC_MODE_MASK) + +#define apic_lvtt_period(apic) \ + (apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC) + +static inline u32 apic_get_reg(struct kvm_kern_apic *apic, u32 reg) +{ + return *((u32 *)(apic->regs + reg)); +} + +static inline void apic_set_reg(struct kvm_kern_apic *apic, + u32 reg, u32 val) +{ + *((u32 *)(apic->regs + reg)) = val; +} + +static unsigned int apic_lvt_mask[APIC_LVT_NUM] = +{ + LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ + LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ + LVT_MASK | APIC_MODE_MASK, /* LVTPC */ + LINT_MASK, LINT_MASK, /* LVT0-1 */ + LVT_MASK /* LVTERR */ +}; + +#define ASSERT(x) \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } + +static int apic_find_highest_irr(struct kvm_kern_apic *apic) +{ + int result; + + result = find_highest_bit((unsigned long *)(apic->regs + APIC_IRR), + MAX_APIC_INT_VECTOR); + + ASSERT( result == 0 || result >= 16); + + return result; +} + + +static int apic_find_highest_isr(struct kvm_kern_apic *apic) +{ + int result; + + result = find_highest_bit((unsigned long *)(apic->regs + APIC_ISR), + MAX_APIC_INT_VECTOR); + + ASSERT( result == 0 || result >= 16); + + return result; +} + +static void apic_dropref(struct kvm_kern_apic *apic) +{ + if (atomic_dec_and_test(&apic->ref_count)) { + + spin_lock_bh(&apic->lock); + + hrtimer_cancel(&apic->timer.dev); + + if (apic->regs_page) { + __free_page(apic->regs_page); + apic->regs_page = 0; + } + + spin_unlock_bh(&apic->lock); + + kfree(apic); + } +} + +#if 0 +static void apic_dump_state(struct kvm_kern_apic *apic) +{ + u64 *tmp; + + printk(KERN_INFO "%s begin\n", __FUNCTION__); + + printk(KERN_INFO "status = 0x%08x\n", apic->status); + printk(KERN_INFO "base_msr=0x%016llx, apicbase = 0x%08lx\n", + apic->base_msr, apic->base_address); + + tmp = (u64*)(apic->regs + APIC_IRR); + printk(KERN_INFO "IRR = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n", + tmp[3], tmp[2], tmp[1], tmp[0]); + tmp = (u64*)(apic->regs + APIC_ISR); + printk(KERN_INFO "ISR = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n", + tmp[3], tmp[2], tmp[1], tmp[0]); + tmp = (u64*)(apic->regs + APIC_TMR); + printk(KERN_INFO "TMR = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n", + tmp[3], tmp[2], tmp[1], tmp[0]); + + printk(KERN_INFO "APIC_ID=0x%08x\n", apic_get_reg(apic, APIC_ID)); + printk(KERN_INFO "APIC_TASKPRI=0x%08x\n", + apic_get_reg(apic, APIC_TASKPRI) & 0xff); + printk(KERN_INFO "APIC_PROCPRI=0x%08x\n", + apic_get_reg(apic, APIC_PROCPRI)); + + printk(KERN_INFO "APIC_DFR=0x%08x\n", + apic_get_reg(apic, APIC_DFR) | 0x0FFFFFFF); + printk(KERN_INFO "APIC_LDR=0x%08x\n", + apic_get_reg(apic, APIC_LDR) & APIC_LDR_MASK); + printk(KERN_INFO "APIC_SPIV=0x%08x\n", + apic_get_reg(apic, APIC_SPIV) & 0x3ff); + printk(KERN_INFO "APIC_ESR=0x%08x\n", + apic_get_reg(apic, APIC_ESR)); + printk(KERN_INFO "APIC_ICR=0x%08x\n", + apic_get_reg(apic, APIC_ICR) & ~(1 << 12)); + printk(KERN_INFO "APIC_ICR2=0x%08x\n", + apic_get_reg(apic, APIC_ICR2) & 0xff000000); + + printk(KERN_INFO "APIC_LVTERR=0x%08x\n", + apic_get_reg(apic, APIC_LVTERR)); + printk(KERN_INFO "APIC_LVT1=0x%08x\n", + apic_get_reg(apic, APIC_LVT1)); + printk(KERN_INFO "APIC_LVT0=0x%08x\n", + apic_get_reg(apic, APIC_LVT0)); + printk(KERN_INFO "APIC_LVTPC=0x%08x\n", + apic_get_reg(apic, APIC_LVTPC)); + printk(KERN_INFO "APIC_LVTTHMR=0x%08x\n", + apic_get_reg(apic, APIC_LVTTHMR)); + printk(KERN_INFO "APIC_LVTT=0x%08x\n", + apic_get_reg(apic, APIC_LVTT)); + + printk(KERN_INFO "APIC_TMICT=0x%08x\n", + apic_get_reg(apic, APIC_TMICT)); + printk(KERN_INFO "APIC_TDCR=0x%08x\n", + apic_get_reg(apic, APIC_TDCR)); + + printk(KERN_INFO "%s end\n", __FUNCTION__); +} +#endif + + +static int apic_update_ppr(struct kvm_kern_apic *apic) +{ + u32 tpr, isrv, ppr, orig_ppr; + int irq; + int masked = 0; + int forward = 0; + + ppr = apic_get_reg(apic, APIC_PROCPRI); + orig_ppr = ppr; + + /* + * Before we change anything, see if the only pending vectors we have + * are anything masked by PPR + */ + irq = apic_find_highest_irr(apic); + if (irq && ((irq & 0xf0) <= ppr)) + masked = true; + + /* + * Compute the PPR value based on the current settings of TPR/ISR + */ + tpr = apic_get_reg(apic, APIC_TASKPRI); + irq = apic_find_highest_isr(apic); + isrv = (irq >> 4) & 0xf; + + if ((tpr >> 4) >= isrv) + ppr = tpr & 0xff; + else + ppr = isrv << 4; /* low 4 bits of PPR have to be cleared */ + + apic_set_reg(apic, APIC_PROCPRI, ppr); + + if (masked) { + /* + * If we get here its because there were vectors that + * were masked by PPR. Check again to see if anything is + * now available + */ + irq = apic_find_highest_irr(apic); + if ((irq & 0xf0) > ppr) + forward = 1; + } + + apic_debug("%s: ppr 0x%x (old) 0x%x (new), isr 0x%x, isrv 0x%x\n", + __FUNCTION__, orig_ppr, ppr, irq, isrv); + + return forward; +} + +static void apic_set_tpr(struct kvm_kern_apic *apic, u32 tpr) +{ + int forward = 0; + + apic_debug("new value = %x\n", tpr); + + apic_set_reg(apic, APIC_TASKPRI, tpr); + forward = apic_update_ppr(apic); + + if (forward) { + spin_unlock_bh(&apic->lock); + kvm_irqdevice_set_intr(apic->irq_dev, kvm_irqpin_localint); + spin_lock_bh(&apic->lock); + } +} + +static int apic_match_dest(struct kvm_kern_apic *target, + int dest, + int dest_mode, + int delivery_mode) +{ + int result = 0; + + spin_lock_bh(&target->lock); + + if (!dest_mode) /* Physical */ + result = (GET_APIC_ID(apic_get_reg(target, APIC_ID)) == dest); + else { /* Logical */ + u32 ldr = apic_get_reg(target, APIC_LDR); + + /* Flat mode */ + if (apic_get_reg(target, APIC_DFR) == APIC_DFR_FLAT) + result = GET_APIC_LOGICAL_ID(ldr) & dest; + else { + if ((delivery_mode == APIC_DM_LOWEST) && + (dest == 0xff)) { + printk(KERN_ALERT "Broadcast IPI " \ + "with lowest priority " + "delivery mode\n"); + spin_unlock_bh(&target->lock); + kvm_crash_guest(target->vcpu->kvm); + return 0; + } + if (GET_APIC_LOGICAL_ID(ldr) == (dest & 0xf)) + result = (GET_APIC_LOGICAL_ID(ldr) >> 4) & + (dest >> 4); + else + result = 0; + } + } + + spin_unlock_bh(&target->lock); + + return result; +} + +/* + * Add a pending IRQ into lapic. + * Return 1 if successfully added and 0 if discarded. + */ +static int __apic_accept_irq(struct kvm_kern_apic *apic, + int delivery_mode, + int vector, + int level, + int trig_mode) +{ + kvm_irqpin_t pin = kvm_irqpin_invalid; + + switch (delivery_mode) { + case APIC_DM_FIXED: + case APIC_DM_LOWEST: + if (unlikely(!apic_enabled(apic))) + break; + + if (test_and_set_bit(vector, apic->regs + APIC_IRR) + && trig_mode) { + apic_debug("level trig mode repeatedly for vector " \ + "%d\n", vector); + break; + } + + if (trig_mode) { + apic_debug("level trig mode for vector %d\n", vector); + set_bit(vector, apic->regs + APIC_TMR); + } + + apic_debug("FIXED/LOWEST interrupt for vector %d\n", vector); + pin = kvm_irqpin_localint; + break; + case APIC_DM_REMRD: + printk(KERN_WARNING "%s: Ignore deliver mode %d\n", + __FUNCTION__, delivery_mode); + break; + case APIC_DM_EXTINT: + apic_debug("EXTINT interrupt\n"); + pin = kvm_irqpin_extint; + break; + case APIC_DM_SMI: + apic_debug("SMI interrupt\n"); + pin = kvm_irqpin_smi; + break; + case APIC_DM_NMI: + apic_debug("NMI interrupt\n"); + pin = kvm_irqpin_nmi; + break; + case APIC_DM_INIT: + apic_debug("INIT interrupt\n"); + if (level) { + spin_unlock_bh(&apic->lock); + kvm_lapic_reset(apic->vcpu); + spin_lock_bh(&apic->lock); + } + break; + case APIC_DM_STARTUP: /* FIXME: currently no support for SMP */ + default: + printk(KERN_ALERT "TODO: support interrupt type %x\n", + delivery_mode); + spin_unlock_bh(&apic->lock); + kvm_crash_guest(apic->vcpu->kvm); + spin_lock_bh(&apic->lock); + break; + } + + if (likely(pin != kvm_irqpin_invalid)) { + /* + * temp release of the lock to transmit + */ + spin_unlock_bh(&apic->lock); + kvm_irqdevice_set_intr(apic->irq_dev, pin); + spin_lock_bh(&apic->lock); + + return 1; + } else + return 0; +} + +static int apic_accept_irq(struct kvm_kern_apic *apic, + int delivery_mode, + int vector, + int level, + int trig_mode) +{ + int ret; + + spin_lock_bh(&apic->lock); + ret = __apic_accept_irq(apic, delivery_mode, vector, + level, trig_mode); + spin_unlock_bh(&apic->lock); + + return ret; +} + +static void apic_set_eoi(struct kvm_kern_apic *apic) +{ + int vector = apic_find_highest_isr(apic); + int forward; + + /* + * Not every write EOI will has corresponding ISR, + * one example is when Kernel check timer on setup_IO_APIC + */ + if (!vector) + return; + + __clear_bit(vector, apic->regs + APIC_ISR); + forward = apic_update_ppr(apic); + + __clear_bit(vector, apic->regs + APIC_TMR); + + if (forward) { + spin_unlock_bh(&apic->lock); + kvm_irqdevice_set_intr(apic->irq_dev, kvm_irqpin_localint); + spin_lock_bh(&apic->lock); + } +} + +static int apic_check_vector(struct kvm_kern_apic *apic,u32 dm, u32 vector) +{ + if ((dm == APIC_DM_FIXED) && (vector < 16)) { + apic->err_status |= 0x40; + __apic_accept_irq(apic, APIC_DM_FIXED, + apic_lvt_vector(apic, APIC_LVTERR), 0, 0); + apic_debug("%s: check failed " + " dm %x vector %x\n", __FUNCTION__, dm, vector); + return 0; + } + return 1; +} + +int kvm_apicbus_send(struct kvm *kvm, int dest, int trig_mode, int level, + int dest_mode, int delivery_mode, int vector) +{ + int i; + u32 lpr_map = 0; + + apic_debug("%s: %d %d %d %d %d %d\n", __FUNCTION__, + dest, trig_mode, level, dest_mode, delivery_mode, vector); + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + struct kvm_kern_apic *target; + target = kvm->vcpus[i].apic.dev; + + if (!target) + continue; + + if (apic_match_dest(target, dest, dest_mode, delivery_mode)) { + if (delivery_mode == APIC_DM_LOWEST) + __set_bit(target->vcpu_id, &lpr_map); + else + apic_accept_irq(target, delivery_mode, + vector, level, trig_mode); + } + } + + if (delivery_mode == APIC_DM_LOWEST) { + struct kvm_kern_apic *target; + + /* Currently only UP is supported */ + target = kvm->vcpus[0].apic.dev; + + if (target) + apic_accept_irq(target, delivery_mode, + vector, level, trig_mode); + } + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_apicbus_send); + +static void apic_send_ipi(struct kvm_kern_apic *apic) +{ + u32 icr_low = apic_get_reg(apic, APIC_ICR); + u32 icr_high = apic_get_reg(apic, APIC_ICR2); + + unsigned int dest = GET_APIC_DEST_FIELD(icr_high); + unsigned int short_hand = icr_low & APIC_SHORT_MASK; + unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; + unsigned int level = icr_low & APIC_INT_ASSERT; + unsigned int dest_mode = icr_low & APIC_DEST_MASK; + unsigned int delivery_mode = icr_low & APIC_MODE_MASK; + unsigned int vector = icr_low & APIC_VECTOR_MASK; + + apic_debug("icr_high 0x%x, icr_low 0x%x, " + "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " + "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", + icr_high, icr_low, short_hand, dest, + trig_mode, level, dest_mode, delivery_mode, vector); + + /* + * We unlock here because we would enter this function in a lock + * state and we dont want to remain this way while we transmit + */ + spin_unlock_bh(&apic->lock); + + switch (short_hand) { + case APIC_DEST_NOSHORT: + /* + * If no short-hand notation is in use, just forward the + * message onto the apicbus and let the bus handle the routing. + */ + kvm_apicbus_send(apic->vcpu->kvm, dest, trig_mode, level, + dest_mode, delivery_mode, vector); + break; + case APIC_DEST_SELF: + apic_accept_irq(apic, delivery_mode, vector, level, trig_mode); + break; + default: { + /* + * Otherwise we need to consider the short-hand to find the + * correct targets. + */ + unsigned int i; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + struct kvm_kern_apic *target; + int result = 0; + + target = apic->vcpu->kvm->vcpus[i].apic.dev; + + if (!target) + continue; + + switch (short_hand) { + case APIC_DEST_ALLINC: + result = 1; + break; + + case APIC_DEST_ALLBUT: + if (target != apic) + result = 1; + break; + default: + kvm_crash_guest(apic->vcpu->kvm); + return; + } + + if (result) + apic_accept_irq(target, delivery_mode, + vector, level, trig_mode); + } + } + } + + /* + * Relock before returning + */ + spin_lock_bh(&apic->lock); + +} + +static u32 apic_get_tmcct(struct kvm_kern_apic *apic) +{ + u32 counter_passed; + ktime_t passed, now = apic->timer.dev.base->get_time(); + u32 tmcct = apic_get_reg(apic, APIC_TMCCT); + + ASSERT(apic != NULL); + + if (unlikely(ktime_to_ns(now) <= + ktime_to_ns(apic->timer.last_update))) { + /* Wrap around */ + passed = ktime_add( + ({ (ktime_t){ + .tv64 = KTIME_MAX - + (apic->timer.last_update).tv64 }; + }), now); + apic_debug("time elapsed\n"); + } else + passed = ktime_sub(now, apic->timer.last_update); + + counter_passed = ktime_to_ns(passed) / + (APIC_BUS_CYCLE_NS * apic->timer.divide_count); + tmcct -= counter_passed; + + if (tmcct <= 0) { + if (unlikely(!apic_lvtt_period(apic))) { + tmcct = 0; + } else { + do { + tmcct += apic_get_reg(apic, APIC_TMICT); + } while ( tmcct <= 0 ); + } + } + + apic->timer.last_update = now; + apic_set_reg(apic, APIC_TMCCT, tmcct); + + return tmcct; +} + +/* + *---------------------------------------------------------------------- + * MMIO + *---------------------------------------------------------------------- + */ + +#define align(val, len) (val & ~(len-1)) + +static int validate_mmio(struct kvm_kern_apic *apic, gpa_t address, int len) +{ + /* + * According to IA 32 Manual, all registers should be accessed with + * 32 bits alignment. + */ + if (align(address, 4) != align(address+(len-1), 4)) { + printk(KERN_WARNING "KVM: MMIO request for %d bytes at " \ + "0x%lx is not 32 bit aligned. Injecting #GP\n", + len, address); + inject_gp(apic->vcpu); + return 0; + } + + return 1; +} + +static u32 __apic_read(struct kvm_kern_apic *apic, + unsigned int offset) +{ + u32 val = 0; + + if (offset > APIC_TDCR) + return 0; + + switch (offset) { + case APIC_ARBPRI: + printk(KERN_WARNING "access local APIC ARBPRI register " \ + "which is for P6\n"); + break; + + case APIC_TMCCT: /* Timer CCR */ + val = apic_get_tmcct(apic); + break; + + case APIC_ESR: + apic->err_write_count = 0; + /* fall through */ + default: + val = apic_get_reg(apic, offset); + break; + } + + return val; +} + +static void apic_mmio_read(struct kvm_io_device *this, + gpa_t address, + int len, + void *data) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private; + unsigned int offset = address - apic->base_address; + unsigned char alignment = offset & 0x3; + u32 val; + + if (!validate_mmio(apic, address, len)) + return; + + spin_lock_bh(&apic->lock); + val = __apic_read(apic, offset & ~0x3); + spin_unlock_bh(&apic->lock); + + switch (len) { + case 1: + case 2: + case 4: + memcpy(data, (char*)((char*)&val + alignment), len); + break; + default: + printk(KERN_ALERT "Local APIC read with len = %x, " \ + "should be 1,2, or 4 instead\n", len); + inject_gp(apic->vcpu); + break; + } +} + +static void apic_mmio_write(struct kvm_io_device *this, + gpa_t address, + int len, + const void *data) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private; + unsigned int offset = address - apic->base_address; + unsigned char alignment = offset & 0x3; + u32 val; + + if (!validate_mmio(apic, address, len)) + return; + + spin_lock_bh(&apic->lock); + + switch (len) { + case 1: + case 2: { + unsigned int tmp; + + /* + * Some kernels will access with byte/word alignment + */ + apic_debug("Notice: Local APIC write with len = %x\n", len); + tmp = __apic_read(apic, offset & ~0x3); + switch (len) { + case 1: + val = *(u8*)data; + + val = (tmp & ~(0xff << (8*alignment))) | + ((val & 0xff) << (8*alignment)); + break; + + case 2: + if (alignment != 0x0 && alignment != 0x2) { + printk(KERN_ALERT "alignment error for apic " \ + "with len == 2\n"); + inject_gp(apic->vcpu); + } + + /* + * assumes 16 bit alignment on the pointer. + * Mis-alignment is a host-side issue, however, so + * we crash + */ + BUG_ON(((long)data & 0x1)); + + val = *(u16*)data; + + val = (tmp & ~(0xffff << (8*alignment))) | + ((val & 0xffff) << (8*alignment)); + break; + } + + break; + } + case 4: + memcpy(&val, data, 4); + break; + default: + printk(KERN_ALERT "Local APIC write with len = %x, " \ + "should be 1,2, or 4 instead\n", len); + inject_gp(apic->vcpu); + break; + } + + /* too common printing */ + if (offset != APIC_EOI) + apic_debug("%s: offset 0x%x with length 0x%x, and value is " \ + "0x%lx\n", + __FUNCTION__, offset, len, val); + + offset &= 0xff0; + + switch (offset) { + case APIC_ID: /* Local APIC ID */ + apic_set_reg(apic, APIC_ID, val); + break; + + case APIC_TASKPRI: + apic_set_tpr(apic, val & 0xff); + break; + + case APIC_EOI: + apic_set_eoi(apic); + break; + + case APIC_LDR: + apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + break; + + case APIC_DFR: + apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + break; + + case APIC_SPIV: + apic_set_reg(apic, APIC_SPIV, val & 0x3ff); + if (!(val & APIC_SPIV_APIC_ENABLED)) { + int i; + u32 lvt_val; + + apic->status |= APIC_SOFTWARE_DISABLE_MASK; + for (i = 0; i < APIC_LVT_NUM; i++) { + lvt_val = apic_get_reg(apic, + APIC_LVTT + + 0x10 * i); + apic_set_reg(apic, APIC_LVTT + 0x10 * i, + lvt_val | APIC_LVT_MASKED); + } + + if ((apic_get_reg(apic, APIC_LVT0) & + APIC_MODE_MASK) == APIC_DM_EXTINT) + clear_bit(_APIC_BSP_ACCEPT_PIC, &apic->status); + } else { + apic->status &= ~APIC_SOFTWARE_DISABLE_MASK; + if ((apic_get_reg(apic, APIC_LVT0) & + APIC_MODE_MASK) == APIC_DM_EXTINT) + set_bit(_APIC_BSP_ACCEPT_PIC, &apic->status); + } + break; + + case APIC_ESR: + apic->err_write_count = !apic->err_write_count; + if (!apic->err_write_count) + apic->err_status = 0; + break; + + case APIC_ICR: + /* No delay here, so we always clear the pending bit*/ + apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); + apic_send_ipi(apic); + break; + + case APIC_ICR2: + apic_set_reg(apic, APIC_ICR2, val & 0xff000000); + break; + + case APIC_LVTT: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + { + if (apic->status & APIC_SOFTWARE_DISABLE_MASK) + val |= APIC_LVT_MASKED; + + val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; + apic_set_reg(apic, offset, val); + + /* On hardware, when write vector less than 0x20 will error */ + if (!(val & APIC_LVT_MASKED)) + apic_check_vector(apic, apic_lvt_dm(apic, offset), + apic_lvt_vector(apic, offset)); + if (!apic->vcpu_id && (offset == APIC_LVT0)) { + if ((val & APIC_MODE_MASK) == APIC_DM_EXTINT) + if (val & APIC_LVT_MASKED) + clear_bit(_APIC_BSP_ACCEPT_PIC, + &apic->status); + else + set_bit(_APIC_BSP_ACCEPT_PIC, + &apic->status); + else + clear_bit(_APIC_BSP_ACCEPT_PIC, + &apic->status); + } + } + break; + + case APIC_TMICT: + { + ktime_t now = apic->timer.dev.base->get_time(); + u32 offset; + + apic_set_reg(apic, APIC_TMICT, val); + apic_set_reg(apic, APIC_TMCCT, val); + apic->timer.last_update = now; + offset = APIC_BUS_CYCLE_NS * apic->timer.divide_count * val; + + /* Make sure the lock ordering is coherent */ + spin_unlock_bh(&apic->lock); + hrtimer_cancel(&apic->timer.dev); + hrtimer_start(&apic->timer.dev, + ktime_add_ns(now, offset), + HRTIMER_MODE_ABS); + + apic_debug("%s: bus cycle is %"PRId64"ns, now 0x%016"PRIx64", " + "timer initial count 0x%x, offset 0x%x, " + "expire @ 0x%016"PRIx64".\n", __FUNCTION__, + APIC_BUS_CYCLE_NS, ktime_to_ns(now), + apic_get_reg(apic, APIC_TMICT), + offset, ktime_to_ns(ktime_add_ns(now, offset))); + } + return; + + case APIC_TDCR: + { + unsigned int tmp1, tmp2; + + tmp1 = val & 0xf; + tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; + apic->timer.divide_count = 0x1 << (tmp2 & 0x7); + + apic_set_reg(apic, APIC_TDCR, val); + + apic_debug("timer divide count is 0x%x\n", + apic->timer.divide_count); + } + break; + + default: + printk(KERN_WARNING "Local APIC Write to read-only register\n"); + break; + } + + spin_unlock_bh(&apic->lock); +} + +static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private; + int ret = 0; + + spin_lock_bh(&apic->lock); + + if (apic_global_enabled(apic) && + (addr >= apic->base_address) && + (addr < (apic->base_address + VLOCAL_APIC_MEM_LENGTH))) + ret = 1; + + spin_unlock_bh(&apic->lock); + + return ret; +} + +static void apic_mmio_destructor(struct kvm_io_device *this) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private; + + apic_dropref(apic); +} + +static void apic_mmio_register(struct kvm_kern_apic *apic) +{ + /* Register ourselves with the MMIO subsystem */ + struct kvm_io_device *dev = &apic->mmio_dev; + + dev->read = apic_mmio_read; + dev->write = apic_mmio_write; + dev->in_range = apic_mmio_range; + dev->destructor = apic_mmio_destructor; + + dev->private = apic; + atomic_inc(&apic->ref_count); + + apic->vcpu->apic.mmio = dev; +} + +/* + *---------------------------------------------------------------------- + * LAPIC interface + *---------------------------------------------------------------------- + */ + +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, u64 cr8) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev; + + spin_lock_bh(&apic->lock); + apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); + spin_unlock_bh(&apic->lock); +} + +u64 kvm_lapic_get_tpr(struct kvm_vcpu *vcpu) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev; + u64 tpr; + + spin_lock_bh(&apic->lock); + tpr = (u64)apic_get_reg(apic, APIC_TASKPRI); + spin_unlock_bh(&apic->lock); + + return (tpr & 0xf0) >> 4; +} +EXPORT_SYMBOL_GPL(kvm_lapic_get_tpr); + +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev; + + spin_lock_bh(&apic->lock); + if (apic->vcpu_id) + value &= ~MSR_IA32_APICBASE_BSP; + + apic->base_msr = value; + apic->base_address = apic->base_msr & MSR_IA32_APICBASE_BASE; + + /* with FSB delivery interrupt, we can restart APIC functionality */ + if (!(value & MSR_IA32_APICBASE_ENABLE)) + set_bit(_APIC_GLOB_DISABLE, &apic->status); + else + clear_bit(_APIC_GLOB_DISABLE, &apic->status); + + apic_debug("apic base msr is 0x%016"PRIx64", and base address is " \ + "0x%lx.\n", apic->base_msr, apic->base_address); + + spin_unlock_bh(&apic->lock); +} + +u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev; + u64 base; + + spin_lock_bh(&apic->lock); + base = apic->base_msr; + spin_unlock_bh(&apic->lock); + + return base; +} +EXPORT_SYMBOL_GPL(kvm_lapic_get_base); + +void kvm_lapic_save(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + /* + * FIXME: This needs to support the entire register set when + * enabled + */ + sregs->cr8 = kvm_lapic_get_tpr(vcpu); + sregs->apic_base = kvm_lapic_get_base(vcpu); +} + +void kvm_lapic_restore(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + /* + * FIXME: This needs to support the entire register set when + * enabled + */ + kvm_lapic_set_tpr(vcpu, sregs->cr8); + kvm_lapic_set_base(vcpu, sregs->apic_base); +} + +void kvm_lapic_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_kern_apic *apic; + int i; + + apic_debug("%s\n", __FUNCTION__); + + ASSERT(vcpu); + apic = vcpu->apic.dev; + ASSERT(apic != NULL); + + /* Stop the timer in case it's a reset to an active apic */ + hrtimer_cancel(&apic->timer.dev); + + spin_lock_bh(&apic->lock); + + apic_set_reg(apic, APIC_ID, vcpu_slot(vcpu) << 24); + apic_set_reg(apic, APIC_LVR, APIC_VERSION); + + for (i = 0; i < APIC_LVT_NUM; i++) + apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + + apic_set_reg(apic, APIC_DFR, 0xffffffffU); + apic_set_reg(apic, APIC_SPIV, 0xff); + apic_set_reg(apic, APIC_TASKPRI, 0); + apic_set_reg(apic, APIC_LDR, 0); + apic_set_reg(apic, APIC_ESR, 0); + apic_set_reg(apic, APIC_ICR, 0); + apic_set_reg(apic, APIC_ICR2, 0); + apic_set_reg(apic, APIC_TDCR, 0); + apic_set_reg(apic, APIC_TMICT, 0); + memset((void*)(apic->regs + APIC_IRR), 0, KVM_IRQ_BITMAP_SIZE(u8)); + memset((void*)(apic->regs + APIC_ISR), 0, KVM_IRQ_BITMAP_SIZE(u8)); + memset((void*)(apic->regs + APIC_TMR), 0, KVM_IRQ_BITMAP_SIZE(u8)); + + apic->base_msr = + MSR_IA32_APICBASE_ENABLE | + APIC_DEFAULT_PHYS_BASE; + if (vcpu_slot(vcpu) == 0) + apic->base_msr |= MSR_IA32_APICBASE_BSP; + apic->base_address = apic->base_msr & MSR_IA32_APICBASE_BASE; + + apic->timer.divide_count = 0; + apic->timer.pending = 0; + apic->status = 0; + +#ifdef APIC_NO_BIOS + /* + * XXX According to mp specification, BIOS will enable LVT0/1, + * remove it after BIOS enabled + */ + if (!vcpu_slot(vcpu)) { + apic_set_reg(apic, APIC_LVT0, APIC_MODE_EXTINT << 8); + apic_set_reg(apic, APIC_LVT1, APIC_MODE_NMI << 8); + set_bit(_APIC_BSP_ACCEPT_PIC, &apic->status); + } +#endif + + spin_unlock_bh(&apic->lock); + + printk(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" \ + "0x%016"PRIx64", base_address=0x%0lx.\n", __FUNCTION__, vcpu, + GET_APIC_ID(apic_get_reg(apic, APIC_ID)), + apic->base_msr, apic->base_address); +} + +int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev; + int ret = 0; + + spin_lock_bh(&apic->lock); + if (!apic->usermode) + ret = apic_enabled(apic); + spin_unlock_bh(&apic->lock); + + return ret; +} + +/* + *---------------------------------------------------------------------- + * timer interface + *---------------------------------------------------------------------- + */ +static int __apic_timer_fn(struct kvm_kern_apic *apic) +{ + u32 vector; + ktime_t now; + int result = 0; + + if (unlikely(!apic_enabled(apic) || + !apic_lvt_enabled(apic, APIC_LVTT))) { + apic_debug("%s: time interrupt although apic is down\n", + __FUNCTION__); + return 0; + } + + vector = apic_lvt_vector(apic, APIC_LVTT); + now = apic->timer.dev.base->get_time(); + apic->timer.last_update = now; + apic->timer.pending++; + + __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); + + if (apic_lvtt_period(apic)) { + u32 offset; + u32 tmict = apic_get_reg(apic, APIC_TMICT); + + apic_set_reg(apic, APIC_TMCCT, tmict); + offset = APIC_BUS_CYCLE_NS * apic->timer.divide_count * tmict; + + result = 1; + apic->timer.dev.expires = ktime_add_ns(now, offset); + + apic_debug("%s: now 0x%016"PRIx64", expire @ 0x%016"PRIx64", " + "timer initial count 0x%x, timer current count 0x%x.\n", + __FUNCTION__, + ktime_to_ns(now), ktime_add_ns(now, offset), + apic_get_reg(apic, APIC_TMICT), + apic_get_reg(apic, APIC_TMCCT)); + } else { + apic_set_reg(apic, APIC_TMCCT, 0); + apic_debug("%s: now 0x%016"PRIx64", " + "timer initial count 0x%x, timer current count 0x%x.\n", + __FUNCTION__, + ktime_to_ns(now), apic_get_reg(apic, APIC_TMICT), + apic_get_reg(apic, APIC_TMCCT)); + } + + return result; +} + +static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) +{ + struct kvm_kern_apic *apic; + int restart_timer = 0; + + apic = container_of(data, struct kvm_kern_apic, timer.dev); + + spin_lock_bh(&apic->lock); + restart_timer = __apic_timer_fn(apic); + spin_unlock_bh(&apic->lock); + + if (restart_timer) + return HRTIMER_RESTART; + else + return HRTIMER_NORESTART; +} + +/* + *---------------------------------------------------------------------- + * IRQDEVICE interface + *---------------------------------------------------------------------- + */ + +static int apic_irqdev_ack(struct kvm_irqdevice *this, int flags, + struct kvm_irqack_data *data) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private; + int irq; + + apic_debug("LAPIC ACK attempt\n"); + + spin_lock_bh(&apic->lock); + + if (!apic_enabled(apic)) + goto out; + + if (!(flags & KVM_IRQACK_FLAG_PEEK)) { + irq = apic_find_highest_irr(apic); + if ((irq & 0xf0) > apic_get_reg(apic, APIC_PROCPRI)) { + BUG_ON (irq < 0x10); + + __set_bit(irq, apic->regs + APIC_ISR); + __clear_bit(irq, apic->regs + APIC_IRR); + apic_update_ppr(apic); + + /* + * We have to special case the timer interrupt + * because we want the vector to stay pending + * for each tick of the clock, even for a backlog. + * Therefore, if this was a timer vector and we + * still have ticks pending, keep IRR set + */ + if (irq == apic_lvt_vector(apic, APIC_LVTT)) { + BUG_ON(!apic->timer.pending); + apic->timer.pending--; + if (apic->timer.pending) + __set_bit(irq, apic->regs + APIC_IRR); + } + + data->flags |= KVM_IRQACKDATA_VECTOR_VALID; + data->vector = irq; + } + else + data->vector = -1; + + apic_debug("ACK for vector %d\n", data->vector); + } + + /* + * See if there is anything still pending. Don't forget that we may + * have entered this function with PEEK just to check pending + * status. This is really the only way we could ever find something + * still eligible, since otherwise we would have just injected + * the highest priority vector above + */ + irq = apic_find_highest_irr(apic); + if ((irq & 0xf0) > apic_get_reg(apic, APIC_PROCPRI)) + data->flags |= KVM_IRQACKDATA_VECTOR_PENDING; + + out: + spin_unlock_bh(&apic->lock); + + return 0; +} + +static int apic_irqdev_set_pin(struct kvm_irqdevice *this, int irq, int level) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private; + int lvt = 0; + + spin_lock_bh(&apic->lock); + + if (!apic_enabled(apic)) { + /* + * If the LAPIC is disabled, we simply forward the interrupt + * on to the output line + */ + __apic_accept_irq(apic, APIC_DM_EXTINT, 0, level, 1); + goto out; + } + + /* + * pin "0" is LINT0, and "1" is LINT1 + */ + BUG_ON(irq > 1); + + switch(irq) { + case 0: + lvt = APIC_LVT0; + break; + case 1: + lvt = APIC_LVT1; + break; + } + + if (apic_lvt_enabled(apic, lvt)) + __apic_accept_irq(apic, + apic_lvt_dm(apic, lvt), + apic_lvt_vector(apic, lvt), + level, + 1); + + + out: + spin_unlock_bh(&apic->lock); + + return 0; +} + +static void apic_irqdev_destructor(struct kvm_irqdevice *this) +{ + struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private; + + apic_dropref(apic); +} + +static void apic_irqdev_register(struct kvm_kern_apic *apic, + struct kvm_irqdevice *dev) +{ + dev->ack = apic_irqdev_ack; + dev->set_pin = apic_irqdev_set_pin; + dev->destructor = apic_irqdev_destructor; + + dev->private = apic; + atomic_inc(&apic->ref_count); + + apic->irq_dev = dev; +} + +int kvm_lapic_init(struct kvm_vcpu *vcpu, + struct kvm_irqdevice *irq_dev, int flags) +{ + struct kvm_kern_apic *apic = NULL; + struct kvm_io_device *mmio_dev = NULL; + + ASSERT(vcpu != NULL); + apic_debug("apic_init %d\n", vcpu_slot(vcpu)); + + apic = kzalloc(sizeof(*apic), GFP_KERNEL); + if (!apic) + goto nomem; + + spin_lock_init(&apic->lock); + atomic_inc(&apic->ref_count); + apic->vcpu_id = vcpu_slot(vcpu); + + apic->regs_page = alloc_page(GFP_KERNEL); + if ( apic->regs_page == NULL ) { + printk(KERN_ALERT "malloc apic regs error for vcpu %x\n", + vcpu_slot(vcpu)); + goto nomem; + } + apic->regs = page_address(apic->regs_page); + memset(apic->regs, 0, PAGE_SIZE); + + apic->vcpu = vcpu; + vcpu->apic.dev = apic; + + if (!(flags & KVM_LAPIC_OPTION_USERMODE)) { + apic_irqdev_register(apic, irq_dev); + apic_mmio_register(apic); + } else + apic->usermode = 1; + + hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + apic->timer.dev.function = apic_timer_fn; + + kvm_lapic_reset(vcpu); + return 0; + + nomem: + if (mmio_dev) + kfree(mmio_dev); + + if (apic) + apic_dropref(apic); + + return -ENOMEM; +} + +void kvm_lapic_destroy(struct kvm_vcpu *vcpu) +{ + struct kvm_kern_apic *apic = vcpu->apic.dev; + + if (vcpu->apic.mmio) + kvm_iodevice_destructor(vcpu->apic.mmio); + + apic_dropref(apic); +} diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 1b9d633..ccc5856 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -570,9 +570,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->fpu_active = 1; - vcpu->apic_base = 0xfee00000 | - /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | - MSR_IA32_APICBASE_ENABLE; return 0; @@ -1410,9 +1407,9 @@ static void do_intr_requests(struct kvm_vcpu *vcpu, r = kvm_vcpu_irq_pop(vcpu, &ack); break; case kvm_irqpin_extint: - printk(KERN_WARNING "KVM: external-interrupts not " \ - "handled yet\n"); - __clear_bit(pin, &vcpu->irq.pending); + r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq, 0, &ack); + if (!(ack.flags & KVM_IRQACKDATA_VECTOR_PENDING)) + __clear_bit(pin, &vcpu->irq.pending); break; case kvm_irqpin_nmi: /* @@ -1509,8 +1506,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, !kvm_vcpu_irq_pending(vcpu) && !(ack.flags & KVM_IRQACKDATA_NEXT_VALID)); kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0; - kvm_run->cr8 = vcpu->cr8; - kvm_run->apic_base = vcpu->apic_base; + kvm_run->cr8 = kvm_lapic_get_tpr(vcpu); + kvm_run->apic_base = kvm_lapic_get_base(vcpu); } /* diff --git a/drivers/kvm/userint.c b/drivers/kvm/userint.c index a60707d..c6118b0 100644 --- a/drivers/kvm/userint.c +++ b/drivers/kvm/userint.c @@ -218,6 +218,12 @@ int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data) int kvm_userint_init(struct kvm_vcpu *vcpu) { - return kvm_user_irqdev_init(&vcpu->irq.dev); + int ret; + + ret = kvm_user_irqdev_init(&vcpu->irq.dev); + if (ret < 0) + return ret; + + return kvm_lapic_init(vcpu, NULL, KVM_LAPIC_OPTION_USERMODE); } diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 7f2af92..82e40c9 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1236,10 +1236,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) memset(vcpu->regs, 0, sizeof(vcpu->regs)); vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); - vcpu->cr8 = 0; - vcpu->apic_base = 0xfee00000 | - /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | - MSR_IA32_APICBASE_ENABLE; fx_init(vcpu); @@ -1480,9 +1476,9 @@ static void do_intr_requests(struct kvm_vcpu *vcpu, r = kvm_vcpu_irq_pop(vcpu, &ack); break; case kvm_irqpin_extint: - printk(KERN_WARNING "KVM: external-interrupts not " \ - "handled yet\n"); - __clear_bit(pin, &vcpu->irq.pending); + r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq, 0, &ack); + if (!(ack.flags & KVM_IRQACKDATA_VECTOR_PENDING)) + __clear_bit(pin, &vcpu->irq.pending); break; case kvm_irqpin_nmi: /* @@ -1849,7 +1845,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; case 8: vcpu_load_rsp_rip(vcpu); - vcpu->regs[reg] = vcpu->cr8; + vcpu->regs[reg] = kvm_lapic_get_tpr(vcpu); vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -1949,8 +1945,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK, &ack); kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0; - kvm_run->cr8 = vcpu->cr8; - kvm_run->apic_base = vcpu->apic_base; + kvm_run->cr8 = kvm_lapic_get_tpr(vcpu); + kvm_run->apic_base = kvm_lapic_get_base(vcpu); kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && !kvm_vcpu_irq_pending(vcpu) && diff --git a/include/linux/kvm.h b/include/linux/kvm.h index e6edca8..aaa826e 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -231,6 +231,17 @@ struct kvm_dirty_log { }; }; +/* for KVM_APIC */ +struct kvm_apic_msg { + /* in */ + __u32 dest; + __u32 trig_mode; + __u32 dest_mode; + __u32 delivery_mode; + __u32 vector; + __u32 padding; +}; + struct kvm_cpuid_entry { __u32 function; __u32 eax; @@ -282,6 +293,9 @@ struct kvm_signal_mask { #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) +#define KVM_ENABLE_KERNEL_PIC _IOW(KVMIO, 0x44, __u32) +#define KVM_ISA_INTERRUPT _IOW(KVMIO, 0x45, struct kvm_interrupt) +#define KVM_APIC_MSG _IOW(KVMIO, 0x46, struct kvm_apic_msg) /* * ioctls for vcpu fds @@ -300,5 +314,6 @@ struct kvm_signal_mask { #define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask) #define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) #define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) +#define KVM_APIC_RESET _IO(KVMIO, 0x8e) #endif ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel