Hi All, Attached are the first three patches in my queue. The first two you are likely familiar with at this point (though I have made some more of the requested changes to 02-irqdevice.patch). The last item (03-preemptible-cpu.patch) adds an implementation to the previously unused kvm_vcpu_intr() callback. This acts as a functional example of the INTR callback mechanism as Avi requested. Note that the work related to IF/NMI/TPR classification of interrupts happens later in my queue and is not mature enough to share yet, but hopefully soon.
Thoughts? -Greg
KVM: Adds support for in-kernel mmio handlers From: <> Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]> --- drivers/kvm/kvm.h | 31 ++++++++++++++++++ drivers/kvm/kvm_main.c | 82 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 101 insertions(+), 12 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index fceeb84..181099f 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -236,6 +236,36 @@ struct kvm_pio_request { int rep; }; +struct kvm_io_device { + unsigned long (*read)(struct kvm_io_device *this, + gpa_t addr, + int length); + void (*write)(struct kvm_io_device *this, + gpa_t addr, + int length, + unsigned long val); + int (*in_range)(struct kvm_io_device *this, gpa_t addr); + + void *private; +}; + +/* + * It would be nice to use something smarter than a linear search, TBD... + * Thankfully we dont expect many devices to register (famous last words :), + * so until then it will suffice. At least its abstracted so we can change + * in one place. + */ +struct kvm_io_bus { + int dev_count; +#define NR_IOBUS_DEVS 6 + struct kvm_io_device *devs[NR_IOBUS_DEVS]; +}; + +void kvm_io_bus_init(struct kvm_io_bus *bus); +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev); + struct kvm_vcpu { struct kvm *kvm; union { @@ -345,6 +375,7 @@ struct kvm { unsigned long rmap_overflow; struct list_head vm_list; struct file *filp; + struct kvm_io_bus mmio_bus; }; struct kvm_stat { diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 4473174..c3c0059 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -294,6 +294,7 @@ static struct kvm *kvm_create_vm(void) spin_lock_init(&kvm->lock); INIT_LIST_HEAD(&kvm->active_mmu_pages); + kvm_io_bus_init(&kvm->mmio_bus); for (i = 0; i < KVM_MAX_VCPUS; ++i) { struct kvm_vcpu *vcpu = &kvm->vcpus[i]; @@ -1015,12 +1016,25 @@ static int emulator_write_std(unsigned long addr, return X86EMUL_UNHANDLEABLE; } +static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + /* + * Note that its important to have this wrapper function because + * in the very near future we will be checking for MMIOs against + * the LAPIC as well as the general MMIO bus + */ + return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); +} + static int emulator_read_emulated(unsigned long addr, unsigned long *val, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_io_device *mmio_dev; + gpa_t gpa; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -1029,18 +1043,26 @@ static int emulator_read_emulated(unsigned long addr, } else if (emulator_read_std(addr, val, bytes, ctxt) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - else { - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; + gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; - return X86EMUL_UNHANDLEABLE; + /* + * Is this MMIO handled locally? + */ + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + *val = mmio_dev->read(mmio_dev, gpa, bytes); + return X86EMUL_CONTINUE; } + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; + + return X86EMUL_UNHANDLEABLE; } static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -1068,8 +1090,9 @@ static int emulator_write_emulated(unsigned long addr, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_io_device *mmio_dev; + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; @@ -1077,6 +1100,15 @@ static int emulator_write_emulated(unsigned long addr, if (emulator_write_phys(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; + /* + * Is this MMIO handled locally? + */ + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + mmio_dev->write(mmio_dev, gpa, bytes, val); + return X86EMUL_CONTINUE; + } + vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; @@ -2911,6 +2943,32 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, return NOTIFY_OK; } +void kvm_io_bus_init(struct kvm_io_bus *bus) +{ + memset(bus, 0, sizeof(*bus)); +} + +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + if (pos->in_range(pos, addr)) + return pos; + } + + return NULL; +} + +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) +{ + BUG_ON(bus->dev_count >= (NR_IOBUS_DEVS-1)); + + bus->devs[bus->dev_count++] = dev; +} + static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_hotplug, .priority = 20, /* must be > scheduler priority */
KVM: Add irqdevice object From: <> The current code is geared towards using a user-mode (A)PIC. This patch adds an "irqdevice" abstraction, and implements a "userint" model to handle the duties of the original code. Later, we can develop other irqdevice models to handle objects like LAPIC, IOAPIC, i8259, etc, as appropriate Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]> --- drivers/kvm/Makefile | 2 drivers/kvm/irqdevice.h | 170 ++++++++++++++++++++++++++++++++++++++++ drivers/kvm/kvm.h | 9 +- drivers/kvm/kvm_main.c | 57 ++++++++++--- drivers/kvm/svm.c | 33 ++++---- drivers/kvm/userint.c | 202 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/kvm/vmx.c | 29 +++---- 7 files changed, 449 insertions(+), 53 deletions(-) diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile index c0a789f..540afbc 100644 --- a/drivers/kvm/Makefile +++ b/drivers/kvm/Makefile @@ -2,7 +2,7 @@ # Makefile for Kernel-based Virtual Machine module # -kvm-objs := kvm_main.o mmu.o x86_emulate.o +kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/drivers/kvm/irqdevice.h b/drivers/kvm/irqdevice.h new file mode 100644 index 0000000..fe284bc --- /dev/null +++ b/drivers/kvm/irqdevice.h @@ -0,0 +1,170 @@ +/* + * Defines an interface for an abstract interrupt controller. The model + * consists of a unit with an arbitrary number of input lines (IRQ0-N), an + * output line (INTR), and methods for completing an interrupt-acknowledge + * cycle (INTA). A particular implementation of this model will define + * various policies, such as irq-to-vector translation, INTA/auto-EOI policy, + * etc. + * + * In addition, the INTR callback mechanism allows the unit to be "wired" to + * an interruptible source in a very flexible manner. For instance, an + * irqdevice could have its INTR wired to a VCPU (ala LAPIC), or another + * interrupt controller (ala cascaded i8259s) + * + * Copyright (C) 2007 Novell + * + * Authors: + * Gregory Haskins <[EMAIL PROTECTED]> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef __IRQDEVICE_H +#define __IRQDEVICE_H + +#define KVM_IRQFLAGS_NMI (1 << 0) +#define KVM_IRQFLAGS_PEEK (1 << 1) + +struct kvm_irqdevice; + +struct kvm_irqsink { + void (*raise_intr)(struct kvm_irqsink *this, + struct kvm_irqdevice *dev); + + void *private; +}; + +struct kvm_irqdevice { + int (*pending)(struct kvm_irqdevice *this, int flags); + int (*read_vector)(struct kvm_irqdevice *this, int flags); + int (*set_pin)(struct kvm_irqdevice *this, int pin, int level); + int (*summary)(struct kvm_irqdevice *this, void *data); + void (*destructor)(struct kvm_irqdevice *this); + + void *private; + struct kvm_irqsink sink; +}; + +/** + * kvm_irqdevice_init - initialize the kvm_irqdevice for use + * @dev: The device + * + * Description: Initialize the kvm_irqdevice for use. Should be called before + * calling any derived implementation init functions + * + * Returns: (void) + */ +static inline void kvm_irqdevice_init(struct kvm_irqdevice *dev) +{ + memset(dev, 0, sizeof(*dev)); +} + +/** + * kvm_irqdevice_pending - efficiently determines if an interrupt is pending + * @dev: The device + * @flags: Modifies the behavior as follows: + * [+ KVM_IRQFLAGS_NMI: Mask everything but NMIs] + * + * Description: Efficiently determines if an interrupt is pending on an + * irqdevice + * + * Returns: (int) + * [0 = no iterrupts pending (per "flags" criteria)] + * [1 = one or more interrupts are pending] + */ +static inline int kvm_irqdevice_pending(struct kvm_irqdevice *dev, int flags) +{ + return dev->pending(dev, flags); +} + +/** + * kvm_irqdevice_read_vector - read the highest priority vector from the device + * @dev: The device + * @flags: Modifies the behavior as follows: + * [+ KVM_IRQFLAGS_NMI: Mask everything but NMIs] + * [+ KVM_IRQFLAGS_PEEK: Do not auto-acknowledge interrupt] + * + * Description: Read the highest priority pending vector from the device, + * potentially invoking auto-EOI depending on device policy + * + * Returns: (int) + * [ -1 = no interrupts pending (per "flags" criteria)] + * [>=0 = the highest pending vector] + */ +static inline int kvm_irqdevice_read_vector(struct kvm_irqdevice *dev, + int flags) +{ + return dev->read_vector(dev, flags); +} + +/** + * kvm_irqdevice_set_pin - allows the caller to assert/deassert an IRQ + * @dev: The device + * @pin: The input pin to alter + * @level: The value to set (1 = assert, 0 = deassert) + * + * Description: Allows the caller to assert/deassert an IRQ input pin to the + * device according to device policy. + * + * Returns: (int) + * [-1 = failure] + * [ 0 = success] + */ +static inline int kvm_irqdevice_set_pin(struct kvm_irqdevice *dev, int pin, + int level) +{ + return dev->set_pin(dev, pin, level); +} + +/** + * kvm_irqdevice_summary - loads a summary bitmask + * @dev: The device + * @data: A pointer to a region capable of holding a 256 bit bitmap + * + * Description: Loads a summary bitmask of all pending vectors (0-255) + * + * Returns: (int) + * [-1 = failure] + * [ 0 = success] + */ +static inline int kvm_irqdevice_summary(struct kvm_irqdevice *dev, void *data) +{ + return dev->summary(dev, data); +} + +/** + * kvm_irqdevice_register_sink - registers an kvm_irqsink object + * @dev: The device + * @sink: The sink to register. Data will be copied so building object from + * transient storage is ok. + * + * Description: Registers an kvm_irqsink object as an INTR callback + * + * Returns: (void) + */ +static inline void kvm_irqdevice_register_sink(struct kvm_irqdevice *dev, + const struct kvm_irqsink *sink) +{ + dev->sink = *sink; +} + +/* + * kvm_irqdevice_raise_intr - invokes a registered INTR callback + * @dev: The device + * + * Description: Invokes a registered INTR callback (if present). This + * function is meant to be used privately by a irqdevice + * implementation. + * + * Returns: (void) + */ +static inline void kvm_irqdevice_raise_intr(struct kvm_irqdevice *dev) +{ + struct kvm_irqsink *sink = &dev->sink; + if (sink->raise_intr) + sink->raise_intr(sink, dev); +} + +#endif /* __IRQDEVICE_H */ diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 181099f..58966d9 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -13,6 +13,7 @@ #include <linux/mm.h> #include "vmx.h" +#include "irqdevice.h" #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -157,6 +158,8 @@ struct vmcs { struct kvm_vcpu; +int kvm_userint_init(struct kvm_irqdevice *dev); + /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level * 32-bit). The kvm_mmu structure abstracts the details of the current mmu @@ -266,6 +269,8 @@ struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev); +#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) + struct kvm_vcpu { struct kvm *kvm; union { @@ -278,9 +283,7 @@ struct kvm_vcpu { u64 host_tsc; struct kvm_run *run; int interrupt_window_open; - unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ -#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) - unsigned long irq_pending[NR_IRQ_WORDS]; + struct kvm_irqdevice irq_dev; unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ unsigned long rip; /* needs vcpu_load_rsp_rip() */ diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index c3c0059..7e00412 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1989,8 +1989,7 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->efer = vcpu->shadow_efer; sregs->apic_base = vcpu->apic_base; - memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, - sizeof sregs->interrupt_bitmap); + kvm_irqdevice_summary(&vcpu->irq_dev, &sregs->interrupt_bitmap); vcpu_put(vcpu); @@ -2044,13 +2043,11 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, - sizeof vcpu->irq_pending); - vcpu->irq_summary = 0; - for (i = 0; i < NR_IRQ_WORDS; ++i) - if (vcpu->irq_pending[i]) - __set_bit(i, &vcpu->irq_summary); - + /* walk the interrupt-bitmap and inject an IRQ for each bit found */ + for (i = 0; i < 256; ++i) + if (test_bit(i, &sregs->interrupt_bitmap[0])) + kvm_irqdevice_set_pin(&vcpu->irq_dev, i, 1); + set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); set_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -2210,14 +2207,8 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, { if (irq->irq < 0 || irq->irq >= 256) return -EINVAL; - vcpu_load(vcpu); - - set_bit(irq->irq, vcpu->irq_pending); - set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); - vcpu_put(vcpu); - - return 0; + return kvm_irqdevice_set_pin(&vcpu->irq_dev, irq->irq, 1); } static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, @@ -2319,6 +2310,36 @@ out1: } /* + * This function will be invoked whenever the vcpu->irq_dev raises its INTR + * line + */ +static void kvm_vcpu_intr(struct kvm_irqsink *this, + struct kvm_irqdevice *dev) +{ + /* + * Our irq device is requesting to interrupt the vcpu. If it is + * currently running, we should inject a host IPI to force a VMEXIT + */ + + /* + * FIXME: Implement this or the CPU wont notice the interrupt until + * the next natural VMEXIT. Note that this is how the system + * has always worked, so nothing is broken here. This is a future + * enhancement + */ +} + +static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu) +{ + struct kvm_irqsink sink = { + .raise_intr = kvm_vcpu_intr, + .private = vcpu + }; + + kvm_irqdevice_register_sink(&vcpu->irq_dev, &sink); +} + +/* * Creates some virtual cpus. Good luck creating more than one. */ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) @@ -2364,6 +2385,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) if (r < 0) goto out_free_vcpus; + kvm_irqdevice_init(&vcpu->irq_dev); + kvm_vcpu_irqsink_init(vcpu); + kvm_userint_init(&vcpu->irq_dev); + kvm_arch_ops->vcpu_load(vcpu); r = kvm_mmu_setup(vcpu); if (r >= 0) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index b7e1410..e59a548 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -108,20 +108,16 @@ static unsigned get_addr_size(struct kvm_vcpu *vcpu) static inline u8 pop_irq(struct kvm_vcpu *vcpu) { - int word_index = __ffs(vcpu->irq_summary); - int bit_index = __ffs(vcpu->irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->irq_pending[word_index]); - if (!vcpu->irq_pending[word_index]) - clear_bit(word_index, &vcpu->irq_summary); - return irq; + return kvm_irqdevice_read_vector(&vcpu->irq_dev, 0); } static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) { - set_bit(irq, vcpu->irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); + /* FIXME: We probably want to reserve the "set_pin" verb for + * actual interrupt requests, not for putting back something + * previously pending. Lets revisit this + */ + kvm_irqdevice_set_pin(&vcpu->irq_dev, irq, 1); } static inline void clgi(void) @@ -1092,7 +1088,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; skip_emulated_instruction(vcpu); - if (vcpu->irq_summary) + if (kvm_irqdevice_pending(&vcpu->irq_dev, 0)) return 1; kvm_run->exit_reason = KVM_EXIT_HLT; @@ -1263,7 +1259,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu, * possible */ if (kvm_run->request_interrupt_window && - !vcpu->irq_summary) { + !kvm_irqdevice_pending(&vcpu->irq_dev, 0)) { ++kvm_stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; @@ -1399,7 +1395,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)); - if (vcpu->interrupt_window_open && vcpu->irq_summary) + if (vcpu->interrupt_window_open && + kvm_irqdevice_pending(&vcpu->irq_dev, 0)) /* * If interrupts enabled, and not blocked by sti or mov ss. Good. */ @@ -1409,7 +1406,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, * Interrupts blocked. Wait for unblock. */ if (!vcpu->interrupt_window_open && - (vcpu->irq_summary || kvm_run->request_interrupt_window)) { + (kvm_irqdevice_pending(&vcpu->irq_dev, 0) || + kvm_run->request_interrupt_window)) { control->intercept |= 1ULL << INTERCEPT_VINTR; } else control->intercept &= ~(1ULL << INTERCEPT_VINTR); @@ -1418,8 +1416,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, static void post_kvm_run_save(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && - vcpu->irq_summary == 0); + kvm_run->ready_for_interrupt_injection = + (vcpu->interrupt_window_open && + !kvm_irqdevice_pending(&vcpu->irq_dev, 0)); kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0; kvm_run->cr8 = vcpu->cr8; kvm_run->apic_base = vcpu->apic_base; @@ -1434,7 +1433,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - return (!vcpu->irq_summary && + return (!kvm_irqdevice_pending(&vcpu->irq_dev, 0) && kvm_run->request_interrupt_window && vcpu->interrupt_window_open && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)); diff --git a/drivers/kvm/userint.c b/drivers/kvm/userint.c new file mode 100644 index 0000000..8363060 --- /dev/null +++ b/drivers/kvm/userint.c @@ -0,0 +1,202 @@ +/* + * User Interrupts IRQ device + * + * This acts as an extention of an interrupt controller that exists elsewhere + * (typically in userspace/QEMU). Because this PIC is a pseudo device that + * is downstream from a real emulated PIC, the "IRQ-to-vector" mapping has + * already occured. Therefore, this PIC has the following unusal properties: + * + * 1) It has 256 "pins" which are literal vectors (i.e. no translation) + * 2) It only supports "auto-EOI" behavior since it is expected that the + * upstream emulated PIC will handle the real EOIs (if applicable) + * 3) It only listens to "asserts" on the pins (deasserts are dropped) + * because its an auto-EOI device anyway. + * + * Copyright (C) 2007 Novell + * + * bitarray code based on original vcpu->irq_pending code, + * Copyright (C) 2007 Qumranet + * + * Authors: + * Gregory Haskins <[EMAIL PROTECTED]> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "kvm.h" + +/* + *---------------------------------------------------------------------- + * optimized bitarray object - works like bitarrays in bitops, but uses + * a summary field to accelerate lookups. Assumes external locking + *--------------------------------------------------------------------- + */ + +struct bitarray { + unsigned long summary; /* 1 per word in pending */ + unsigned long pending[NR_IRQ_WORDS]; +}; + +static inline int bitarray_pending(struct bitarray *this) +{ + return this->summary ? 1 : 0; +} + +static inline int bitarray_findhighest(struct bitarray *this) +{ + if (!this->summary) + return -1; + else { + int word_index = __fls(this->summary); + int bit_index = __fls(this->pending[word_index]); + + return word_index * BITS_PER_LONG + bit_index; + } +} + +static inline void bitarray_set(struct bitarray *this, int nr) +{ + __set_bit(nr, &this->pending); + __set_bit(nr / BITS_PER_LONG, &this->summary); +} + +static inline void bitarray_clear(struct bitarray *this, int nr) +{ + int word = nr / BITS_PER_LONG; + + __clear_bit(nr, &this->pending); + if (!this->pending[word]) + __clear_bit(word, &this->summary); +} + +static inline int bitarray_test(struct bitarray *this, int nr) +{ + return test_bit(nr, &this->pending); +} + +/* + *---------------------------------------------------------------------- + * userint interface - provides the actual kvm_irqdevice implementation + *--------------------------------------------------------------------- + */ + +struct kvm_userint { + spinlock_t lock; + struct bitarray irq_pending; + int nmi_pending; +}; + +static int userint_pending(struct kvm_irqdevice *this, int flags) +{ + struct kvm_userint *s = (struct kvm_userint*)this->private; + int ret; + + spin_lock_irq(&s->lock); + + if (flags & KVM_IRQFLAGS_NMI) + ret = s->nmi_pending; + else + ret = bitarray_pending(&s->irq_pending); + + spin_unlock_irq(&s->lock); + + return ret; +} + +static int userint_read_vector(struct kvm_irqdevice *this, int flags) +{ + struct kvm_userint *s = (struct kvm_userint*)this->private; + int irq; + + spin_lock_irq(&s->lock); + + /* + * NMIs take priority, so if there is an NMI pending, or + * if we are filtering out NMIs, only consider them + */ + if (s->nmi_pending || (flags & KVM_IRQFLAGS_NMI)) + irq = s->nmi_pending ? 2 : -1; + else + irq = bitarray_findhighest(&s->irq_pending); + + if ((irq > -1) && !(flags & KVM_IRQFLAGS_PEEK)) { + /* + * If the "peek" flag is not set, automatically clear the + * interrupt as the EOI mechanism (if any) will take place + * in userspace + */ + bitarray_clear(&s->irq_pending, irq); + if (irq == 2) + s->nmi_pending = 0; + } + + spin_unlock_irq(&s->lock); + + return irq; +} + +static int userint_set_pin(struct kvm_irqdevice* this, int irq, int level) +{ + struct kvm_userint *s = (struct kvm_userint*)this->private; + + if (!level) + return 0; /* We dont care about deasserts */ + + spin_lock_irq(&s->lock); + + /* + * Update the local state + */ + bitarray_set(&s->irq_pending, irq); + if (irq == 2) + s->nmi_pending = 1; + + spin_unlock_irq(&s->lock); + + /* + * And then alert the higher layer software we have changes + */ + kvm_irqdevice_raise_intr(this); + + return 0; +} + +static int userint_summary(struct kvm_irqdevice* this, void *data) +{ + struct kvm_userint *s = (struct kvm_userint*)this->private; + + spin_lock_irq(&s->lock); + memcpy(data, s->irq_pending.pending, sizeof s->irq_pending.pending); + spin_unlock_irq(&s->lock); + + return 0; +} + +static void userint_destructor(struct kvm_irqdevice *this) +{ + kfree(this->private); +} + +int kvm_userint_init(struct kvm_irqdevice *dev) +{ + struct kvm_userint *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + spin_lock_init(&s->lock); + + dev->pending = userint_pending; + dev->read_vector = userint_read_vector; + dev->set_pin = userint_set_pin; + dev->summary = userint_summary; + dev->destructor = userint_destructor; + + dev->private = s; + + return 0; +} + diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 61a6116..a0fdf02 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1219,13 +1219,8 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { - int word_index = __ffs(vcpu->irq_summary); - int bit_index = __ffs(vcpu->irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->irq_pending[word_index]); - if (!vcpu->irq_pending[word_index]) - clear_bit(word_index, &vcpu->irq_summary); + int irq = kvm_irqdevice_read_vector(&vcpu->irq_dev, 0); + BUG_ON(irq < 0); if (vcpu->rmode.active) { inject_rmode_irq(vcpu, irq); @@ -1246,7 +1241,7 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); if (vcpu->interrupt_window_open && - vcpu->irq_summary && + kvm_irqdevice_pending(&vcpu->irq_dev, 0) && !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) /* * If interrupts enabled, and not blocked by sti or mov ss. Good. @@ -1255,7 +1250,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); if (!vcpu->interrupt_window_open && - (vcpu->irq_summary || kvm_run->request_interrupt_window)) + (kvm_irqdevice_pending(&vcpu->irq_dev, 0) || + kvm_run->request_interrupt_window)) /* * Interrupts blocked. Wait for unblock. */ @@ -1314,8 +1310,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (is_external_interrupt(vect_info)) { int irq = vect_info & VECTORING_INFO_VECTOR_MASK; - set_bit(irq, vcpu->irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); + /* FIXME: Is this right? */ + kvm_irqdevice_set_pin(&vcpu->irq_dev, irq, 1); } if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ @@ -1619,8 +1615,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0; kvm_run->cr8 = vcpu->cr8; kvm_run->apic_base = vcpu->apic_base; - kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && - vcpu->irq_summary == 0); + kvm_run->ready_for_interrupt_injection = + (vcpu->interrupt_window_open && + !kvm_irqdevice_pending(&vcpu->irq_dev, 0)); } static int handle_interrupt_window(struct kvm_vcpu *vcpu, @@ -1631,7 +1628,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, * possible */ if (kvm_run->request_interrupt_window && - !vcpu->irq_summary) { + !kvm_irqdevice_pending(&vcpu->irq_dev, 0)) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; ++kvm_stat.irq_window_exits; return 0; @@ -1642,7 +1639,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); - if (vcpu->irq_summary) + if (kvm_irqdevice_pending(&vcpu->irq_dev, 0)) return 1; kvm_run->exit_reason = KVM_EXIT_HLT; @@ -1713,7 +1710,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - return (!vcpu->irq_summary && + return (!kvm_irqdevice_pending(&vcpu->irq_dev, 0) && kvm_run->request_interrupt_window && vcpu->interrupt_window_open && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
KVM: Preemptible VCPU From: <> This adds support for interrupting an executing CPU Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]> --- drivers/kvm/Makefile | 2 - drivers/kvm/condvar.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/kvm/condvar.h | 36 ++++++++++++++++ drivers/kvm/kvm.h | 12 +++++ drivers/kvm/kvm_main.c | 47 ++++++++++++++++++--- drivers/kvm/svm.c | 35 +++++++++++++++ drivers/kvm/vmx.c | 35 +++++++++++++++ 7 files changed, 270 insertions(+), 6 deletions(-) diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile index 540afbc..b3bef0e 100644 --- a/drivers/kvm/Makefile +++ b/drivers/kvm/Makefile @@ -2,7 +2,7 @@ # Makefile for Kernel-based Virtual Machine module # -kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o +kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o condvar.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/drivers/kvm/condvar.c b/drivers/kvm/condvar.c new file mode 100644 index 0000000..87e464a --- /dev/null +++ b/drivers/kvm/condvar.c @@ -0,0 +1,109 @@ +/* + * Condition Variable + * + * Copyright (C) 2007, Novell + * + * Authors: + * Gregory Haskins <[EMAIL PROTECTED]> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "condvar.h" + +void condvar_init(struct condvar *cv) +{ + wait_queue_head_t __head = __WAIT_QUEUE_HEAD_INITIALIZER(cv->queue); + + memset(cv, 0, sizeof(*cv)); + cv->queue = __head; +} +EXPORT_SYMBOL_GPL(condvar_init); + +/* + * Assumes the lock is already held + */ +int condvar_wait(struct condvar *cv, void *l, long timeout) +{ + DEFINE_WAIT(__wait); + int _ret = 0; + + BUG_ON(!cv->lock_ops); + + /* + * first place ourselves on the waitqueue before releasing the lock + */ + prepare_to_wait(&cv->queue, &__wait, TASK_UNINTERRUPTIBLE); + + /* + * now actually release the lock to unblock any potential signalers + */ + cv->lock_ops->unlock(l); + + /* + * finally, reschedule until we are re-awoken + */ + if (timeout > -1) + schedule_timeout(timeout); + else + schedule(); + finish_wait(&cv->queue, &__wait); + + /* + * if we get here, its because someone signaled us. + * reaquire the lock + */ + cv->lock_ops->lock(l); + + return _ret; +} +EXPORT_SYMBOL_GPL(condvar_wait); + +/* + * Assumes the lock is already held + */ +int condvar_signal(struct condvar *cv) +{ + wake_up(&cv->queue); + return 0; +} +EXPORT_SYMBOL_GPL(condvar_signal); + +/* + *------------------------------------------------------------------------ + * spinlock_condvar + * + * spinlock_lock/unlock can sometimes be implemented as macros, so + * assigning them as function pointers directly is probably not going to + * work. Therefore we need these lightweight wrappers + *------------------------------------------------------------------------ + */ + +static void spinlock_condvar_lock(void *l) +{ + spinlock_t *lock = (spinlock_t*)l; + + spin_lock(lock); +} + +static void spinlock_condvar_unlock(void *l) +{ + spinlock_t *lock = (spinlock_t*)l; + + spin_unlock(lock); +} + +static struct cv_lock_ops spinlock_ops = { + .lock = spinlock_condvar_lock, + .unlock = spinlock_condvar_unlock +}; + +void spinlock_condvar_init(struct condvar *cv) +{ + condvar_init(cv); + + cv->lock_ops = &spinlock_ops; +} + diff --git a/drivers/kvm/condvar.h b/drivers/kvm/condvar.h new file mode 100644 index 0000000..58ed523 --- /dev/null +++ b/drivers/kvm/condvar.h @@ -0,0 +1,36 @@ +/* + * Condition Variable + * + * Copyright (C) 2007, Novell + * + * Authors: + * Gregory Haskins <[EMAIL PROTECTED]> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +struct cv_lock_ops { + void (*lock)(void *); + void (*unlock)(void *); +}; + +struct condvar { + wait_queue_head_t queue; + struct cv_lock_ops *lock_ops; +}; + +void condvar_init(struct condvar *cv); +int condvar_wait(struct condvar *cv, void *l, long timeout); +int condvar_signal(struct condvar *cv); + +/* + *------------------------------------------------------------------------ + * spinlock_condvar + *------------------------------------------------------------------------ + */ + +void spinlock_condvar_init(struct condvar *cv); + + diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 58966d9..703ffe0 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -14,6 +14,7 @@ #include "vmx.h" #include "irqdevice.h" +#include "condvar.h" #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -271,6 +272,16 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus, #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) +/* + * structure for maintaining info for interrupting an executing VCPU + */ +struct kvm_vcpu_irq { + spinlock_t lock; + struct condvar cv; + struct task_struct *task; + int pending; +}; + struct kvm_vcpu { struct kvm *kvm; union { @@ -284,6 +295,7 @@ struct kvm_vcpu { struct kvm_run *run; int interrupt_window_open; struct kvm_irqdevice irq_dev; + struct kvm_vcpu_irq irq; unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ unsigned long rip; /* needs vcpu_load_rsp_rip() */ diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 7e00412..ea3609e 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -299,6 +299,11 @@ static struct kvm *kvm_create_vm(void) struct kvm_vcpu *vcpu = &kvm->vcpus[i]; mutex_init(&vcpu->mutex); + + memset(&vcpu->irq, 0, sizeof(vcpu->irq)); + spin_lock_init(&vcpu->irq.lock); + spinlock_condvar_init(&vcpu->irq.cv); + vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; @@ -2320,13 +2325,45 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this, * Our irq device is requesting to interrupt the vcpu. If it is * currently running, we should inject a host IPI to force a VMEXIT */ - + struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private; + /* - * FIXME: Implement this or the CPU wont notice the interrupt until - * the next natural VMEXIT. Note that this is how the system - * has always worked, so nothing is broken here. This is a future - * enhancement + * HACK ALERT! + * + * We want to send a virtual interrupt signal to the task that owns + * the guest. However, the signal will only force a VMEXIT (via + * a reschedule IPI) if the task is currently in GUEST mode. There + * is a race condition between the time that we mark the vcpu as + * running and the time the system actually enter guest mode. Since + * there doesnt appear to be any way to help with this situation from + * the VT hardware, we are forced to wait to make sure the guest + * actually gets interrupted in a reasonable amount of time. If it + * does not, we assume that the IPI failed because it was too early + * and must try again until it does. + * + * This condvar/spinlock/timeout/retry eliminate the race in a safe + * manner, at the expense of making the INTR delivery synchronous */ + spin_lock(&vcpu->irq.lock); + + if (vcpu->irq.task) { + struct timespec tmo = { + .tv_sec = 0, + .tv_nsec = 100000 /* 100us */ + }; + + BUG_ON(vcpu->irq.task == current); + + while (vcpu->irq.task) { + send_sig(SIGSTOP, vcpu->irq.task, 0); + condvar_wait(&vcpu->irq.cv, &vcpu->irq.lock, + timespec_to_jiffies(&tmo)); + } + + vcpu->irq.pending = 1; + } + + spin_unlock(&vcpu->irq.lock); } static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index e59a548..6bc2fb1 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1463,9 +1463,25 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; again: + spin_lock(&vcpu->irq.lock); + + /* + * Setting vcpu->task signals to outsiders that the VMCS is + * effectively in GUEST mode, and therefore must be signalled + * to transition the task back to HOST mode if any new interrupts + * arrive. + */ + vcpu->irq.task = current; + + /* + * We also must inject interrupts (if any) while the irq_lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); + spin_unlock(&vcpu->irq.lock); + clgi(); pre_svm_run(vcpu); @@ -1617,6 +1633,25 @@ again: reload_tss(vcpu); /* + * Signal that we have transitioned back to host mode + */ + spin_lock(&vcpu->irq.lock); + + vcpu->irq.task = NULL; + condvar_signal(&vcpu->irq.cv); + + /* + * If irqpending is asserted someone undoubtedly has sent us a SIGSTOP + * signal. Counter it with a SIGCONT + */ + if(vcpu->irq.pending) { + send_sig(SIGCONT, current, 0); + vcpu->irq.pending = 0; + } + + spin_unlock(&vcpu->irq.lock); + + /* * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a0fdf02..f7b716b 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1748,9 +1748,25 @@ again: vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); #endif + spin_lock(&vcpu->irq.lock); + + /* + * Setting vcpu->task signals to outsiders that the VMCS is + * effectively in GUEST mode, and therefore must be signalled + * to transition the task back to HOST mode if any new interrupts + * arrive. + */ + vcpu->irq.task = current; + + /* + * We also must inject interrupts (if any) while the irq_lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); + spin_unlock(&vcpu->irq.lock); + if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); @@ -1911,6 +1927,25 @@ again: asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + /* + * Signal that we have transitioned back to host mode + */ + spin_lock(&vcpu->irq.lock); + + vcpu->irq.task = NULL; + condvar_signal(&vcpu->irq.cv); + + /* + * If irqpending is asserted someone undoubtedly has sent us a SIGSTOP + * signal. Counter it with a SIGCONT + */ + if(vcpu->irq.pending) { + send_sig(SIGCONT, current, 0); + vcpu->irq.pending = 0; + } + + spin_unlock(&vcpu->irq.lock); + if (fail) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason
series
Description: Binary data
------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________ kvm-devel mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/kvm-devel