Subject: [patch] KVM: paravirtual guest support
From: Ingo Molnar <[EMAIL PROTECTED]>

this enables a CONFIG_PARAVIRT Linux guest kernel to establish a 
hypercall API to a KVM host. If successfully registered, then the Linux 
guest will optimize a few things like its interrupt controller, io-delay 
and it also registers its cr3-cache structures with the host. (but the 
host will not touch those, just yet)

(this is fully backwards compatible - if the WRMSR fails then the Linux
guest continues to execute as a native kernel.)

Signed-off-by: Ingo Molnar <[EMAIL PROTECTED]>
---
 arch/i386/kernel/paravirt.c |  275 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/serial/8250.c       |    3 
 include/linux/paravirt.h    |   12 +
 init/main.c                 |    6 
 4 files changed, 294 insertions(+), 2 deletions(-)

Index: linux/arch/i386/kernel/paravirt.c
===================================================================
--- linux.orig/arch/i386/kernel/paravirt.c
+++ linux/arch/i386/kernel/paravirt.c
@@ -20,6 +20,7 @@
 #include <linux/efi.h>
 #include <linux/bcd.h>
 #include <linux/start_kernel.h>
+#include <linux/kvm_para.h>
 
 #include <asm/bug.h>
 #include <asm/paravirt.h>
@@ -33,6 +34,9 @@
 #include <asm/apic.h>
 #include <asm/tlbflush.h>
 
+#include <asm/i8259.h>
+#include <io_ports.h>
+
 /* nop stub */
 static void native_nop(void)
 {
@@ -683,3 +687,274 @@ struct paravirt_ops paravirt_ops = {
        .irq_enable_sysexit = native_irq_enable_sysexit,
        .iret = native_iret,
 };
+
+/*
+ * KVM paravirtualization optimizations:
+ */
+int kvm_paravirt;
+
+/*
+ * No need for any "IO delay" on KVM:
+ */
+static void kvm_io_delay(void)
+{
+}
+
+static DEFINE_PER_CPU(struct kvm_vcpu_para_state, para_state);
+
+/*
+ * Special, register-to-cr3 instruction based hypercall API
+ * variant to the KVM host. This utilizes the cr3 filter capability
+ * of the hardware - if this works out then no VM exit happens,
+ * if a VM exit happens then KVM will get the virtual address too.
+ */
+static void kvm_write_cr3(unsigned long guest_cr3)
+{
+       struct kvm_vcpu_para_state *para_state = &get_cpu_var(para_state);
+       struct kvm_cr3_cache *cache = &para_state->cr3_cache;
+       int idx;
+
+       /*
+        * Check the cache (maintained by the host) for a matching
+        * guest_cr3 => host_cr3 mapping. Use it if found:
+        */
+       for (idx = 0; idx < cache->max_idx; idx++) {
+               if (cache->entry[idx].guest_cr3 == guest_cr3) {
+                       /*
+                        * Cache-hit: we load the cached host-CR3 value.
+                        * This never causes any VM exit. (if it does then the
+                        * hypervisor could do nothing with this instruction
+                        * and the guest OS would be aborted)
+                        */
+                       asm volatile("movl %0, %%cr3"
+                               : : "r" (cache->entry[idx].host_cr3));
+                       goto out;
+               }
+       }
+
+       /*
+        * Cache-miss. Load the guest-cr3 value into cr3, which will
+        * cause a VM exit to the hypervisor, which then loads the
+        * host cr3 value and updates the cr3_cache.
+        */
+       asm volatile("movl %0, %%cr3" : : "r" (guest_cr3));
+out:
+       put_cpu_var(para_state);
+}
+
+/*
+ * Avoid the VM exit upon cr3 load by using the cached
+ * ->active_mm->pgd value:
+ */
+static void kvm_flush_tlb_user(void)
+{
+       kvm_write_cr3(__pa(current->active_mm->pgd));
+}
+
+static void kvm_flush_tlb_single(u32 addr)
+{
+       __native_flush_tlb_single(addr);
+}
+/*
+ * Disable global pages, do a flush, then enable global pages:
+ */
+static fastcall void kvm_flush_tlb_kernel(void)
+{
+       unsigned long orig_cr4 = read_cr4();
+
+       write_cr4(orig_cr4 & ~X86_CR4_PGE);
+       kvm_flush_tlb_user();
+       write_cr4(orig_cr4);
+}
+
+/*
+ * Simplified i8259A controller handling:
+ */
+static void mask_and_ack_kvm(unsigned int irq)
+{
+       unsigned int irqmask = 1 << irq;
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       cached_irq_mask |= irqmask;
+
+       if (irq & 8) {
+               outb(cached_slave_mask, PIC_SLAVE_IMR);
+               outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
+               outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to 
master-IRQ2 */
+       } else {
+               outb(cached_master_mask, PIC_MASTER_IMR);
+               /* 'Specific EOI' to master: */
+               outb(0x60+irq, PIC_MASTER_CMD);
+       }
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void disable_kvm_irq(unsigned int irq)
+{
+       unsigned int mask = 1 << irq;
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       cached_irq_mask |= mask;
+       if (irq & 8)
+               outb(cached_slave_mask, PIC_SLAVE_IMR);
+       else
+               outb(cached_master_mask, PIC_MASTER_IMR);
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void enable_kvm_irq(unsigned int irq)
+{
+       unsigned int mask = ~(1 << irq);
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       cached_irq_mask &= mask;
+       if (irq & 8)
+               outb(cached_slave_mask, PIC_SLAVE_IMR);
+       else
+               outb(cached_master_mask, PIC_MASTER_IMR);
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static struct irq_chip kvm_chip = {
+       .name           = "XT-PIC",
+       .mask           = disable_kvm_irq,
+       .disable        = disable_kvm_irq,
+       .unmask         = enable_kvm_irq,
+       .mask_ack       = mask_and_ack_kvm,
+};
+
+static void __init kvm_init_IRQ(void)
+{
+       int i;
+
+       printk("init KVM IRQ controller\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+       init_bsp_APIC();
+#endif
+       init_8259A(0);
+
+       for (i = 0; i < NR_IRQS; i++) {
+               irq_desc[i].status = IRQ_DISABLED;
+               irq_desc[i].action = NULL;
+               irq_desc[i].depth = 1;
+
+               if (i < 16) {
+                       /*
+                        * 16 old-style INTA-cycle interrupts:
+                        */
+                       set_irq_chip_and_handler_name(i, &kvm_chip,
+                                                     handle_level_irq, "XT");
+               } else {
+                       /*
+                        * 'high' PCI IRQs filled in on demand
+                        */
+                       irq_desc[i].chip = &no_irq_chip;
+               }
+       }
+
+       /*
+        * Cover the whole vector space, no vector can escape
+        * us. (some of these will be overridden and become
+        * 'special' SMP interrupts)
+        */
+       for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+               int vector = FIRST_EXTERNAL_VECTOR + i;
+               if (i >= NR_IRQS)
+                       break;
+               if (vector != SYSCALL_VECTOR)
+                       set_intr_gate(vector, interrupt[i]);
+       }
+
+       /* setup after call gates are initialised (usually add in
+        * the architecture specific gates)
+        */
+       intr_init_hook();
+
+       irq_ctx_init(smp_processor_id());
+}
+
+/*
+ * This is the vm-syscall address - to be patched by the host to
+ * VMCALL (Intel) or VMMCALL (AMD), depending on the CPU model:
+ */
+asm (
+       "       .globl vm_syscall_addr          \n"
+       "       .align 4                        \n"
+       "       vm_syscall_addr:                \n"
+       "               nop                     \n"
+       "               nop                     \n"
+       "               nop                     \n"
+       "               ret                     \n"
+);
+
+extern unsigned char vm_syscall_addr[4];
+
+int kvm_guest_register_para(int cpu)
+{
+       struct kvm_vcpu_para_state *para_state = &per_cpu(para_state, cpu);
+
+       printk("kvm guest on VCPU#%d: trying to register para_state %p\n",
+               cpu, para_state);
+       /*
+        * Move a magic (and otherwise invalid) value to
+        * cr3, and thus signal to KVM that we are entering
+        * paravirtualized mode:
+        */
+       para_state->guest_version = KVM_PARA_API_VERSION;
+       para_state->host_version = -1;
+       para_state->size = sizeof(*para_state);
+       para_state->ret = 0;
+       para_state->vm_syscall_addr = __pa(vm_syscall_addr);
+
+       if (wrmsr_safe(MSR_KVM_API_MAGIC, __pa(para_state), 0)) {
+               printk("KVM guest: WRMSR probe failed.\n");
+               return 0;
+       }
+
+       printk("kvm guest: host returned %d\n", para_state->ret);
+       printk("kvm guest: host version: %d\n", para_state->host_version);
+       printk("kvm guest: cr3 cache size: %d\n",
+                               para_state->cr3_cache.max_idx);
+       printk("kvm guest: syscall entry: %02x %02x %02x %02x\n",
+                       vm_syscall_addr[0], vm_syscall_addr[1],
+                       vm_syscall_addr[2], vm_syscall_addr[3]);
+       if (para_state->ret) {
+               printk("kvm guest: host refused registration.\n");
+               return 0;
+       }
+
+       return 1;
+}
+
+static int __init kvm_paravirt_setup(char *s)
+{
+       printk("KVM paravirtualization setup\n");
+        if (sscanf(s, "%u", &kvm_paravirt) <= 0)
+               return 1;
+       if (!kvm_paravirt)
+               return 1;
+
+       kvm_paravirt = kvm_guest_register_para(smp_processor_id());
+       if (!kvm_paravirt)
+               return 1;
+
+       printk("KVM paravirtualized: OK\n");
+
+       paravirt_ops.name = "KVM";
+       paravirt_ops.io_delay = kvm_io_delay;
+       paravirt_ops.init_IRQ = kvm_init_IRQ;
+       paravirt_ops.flush_tlb_user = kvm_flush_tlb_user;
+       paravirt_ops.flush_tlb_kernel = kvm_flush_tlb_kernel;
+       paravirt_ops.flush_tlb_single = kvm_flush_tlb_single;
+       paravirt_ops.write_cr3 = kvm_write_cr3;
+       paravirt_ops.paravirt_enabled = 1;
+
+       return 1;
+}
+__setup("kvm_paravirt=", kvm_paravirt_setup);
+
+EXPORT_SYMBOL_GPL(paravirt_ops);
+
Index: linux/drivers/serial/8250.c
===================================================================
--- linux.orig/drivers/serial/8250.c
+++ linux/drivers/serial/8250.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/ioport.h>
+#include <linux/paravirt.h>
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/sysrq.h>
@@ -1371,7 +1372,7 @@ static irqreturn_t serial8250_interrupt(
 
                l = l->next;
 
-               if (l == i->head && pass_counter++ > PASS_LIMIT) {
+               if (!paravirt_enabled() && l == i->head && pass_counter++ > 
PASS_LIMIT) {
                        /* If we hit this, we're dead. */
                        printk(KERN_ERR "serial8250: too much work for "
                                "irq%d\n", irq);
Index: linux/include/linux/paravirt.h
===================================================================
--- /dev/null
+++ linux/include/linux/paravirt.h
@@ -0,0 +1,12 @@
+#ifndef __LINUX_PARAVIRT_H
+#define __LINUX_PARAVIRT_H
+
+/*
+ * Paravirtualization support
+ */
+
+#ifndef CONFIG_PARAVIRT
+# define paravirt_enabled()    0
+#endif
+
+#endif
Index: linux/init/main.c
===================================================================
--- linux.orig/init/main.c
+++ linux/init/main.c
@@ -374,7 +374,11 @@ static void __init setup_per_cpu_areas(v
        if (size < PERCPU_ENOUGH_ROOM)
                size = PERCPU_ENOUGH_ROOM;
 #endif
-       ptr = alloc_bootmem(size * nr_possible_cpus);
+       /*
+        * Align them to page size - just in case someone aligns
+        * the per-CPU data to page that alignment should be preserved:
+        */
+       ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
        for_each_possible_cpu(i) {
                __per_cpu_offset[i] = ptr - __per_cpu_start;

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to