Subject: [patch] KVM: paravirtual guest support From: Ingo Molnar <[EMAIL PROTECTED]>
this enables a CONFIG_PARAVIRT Linux guest kernel to establish a hypercall API to a KVM host. If successfully registered, then the Linux guest will optimize a few things like its interrupt controller, io-delay and it also registers its cr3-cache structures with the host. (but the host will not touch those, just yet) (this is fully backwards compatible - if the WRMSR fails then the Linux guest continues to execute as a native kernel.) Signed-off-by: Ingo Molnar <[EMAIL PROTECTED]> --- arch/i386/kernel/paravirt.c | 275 ++++++++++++++++++++++++++++++++++++++++++++ drivers/serial/8250.c | 3 include/linux/paravirt.h | 12 + init/main.c | 6 4 files changed, 294 insertions(+), 2 deletions(-) Index: linux/arch/i386/kernel/paravirt.c =================================================================== --- linux.orig/arch/i386/kernel/paravirt.c +++ linux/arch/i386/kernel/paravirt.c @@ -20,6 +20,7 @@ #include <linux/efi.h> #include <linux/bcd.h> #include <linux/start_kernel.h> +#include <linux/kvm_para.h> #include <asm/bug.h> #include <asm/paravirt.h> @@ -33,6 +34,9 @@ #include <asm/apic.h> #include <asm/tlbflush.h> +#include <asm/i8259.h> +#include <io_ports.h> + /* nop stub */ static void native_nop(void) { @@ -683,3 +687,274 @@ struct paravirt_ops paravirt_ops = { .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, }; + +/* + * KVM paravirtualization optimizations: + */ +int kvm_paravirt; + +/* + * No need for any "IO delay" on KVM: + */ +static void kvm_io_delay(void) +{ +} + +static DEFINE_PER_CPU(struct kvm_vcpu_para_state, para_state); + +/* + * Special, register-to-cr3 instruction based hypercall API + * variant to the KVM host. This utilizes the cr3 filter capability + * of the hardware - if this works out then no VM exit happens, + * if a VM exit happens then KVM will get the virtual address too. + */ +static void kvm_write_cr3(unsigned long guest_cr3) +{ + struct kvm_vcpu_para_state *para_state = &get_cpu_var(para_state); + struct kvm_cr3_cache *cache = ¶_state->cr3_cache; + int idx; + + /* + * Check the cache (maintained by the host) for a matching + * guest_cr3 => host_cr3 mapping. Use it if found: + */ + for (idx = 0; idx < cache->max_idx; idx++) { + if (cache->entry[idx].guest_cr3 == guest_cr3) { + /* + * Cache-hit: we load the cached host-CR3 value. + * This never causes any VM exit. (if it does then the + * hypervisor could do nothing with this instruction + * and the guest OS would be aborted) + */ + asm volatile("movl %0, %%cr3" + : : "r" (cache->entry[idx].host_cr3)); + goto out; + } + } + + /* + * Cache-miss. Load the guest-cr3 value into cr3, which will + * cause a VM exit to the hypervisor, which then loads the + * host cr3 value and updates the cr3_cache. + */ + asm volatile("movl %0, %%cr3" : : "r" (guest_cr3)); +out: + put_cpu_var(para_state); +} + +/* + * Avoid the VM exit upon cr3 load by using the cached + * ->active_mm->pgd value: + */ +static void kvm_flush_tlb_user(void) +{ + kvm_write_cr3(__pa(current->active_mm->pgd)); +} + +static void kvm_flush_tlb_single(u32 addr) +{ + __native_flush_tlb_single(addr); +} +/* + * Disable global pages, do a flush, then enable global pages: + */ +static fastcall void kvm_flush_tlb_kernel(void) +{ + unsigned long orig_cr4 = read_cr4(); + + write_cr4(orig_cr4 & ~X86_CR4_PGE); + kvm_flush_tlb_user(); + write_cr4(orig_cr4); +} + +/* + * Simplified i8259A controller handling: + */ +static void mask_and_ack_kvm(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= irqmask; + + if (irq & 8) { + outb(cached_slave_mask, PIC_SLAVE_IMR); + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ + } else { + outb(cached_master_mask, PIC_MASTER_IMR); + /* 'Specific EOI' to master: */ + outb(0x60+irq, PIC_MASTER_CMD); + } + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void disable_kvm_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void enable_kvm_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static struct irq_chip kvm_chip = { + .name = "XT-PIC", + .mask = disable_kvm_irq, + .disable = disable_kvm_irq, + .unmask = enable_kvm_irq, + .mask_ack = mask_and_ack_kvm, +}; + +static void __init kvm_init_IRQ(void) +{ + int i; + + printk("init KVM IRQ controller\n"); +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &kvm_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + irq_ctx_init(smp_processor_id()); +} + +/* + * This is the vm-syscall address - to be patched by the host to + * VMCALL (Intel) or VMMCALL (AMD), depending on the CPU model: + */ +asm ( + " .globl vm_syscall_addr \n" + " .align 4 \n" + " vm_syscall_addr: \n" + " nop \n" + " nop \n" + " nop \n" + " ret \n" +); + +extern unsigned char vm_syscall_addr[4]; + +int kvm_guest_register_para(int cpu) +{ + struct kvm_vcpu_para_state *para_state = &per_cpu(para_state, cpu); + + printk("kvm guest on VCPU#%d: trying to register para_state %p\n", + cpu, para_state); + /* + * Move a magic (and otherwise invalid) value to + * cr3, and thus signal to KVM that we are entering + * paravirtualized mode: + */ + para_state->guest_version = KVM_PARA_API_VERSION; + para_state->host_version = -1; + para_state->size = sizeof(*para_state); + para_state->ret = 0; + para_state->vm_syscall_addr = __pa(vm_syscall_addr); + + if (wrmsr_safe(MSR_KVM_API_MAGIC, __pa(para_state), 0)) { + printk("KVM guest: WRMSR probe failed.\n"); + return 0; + } + + printk("kvm guest: host returned %d\n", para_state->ret); + printk("kvm guest: host version: %d\n", para_state->host_version); + printk("kvm guest: cr3 cache size: %d\n", + para_state->cr3_cache.max_idx); + printk("kvm guest: syscall entry: %02x %02x %02x %02x\n", + vm_syscall_addr[0], vm_syscall_addr[1], + vm_syscall_addr[2], vm_syscall_addr[3]); + if (para_state->ret) { + printk("kvm guest: host refused registration.\n"); + return 0; + } + + return 1; +} + +static int __init kvm_paravirt_setup(char *s) +{ + printk("KVM paravirtualization setup\n"); + if (sscanf(s, "%u", &kvm_paravirt) <= 0) + return 1; + if (!kvm_paravirt) + return 1; + + kvm_paravirt = kvm_guest_register_para(smp_processor_id()); + if (!kvm_paravirt) + return 1; + + printk("KVM paravirtualized: OK\n"); + + paravirt_ops.name = "KVM"; + paravirt_ops.io_delay = kvm_io_delay; + paravirt_ops.init_IRQ = kvm_init_IRQ; + paravirt_ops.flush_tlb_user = kvm_flush_tlb_user; + paravirt_ops.flush_tlb_kernel = kvm_flush_tlb_kernel; + paravirt_ops.flush_tlb_single = kvm_flush_tlb_single; + paravirt_ops.write_cr3 = kvm_write_cr3; + paravirt_ops.paravirt_enabled = 1; + + return 1; +} +__setup("kvm_paravirt=", kvm_paravirt_setup); + +EXPORT_SYMBOL_GPL(paravirt_ops); + Index: linux/drivers/serial/8250.c =================================================================== --- linux.orig/drivers/serial/8250.c +++ linux/drivers/serial/8250.c @@ -27,6 +27,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/ioport.h> +#include <linux/paravirt.h> #include <linux/init.h> #include <linux/console.h> #include <linux/sysrq.h> @@ -1371,7 +1372,7 @@ static irqreturn_t serial8250_interrupt( l = l->next; - if (l == i->head && pass_counter++ > PASS_LIMIT) { + if (!paravirt_enabled() && l == i->head && pass_counter++ > PASS_LIMIT) { /* If we hit this, we're dead. */ printk(KERN_ERR "serial8250: too much work for " "irq%d\n", irq); Index: linux/include/linux/paravirt.h =================================================================== --- /dev/null +++ linux/include/linux/paravirt.h @@ -0,0 +1,12 @@ +#ifndef __LINUX_PARAVIRT_H +#define __LINUX_PARAVIRT_H + +/* + * Paravirtualization support + */ + +#ifndef CONFIG_PARAVIRT +# define paravirt_enabled() 0 +#endif + +#endif Index: linux/init/main.c =================================================================== --- linux.orig/init/main.c +++ linux/init/main.c @@ -374,7 +374,11 @@ static void __init setup_per_cpu_areas(v if (size < PERCPU_ENOUGH_ROOM) size = PERCPU_ENOUGH_ROOM; #endif - ptr = alloc_bootmem(size * nr_possible_cpus); + /* + * Align them to page size - just in case someone aligns + * the per-CPU data to page that alignment should be preserved: + */ + ptr = alloc_bootmem_pages(size * nr_possible_cpus); for_each_possible_cpu(i) { __per_cpu_offset[i] = ptr - __per_cpu_start; ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys - and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel