[PATCH 3/3] KVM: x86: Use FPU API
Convert KVM to use generic FPU API. Signed-off-by: Sheng Yang --- arch/x86/include/asm/kvm_host.h | 18 +- arch/x86/kvm/x86.c | 73 --- 2 files changed, 23 insertions(+), 68 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ed48904..beba6f5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -300,8 +300,7 @@ struct kvm_vcpu_arch { unsigned long mmu_seq; } update_pte; - struct i387_fxsave_struct host_fx_image; - struct i387_fxsave_struct guest_fx_image; + struct fpu host_fpu, guest_fpu; gva_t mmio_fault_cr2; struct kvm_pio_request pio; @@ -709,21 +708,6 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline void kvm_fx_save(struct i387_fxsave_struct *image) -{ - asm("fxsave (%0)":: "r" (image)); -} - -static inline void kvm_fx_restore(struct i387_fxsave_struct *image) -{ - asm("fxrstor (%0)":: "r" (image)); -} - -static inline void kvm_fx_finit(void) -{ - asm("finit"); -} - static inline u32 get_rdx_init_val(void) { return 0x600; /* P6 family */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd8a606..2313f76 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -52,6 +52,8 @@ #include #include #include +#include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -5069,27 +5071,6 @@ unlock_out: } /* - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when - * we have asm/x86/processor.h - */ -struct fxsave { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ -#else - u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif -}; - -/* * Translate a guest virtual address to a guest physical address. */ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -5114,7 +5095,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; vcpu_load(vcpu); @@ -5134,7 +5116,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; vcpu_load(vcpu); @@ -5154,41 +5137,30 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) void fx_init(struct kvm_vcpu *vcpu) { - unsigned after_mxcsr_mask; - - /* -* Touch the fpu the first time in non atomic context as if -* this is the first fpu instruction the exception handler -* will fire before the instruction returns and it'll have to -* allocate ram with GFP_KERNEL. -*/ - if (!used_math()) - kvm_fx_save(&vcpu->arch.host_fx_image); + fpu_alloc(&vcpu->arch.host_fpu); + fpu_alloc(&vcpu->arch.guest_fpu); - /* Initialize guest FPU by resetting ours and saving into guest's */ - preempt_disable(); - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_finit(); - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); - preempt_enable(); + fpu_save(&vcpu->arch.host_fpu); + fpu_finit(&vcpu->arch.guest_fpu); vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); } EXPORT_SYMBOL_GPL(fx_init); +static void fx_free(struct kvm_vcpu *vcpu) +{ + fpu_free(&vcpu->arch.host_fpu); + fpu_free(&vcpu->arch.guest_fpu); +} + void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_restore(&vcpu->arch.guest_fx_image); + fpu_save(&vcpu->arch.host_fpu); + fpu_restore_checking(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } @@ -5198,8 +5170,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) return;
[PATCH 0/3] Convert KVM to use FPU API
This patchset based on Avi's FPU API patchset. Sheng Yang (3): x86: Split fpu_save_init() to fpu_save() and fpu_clear() x86: Export FPU API for KVM use KVM: x86: Use FPU API arch/x86/include/asm/i387.h | 73 --- arch/x86/include/asm/kvm_host.h | 18 +- arch/x86/include/asm/xsave.h|3 ++ arch/x86/kernel/i387.c |3 +- arch/x86/kernel/process.c |1 + arch/x86/kvm/x86.c | 73 --- 6 files changed, 74 insertions(+), 97 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] x86: Split fpu_save_init() to fpu_save() and fpu_clear()
fpu_save() would be used later by KVM. Signed-off-by: Sheng Yang --- arch/x86/include/asm/i387.h | 71 ++- 1 files changed, 43 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 1a8cca3..989d3b7 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -180,13 +180,17 @@ static inline void fpu_fxsave(struct fpu *fpu) #endif } -static inline void fpu_save_init(struct fpu *fpu) +static inline void fpu_save(struct fpu *fpu) { if (use_xsave()) fpu_xsave(fpu); else fpu_fxsave(fpu); +} +static inline void fpu_save_init(struct fpu *fpu) +{ + fpu_save(fpu); fpu_clear(fpu); } @@ -237,19 +241,41 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) /* * These must be called with preempt disabled */ -static inline void fpu_save_init(struct fpu *fpu) +static inline void fpu_fxsave(struct fpu *fpu) { - if (use_xsave()) { - struct xsave_struct *xstate = &fpu->state->xsave; - struct i387_fxsave_struct *fx = &fpu->state->fxsave; + /* Use more nops than strictly needed in case the compiler + varies code */ + alternative_input( + "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4, + "fxsave %[fx]\n" + "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", + X86_FEATURE_FXSR, + [fx] "m" (fpu->state->fxsave), + [fsw] "m" (fpu->state->fxsave.swd) : "memory"); +} +static inline void fpu_save(struct fpu *fpu) +{ + if (use_xsave()) fpu_xsave(fpu); + else + fpu_fxsave(fpu); +} + +/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. safe_address is a random variable that should be in L1 */ +static inline void fpu_clear(struct fpu *fpu) +{ + struct xsave_struct *xstate = &fpu->state->xsave; + struct i387_fxsave_struct *fx = &fpu->state->fxsave; + if (use_xsave()) { /* * xsave header may indicate the init state of the FP. */ if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) - goto end; + return; if (unlikely(fx->swd & X87_FSW_ES)) asm volatile("fnclex"); @@ -257,30 +283,19 @@ static inline void fpu_save_init(struct fpu *fpu) /* * we can do a simple return here or be paranoid :) */ - goto clear_state; } + alternative_input( + GENERIC_NOP8 GENERIC_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %[addr]",/* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (safe_address)); +} - /* Use more nops than strictly needed in case the compiler - varies code */ - alternative_input( - "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4, - "fxsave %[fx]\n" - "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", - X86_FEATURE_FXSR, - [fx] "m" (fpu->state->fxsave), - [fsw] "m" (fpu->state->fxsave.swd) : "memory"); -clear_state: - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. safe_address is a random variable that should be in L1 */ - alternative_input( - GENERIC_NOP8 GENERIC_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %[addr]",/* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); -end: - ; +static inline void fpu_save_init(struct fpu *fpu) +{ + fpu_save(fpu); + fpu_clear(fpu); } static inline void __save_init_fpu(struct task_struct *tsk) -- 1.7.0.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] x86: Export FPU API for KVM use
Also add some more constants. Signed-off-by: Sheng Yang --- Do we need to rename "task_xstate_cachep"? It's not only for task struct now. arch/x86/include/asm/i387.h |2 ++ arch/x86/include/asm/xsave.h |3 +++ arch/x86/kernel/i387.c |3 ++- arch/x86/kernel/process.c|1 + 4 files changed, 8 insertions(+), 1 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 989d3b7..f19bdf9 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -503,6 +503,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src) memcpy(dst->state, src->state, xstate_size); } +extern void fpu_finit(struct fpu *fpu); + #endif /* __ASSEMBLY__ */ #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 2c4390c..29ee4e4 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -13,6 +13,9 @@ #define FXSAVE_SIZE512 +#define XSTATE_YMM_SIZE 256 +#define XSTATE_YMM_OFFSET (512 + 64) + /* * These are the features that the OS can handle currently. */ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 86cef6b..cbc 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -107,7 +107,7 @@ void __cpuinit fpu_init(void) } #endif /* CONFIG_X86_64 */ -static void fpu_finit(struct fpu *fpu) +void fpu_finit(struct fpu *fpu) { #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { @@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu) fp->fos = 0xu; } } +EXPORT_SYMBOL_GPL(fpu_finit); /* * The _current_ task is using the FPU for the first time diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8bcc21f..373fec9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL(task_xstate_cachep); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { -- 1.7.0.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH -v2] KVM, Fix QEMU-KVM is killed by guest SRAO MCE
On Fri, 2010-05-14 at 05:43 +0800, Marcelo Tosatti wrote: > On Wed, May 12, 2010 at 02:44:03PM +0800, Huang Ying wrote: > > @@ -1975,6 +1976,27 @@ static int __direct_map(struct kvm_vcpu > > return pt_write; > > } > > > > +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) > > +{ > > + char buf[1]; > > + void __user *hva; > > + int r; > > + > > + /* Touch the page, so send SIGBUS */ > > + hva = (void __user *)gfn_to_hva(kvm, gfn); > > + r = copy_from_user(buf, hva, 1); > > +} > > A SIGBUS signal has been raised by memory poisoning already, so i don't > see why this is needed? > > To avoid the MMIO processing in userspace before the MCE is sent to the > guest you can just return -EAGAIN from the page fault handlers back to > kvm_mmu_page_fault. The SIGBUS signal is necessary here because this SIGBUS is SRAR (Action Requirement) while the previously sent one is SRAO (Action Optional). They have different semantics and processed differently in QEMU-KVM and guest OS. For example the guest Linux kernel may ignore SRAO MCE (raised by QEMU-KVM after receiving SRAO SIGBUS), because it is optional, but for SRAR MCE (raised by QEMU-KVM after receiving SRAR SIGBUS) the guest Linux kernel must kill corresponding application or go panic. > > +int is_hwpoison_address(unsigned long addr) > > +{ > > + pgd_t *pgdp; > > + pud_t *pudp; > > + pmd_t *pmdp; > > + pte_t pte, *ptep; > > + swp_entry_t entry; > > + > > + pgdp = pgd_offset(current->mm, addr); > > + if (!pgd_present(*pgdp)) > > + return 0; > > + pudp = pud_offset(pgdp, addr); > > + if (!pud_present(*pudp)) > > + return 0; > > + pmdp = pmd_offset(pudp, addr); > > + if (!pmd_present(*pmdp)) > > + return 0; > > Need to bail out if pmd is huge. Yes. I will change this. Best Regards, Huang Ying -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: remove CAP_SYS_RAWIO requirement from kvm_vm_ioctl_assign_irq
On Thu, 2010-05-13 at 15:06 -0700, Chris Wright wrote: > * Alex Williamson (alex.william...@redhat.com) wrote: > > Remove this check in an effort to allow kvm guests to run without > > root privileges. This capability check doesn't seem to add any > > security since the device needs to have already been added via the > > assign device ioctl and the io actually occurs through the pci > > sysfs interface. > > Right, so irq resource allocation can't be directly abused (by calling > this ioctl out of order, for example) because the device must be assigned > first via kvm_vm_ioctl_assign_device(). My only concern isn't with this > patch per-se, but the exising kvm_vm_ioctl_assign_device(). Yep, I agree. How do we decide the caller of the assign device ioctl has sufficient rights to the device without introducing another capabilities check? We won't get far if the resources are already claimed by a running driver, but we don't necessarily want anyone with /dev/kvm rights to be able to grab the remaining devices in the system. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next] ixgbevf: Enable GRO by default
On Thu, May 13, 2010 at 12:51, Shirley Ma wrote: > Enable GRO by default for performance. > > Signed-off-by: Shirley Ma > --- > > drivers/net/ixgbevf/ixgbevf_main.c | 1 + > 1 files changed, 1 insertions(+), 0 deletions(-) > > diff --git a/drivers/net/ixgbevf/ixgbevf_main.c > b/drivers/net/ixgbevf/ixgbevf_main.c > index 40f47b8..1bbb05e 100644 > --- a/drivers/net/ixgbevf/ixgbevf_main.c > +++ b/drivers/net/ixgbevf/ixgbevf_main.c > @@ -3415,6 +3415,7 @@ static int __devinit ixgbevf_probe(struct pci_dev *pdev, > netdev->features |= NETIF_F_IPV6_CSUM; > netdev->features |= NETIF_F_TSO; > netdev->features |= NETIF_F_TSO6; > + netdev->features |= NETIF_F_GRO; > netdev->vlan_features |= NETIF_F_TSO; > netdev->vlan_features |= NETIF_F_TSO6; > netdev->vlan_features |= NETIF_F_IP_CSUM; > > Thanks, I have added the patch to my queue. -- Cheers, Jeff -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: remove CAP_SYS_RAWIO requirement from kvm_vm_ioctl_assign_irq
* Alex Williamson (alex.william...@redhat.com) wrote: > Remove this check in an effort to allow kvm guests to run without > root privileges. This capability check doesn't seem to add any > security since the device needs to have already been added via the > assign device ioctl and the io actually occurs through the pci > sysfs interface. Right, so irq resource allocation can't be directly abused (by calling this ioctl out of order, for example) because the device must be assigned first via kvm_vm_ioctl_assign_device(). My only concern isn't with this patch per-se, but the exising kvm_vm_ioctl_assign_device(). thanks, -chris -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: pci device passthrough: Failed to assign irq
> I'm trying to pass a pcie-device into a kvm guest. qemu-kvm is started > by libvirt with this command: [...] > Failed to assign irq for "hostdev0": Operation not permitted > Perhaps you are assigning a device that shares an IRQ with another device? Found out what was causing the issue: Current libvirt seems to drop CAP_SYS_RAWIO before forking qemu-kvm. When I patch libvirt to not drop the capabilities, everything works as expected. So no problem in kvm, sorry for the noise. Kind regards, Gerd -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH -v2] KVM, Fix QEMU-KVM is killed by guest SRAO MCE
On Wed, May 12, 2010 at 02:44:03PM +0800, Huang Ying wrote: > In common cases, guest SRAO MCE will cause corresponding poisoned page > be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay > the MCE to guest OS. > > But it is reported that if the poisoned page is accessed in guest > after un-mapped and before MCE is relayed to guest OS, QEMU-KVM will > be killed. > > The reason is as follow. Because poisoned page has been un-mapped, > guest access will cause guest exit and kvm_mmu_page_fault will be > called. kvm_mmu_page_fault can not get the poisoned page for fault > address, so kernel and user space MMIO processing is tried in turn. In > user MMIO processing, poisoned page is accessed again, then QEMU-KVM > is killed by force_sig_info. > > To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM > and do not try kernel and user space MMIO processing for poisoned > page. > > > Changelog: > > v2: > > - Use page table walker to determine whether the virtual address is > poisoned to avoid change user space interface (via changing > get_user_pages). > > - Wrap bad page processing into kvm_handle_bad_page to avoid code > duplicating. > > Reported-by: Max Asbock > Signed-off-by: Huang Ying > --- > arch/x86/kvm/mmu.c | 34 ++ > arch/x86/kvm/paging_tmpl.h |7 ++- > include/linux/kvm_host.h |1 + > include/linux/mm.h |8 > mm/memory-failure.c| 28 > virt/kvm/kvm_main.c| 30 -- > 6 files changed, 93 insertions(+), 15 deletions(-) > > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -32,6 +32,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -1975,6 +1976,27 @@ static int __direct_map(struct kvm_vcpu > return pt_write; > } > > +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) > +{ > + char buf[1]; > + void __user *hva; > + int r; > + > + /* Touch the page, so send SIGBUS */ > + hva = (void __user *)gfn_to_hva(kvm, gfn); > + r = copy_from_user(buf, hva, 1); > +} A SIGBUS signal has been raised by memory poisoning already, so i don't see why this is needed? To avoid the MMIO processing in userspace before the MCE is sent to the guest you can just return -EAGAIN from the page fault handlers back to kvm_mmu_page_fault. > +int is_hwpoison_pfn(pfn_t pfn) > +{ > + return pfn == hwpoison_pfn; > +} > +EXPORT_SYMBOL_GPL(is_hwpoison_pfn); > + > static inline unsigned long bad_hva(void) > { > return PAGE_OFFSET; > @@ -939,6 +948,11 @@ static pfn_t hva_to_pfn(struct kvm *kvm, > if (unlikely(npages != 1)) { > struct vm_area_struct *vma; > > + if (is_hwpoison_address(addr)) { > + get_page(hwpoison_page); > + return page_to_pfn(hwpoison_page); > + } > + > down_read(¤t->mm->mmap_sem); > vma = find_vma(current->mm, addr); > > @@ -2198,6 +2212,15 @@ int kvm_init(void *opaque, unsigned int > > bad_pfn = page_to_pfn(bad_page); > > + hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); > + > + if (hwpoison_page == NULL) { > + r = -ENOMEM; > + goto out_free_0; > + } > + > + hwpoison_pfn = page_to_pfn(hwpoison_page); > + > if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { > r = -ENOMEM; > goto out_free_0; > @@ -2269,6 +2292,8 @@ out_free_1: > out_free_0a: > free_cpumask_var(cpus_hardware_enabled); > out_free_0: > + if (hwpoison_page) > + __free_page(hwpoison_page); > __free_page(bad_page); > out: > kvm_arch_exit(); > @@ -2291,6 +2316,7 @@ void kvm_exit(void) > kvm_arch_hardware_unsetup(); > kvm_arch_exit(); > free_cpumask_var(cpus_hardware_enabled); > + __free_page(hwpoison_page); > __free_page(bad_page); > } > EXPORT_SYMBOL_GPL(kvm_exit); > --- a/mm/memory-failure.c > +++ b/mm/memory-failure.c > @@ -45,6 +45,7 @@ > #include > #include > #include > +#include > #include "internal.h" > > int sysctl_memory_failure_early_kill __read_mostly = 0; > @@ -1296,3 +1297,30 @@ done: > /* keep elevated page count for bad page */ > return ret; > } > + > +int is_hwpoison_address(unsigned long addr) > +{ > + pgd_t *pgdp; > + pud_t *pudp; > + pmd_t *pmdp; > + pte_t pte, *ptep; > + swp_entry_t entry; > + > + pgdp = pgd_offset(current->mm, addr); > + if (!pgd_present(*pgdp)) > + return 0; > + pudp = pud_offset(pgdp, addr); > + if (!pud_present(*pudp)) > + return 0; > + pmdp = pmd_offset(pudp, addr); > + if (!pmd_present(*pmdp)) > + return 0; Need to bail out if pmd is huge. > + ptep = pte_offset_map(pmdp, addr); > + pte = *ptep; > + pte_unmap(ptep); > + i
[QEMU-KVM]: Megasas + TCM_Loop + SG_IO into Windows XP guests
Greetings Hannes and co, I have been spending a bit of time trying Megasas HBA emulation + TCM_Loop + SG_IO with a Windows XP SP2 KVM guests.. So far, I noticed that hw/scsi-generic.c:execute_command_run() using bdev_aio_ioctl() appears to be broken for XP guests, which causes the first 36-byte INQUIRY sent via SG_IO to never make it back to QEMU and results in the win32 LSI drive taking the LUN offline, et al. Note that everything does appear to be functioning as expected in kernel space for the first INQUIRY with the TCM_Loop LLD and Linux/SCSI code (AFAICT) and Linux KVM guests using megasas emulation are still working. So, I ended up needing requiring the following quick hack for hw/scsi-generic.c:execute_command_run() to make SG_IO function synchronously using bdrv_ioctl(), which at least gets LUN registration and basic control path CDBs working for the XP guest. Here is how it looks in action on a v2.6.34-rc7 host so far: http://www.linux-iscsi.org/images/TCM-KVM-megasas-XP-05132010.png diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c index 6c58742..aa1eb83 100644 --- a/hw/scsi-generic.c +++ b/hw/scsi-generic.c @@ -140,6 +140,7 @@ static int execute_command_run(SCSIGenericReq *r, { BlockDriverState *bdrv = r->req.dev->conf.dinfo->bdrv; SCSIGenericState *s = DO_UPCAST(SCSIGenericState, qdev, r->req.dev); +int ret; r->io_header.interface_id = 'S'; r->io_header.dxfer_direction = sgdir[r->req.cmd.mode]; @@ -161,11 +162,16 @@ static int execute_command_run(SCSIGenericReq *r, printf("\n"); } #endif +#if 0 r->req.aiocb = bdrv_aio_ioctl(bdrv, SG_IO, &r->io_header, complete, r); if (r->req.aiocb == NULL) { BADF("execute_command: read failed !\n"); return -1; } +#else +ret = bdrv_ioctl(bdrv, SG_IO, &r->io_header); +complete((void *)r, ret); +#endif * return 0; } Beyond the initial LUN registration and control CDB parts, doing bulk DATA_SG_IO traffic is completing successfully (and everything looks sane with TCM_Loop and Linux/SCSI) but it appears that the correct blocks are not actually getting written/read by megasas. This appears to be the case with both hw/scsi-generic.c and hw/scsi-disk.c modes of operation for megasas with the win32 XP guest. So I was wondering if anyone aware of known issues with QEMU asynchronous SG_IO into MSFT KVM guests with virtio or hw/lsi53c895a.c, or would this be something specific to megasas HBA emulation and XP guests..? Hannes, which MSFT guest + driver did you get work stable with bulk DATA_SG_IO and hw/scsi-disk.c..? Best, --nab -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 4/5] Inter-VM shared memory PCI device
On Mon, May 10, 2010 at 5:59 AM, Avi Kivity wrote: > On 04/21/2010 08:53 PM, Cam Macdonell wrote: >> + >> + /* allocate/initialize space for interrupt handling */ >> + s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *)); >> + s->eventfd_table = qemu_mallocz(s->vectors * >> sizeof(EventfdEntry)); >> + s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests * >> sizeof(int)); >> + >> + pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support >> interrupts */ >> > > This is done by the guest BIOS. > > If I remove that line, my driver crashes when it falls back to pin-based interrupts (when MSI is turned off). Is there something in the device driver that I need to set in place of this? A number of other devices (mostly network cards) set the interrupt pin this way, so I'm a little confused. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] x86 emulator: Add missing decoder flags for sub instruction
On Thu, May 13, 2010 at 9:50 PM, Marcelo Tosatti wrote: > On Wed, May 12, 2010 at 01:39:21AM +0300, Mohammed Gamal wrote: >> This adds missing decoder flags for sub instructions (opcodes 0x2c - 0x2d) >> >> Signed-off-by: Mohammed Gamal >> --- >> arch/x86/kvm/emulate.c | 2 +- >> 1 files changed, 1 insertions(+), 1 deletions(-) > > Applied both, thanks. > > Thanks Marcelo, please also take a look at the test cases for these instructions that I posted to the mailing list. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] VMX: Fix and improve guest state validity checks
On Thu, May 13, 2010 at 9:24 AM, Avi Kivity wrote: > On 05/11/2010 07:52 PM, Mohammed Gamal wrote: >> >> - Add 's' and 'g' field checks on segment registers >> - Correct SS checks for request and descriptor privilege levels >> >> Signed-off-by: Mohammed Gamal >> --- >> arch/x86/kvm/vmx.c | 73 >> +++ >> 1 files changed, 67 insertions(+), 6 deletions(-) >> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index 777e00d..9805c2a 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -2121,16 +2121,30 @@ static bool stack_segment_valid(struct kvm_vcpu >> *vcpu) >> vmx_get_segment(vcpu,&ss, VCPU_SREG_SS); >> ss_rpl = ss.selector& SELECTOR_RPL_MASK; >> >> - if (ss.unusable) >> + if (ss.dpl != ss_rpl) /* DPL != RPL */ >> + return false; >> + >> + if (ss.unusable) /* Short-circuit */ >> return true; >> > > If ss.unusable, do the dpl and rpl have any meaning? The idea is that dpl and rpl are checked on vmentry regardless of whether ss is usable or not. While the other checks are performed only if ss is usable. > >> if (!ss.present) >> return false; >> + if (ss.limit& 0xfff0) { >> + if ((ss.limit& 0xfff)< 0xfff) >> + return false; >> + if (!ss.g) >> + return false; >> + } else { >> + if ((ss.limit& 0xfff) == 0xfff) >> + return false; >> + if (ss.g) >> + return false; >> + } >> > > There is no architectural way to break this. That is, without > virtualization, there is no way a real cpu will ever have a limit of > 0x12345678. > > We need to distinguish between big real mode and real mode that can be > virtualized using vm86, but we don't need to consider impossible setups. I didn't realize this is architecturally impossible. I was simply implementing the checks specified in the Intel manual. Now that we know this is redunant, we can just drop these checks. > > >> @@ -2143,8 +2157,15 @@ static bool data_segment_valid(struct kvm_vcpu >> *vcpu, int seg) >> vmx_get_segment(vcpu,&var, seg); >> rpl = var.selector& SELECTOR_RPL_MASK; >> >> - if (var.unusable) >> + if (var.unusable) /* Short-circuit */ >> return true; >> + if (!(var.type& AR_TYPE_ACCESSES_MASK)) >> + return false; >> > > Again, there is no architectural way for a segment not to have the accessed > bit set. > >> + if (var.type& AR_TYPE_CODE_MASK) { >> + if (!(var.type& AR_TYPE_READABLE_MASK)) >> + return false; >> + } >> > > About this, I'm not sure. > >> + >> if (!var.s) >> return false; >> if (!var.present) >> @@ -2154,6 +2175,18 @@ static bool data_segment_valid(struct kvm_vcpu >> *vcpu, int seg) >> return false; >> } >> >> + if (var.limit& 0xfff0) { >> + if ((var.limit& 0xfff)< 0xfff) >> + return false; >> + if (!var.g) >> + return false; >> + } else { >> + if ((var.limit& 0xfff) == 0xfff) >> + return false; >> + if (var.g) >> + return false; >> + } >> > > Even disregarding the incorrectness, you shouldn't duplicate code like this. I was intending to consolidate it into a single function eventually, I just wasn't sure that this was correct and I needed some comments on it. It's not needed now anyhow. > >> @@ -2192,6 +2240,20 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu) >> return false; >> if (!ldtr.present) >> return false; >> + if (ldtr.s) >> + return false; >> > > Architecturally impossible. > > -- > Do not meddle in the internals of kernels, for they are subtle and quick to > panic. > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[GIT PULL] KVM updates for 2.6.34-rc7
Linus, please pull from git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm-updates/2.6.34 To receive the following updates: Dongxiao Xu (1): KVM: x86: Call vcpu_load and vcpu_put in cpuid_update Jan Kiszka (1): KVM: VMX: blocked-by-sti must not defer NMI injections Joerg Roedel (1): KVM: SVM: Fix wrong intercept masks on 32 bit Marcelo Tosatti (1): KVM: convert ioapic lock to spinlock Roel Kluin (1): KVM: PPC: Keep index within boundaries in kvmppc_44x_emul_tlbwe() arch/powerpc/kvm/44x_tlb.c |2 +- arch/x86/kvm/svm.c |8 arch/x86/kvm/vmx.c |3 +-- arch/x86/kvm/x86.c |4 virt/kvm/ioapic.c | 30 +++--- virt/kvm/ioapic.h |2 +- 6 files changed, 26 insertions(+), 23 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] x86 emulator: Add missing decoder flags for sub instruction
On Wed, May 12, 2010 at 01:39:21AM +0300, Mohammed Gamal wrote: > This adds missing decoder flags for sub instructions (opcodes 0x2c - 0x2d) > > Signed-off-by: Mohammed Gamal > --- > arch/x86/kvm/emulate.c |2 +- > 1 files changed, 1 insertions(+), 1 deletions(-) Applied both, thanks. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: network between host and guest
on 05/13/2010 10:48 PM dry...@sky-haven.net wrote the following: > Ar 10.05.10 10:46, scríobh Thanasis: >> I have installed kvm (app-emulation/qemu-kvm-0.12.3-r1) on linux (a >> laptop with gentoo linux) and MS Windows 7 as guest. >> Here is what info I get about the network on each one: >> 1) on linux (host): > > [snip] > > Hi there: > > The precise KVM command would be useful in determining what's going on. > > Since I'm guessing, I may as well presume you used "-net > nic,(options...) -net user". If you did, the magick is in the qemu > binary which is both providing a DHCP service to the guest and > performing the necessary outbound source NAT. > > But as others have mentioned, more information on how you started KVM > might be useful to determine what's going on. The reason is that there > are several ways to set up networking in KVM. > The command is (and running as a user, not root): $ kvm win7kvm.img -m 1024 -boot c -usb -usbdevice tablet $ which kvm /usr/bin/kvm $ ls -l /usr/bin/kvm lrwxrwxrwx 1 root root 17 Apr 21 15:10 /usr/bin/kvm -> /usr/bin/qemu-kvm $ file /usr/bin/qemu-kvm /usr/bin/qemu-kvm: POSIX shell script text executable $ cat /usr/bin/qemu-kvm #!/bin/sh exec /usr/bin/qemu-system-x86_64 --enable-kvm "$@" $ file /usr/bin/qemu-system-x86_64 /usr/bin/qemu-system-x86_64: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.9, stripped $ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next] ixgbevf: Enable GRO by default
Enable GRO by default for performance. Signed-off-by: Shirley Ma --- drivers/net/ixgbevf/ixgbevf_main.c |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/drivers/net/ixgbevf/ixgbevf_main.c b/drivers/net/ixgbevf/ixgbevf_main.c index 40f47b8..1bbb05e 100644 --- a/drivers/net/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ixgbevf/ixgbevf_main.c @@ -3415,6 +3415,7 @@ static int __devinit ixgbevf_probe(struct pci_dev *pdev, netdev->features |= NETIF_F_IPV6_CSUM; netdev->features |= NETIF_F_TSO; netdev->features |= NETIF_F_TSO6; + netdev->features |= NETIF_F_GRO; netdev->vlan_features |= NETIF_F_TSO; netdev->vlan_features |= NETIF_F_TSO6; netdev->vlan_features |= NETIF_F_IP_CSUM; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/7] Consolidate vcpu ioctl locking
Am 13.05.2010 um 14:29 schrieb Avi Kivity : On 05/13/2010 03:18 PM, Alexander Graf wrote: [PATCH 0/7] Consolidate vcpu ioctl locking In general, all vcpu ioctls need to take the vcpu mutex, but each one does it (or not) individually. This is cumbersome and error prone. This patchset moves all locking to a central place. This is complicated by the fact that ppc's KVM_INTERRUPT and s390's KVM_S390_INTERRUPT break the convention and need to run unlocked. Why is the x86 non-kernel-pic path different? Userspace issues the ioctl from a vcpu thread. It has to, btw, since whether an interrupt can be injected or not depends on vcpu-synchronous registers: eflags.if and tpr/cr8. On ppc we don't have a tpr, but eflags.if is basically the same as msr.ee. The major difference apparently is that on ppc we KVM_INTERRUPT pulls the interrupt line. On vcpu_run we then check whether msr.ee is set and if so, trigger the interrupt. I wonder why we don't do the same for x86. The current limitation on userspace checking eflags and the tpr seems cumbersome. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2 v2] pci: check caps from sysfs file open to read device dependent config space
* Greg KH (g...@kroah.com) wrote: > On Thu, May 13, 2010 at 10:43:07AM -0700, Chris Wright wrote: > > * Alan Cox (a...@lxorguk.ukuu.org.uk) wrote: > > > I agree with the problem - but IMHO the fix is to require opening the file > > > checks CAP_SYS_something instead: not to hack the read method and make it > > > even weirder and more un-Linux than it is now. > > > > This patch does that. Not as convenient from the KVM/libvirt point of view > > because it is not prepared to do this setup before dropping privileges > > and launching the VM. > > So does that mean that this patch doesn't solve your original problem > here? Right, it means we have to change how we create a guest with a directly assigned PCI device. Currently KVM/libvirt is assuming that sysfs file ownership is sufficient to read a sysfs file. It chowns all relevant sysfs files and updates security labels such that only that guest can access the files, then drops privileges and launches the guest. With the v2 patch we'll have to open the config space sysfs file in the privileged context and pass it into the unprivileged one. It is awkward, but it should be doable. thanks, -chris -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2 v2] pci: check caps from sysfs file open to read device dependent config space
On Thu, May 13, 2010 at 10:43:07AM -0700, Chris Wright wrote: > * Alan Cox (a...@lxorguk.ukuu.org.uk) wrote: > > I agree with the problem - but IMHO the fix is to require opening the file > > checks CAP_SYS_something instead: not to hack the read method and make it > > even weirder and more un-Linux than it is now. > > This patch does that. Not as convenient from the KVM/libvirt point of view > because it is not prepared to do this setup before dropping privileges > and launching the VM. So does that mean that this patch doesn't solve your original problem here? thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM and the OOM-Killer
>> Not sure I like the idea of running a 64bit user space kernel on top >> of a 32bit host, prefer to re-install. >> >> Can I just replace my kernel with a 64bit one, or do I have to >> re-install the host O/S ? > > You can run 32-bit userspace with a 64-bit kernel, or reinstall, > whichever you prefer. > > I once upgraded a 32-bit Fedora install to 64-bit, but that takes some > tinkering. > You can just install a 64-bit kernel. For rpm based systems you have to "unpack" the rpm using rpm2cpio. The modules.dep file cannot be updated -- need to generate that elsewhere -- and mkinitrd needs to be modified to not try to strip modules (s,strip,true,). That's all I had to do to plop a 64-bit kernel onto a 32-bit install. David -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: Another SIGFPE in display code, now in cirrus
On Thu, 13 May 2010, Michael Tokarev wrote: > Stefano Stabellini wrote: > [] > > diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c > > index 9f61a01..81c443b 100644 > > --- a/hw/cirrus_vga.c > > +++ b/hw/cirrus_vga.c > > The same as with previous patch: Yellow screen > (instead of crashing), and two lines on the > stderr: > > BUG: kvm_dirty_pages_log_enable_slot: invalid parameters > BUG: kvm_dirty_pages_log_enable_slot: invalid parameters > I tried to do the same thing on WinNT with qemu (thanks to Michael for providing me the image) and it works OK with the patch applied. I think there must be another bug in the kvm dirty page handling... -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2 v2] pci: check caps from sysfs file open to read device dependent config space
* Alan Cox (a...@lxorguk.ukuu.org.uk) wrote: > I agree with the problem - but IMHO the fix is to require opening the file > checks CAP_SYS_something instead: not to hack the read method and make it > even weirder and more un-Linux than it is now. This patch does that. Not as convenient from the KVM/libvirt point of view because it is not prepared to do this setup before dropping privileges and launching the VM. thanks, -chris -- From: Chris Wright Subject: [PATCH 2/2 v2] pci: check caps from sysfs file open to read device dependent config space The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN check to verify privileges before allowing a user to read device dependent config space. This is meant to protect from an unprivileged user potentially locking up the box. When assigning a PCI device directly to a guest with libvirt and KVM, the sysfs config space file is chown'd to the unprivileged user that the KVM guest will run as. The guest needs to have full access to the device's config space since it's responsible for driving the device. However, despite being the owner of the sysfs file, the CAP_SYS_ADMIN check will not allow read access beyond the config header. With this patch we check privileges against the capabilities used when openining the sysfs file. The allows a privileged process to open the file and hand it to an unprivileged process, and the unprivileged process can still read all of the config space. Signed-off-by: Chris Wright --- drivers/pci/pci-sysfs.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index ad44557..6309c5a 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -367,7 +368,7 @@ pci_read_config(struct file *filp, struct kobject *kobj, u8 *data = (u8*) buf; /* Several chips lock up trying to read undefined config space */ - if (capable(CAP_SYS_ADMIN)) { + if (cap_raised(filp->f_cred->cap_effective, CAP_SYS_ADMIN)) { size = dev->cfg_size; } else if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) { size = 128; -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Prevent IRQ sharing for PCI passthrough
* Avi Kivity (a...@redhat.com) wrote: > On 05/13/2010 07:57 PM, Dirk Gouders wrote: > >Hello List, > > > >please don't shout at me if the answer to my question is too obvious, > > We won't shout at you. > > >but in the German Linux Magazine I read an article about passing PCI > >devices to KVM guests and the authors said that "shared interrupt > >funcionality has to be disabled" which I read as "it is possible to > >disable shared interrupt functionality". > > It would be more correct to say "no go with shared host interrupts". > > >I currently have the problem with a PCI ISDN card that is not MSI > >capable and even if I change the PCI slot and disable all other slots, > >USB etc. in the BIOS, that card still shares its interrupt with three > >other devices. > > > >Even after extensive searching/reading, I am not sure if it possible to > >prevent a PCI card from sharing an interrupt with other devices or > >probably manually assign a free interrupt to that card via some kernel > >parameter and would be glad if someone could give me an answer to that > >question. > > There is an ACPI _SRS method which can be used to move interrupts > around. However Linux doesn't appear to expose it. Even if it did, > the interrupt may be shared on the motherboard, in which case > nothing would help (though you might be able to share it with unused > devices). The only way to work around this is to unbind the other drivers that are sharing that interrupt. This may not work if the other devices are critical to you system (it's typically sharing w/ USB Host Controller). Otherwise, we can't support this mode easily (there is some work in progress to allow this to work if your ISDN device is PCI 2.3 compliant). thanks, -chris -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Prevent IRQ sharing for PCI passthrough
On 05/13/2010 07:57 PM, Dirk Gouders wrote: Hello List, please don't shout at me if the answer to my question is too obvious, We won't shout at you. but in the German Linux Magazine I read an article about passing PCI devices to KVM guests and the authors said that "shared interrupt funcionality has to be disabled" which I read as "it is possible to disable shared interrupt functionality". It would be more correct to say "no go with shared host interrupts". I currently have the problem with a PCI ISDN card that is not MSI capable and even if I change the PCI slot and disable all other slots, USB etc. in the BIOS, that card still shares its interrupt with three other devices. Even after extensive searching/reading, I am not sure if it possible to prevent a PCI card from sharing an interrupt with other devices or probably manually assign a free interrupt to that card via some kernel parameter and would be glad if someone could give me an answer to that question. There is an ACPI _SRS method which can be used to move interrupts around. However Linux doesn't appear to expose it. Even if it did, the interrupt may be shared on the motherboard, in which case nothing would help (though you might be able to share it with unused devices). Chris? -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
pci device passthrough: Failed to assign irq
Hi, I'm trying to pass a pcie-device into a kvm guest. qemu-kvm is started by libvirt with this command: LC_ALL=C PATH=/sbin:/usr/sbin:/bin:/usr/bin HOME=/root USER=root LOGNAME=root QEMU_AUDIO_DRV=none /usr/bin/qemu-kvm -S -M pc-0.12 -enable-kvm -m 2048 -smp 2,sockets=2,cores=1,threads=1 -name test -uuid cefc7515-c0c2-27fc-8e7e-e0a65f9cb95b -nodefaults -chardev socket,id=monitor,path=/var/lib/libvirt/qemu/test.monitor,server,nowait -mon chardev=monitor,mode=readline -rtc base=utc -boot c -drive file=/dev/vg_main/test,if=none,id=drive-virtio-disk0,boot=on,format=raw -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -drive if=none,media=cdrom,id=drive-ide0-1-0,readonly=on -device ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 -device virtio-net-pci,vlan=0,id=net0,mac=52:54:00:d0:84:c7,bus=pci.0,addr=0x7 -net tap,fd=59,vlan=0,name=hostnet0 -chardev pty,id=serial0 -device isa-serial,chardev=serial0 -usb -vnc 127.0.0.1:0 -k de -vga cirrus -device pci-assign,host=05:00.0,id=hostdev0,bus=pci.0,addr=0x8 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x3 It failes with: Failed to assign irq for "hostdev0": Operation not permitted Perhaps you are assigning a device that shares an IRQ with another device? The irq of the device does not seem to be shared: 05:00.0 Ethernet controller: Intel Corporation 82574L Gigabit Network Connection Subsystem: ASUSTeK Computer Inc. Device 8369 Flags: bus master, fast devsel, latency 0, IRQ 18 Memory at faee (32-bit, non-prefetchable) [size=128K] I/O ports at cc00 [size=32] Memory at faedc000 (32-bit, non-prefetchable) [size=16K] Capabilities: [c8] Power Management version 2 Capabilities: [d0] MSI: Enable- Count=1/1 Maskable- 64bit+ Capabilities: [e0] Express Endpoint, MSI 00 Capabilities: [a0] MSI-X: Enable- Count=5 Masked- Capabilities: [100] Advanced Error Reporting Capabilities: [140] Device Serial Number 90-e6-ba-ff-ff-7c-c6-ef Kernel driver in use: pci-stub Kernel modules: e1000e when kvm is not running and the device used as eth0 on host: # cat /proc/interrupts [...] 17: 0 0 0 0 0 0 0 0 IO-APIC-fasteoi sata_sil24 23: 69 0 0 0 0 0 0 0 IO-APIC-fasteoi ehci_hcd:usb2 [...] 37: 2 0 0 0 0 0 0 0 PCI-MSI-edge eth0-rx-0 38: 4 0 0 0 0 0 0 0 PCI-MSI-edge eth0-tx-0 39: 1 0 0 0 0 0 0 0 PCI-MSI-edge eth0 [...] qemu-kvm is running as root, selinux is running in permissive mode. So the rights shouldn't be an issue too. The processor is an Intel Core i7-860 on an Asus P7F-E board, chipset is an Intel 3420. Bios is ver 0607, vt-d is enabled in bios setup. This is logged in dmesg, so to me it seems like the system is indeed vt-d capable: [...] ACPI: Core revision 20091214 DMAR: Host address width 36 DMAR: DRHD base: 0x00fed9 flags: 0x1 IOMMU fed9: ver 1:0 cap c90780106f0462 ecap f020e3 DMAR: RMRR base: 0x0e4000 end: 0x0e7fff DMAR: RMRR base: 0x00bf7ec000 end: 0x00bf7f DMAR: No ATSR found Setting APIC routing to physical flat ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 CPU0: Intel(R) Core(TM) i7 CPU 860 @ 2.80GHz stepping 05 [...] IOMMU 0xfed9: using Queued invalidation IOMMU: Setting RMRR: IOMMU: Setting identity map for device :00:1d.0 [0xbf7ec000 - 0xbf80] IOMMU: Setting identity map for device :00:1a.0 [0xbf7ec000 - 0xbf80] IOMMU: Setting identity map for device :00:1d.0 [0xe4000 - 0xe8000] IOMMU: Setting identity map for device :00:1a.0 [0xe4000 - 0xe8000] IOMMU: Prepare 0-16MiB unity mapping for LPC IOMMU: Setting identity map for device :00:1f.0 [0x0 - 0x100] alloc irq_desc for 29 on node 0 alloc kstat_irqs on node 0 PCI-DMA: Intel(R) Virtualization Technology for Directed I/O Intel PCLMULQDQ-NI instructions are not detected. [...] Kernel is 2.6.33.3-85.fc13.x86_64. I tried qemu-0.12.3-8.fc13 as it came with Fedora 13 beta and qemu-kvm-0.12.4, same results. Any idea whats going wrong or what else I could try? Kind regards, Gerd -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Prevent IRQ sharing for PCI passthrough
Hello List, please don't shout at me if the answer to my question is too obvious, but in the German Linux Magazine I read an article about passing PCI devices to KVM guests and the authors said that "shared interrupt funcionality has to be disabled" which I read as "it is possible to disable shared interrupt functionality". I currently have the problem with a PCI ISDN card that is not MSI capable and even if I change the PCI slot and disable all other slots, USB etc. in the BIOS, that card still shares its interrupt with three other devices. Even after extensive searching/reading, I am not sure if it possible to prevent a PCI card from sharing an interrupt with other devices or probably manually assign a free interrupt to that card via some kernel parameter and would be glad if someone could give me an answer to that question. Best regards, Dirk -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [SeaBIOS] [PATCHv2] Support for booting from virtio disks
On 05/10/2010 06:58 PM, Anthony Liguori wrote: Isn't this problem unrelated to this patch? I mean if I start qemu with two ide devices can I specify from qemu command line which one I want to boot from? That's sort of what I'm asking. If you compare this approach to extboot, extboot provided a capability to select a disk. I think it can be argued though that this isn't a necessary feature to carry over and I'm looking for additional opinions on that. I'd say it's a necessary feature, but not one to carry over from the extboot implementation. We have the seabios boot menu (how to reach it?), we need to store the nvram persistently, and we need to extend the selection menu to qemu, but that's unrelated to this patch. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: Another SIGFPE in display code, now in cirrus
Stefano Stabellini wrote: > > I think we need to consider only dstpitch for a full invalidate. We > > might be copying an offscreen bitmap into the screen, and srcpitch is > > likely to be the bitmap width instead of the screen pitch. > > Agreed. Even when copying on-screen (or partially on-screen), the srcpitch does not affect the invalidated area. The source area might be strange (parallelogram, single line repeated), but srcpitch should only affect whether qemu_console_copy can be used, not the invalidation. -- Jamie -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] pci: allow sysfs file owner to read device dependent config space
* Alan Cox (a...@lxorguk.ukuu.org.uk) wrote: > On Wed, 12 May 2010 18:29:57 -0700 > Chris Wright wrote: > > > The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN > > check to verify privileges before allowing a user to read device > > dependent config space. This is meant to protect from an unprivileged > > user potentially locking up the box. > > Or hacking it. With read access? You mean from read side-effect on a device register? > You could argue the correct requirement is to change it to > require CAP_SYS_RAWIO in fact ! It's also pervasive through sysfs. I found 22 CAP_SYS_ADMIN checks via sysfs bin_attrs, most are firmware, config space, and similar; most of which have could be abused to hack box in one way or another. > > With this patch the sysfs file owner is also considered privileged enough > > to read all of the config space. > > Which breaks the main reason the check was there in the first place. To > stop accidents of the form > > find / -exec grep {} "wibble" > > blowing up in people's faces. Right, this won't change w/out sysadmin intervention. Currently, any random user doing that won't trigger an ill-behaving device dependent config space read. It requires an admin to chown the file to the user, at which point you've given the device to the user. This is typically only done in the privileged context of libvirt launching a KVM guest that has a host PCI device directly assigned to it, and the chown is typically to a non-shell user. > I agree with the problem - but IMHO the fix is to require opening the file > checks CAP_SYS_something instead: We can't require CAP_SYS_something on open as that will break basic tools like lspci. We could note the privileges when opened and check them later. > not to hack the read method and make it > even weirder and more un-Linux than it is now. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Wiki docs on counting and tracing KVM perf events
On 05/13/2010 05:35 PM, Stefan Hajnoczi wrote: How to count and trace KVM perf events: http://www.linux-kvm.org/page/Perf_events I want to draw attention to this because traditional kvm_stat and kvm_trace use has been moving over to the debugfs based tracing mechanisms. Perhaps we can flesh out documentation and examples of common perf event usage. Two things are missing to make this really useful: - a continuously updating difference mode like kvm_stat - subevents; for example kvm:kvm_exit is an aggregate of all exit types that can be split using filters to show individual exit reason statistics The plan is to eventually remove kvm_stat based performance monitoring in favour of perf events. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Wiki docs on counting and tracing KVM perf events
How to count and trace KVM perf events: http://www.linux-kvm.org/page/Perf_events I want to draw attention to this because traditional kvm_stat and kvm_trace use has been moving over to the debugfs based tracing mechanisms. Perhaps we can flesh out documentation and examples of common perf event usage. Stefan -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: Another SIGFPE in display code, now in cirrus
Stefano Stabellini wrote: [] > diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c > index 9f61a01..81c443b 100644 > --- a/hw/cirrus_vga.c > +++ b/hw/cirrus_vga.c The same as with previous patch: Yellow screen (instead of crashing), and two lines on the stderr: BUG: kvm_dirty_pages_log_enable_slot: invalid parameters BUG: kvm_dirty_pages_log_enable_slot: invalid parameters Thanks! /mjt -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] fix "info cpus" halted state display
Gleb Natapov wrote: > When in-kernel irqchip is used env->halted is never used for anything > except "info cpus" command. In fact, it's used in a few more places, namely cpu_dump_state and the gdbstub. > Halted state is synced in > kvm_arch_save_mpstate() and showed by do_info_cpus() but otherwise never > looked at. Zeroing it here breaks "info cpus" since before > do_info_cpus() outputs env->halted in io thread it is zeroed here when > vcpu thread reenters kernel. Looks good for current qemu-kvm. Execution of kvm_cpu_exec once depended on env->halted, even for in-kernel irqchip, right? Anyway, there are not such traces left here. We will just need to look at it again when pushing in-kernel irqchips upstream as its kvm loop looks different. Jan > > Signed-off-by: Gleb Natapov > diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c > index 61d9331..0ec2881 100644 > --- a/qemu-kvm-x86.c > +++ b/qemu-kvm-x86.c > @@ -922,10 +922,6 @@ void kvm_arch_load_regs(CPUState *env, int level) > if (env->kvm_vcpu_update_vapic) > kvm_tpr_enable_vapic(env); > } > -if (kvm_irqchip_in_kernel()) { > -/* Avoid deadlock: no user space IRQ will ever clear it. */ > -env->halted = 0; > -} > > kvm_put_vcpu_events(env, level); > kvm_put_debugregs(env); > -- > Gleb. signature.asc Description: OpenPGP digital signature
Re: KVM and the OOM-Killer
On 05/13/2010 04:39 PM, James Stevens wrote: I'd go with 64-bit at 2GB and above. It's both faster and safer. safer, how? (apart from no lowmem exhaust). Nothing apart from that (well, if you run nonpae you lose NX protection). On a different subject, the qemu documentation says a guest VM can only have 2Gb of memory - does this still apply when using a 64bit host O/S ? The documentation is outdated. Since you can run a 64-bit kernel with your existing userspace, at least you have a simple upgrade path. Not sure I like the idea of running a 64bit user space kernel on top of a 32bit host, prefer to re-install. Can I just replace my kernel with a 64bit one, or do I have to re-install the host O/S ? You can run 32-bit userspace with a 64-bit kernel, or reinstall, whichever you prefer. I once upgraded a 32-bit Fedora install to 64-bit, but that takes some tinkering. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: Another SIGFPE in display code, now in cirrus
On Thu, 13 May 2010, Avi Kivity wrote: > > /* extra x, y */ > > -sx = (src % ABS(s->cirrus_blt_srcpitch)) / depth; > > -sy = (src / ABS(s->cirrus_blt_srcpitch)); > > +sx = (src % line_offset) / depth; > > +sy = (src / line_offset); > > > > Does anything prevent the guest from programming the CRTC display pitch > to 0? > Nope, I'll use line_offset there too to prevent possible divisions by 0. > > dx = (dst % ABS(s->cirrus_blt_dstpitch)) / depth; > > dy = (dst / ABS(s->cirrus_blt_dstpitch)); > > > > @@ -725,18 +727,23 @@ static void cirrus_do_copy(CirrusVGAState *s, int > > dst, int src, int w, int h) > > s->cirrus_blt_dstpitch, s->cirrus_blt_srcpitch, > > s->cirrus_blt_width, s->cirrus_blt_height); > > > > -if (notify) > > - qemu_console_copy(s->vga.ds, > > - sx, sy, dx, dy, > > - s->cirrus_blt_width / depth, > > - s->cirrus_blt_height); > > - > > -/* we don't have to notify the display that this portion has > > - changed since qemu_console_copy implies this */ > > - > > -cirrus_invalidate_region(s, s->cirrus_blt_dstaddr, > > - s->cirrus_blt_dstpitch, s->cirrus_blt_width, > > - s->cirrus_blt_height); > > + if (ABS(s->cirrus_blt_dstpitch) != line_offset || > > + ABS(s->cirrus_blt_srcpitch) != line_offset) { > > + /* this is not going to happen very often */ > > + vga_hw_invalidate(); > > > > I think we need to consider only dstpitch for a full invalidate. We > might be copying an offscreen bitmap into the screen, and srcpitch is > likely to be the bitmap width instead of the screen pitch. > Agreed. > > > + } else { > > + if (notify) > > + /* we don't have to notify the display that this portion has > > +changed since qemu_console_copy implies this */ > > + qemu_console_copy(s->vga.ds, > > + sx, sy, dx, dy, > > + s->cirrus_blt_width / depth, > > + s->cirrus_blt_height); > > + else > > + cirrus_invalidate_region(s, s->cirrus_blt_dstaddr, > > + s->cirrus_blt_dstpitch, > > s->cirrus_blt_width, > > + s->cirrus_blt_height); > > + } > > } > > > > static int cirrus_bitblt_videotovideo_copy(CirrusVGAState * s) > > diff --git a/hw/cirrus_vga_rop.h b/hw/cirrus_vga_rop.h > > index 39a7b72..80f135b 100644 > > --- a/hw/cirrus_vga_rop.h > > +++ b/hw/cirrus_vga_rop.h > > @@ -32,10 +32,10 @@ glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusVGAState > > *s, > > dstpitch -= bltwidth; > > srcpitch -= bltwidth; > > > > -if (dstpitch< 0 || srcpitch< 0) { > > -/* is 0 valid? srcpitch == 0 could be useful */ > > +if (dstpitch< 0) > > return; > > -} > > +if (srcpitch< 0) > > +srcpitch = 0; > > > > Why? I wouldn't want an operation that is supposed to be a forward copy to become a backward copy instead. With the way the code is currently written in cirrus_vga it shouldn't be possible, but it might be a good idea to have the checks anyway. Actually the limit I wrote is too strict, I'll fix it. --- diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c index 9f61a01..81c443b 100644 --- a/hw/cirrus_vga.c +++ b/hw/cirrus_vga.c @@ -676,17 +676,19 @@ static void cirrus_do_copy(CirrusVGAState *s, int dst, int src, int w, int h) int sx, sy; int dx, dy; int width, height; +uint32_t start_addr, line_offset, line_compare; int depth; int notify = 0; depth = s->vga.get_bpp(&s->vga) / 8; s->vga.get_resolution(&s->vga, &width, &height); +s->vga.get_offsets(&s->vga, &line_offset, &start_addr, &line_compare); /* extra x, y */ -sx = (src % ABS(s->cirrus_blt_srcpitch)) / depth; -sy = (src / ABS(s->cirrus_blt_srcpitch)); -dx = (dst % ABS(s->cirrus_blt_dstpitch)) / depth; -dy = (dst / ABS(s->cirrus_blt_dstpitch)); +sx = (src % line_offset) / depth; +sy = (src / line_offset); +dx = (dst % line_offset) / depth; +dy = (dst / line_offset); /* normalize width */ w /= depth; @@ -725,18 +727,22 @@ static void cirrus_do_copy(CirrusVGAState *s, int dst, int src, int w, int h) s->cirrus_blt_dstpitch, s->cirrus_blt_srcpitch, s->cirrus_blt_width, s->cirrus_blt_height); -if (notify) - qemu_console_copy(s->vga.ds, - sx, sy, dx, dy, - s->cirrus_blt_width / depth, - s->cirrus_blt_height); - -/* we don't have to notify the display that this portion has - changed since qemu_console_copy implies this */ - -cirrus_invalidate_region(s, s->cirrus_blt_dstaddr, -
Re: [Qemu-devel] [RFC PATCH 0/2] Sheepdog: distributed storage system for QEMU
At Thu, 13 May 2010 04:46:46 +0900, MORITA Kazutaka wrote: > > On 2010/05/12 20:38, Kevin Wolf wrote: > > I'll have a closer look at your code later, but one thing I noticed is > > that the new block driver is something in between a protocol and a > > format driver (just like vvfat, which should stop doing so, too). I > > think it ought to be a real protocol with the raw format driver on top > > (or any other format - I don't see a reason why this should be > > restricted to raw). > > > > The one thing that is unusual about it as a protocol driver is that it > > supports snapshots. However, while it is the first one, supporting > > snapshots in protocols is a thing that could be generally useful to > > support (for example thinking of a LVM protocol, which was discussed in > > the past). > > > > I agreed. I'll modify the sheepdog driver patch as a protocol driver one, > and remove unnecessary format check from my patch. > > > So in block.c we could check if the format driver supports snapshots, > > and if it doesn't we try again with the underlying protocol. Not sure > > yet what we would do when both format and protocol do support snapshots > > (qcow2 on sheepdog/LVM/...), but that's a detail. > > > To support snapshot in a protocol, I'd like to call the hander of the protocol driver in the following functions in block.c: bdrv_snapshot_create bdrv_snapshot_goto bdrv_snapshot_delete bdrv_snapshot_list bdrv_save_vmstate bdrv_load_vmstate Is it okay? In the case both format and protocol drivers support snapshots, I think it is better to call the format driver handler. Because qcow2 is well known as a snapshot support format, so when users use qcow2, they expect to get snapshot with qcow2. There is another problem to make the sheepdog driver be a protocol; how to deal with protocol specific create_options? For example, sheepdog supports cloning images as a format driver: $ qemu-img create -f sheepdog dst -b sheepdog:src But if the sheepdog driver is a protocol, error will occur. $ qemu-img create sheepdog:dst -b sheepdog:src Unknown option 'backing_file' qemu-img: Backing file not supported for file format 'raw' It is because the raw format doesn't support a backing_file option. To support the protocol specific create_options, if the format driver cannot parse some of the arguments, the protocol driver need to parse them. If my suggestions are okay, I'd like to prepare the patches. Regards, Kazutaka -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM and the OOM-Killer
On Thu, May 13, 2010 at 03:39:20PM +0300, Avi Kivity wrote: > On 05/13/2010 03:20 PM, James Stevens wrote: > > > >General advice seems to be, if you have more than 16Gb RAM then > >you should run the VM host 64bit. > > > >We didn't see this issue on a server with 32Gb running the same > >set of VMs. > > > > I'd go with 64-bit at 2GB and above. It's both faster and safer. There's an interesting posting from Linus which explains details: http://www.realworldtech.com/forums/index.cfm?action=detail&id=78966&threadid=78766&roomid=2 HTH, Johannes -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM and the OOM-Killer
I'd go with 64-bit at 2GB and above. It's both faster and safer. safer, how? (apart from no lowmem exhaust). On a different subject, the qemu documentation says a guest VM can only have 2Gb of memory - does this still apply when using a 64bit host O/S ? The lowmem load is about 0.5% of guest memory, so 48GB means 240MB lowmem allocated. Thin ice. We currently only have about 12Gb used by the VM guests, so not hitting that issue - the rest of HIGHMEM is host disk cache. Our experience is that the bottle-neck on number of VM guests is disk i/o - with loads of memory we've pretty much eliminated reads, so that means disk writes. Since you can run a 64-bit kernel with your existing userspace, at least you have a simple upgrade path. Not sure I like the idea of running a 64bit user space kernel on top of a 32bit host, prefer to re-install. Can I just replace my kernel with a 64bit one, or do I have to re-install the host O/S ? James -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] fix "info cpus" halted state display
When in-kernel irqchip is used env->halted is never used for anything except "info cpus" command. Halted state is synced in kvm_arch_save_mpstate() and showed by do_info_cpus() but otherwise never looked at. Zeroing it here breaks "info cpus" since before do_info_cpus() outputs env->halted in io thread it is zeroed here when vcpu thread reenters kernel. Signed-off-by: Gleb Natapov diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c index 61d9331..0ec2881 100644 --- a/qemu-kvm-x86.c +++ b/qemu-kvm-x86.c @@ -922,10 +922,6 @@ void kvm_arch_load_regs(CPUState *env, int level) if (env->kvm_vcpu_update_vapic) kvm_tpr_enable_vapic(env); } -if (kvm_irqchip_in_kernel()) { -/* Avoid deadlock: no user space IRQ will ever clear it. */ -env->halted = 0; -} kvm_put_vcpu_events(env, level); kvm_put_debugregs(env); -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM and the OOM-Killer
On 05/13/2010 03:20 PM, James Stevens wrote: This is *NOT* a KVM issue, but may be worth adding into the FAQ... We have a KVM host with 48Gb of RAM and run about 20 KVM clients on it. After some time - different time depending on the kernel version - the VM host kernel will start OOM-Killing the VM clients, even when there is lots of free RAM (>10Gb) and free SWAP (>34Gb). This seems to be caused by the kernel running out of LOWMEM (memory below 1Gb) - because of the large amount of RAM a lot of LOWMEM (~400Mb) is used by the memory map (32 bytes per 4Kb page), add in the kernel itself and that leaves "only" about 460Mb of LOWMEM for kernel alloc. This may not have been a problem, except Linux may also put cache blocks and user processes in LOWMEM - it seems this can then lead to a LOWMEM exhaust situation which triggers OOM-Killing even when there is LOADS of SWAP and HIGHMEM free. Sadly, killing userland processes is not a good way to try and free LOWMEM, so what happens is a killing spree where by every process on the VM host gets killed (inc all the VMs, sysklogd, klogd, sshd, udevd etc). This is very bad in 2.6.32.6, quite bad in 2.6.32.9, better (but still bad in) 2.6.31.12 - currently testing 2.6.33.3 See https://bugzilla.kernel.org/show_bug.cgi?id=15058 General advice seems to be, if you have more than 16Gb RAM then you should run the VM host 64bit. We didn't see this issue on a server with 32Gb running the same set of VMs. I'd go with 64-bit at 2GB and above. It's both faster and safer. The lowmem load is about 0.5% of guest memory, so 48GB means 240MB lowmem allocated. Thin ice. Since you can run a 64-bit kernel with your existing userspace, at least you have a simple upgrade path. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
KVM and the OOM-Killer
This is *NOT* a KVM issue, but may be worth adding into the FAQ... We have a KVM host with 48Gb of RAM and run about 20 KVM clients on it. After some time - different time depending on the kernel version - the VM host kernel will start OOM-Killing the VM clients, even when there is lots of free RAM (>10Gb) and free SWAP (>34Gb). This seems to be caused by the kernel running out of LOWMEM (memory below 1Gb) - because of the large amount of RAM a lot of LOWMEM (~400Mb) is used by the memory map (32 bytes per 4Kb page), add in the kernel itself and that leaves "only" about 460Mb of LOWMEM for kernel alloc. This may not have been a problem, except Linux may also put cache blocks and user processes in LOWMEM - it seems this can then lead to a LOWMEM exhaust situation which triggers OOM-Killing even when there is LOADS of SWAP and HIGHMEM free. Sadly, killing userland processes is not a good way to try and free LOWMEM, so what happens is a killing spree where by every process on the VM host gets killed (inc all the VMs, sysklogd, klogd, sshd, udevd etc). This is very bad in 2.6.32.6, quite bad in 2.6.32.9, better (but still bad in) 2.6.31.12 - currently testing 2.6.33.3 See https://bugzilla.kernel.org/show_bug.cgi?id=15058 General advice seems to be, if you have more than 16Gb RAM then you should run the VM host 64bit. We didn't see this issue on a server with 32Gb running the same set of VMs. James -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/7] Consolidate vcpu ioctl locking
On 05/13/2010 03:18 PM, Alexander Graf wrote: [PATCH 0/7] Consolidate vcpu ioctl locking In general, all vcpu ioctls need to take the vcpu mutex, but each one does it (or not) individually. This is cumbersome and error prone. This patchset moves all locking to a central place. This is complicated by the fact that ppc's KVM_INTERRUPT and s390's KVM_S390_INTERRUPT break the convention and need to run unlocked. Why is the x86 non-kernel-pic path different? Userspace issues the ioctl from a vcpu thread. It has to, btw, since whether an interrupt can be injected or not depends on vcpu-synchronous registers: eflags.if and tpr/cr8. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/7] Consolidate vcpu ioctl locking
On 13.05.2010, at 14:03, Avi Kivity wrote: > On 05/13/2010 03:03 PM, Avi Kivity wrote: >> On 05/13/2010 03:01 PM, Avi Kivity wrote: >>> On 05/13/2010 02:57 PM, Alexander Graf wrote: Mind to give a high level overview on where you're moving which locks? >>> >>> Um, looks like I forgot to fill in the patchset header. Sorry. >> >> Gar, I actually wrote it but forgot to save the file. >> > > And it had some useful info: > > [PATCH 0/7] Consolidate vcpu ioctl locking > > In general, all vcpu ioctls need to take the vcpu mutex, but each one does it > (or not) individually. This is cumbersome and error prone. > > This patchset moves all locking to a central place. This is complicated > by the fact that ppc's KVM_INTERRUPT and s390's KVM_S390_INTERRUPT break > the convention and need to run unlocked. Why is the x86 non-kernel-pic path different? Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/7] Consolidate vcpu ioctl locking
On 05/13/2010 03:03 PM, Avi Kivity wrote: On 05/13/2010 03:01 PM, Avi Kivity wrote: On 05/13/2010 02:57 PM, Alexander Graf wrote: Mind to give a high level overview on where you're moving which locks? Um, looks like I forgot to fill in the patchset header. Sorry. Gar, I actually wrote it but forgot to save the file. And it had some useful info: [PATCH 0/7] Consolidate vcpu ioctl locking In general, all vcpu ioctls need to take the vcpu mutex, but each one does it (or not) individually. This is cumbersome and error prone. This patchset moves all locking to a central place. This is complicated by the fact that ppc's KVM_INTERRUPT and s390's KVM_S390_INTERRUPT break the convention and need to run unlocked. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/7] Consolidate vcpu ioctl locking
On 05/13/2010 03:01 PM, Avi Kivity wrote: On 05/13/2010 02:57 PM, Alexander Graf wrote: Mind to give a high level overview on where you're moving which locks? Um, looks like I forgot to fill in the patchset header. Sorry. Gar, I actually wrote it but forgot to save the file. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/7] Consolidate vcpu ioctl locking
On 05/13/2010 02:57 PM, Alexander Graf wrote: Mind to give a high level overview on where you're moving which locks? Um, looks like I forgot to fill in the patchset header. Sorry. The patches move all vcpu ioctl locking from the individual ioctl handlers (e.g. kvm_vcpu_ioctl_set_cpuid()) to the top-level vcpu ioctl handler (kvm_vcpu_ioctl()). So tons of vcpu_load()/vcpu_put() pairs (some of the missing) get replaced by a single pair. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/7] Consolidate vcpu ioctl locking
On 13.05.2010, at 13:17, Avi Kivity wrote: > > > Avi Kivity (7): > KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls > KVM: x86: Add missing locking to arch specific vcpu ioctls > KVM: move vcpu locking to dispatcher for generic vcpu ioctls > KVM: x86: Lock arch specific vcpu ioctls centrally > KVM: s390: Centrally lock arch specific vcpu ioctls > KVM: PPC: Centralize locking of arch specific vcpu ioctls > KVM: Consolidate arch specific vcpu ioctl locking Mind to give a high level overview on where you're moving which locks? Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH 0/12] KVM, x86, ppc, asm-generic: moving dirty bitmaps to user space
On 05/10/2010 03:26 PM, Takuya Yoshikawa wrote: No doubt get.org -> get.opt is measurable, but get.opt->switch.opt is problematic. Have you tried profiling to see where the time is spent (well I can guess, clearing the write access from the sptes). Sorry but no, and I agree with your guess. Anyway, I want to do some profiling to confirm this guess. BTW, If we only think about performance improvement of time, optimized get(get.opt) may be enough at this stage. But if we consider the future expansion like using user allocated bitmaps, new API's introduced for switch.opt won't become waste, I think, because we need a structure to get and export bitmap addresses. User allocated bitmaps have the advantage of reducing pinned memory. However we have plenty more pinned memory allocated in memory slots, so by itself, user allocated bitmaps don't justify this change. Perhaps if we optimize memory slot write protection (I have some ideas about this) we can make the performance improvement more pronounced. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/7] KVM: Consolidate arch specific vcpu ioctl locking
Now that all arch specific ioctls have centralized locking, it is easy to move it to the central dispatcher. Signed-off-by: Avi Kivity --- arch/powerpc/kvm/powerpc.c | 11 --- arch/s390/kvm/kvm-s390.c | 13 ++--- arch/x86/kvm/x86.c |2 -- virt/kvm/kvm_main.c|2 -- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index caeed7b..a1d8750 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -512,17 +512,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; long r; - if (ioctl == KVM_INTERRUPT) { + switch (ioctl) { + case KVM_INTERRUPT: { struct kvm_interrupt irq; r = -EFAULT; if (copy_from_user(&irq, argp, sizeof(irq))) - goto out_nolock; + goto out; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - goto out_nolock; + goto out; } - vcpu_load(vcpu); - switch (ioctl) { case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; @@ -537,8 +536,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } out: - vcpu_put(vcpu); -out_nolock: return r; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 28cd8fd..fad1024 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -638,16 +638,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; long r; - if (ioctl == KVM_S390_INTERRUPT) { + switch (ioctl) { + case KVM_S390_INTERRUPT: { struct kvm_s390_interrupt s390int; + r = -EFAULT; if (copy_from_user(&s390int, argp, sizeof(s390int))) - return -EFAULT; - return kvm_s390_inject_vcpu(vcpu, &s390int); + break; + r = kvm_s390_inject_vcpu(vcpu, &s390int); + break; } - - vcpu_load(vcpu); - switch (ioctl) { case KVM_S390_STORE_STATUS: r = kvm_s390_vcpu_store_status(vcpu, arg); break; @@ -666,7 +666,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, default: r = -EINVAL; } - vcpu_put(vcpu); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b9e5ec..3a763de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2288,7 +2288,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int r; struct kvm_lapic_state *lapic = NULL; - vcpu_load(vcpu); switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; @@ -2486,7 +2485,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: - vcpu_put(vcpu); kfree(lapic); return r; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 08b2ccd..5ee558c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1564,9 +1564,7 @@ out_free2: break; } default: - vcpu_put(vcpu); r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); - vcpu_load(vcpu); } out: vcpu_put(vcpu); -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/7] KVM: s390: Centrally lock arch specific vcpu ioctls
Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 40 +--- 1 files changed, 17 insertions(+), 23 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e80f55e..28cd8fd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -363,9 +363,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); kvm_s390_vcpu_initial_reset(vcpu); - vcpu_put(vcpu); return 0; } @@ -415,14 +413,12 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw) { int rc = 0; - vcpu_load(vcpu); if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING) rc = -EBUSY; else { vcpu->run->psw_mask = psw.mask; vcpu->run->psw_addr = psw.addr; } - vcpu_put(vcpu); return rc; } @@ -573,7 +569,7 @@ static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, const void *from, * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit * KVM_S390_STORE_STATUS_PREFIXED: -> prefix */ -int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) +static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) { const unsigned char archmode = 1; int prefix; @@ -635,45 +631,43 @@ int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) return 0; } -static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) -{ - int rc; - - vcpu_load(vcpu); - rc = __kvm_s390_vcpu_store_status(vcpu, addr); - vcpu_put(vcpu); - return rc; -} - long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; + long r; - switch (ioctl) { - case KVM_S390_INTERRUPT: { + if (ioctl == KVM_S390_INTERRUPT) { struct kvm_s390_interrupt s390int; if (copy_from_user(&s390int, argp, sizeof(s390int))) return -EFAULT; return kvm_s390_inject_vcpu(vcpu, &s390int); } + + vcpu_load(vcpu); + switch (ioctl) { case KVM_S390_STORE_STATUS: - return kvm_s390_vcpu_store_status(vcpu, arg); + r = kvm_s390_vcpu_store_status(vcpu, arg); + break; case KVM_S390_SET_INITIAL_PSW: { psw_t psw; + r = -EFAULT; if (copy_from_user(&psw, argp, sizeof(psw))) - return -EFAULT; - return kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); + break; + r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); + break; } case KVM_S390_INITIAL_RESET: - return kvm_arch_vcpu_ioctl_initial_reset(vcpu); + r = kvm_arch_vcpu_ioctl_initial_reset(vcpu); + break; default: - ; + r = -EINVAL; } - return -EINVAL; + vcpu_put(vcpu); + return r; } /* Section: memory related */ -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/7] KVM: move vcpu locking to dispatcher for generic vcpu ioctls
All vcpu ioctls need to be locked, so instead of locking each one specifically we lock at the generic dispatcher. This patch only updates generic ioctls and leaves arch specific ioctls alone. Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 11 --- arch/powerpc/kvm/book3s.c | 16 arch/powerpc/kvm/booke.c | 10 -- arch/powerpc/kvm/powerpc.c |4 arch/s390/kvm/kvm-s390.c | 16 arch/x86/kvm/x86.c | 36 ++-- virt/kvm/kvm_main.c| 15 +++ 7 files changed, 17 insertions(+), 91 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index d5f4e91..29f6075 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -724,8 +724,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - vcpu_load(vcpu); - if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -747,7 +745,6 @@ out: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); - vcpu_put(vcpu); return r; } @@ -882,8 +879,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd); int i; - vcpu_load(vcpu); - for (i = 0; i < 16; i++) { vpd->vgr[i] = regs->vpd.vgr[i]; vpd->vbgr[i] = regs->vpd.vbgr[i]; @@ -930,8 +925,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu); set_bit(KVM_REQ_RESUME, &vcpu->requests); - vcpu_put(vcpu); - return 0; } @@ -1966,9 +1959,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); mp_state->mp_state = vcpu->arch.mp_state; - vcpu_put(vcpu); return 0; } @@ -1999,10 +1990,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { int r = 0; - vcpu_load(vcpu); vcpu->arch.mp_state = mp_state->mp_state; if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) r = vcpu_reset(vcpu); - vcpu_put(vcpu); return r; } diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index b998abf..f6eac2f 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1047,8 +1047,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; - vcpu_load(vcpu); - regs->pc = kvmppc_get_pc(vcpu); regs->cr = kvmppc_get_cr(vcpu); regs->ctr = kvmppc_get_ctr(vcpu); @@ -1069,8 +1067,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); - vcpu_put(vcpu); - return 0; } @@ -1078,8 +1074,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; - vcpu_load(vcpu); - kvmppc_set_pc(vcpu, regs->pc); kvmppc_set_cr(vcpu, regs->cr); kvmppc_set_ctr(vcpu, regs->ctr); @@ -1099,8 +1093,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); - vcpu_put(vcpu); - return 0; } @@ -1110,8 +1102,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; - vcpu_load(vcpu); - sregs->pvr = vcpu->arch.pvr; sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; @@ -1131,8 +1121,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, } } - vcpu_put(vcpu); - return 0; } @@ -1142,8 +1130,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; - vcpu_load(vcpu); - kvmppc_set_pvr(vcpu, sregs->pvr); vcpu3s->sdr1 = sregs->u.s.sdr1; @@ -1171,8 +1157,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Flush the MMU after messing with the segments */ kvmppc_mmu_pte_flush(vcpu, 0, 0); - vcpu_put(vcpu); - return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index a33ab8c..b687f43 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -485,8 +485,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; - vcpu_load(vcpu); - regs->pc = vcpu->arch.pc; regs->cr
[PATCH 6/7] KVM: PPC: Centralize locking of arch specific vcpu ioctls
Signed-off-by: Avi Kivity --- arch/powerpc/kvm/powerpc.c | 12 1 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e0fae7a..caeed7b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -512,15 +512,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; long r; - switch (ioctl) { - case KVM_INTERRUPT: { + if (ioctl == KVM_INTERRUPT) { struct kvm_interrupt irq; r = -EFAULT; if (copy_from_user(&irq, argp, sizeof(irq))) - goto out; + goto out_nolock; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - break; + goto out_nolock; } + + vcpu_load(vcpu); + switch (ioctl) { case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; @@ -535,6 +537,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } out: + vcpu_put(vcpu); +out_nolock: return r; } -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/7] Consolidate vcpu ioctl locking
Avi Kivity (7): KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls KVM: x86: Add missing locking to arch specific vcpu ioctls KVM: move vcpu locking to dispatcher for generic vcpu ioctls KVM: x86: Lock arch specific vcpu ioctls centrally KVM: s390: Centrally lock arch specific vcpu ioctls KVM: PPC: Centralize locking of arch specific vcpu ioctls KVM: Consolidate arch specific vcpu ioctl locking arch/ia64/kvm/kvm-ia64.c | 11 --- arch/powerpc/kvm/book3s.c | 10 +- arch/powerpc/kvm/booke.c |5 ++- arch/powerpc/kvm/powerpc.c |7 +--- arch/s390/kvm/kvm-s390.c | 55 ++- arch/x86/kvm/x86.c | 69 +-- virt/kvm/kvm_main.c| 13 7 files changed, 39 insertions(+), 131 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/7] KVM: x86: Lock arch specific vcpu ioctls centrally
Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 41 ++--- 1 files changed, 2 insertions(+), 39 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eedb23b..8b9e5ec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1531,16 +1531,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, { int i, idx; - vcpu_load(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; srcu_read_unlock(&vcpu->kvm->srcu, idx); - vcpu_put(vcpu); - return i; } @@ -1788,7 +1784,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; - vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -1806,7 +1801,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); out_free: vfree(cpuid_entries); @@ -1827,11 +1821,9 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; - vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); return 0; out: @@ -1844,7 +1836,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, { int r; - vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1856,7 +1847,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, out: cpuid->nent = vcpu->arch.cpuid_nent; - vcpu_put(vcpu); return r; } @@ -2088,9 +2078,7 @@ out: static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - vcpu_put(vcpu); return 0; } @@ -2098,11 +2086,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); update_cr8_intercept(vcpu); - vcpu_put(vcpu); return 0; } @@ -2114,20 +2100,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; - vcpu_load(vcpu); kvm_queue_interrupt(vcpu, irq->irq, false); - vcpu_put(vcpu); - return 0; } static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); kvm_inject_nmi(vcpu); - vcpu_put(vcpu); return 0; } @@ -2147,7 +2128,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, int r; unsigned bank_num = mcg_cap & 0xff, bank; - vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -2162,7 +2142,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: - vcpu_put(vcpu); return r; } @@ -2220,8 +2199,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - vcpu_load(vcpu); - events->exception.injected = vcpu->arch.exception.pending && !kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2246,8 +2223,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW); - - vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, @@ -2258,8 +2233,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; - vcpu_load(vcpu); - vcpu->arch.exception.
[PATCH 2/7] KVM: x86: Add missing locking to arch specific vcpu ioctls
Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c |6 ++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b1433f..f54ec24 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1844,6 +1844,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, { int r; + vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1855,6 +1856,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, out: cpuid->nent = vcpu->arch.cpuid_nent; + vcpu_put(vcpu); return r; } @@ -2145,6 +2147,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, int r; unsigned bank_num = mcg_cap & 0xff, bank; + vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -2159,6 +2162,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: + vcpu_put(vcpu); return r; } @@ -2467,7 +2471,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&mce, argp, sizeof mce)) goto out; + vcpu_load(vcpu); r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + vcpu_put(vcpu); break; } case KVM_GET_VCPU_EVENTS: { -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/7] KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls
Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s.c | 10 ++ arch/powerpc/kvm/booke.c | 15 ++- 2 files changed, 24 insertions(+), 1 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 11f226f..b998abf 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1110,6 +1110,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; + vcpu_load(vcpu); + sregs->pvr = vcpu->arch.pvr; sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; @@ -1128,6 +1130,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; } } + + vcpu_put(vcpu); + return 0; } @@ -1137,6 +1142,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; + vcpu_load(vcpu); + kvmppc_set_pvr(vcpu, sregs->pvr); vcpu3s->sdr1 = sregs->u.s.sdr1; @@ -1163,6 +1170,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Flush the MMU after messing with the segments */ kvmppc_mmu_pte_flush(vcpu, 0, 0); + + vcpu_put(vcpu); + return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index c922240..a33ab8c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -485,6 +485,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + regs->pc = vcpu->arch.pc; regs->cr = kvmppc_get_cr(vcpu); regs->ctr = vcpu->arch.ctr; @@ -505,6 +507,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); + vcpu_put(vcpu); + return 0; } @@ -512,6 +516,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + vcpu->arch.pc = regs->pc; kvmppc_set_cr(vcpu, regs->cr); vcpu->arch.ctr = regs->ctr; @@ -531,6 +537,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); + vcpu_put(vcpu); + return 0; } @@ -559,7 +567,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { - return kvmppc_core_vcpu_translate(vcpu, tr); + int r; + + vcpu_load(vcpu); + r = kvmppc_core_vcpu_translate(vcpu, tr); + vcpu_put(vcpu); + return r; } int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] pci: allow sysfs file owner to read device dependent config space
On Thu, May 13, 2010 at 01:56:53PM +0300, Avi Kivity wrote: > On 05/13/2010 04:29 AM, Chris Wright wrote: > >The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN > >check to verify privileges before allowing a user to read device > >dependent config space. This is meant to protect from an unprivileged > >user potentially locking up the box. > > > >When assigning a PCI device directly to a guest with libvirt and KVM, > >the sysfs config space file is chown'd to the unprivileged user that > >the KVM guest will run as. The guest needs to have full access to the > >device's config space since it's responsible for driving the device. > >However, despite being the owner of the sysfs file, the CAP_SYS_ADMIN > >check will not allow read access beyond the config header. > > > >With this patch the sysfs file owner is also considered privileged enough > >to read all of the config space. > > > > > > Related questions: > > - does sysfs support selinux labels? With a recent enough kernel + selinux policy it does. Regards, Daniel -- |: Red Hat, Engineering, London-o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org-o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH RFC 10/12] KVM: move dirty bitmaps to user space
mark_page_dirty is called with the mmu_lock spinlock held in set_spte. Must find a way to move it outside of the spinlock section. Oh, it's a serious problem. I have to consider it. Avi, Marcelo, Sorry but I have to say that mmu_lock spin_lock problem was completely out of my mind. Although I looked through the code, it seems not easy to move the set_bit_user to outside of spinlock section without breaking the semantics of its protection. So this may take some time to solve. But personally, I want to do something for x86's "vmallc() every time" problem even though moving dirty bitmaps to user space cannot be achieved soon. In that sense, do you mind if we do double buffering without moving dirty bitmaps to user space? I know that the resource for vmalloc() is precious for x86 but even now, at the timing of get_dirty_log, we use the same amount of memory as double buffering. Thanks, Takuya -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] pci: allow sysfs file owner to read device dependent config space
On 05/13/2010 04:29 AM, Chris Wright wrote: The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN check to verify privileges before allowing a user to read device dependent config space. This is meant to protect from an unprivileged user potentially locking up the box. When assigning a PCI device directly to a guest with libvirt and KVM, the sysfs config space file is chown'd to the unprivileged user that the KVM guest will run as. The guest needs to have full access to the device's config space since it's responsible for driving the device. However, despite being the owner of the sysfs file, the CAP_SYS_ADMIN check will not allow read access beyond the config header. With this patch the sysfs file owner is also considered privileged enough to read all of the config space. Related questions: - does sysfs support selinux labels? - what about iommu? So we give a user privileges to access a device. But if we don't enforce iommu protection, you're giving the user access to the entire system. With kvm, qemu wraps the device with an iommu, but this is less secure than it might be. Better to have a privileged process open the device, irrevocably wrap it with an iommu, and pass the wrapped fd to qemu. This is probably best done with uio, but it means all device access needs to be available through the uio fd, including pci config space access. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] pci: allow sysfs file owner to read device dependent config space
On Wed, 12 May 2010 18:29:57 -0700 Chris Wright wrote: > The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN > check to verify privileges before allowing a user to read device > dependent config space. This is meant to protect from an unprivileged > user potentially locking up the box. Or hacking it. You could argue the correct requirement is to change it to require CAP_SYS_RAWIO in fact ! > With this patch the sysfs file owner is also considered privileged enough > to read all of the config space. Which breaks the main reason the check was there in the first place. To stop accidents of the form find / -exec grep {} "wibble" blowing up in people's faces. I agree with the problem - but IMHO the fix is to require opening the file checks CAP_SYS_something instead: not to hack the read method and make it even weirder and more un-Linux than it is now. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[GIT PULL] last minute vhost-net fix
David, if it's not too late, please pull the following last minute fix into 2.6.34. Thanks! The following changes since commit de02d72bb3cc5b3d4c873db4ca8291723dd48479: Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6 (2010-05-10 22:53:41 -0700) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git net-2.6 Michael S. Tsirkin (1): vhost: fix barrier pairing drivers/vhost/vhost.c |7 ++- 1 files changed, 6 insertions(+), 1 deletions(-) -- MST -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Another SIGFPE in display code, now in cirrus
On 05/12/2010 05:57 PM, Stefano Stabellini wrote: I guess even a src blt pitch of 0 could be useful there, however in practice I think the only rop function that was written with this case in mind has: dstpitch -= bltwidth; srcpitch -= bltwidth; if (dstpitch< 0 || srcpitch< 0) { /* is 0 valid? srcpitch == 0 could be useful */ return; } Note that here srcpitch == 0 is actually srcpitch == bltwidth, which _is_ obviously useful. The "real" srcpitch == 0 case would result in srcpitch == -bltwidth, and it is actually quite useful if you want to stretch a Nx1 bitmap to NxN. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html