[PATCH 3/3] KVM: x86: Use FPU API

2010-05-13 Thread Sheng Yang
Convert KVM to use generic FPU API.

Signed-off-by: Sheng Yang 
---
 arch/x86/include/asm/kvm_host.h |   18 +-
 arch/x86/kvm/x86.c  |   73 ---
 2 files changed, 23 insertions(+), 68 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ed48904..beba6f5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -300,8 +300,7 @@ struct kvm_vcpu_arch {
unsigned long mmu_seq;
} update_pte;
 
-   struct i387_fxsave_struct host_fx_image;
-   struct i387_fxsave_struct guest_fx_image;
+   struct fpu host_fpu, guest_fpu;
 
gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
@@ -709,21 +708,6 @@ static inline unsigned long read_msr(unsigned long msr)
 }
 #endif
 
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
-   asm("fxsave (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
-   asm("fxrstor (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_finit(void)
-{
-   asm("finit");
-}
-
 static inline u32 get_rdx_init_val(void)
 {
return 0x600; /* P6 family */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cd8a606..2313f76 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -52,6 +52,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS  \
@@ -5069,27 +5071,6 @@ unlock_out:
 }
 
 /*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
-   u16 cwd;
-   u16 swd;
-   u16 twd;
-   u16 fop;
-   u64 rip;
-   u64 rdp;
-   u32 mxcsr;
-   u32 mxcsr_mask;
-   u32 st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-   u32 xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-   u32 xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
-/*
  * Translate a guest virtual address to a guest physical address.
  */
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5114,7 +5095,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-   struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+   struct i387_fxsave_struct *fxsave =
+   &vcpu->arch.guest_fpu.state->fxsave;
 
vcpu_load(vcpu);
 
@@ -5134,7 +5116,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, 
struct kvm_fpu *fpu)
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-   struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+   struct i387_fxsave_struct *fxsave =
+   &vcpu->arch.guest_fpu.state->fxsave;
 
vcpu_load(vcpu);
 
@@ -5154,41 +5137,30 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, 
struct kvm_fpu *fpu)
 
 void fx_init(struct kvm_vcpu *vcpu)
 {
-   unsigned after_mxcsr_mask;
-
-   /*
-* Touch the fpu the first time in non atomic context as if
-* this is the first fpu instruction the exception handler
-* will fire before the instruction returns and it'll have to
-* allocate ram with GFP_KERNEL.
-*/
-   if (!used_math())
-   kvm_fx_save(&vcpu->arch.host_fx_image);
+   fpu_alloc(&vcpu->arch.host_fpu);
+   fpu_alloc(&vcpu->arch.guest_fpu);
 
-   /* Initialize guest FPU by resetting ours and saving into guest's */
-   preempt_disable();
-   kvm_fx_save(&vcpu->arch.host_fx_image);
-   kvm_fx_finit();
-   kvm_fx_save(&vcpu->arch.guest_fx_image);
-   kvm_fx_restore(&vcpu->arch.host_fx_image);
-   preempt_enable();
+   fpu_save(&vcpu->arch.host_fpu);
+   fpu_finit(&vcpu->arch.guest_fpu);
 
vcpu->arch.cr0 |= X86_CR0_ET;
-   after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-   vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
-   memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
-  0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
 }
 EXPORT_SYMBOL_GPL(fx_init);
 
+static void fx_free(struct kvm_vcpu *vcpu)
+{
+   fpu_free(&vcpu->arch.host_fpu);
+   fpu_free(&vcpu->arch.guest_fpu);
+}
+
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
if (vcpu->guest_fpu_loaded)
return;
 
vcpu->guest_fpu_loaded = 1;
-   kvm_fx_save(&vcpu->arch.host_fx_image);
-   kvm_fx_restore(&vcpu->arch.guest_fx_image);
+   fpu_save(&vcpu->arch.host_fpu);
+   fpu_restore_checking(&vcpu->arch.guest_fpu);
trace_kvm_fpu(1);
 }
 
@@ -5198,8 +5170,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
return;
 

[PATCH 0/3] Convert KVM to use FPU API

2010-05-13 Thread Sheng Yang
This patchset based on Avi's FPU API patchset.

Sheng Yang (3):
  x86: Split fpu_save_init() to fpu_save() and fpu_clear()
  x86: Export FPU API for KVM use
  KVM: x86: Use FPU API

 arch/x86/include/asm/i387.h |   73 ---
 arch/x86/include/asm/kvm_host.h |   18 +-
 arch/x86/include/asm/xsave.h|3 ++
 arch/x86/kernel/i387.c  |3 +-
 arch/x86/kernel/process.c   |1 +
 arch/x86/kvm/x86.c  |   73 ---
 6 files changed, 74 insertions(+), 97 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] x86: Split fpu_save_init() to fpu_save() and fpu_clear()

2010-05-13 Thread Sheng Yang
fpu_save() would be used later by KVM.

Signed-off-by: Sheng Yang 
---
 arch/x86/include/asm/i387.h |   71 ++-
 1 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 1a8cca3..989d3b7 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -180,13 +180,17 @@ static inline void fpu_fxsave(struct fpu *fpu)
 #endif
 }
 
-static inline void fpu_save_init(struct fpu *fpu)
+static inline void fpu_save(struct fpu *fpu)
 {
if (use_xsave())
fpu_xsave(fpu);
else
fpu_fxsave(fpu);
+}
 
+static inline void fpu_save_init(struct fpu *fpu)
+{
+   fpu_save(fpu);
fpu_clear(fpu);
 }
 
@@ -237,19 +241,41 @@ static inline int fxrstor_checking(struct 
i387_fxsave_struct *fx)
 /*
  * These must be called with preempt disabled
  */
-static inline void fpu_save_init(struct fpu *fpu)
+static inline void fpu_fxsave(struct fpu *fpu)
 {
-   if (use_xsave()) {
-   struct xsave_struct *xstate = &fpu->state->xsave;
-   struct i387_fxsave_struct *fx = &fpu->state->fxsave;
+   /* Use more nops than strictly needed in case the compiler
+  varies code */
+   alternative_input(
+   "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
+   "fxsave %[fx]\n"
+   "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
+   X86_FEATURE_FXSR,
+   [fx] "m" (fpu->state->fxsave),
+   [fsw] "m" (fpu->state->fxsave.swd) : "memory");
+}
 
+static inline void fpu_save(struct fpu *fpu)
+{
+   if (use_xsave())
fpu_xsave(fpu);
+   else
+   fpu_fxsave(fpu);
+}
+
+/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+   is pending.  Clear the x87 state here by setting it to fixed
+   values. safe_address is a random variable that should be in L1 */
+static inline void fpu_clear(struct fpu *fpu)
+{
+   struct xsave_struct *xstate = &fpu->state->xsave;
+   struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
+   if (use_xsave()) {
/*
 * xsave header may indicate the init state of the FP.
 */
if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
-   goto end;
+   return;
 
if (unlikely(fx->swd & X87_FSW_ES))
asm volatile("fnclex");
@@ -257,30 +283,19 @@ static inline void fpu_save_init(struct fpu *fpu)
/*
 * we can do a simple return here or be paranoid :)
 */
-   goto clear_state;
}
+   alternative_input(
+   GENERIC_NOP8 GENERIC_NOP2,
+   "emms\n\t"  /* clear stack tags */
+   "fildl %[addr]",/* set F?P to defined value */
+   X86_FEATURE_FXSAVE_LEAK,
+   [addr] "m" (safe_address));
+}
 
-   /* Use more nops than strictly needed in case the compiler
-  varies code */
-   alternative_input(
-   "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
-   "fxsave %[fx]\n"
-   "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
-   X86_FEATURE_FXSR,
-   [fx] "m" (fpu->state->fxsave),
-   [fsw] "m" (fpu->state->fxsave.swd) : "memory");
-clear_state:
-   /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-  is pending.  Clear the x87 state here by setting it to fixed
-  values. safe_address is a random variable that should be in L1 */
-   alternative_input(
-   GENERIC_NOP8 GENERIC_NOP2,
-   "emms\n\t"  /* clear stack tags */
-   "fildl %[addr]",/* set F?P to defined value */
-   X86_FEATURE_FXSAVE_LEAK,
-   [addr] "m" (safe_address));
-end:
-   ;
+static inline void fpu_save_init(struct fpu *fpu)
+{
+   fpu_save(fpu);
+   fpu_clear(fpu);
 }
 
 static inline void __save_init_fpu(struct task_struct *tsk)
-- 
1.7.0.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] x86: Export FPU API for KVM use

2010-05-13 Thread Sheng Yang
Also add some more constants.

Signed-off-by: Sheng Yang 
---

Do we need to rename "task_xstate_cachep"? It's not only for task struct now.

 arch/x86/include/asm/i387.h  |2 ++
 arch/x86/include/asm/xsave.h |3 +++
 arch/x86/kernel/i387.c   |3 ++-
 arch/x86/kernel/process.c|1 +
 4 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 989d3b7..f19bdf9 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -503,6 +503,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu 
*src)
memcpy(dst->state, src->state, xstate_size);
 }
 
+extern void fpu_finit(struct fpu *fpu);
+
 #endif /* __ASSEMBLY__ */
 
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390c..29ee4e4 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,6 +13,9 @@
 
 #define FXSAVE_SIZE512
 
+#define XSTATE_YMM_SIZE 256
+#define XSTATE_YMM_OFFSET (512 + 64)
+
 /*
  * These are the features that the OS can handle currently.
  */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b..cbc 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,7 +107,7 @@ void __cpuinit fpu_init(void)
 }
 #endif /* CONFIG_X86_64 */
 
-static void fpu_finit(struct fpu *fpu)
+void fpu_finit(struct fpu *fpu)
 {
 #ifdef CONFIG_X86_32
if (!HAVE_HWFP) {
@@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu)
fp->fos = 0xu;
}
 }
+EXPORT_SYMBOL_GPL(fpu_finit);
 
 /*
  * The _current_ task is using the FPU for the first time
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 8bcc21f..373fec9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
 EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL(task_xstate_cachep);
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-- 
1.7.0.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -v2] KVM, Fix QEMU-KVM is killed by guest SRAO MCE

2010-05-13 Thread Huang Ying
On Fri, 2010-05-14 at 05:43 +0800, Marcelo Tosatti wrote:
> On Wed, May 12, 2010 at 02:44:03PM +0800, Huang Ying wrote:
> > @@ -1975,6 +1976,27 @@ static int __direct_map(struct kvm_vcpu
> > return pt_write;
> >  }
> >  
> > +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
> > +{
> > +   char buf[1];
> > +   void __user *hva;
> > +   int r;
> > +
> > +   /* Touch the page, so send SIGBUS */
> > +   hva = (void __user *)gfn_to_hva(kvm, gfn);
> > +   r = copy_from_user(buf, hva, 1);
> > +}
> 
> A SIGBUS signal has been raised by memory poisoning already, so i don't
> see why this is needed?
> 
> To avoid the MMIO processing in userspace before the MCE is sent to the
> guest you can just return -EAGAIN from the page fault handlers back to
> kvm_mmu_page_fault.

The SIGBUS signal is necessary here because this SIGBUS is SRAR (Action
Requirement) while the previously sent one is SRAO (Action Optional).
They have different semantics and processed differently in QEMU-KVM and
guest OS.

For example the guest Linux kernel may ignore SRAO MCE (raised by
QEMU-KVM after receiving SRAO SIGBUS), because it is optional, but for
SRAR MCE (raised by QEMU-KVM after receiving SRAR SIGBUS) the guest
Linux kernel must kill corresponding application or go panic.

> > +int is_hwpoison_address(unsigned long addr)
> > +{
> > +   pgd_t *pgdp;
> > +   pud_t *pudp;
> > +   pmd_t *pmdp;
> > +   pte_t pte, *ptep;
> > +   swp_entry_t entry;
> > +
> > +   pgdp = pgd_offset(current->mm, addr);
> > +   if (!pgd_present(*pgdp))
> > +   return 0;
> > +   pudp = pud_offset(pgdp, addr);
> > +   if (!pud_present(*pudp))
> > +   return 0;
> > +   pmdp = pmd_offset(pudp, addr);
> > +   if (!pmd_present(*pmdp))
> > +   return 0;
> 
> Need to bail out if pmd is huge.

Yes. I will change this.

Best Regards,
Huang Ying


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: remove CAP_SYS_RAWIO requirement from kvm_vm_ioctl_assign_irq

2010-05-13 Thread Alex Williamson
On Thu, 2010-05-13 at 15:06 -0700, Chris Wright wrote:
> * Alex Williamson (alex.william...@redhat.com) wrote:
> > Remove this check in an effort to allow kvm guests to run without
> > root privileges.  This capability check doesn't seem to add any
> > security since the device needs to have already been added via the
> > assign device ioctl and the io actually occurs through the pci
> > sysfs interface.
> 
> Right, so irq resource allocation can't be directly abused (by calling
> this ioctl out of order, for example) because the device must be assigned
> first via kvm_vm_ioctl_assign_device().  My only concern isn't with this
> patch per-se, but the exising kvm_vm_ioctl_assign_device().

Yep, I agree.  How do we decide the caller of the assign device ioctl
has sufficient rights to the device without introducing another
capabilities check?  We won't get far if the resources are already
claimed by a running driver, but we don't necessarily want anyone
with /dev/kvm rights to be able to grab the remaining devices in the
system.

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] ixgbevf: Enable GRO by default

2010-05-13 Thread Jeff Kirsher
On Thu, May 13, 2010 at 12:51, Shirley Ma  wrote:
> Enable GRO by default for performance.
>
> Signed-off-by: Shirley Ma 
> ---
>
>  drivers/net/ixgbevf/ixgbevf_main.c |    1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/net/ixgbevf/ixgbevf_main.c 
> b/drivers/net/ixgbevf/ixgbevf_main.c
> index 40f47b8..1bbb05e 100644
> --- a/drivers/net/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ixgbevf/ixgbevf_main.c
> @@ -3415,6 +3415,7 @@ static int __devinit ixgbevf_probe(struct pci_dev *pdev,
>        netdev->features |= NETIF_F_IPV6_CSUM;
>        netdev->features |= NETIF_F_TSO;
>        netdev->features |= NETIF_F_TSO6;
> +       netdev->features |= NETIF_F_GRO;
>        netdev->vlan_features |= NETIF_F_TSO;
>        netdev->vlan_features |= NETIF_F_TSO6;
>        netdev->vlan_features |= NETIF_F_IP_CSUM;
>
>

Thanks, I have added the patch to my queue.

-- 
Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: remove CAP_SYS_RAWIO requirement from kvm_vm_ioctl_assign_irq

2010-05-13 Thread Chris Wright
* Alex Williamson (alex.william...@redhat.com) wrote:
> Remove this check in an effort to allow kvm guests to run without
> root privileges.  This capability check doesn't seem to add any
> security since the device needs to have already been added via the
> assign device ioctl and the io actually occurs through the pci
> sysfs interface.

Right, so irq resource allocation can't be directly abused (by calling
this ioctl out of order, for example) because the device must be assigned
first via kvm_vm_ioctl_assign_device().  My only concern isn't with this
patch per-se, but the exising kvm_vm_ioctl_assign_device().

thanks,
-chris
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: pci device passthrough: Failed to assign irq

2010-05-13 Thread Gerd v. Egidy
> I'm trying to pass a pcie-device into a kvm guest. qemu-kvm is started
> by libvirt with this command:
[...]
> Failed to assign irq for "hostdev0": Operation not permitted
> Perhaps you are assigning a device that shares an IRQ with another device?

Found out what was causing the issue:

Current libvirt seems to drop CAP_SYS_RAWIO before forking qemu-kvm.

When I patch libvirt to not drop the capabilities, everything works as 
expected. 

So no problem in kvm, sorry for the noise.

Kind regards,

Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -v2] KVM, Fix QEMU-KVM is killed by guest SRAO MCE

2010-05-13 Thread Marcelo Tosatti
On Wed, May 12, 2010 at 02:44:03PM +0800, Huang Ying wrote:
> In common cases, guest SRAO MCE will cause corresponding poisoned page
> be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay
> the MCE to guest OS.
> 
> But it is reported that if the poisoned page is accessed in guest
> after un-mapped and before MCE is relayed to guest OS, QEMU-KVM will
> be killed.
> 
> The reason is as follow. Because poisoned page has been un-mapped,
> guest access will cause guest exit and kvm_mmu_page_fault will be
> called. kvm_mmu_page_fault can not get the poisoned page for fault
> address, so kernel and user space MMIO processing is tried in turn. In
> user MMIO processing, poisoned page is accessed again, then QEMU-KVM
> is killed by force_sig_info.
> 
> To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM
> and do not try kernel and user space MMIO processing for poisoned
> page.
> 
> 
> Changelog:
> 
> v2:
> 
> - Use page table walker to determine whether the virtual address is
>   poisoned to avoid change user space interface (via changing
>   get_user_pages).
> 
> - Wrap bad page processing into kvm_handle_bad_page to avoid code
>   duplicating.
> 
> Reported-by: Max Asbock 
> Signed-off-by: Huang Ying 
> ---
>  arch/x86/kvm/mmu.c |   34 ++
>  arch/x86/kvm/paging_tmpl.h |7 ++-
>  include/linux/kvm_host.h   |1 +
>  include/linux/mm.h |8 
>  mm/memory-failure.c|   28 
>  virt/kvm/kvm_main.c|   30 --
>  6 files changed, 93 insertions(+), 15 deletions(-)
> 
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -32,6 +32,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -1975,6 +1976,27 @@ static int __direct_map(struct kvm_vcpu
>   return pt_write;
>  }
>  
> +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
> +{
> + char buf[1];
> + void __user *hva;
> + int r;
> +
> + /* Touch the page, so send SIGBUS */
> + hva = (void __user *)gfn_to_hva(kvm, gfn);
> + r = copy_from_user(buf, hva, 1);
> +}

A SIGBUS signal has been raised by memory poisoning already, so i don't
see why this is needed?

To avoid the MMIO processing in userspace before the MCE is sent to the
guest you can just return -EAGAIN from the page fault handlers back to
kvm_mmu_page_fault.

> +int is_hwpoison_pfn(pfn_t pfn)
> +{
> + return pfn == hwpoison_pfn;
> +}
> +EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
> +
>  static inline unsigned long bad_hva(void)
>  {
>   return PAGE_OFFSET;
> @@ -939,6 +948,11 @@ static pfn_t hva_to_pfn(struct kvm *kvm,
>   if (unlikely(npages != 1)) {
>   struct vm_area_struct *vma;
>  
> + if (is_hwpoison_address(addr)) {
> + get_page(hwpoison_page);
> + return page_to_pfn(hwpoison_page);
> + }
> +
>   down_read(¤t->mm->mmap_sem);
>   vma = find_vma(current->mm, addr);
>  
> @@ -2198,6 +2212,15 @@ int kvm_init(void *opaque, unsigned int
>  
>   bad_pfn = page_to_pfn(bad_page);
>  
> + hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +
> + if (hwpoison_page == NULL) {
> + r = -ENOMEM;
> + goto out_free_0;
> + }
> +
> + hwpoison_pfn = page_to_pfn(hwpoison_page);
> +
>   if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
>   r = -ENOMEM;
>   goto out_free_0;
> @@ -2269,6 +2292,8 @@ out_free_1:
>  out_free_0a:
>   free_cpumask_var(cpus_hardware_enabled);
>  out_free_0:
> + if (hwpoison_page)
> + __free_page(hwpoison_page);
>   __free_page(bad_page);
>  out:
>   kvm_arch_exit();
> @@ -2291,6 +2316,7 @@ void kvm_exit(void)
>   kvm_arch_hardware_unsetup();
>   kvm_arch_exit();
>   free_cpumask_var(cpus_hardware_enabled);
> + __free_page(hwpoison_page);
>   __free_page(bad_page);
>  }
>  EXPORT_SYMBOL_GPL(kvm_exit);
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -45,6 +45,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "internal.h"
>  
>  int sysctl_memory_failure_early_kill __read_mostly = 0;
> @@ -1296,3 +1297,30 @@ done:
>   /* keep elevated page count for bad page */
>   return ret;
>  }
> +
> +int is_hwpoison_address(unsigned long addr)
> +{
> + pgd_t *pgdp;
> + pud_t *pudp;
> + pmd_t *pmdp;
> + pte_t pte, *ptep;
> + swp_entry_t entry;
> +
> + pgdp = pgd_offset(current->mm, addr);
> + if (!pgd_present(*pgdp))
> + return 0;
> + pudp = pud_offset(pgdp, addr);
> + if (!pud_present(*pudp))
> + return 0;
> + pmdp = pmd_offset(pudp, addr);
> + if (!pmd_present(*pmdp))
> + return 0;

Need to bail out if pmd is huge.

> + ptep = pte_offset_map(pmdp, addr);
> + pte = *ptep;
> + pte_unmap(ptep);
> + i

[QEMU-KVM]: Megasas + TCM_Loop + SG_IO into Windows XP guests

2010-05-13 Thread Nicholas A. Bellinger
Greetings Hannes and co,

I have been spending a bit of time trying Megasas HBA emulation +
TCM_Loop + SG_IO with a Windows XP SP2 KVM guests..  So far, I noticed
that hw/scsi-generic.c:execute_command_run() using bdev_aio_ioctl()
appears to be broken for XP guests, which causes the first 36-byte
INQUIRY sent via SG_IO to never make it back to QEMU and results in the
win32 LSI drive taking the LUN offline, et al.  Note that everything
does appear to be functioning as expected in kernel space for the first
INQUIRY with the TCM_Loop LLD and Linux/SCSI code (AFAICT) and Linux KVM
guests using megasas emulation are still working.

So, I ended up needing requiring the following quick hack for
hw/scsi-generic.c:execute_command_run() to make SG_IO function
synchronously using bdrv_ioctl(), which at least gets LUN registration
and basic control path CDBs working for the XP guest.

Here is how it looks in action on a v2.6.34-rc7 host so far:

http://www.linux-iscsi.org/images/TCM-KVM-megasas-XP-05132010.png


diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
index 6c58742..aa1eb83 100644
--- a/hw/scsi-generic.c
+++ b/hw/scsi-generic.c
@@ -140,6 +140,7 @@ static int execute_command_run(SCSIGenericReq *r,
 {
 BlockDriverState *bdrv = r->req.dev->conf.dinfo->bdrv;
 SCSIGenericState *s = DO_UPCAST(SCSIGenericState, qdev, r->req.dev);
+int ret;
 
 r->io_header.interface_id = 'S';
 r->io_header.dxfer_direction = sgdir[r->req.cmd.mode];
@@ -161,11 +162,16 @@ static int execute_command_run(SCSIGenericReq *r,
 printf("\n");
 }
 #endif
+#if 0
 r->req.aiocb = bdrv_aio_ioctl(bdrv, SG_IO, &r->io_header, complete, r);
 if (r->req.aiocb == NULL) {
 BADF("execute_command: read failed !\n");
 return -1;
 }
+#else
+ret = bdrv_ioctl(bdrv, SG_IO, &r->io_header);
+complete((void *)r, ret);
+#endif
 
  *  return 0;
 }


Beyond the initial LUN registration and control CDB parts, doing bulk
DATA_SG_IO traffic is completing successfully (and everything looks sane
with TCM_Loop and Linux/SCSI) but it appears that the correct blocks are
not actually getting written/read by megasas.  This appears to be the
case with both hw/scsi-generic.c and hw/scsi-disk.c modes of operation
for megasas with the win32 XP guest.

So I was wondering if anyone aware of known issues with QEMU
asynchronous SG_IO into MSFT KVM guests with virtio or hw/lsi53c895a.c,
or would this be something specific to megasas HBA emulation and XP
guests..?

Hannes, which MSFT guest + driver did you get work stable with bulk
DATA_SG_IO and hw/scsi-disk.c..?

Best,

--nab

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 4/5] Inter-VM shared memory PCI device

2010-05-13 Thread Cam Macdonell
On Mon, May 10, 2010 at 5:59 AM, Avi Kivity  wrote:
> On 04/21/2010 08:53 PM, Cam Macdonell wrote:

>> +
>> +        /* allocate/initialize space for interrupt handling */
>> +        s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
>> +        s->eventfd_table = qemu_mallocz(s->vectors *
>> sizeof(EventfdEntry));
>> +        s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests *
>> sizeof(int));
>> +
>> +        pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support
>> interrupts */
>>
>
> This is done by the guest BIOS.
>
>

If I remove that line, my driver crashes when it falls back to
pin-based interrupts (when MSI is turned off).  Is there something in
the device driver that I need to set in place of this?  A number of
other devices (mostly network cards) set the interrupt pin this way,
so I'm a little confused.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] x86 emulator: Add missing decoder flags for sub instruction

2010-05-13 Thread Mohammed Gamal
On Thu, May 13, 2010 at 9:50 PM, Marcelo Tosatti  wrote:
> On Wed, May 12, 2010 at 01:39:21AM +0300, Mohammed Gamal wrote:
>> This adds missing decoder flags for sub instructions (opcodes 0x2c - 0x2d)
>>
>> Signed-off-by: Mohammed Gamal 
>> ---
>>  arch/x86/kvm/emulate.c |    2 +-
>>  1 files changed, 1 insertions(+), 1 deletions(-)
>
> Applied both, thanks.
>
>

Thanks Marcelo, please also take a look at the test cases for these
instructions that I posted to the mailing list.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] VMX: Fix and improve guest state validity checks

2010-05-13 Thread Mohammed Gamal
On Thu, May 13, 2010 at 9:24 AM, Avi Kivity  wrote:
> On 05/11/2010 07:52 PM, Mohammed Gamal wrote:
>>
>> - Add 's' and 'g' field checks on segment registers
>> - Correct SS checks for request and descriptor privilege levels
>>
>> Signed-off-by: Mohammed Gamal
>> ---
>>  arch/x86/kvm/vmx.c |   73
>> +++
>>  1 files changed, 67 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 777e00d..9805c2a 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2121,16 +2121,30 @@ static bool stack_segment_valid(struct kvm_vcpu
>> *vcpu)
>>        vmx_get_segment(vcpu,&ss, VCPU_SREG_SS);
>>        ss_rpl = ss.selector&  SELECTOR_RPL_MASK;
>>
>> -       if (ss.unusable)
>> +       if (ss.dpl != ss_rpl) /* DPL != RPL */
>> +               return false;
>> +
>> +       if (ss.unusable) /* Short-circuit */
>>                return true;
>>
>
> If ss.unusable, do the dpl and rpl have any meaning?

The idea is that dpl and rpl are checked on vmentry regardless of
whether ss is usable or not. While the other checks are performed only
if ss is usable.

>
>>        if (!ss.present)
>>                return false;
>> +       if (ss.limit&  0xfff0) {
>> +                if ((ss.limit&  0xfff)<  0xfff)
>> +                        return false;
>> +                if (!ss.g)
>> +                        return false;
>> +        } else {
>> +                if ((ss.limit&  0xfff) == 0xfff)
>> +                        return false;
>> +                if (ss.g)
>> +                        return false;
>> +        }
>>
>
> There is no architectural way to break this.  That is, without
> virtualization, there is no way a real cpu will ever have a limit of
> 0x12345678.
>
> We need to distinguish between big real mode and real mode that can be
> virtualized using vm86, but we don't need to consider impossible setups.

I didn't realize this is architecturally impossible. I was simply
implementing the checks specified in the Intel manual. Now that we
know this is redunant, we can just drop these checks.

>
>
>> @@ -2143,8 +2157,15 @@ static bool data_segment_valid(struct kvm_vcpu
>> *vcpu, int seg)
>>        vmx_get_segment(vcpu,&var, seg);
>>        rpl = var.selector&  SELECTOR_RPL_MASK;
>>
>> -       if (var.unusable)
>> +       if (var.unusable)  /* Short-circuit */
>>                return true;
>> +       if (!(var.type&  AR_TYPE_ACCESSES_MASK))
>> +               return false;
>>
>
> Again, there is no architectural way for a segment not to have the accessed
> bit set.
>
>> +       if (var.type&  AR_TYPE_CODE_MASK) {
>> +               if (!(var.type&  AR_TYPE_READABLE_MASK))
>> +                       return false;
>> +       }
>>
>
> About this, I'm not sure.
>
>> +
>>        if (!var.s)
>>                return false;
>>        if (!var.present)
>> @@ -2154,6 +2175,18 @@ static bool data_segment_valid(struct kvm_vcpu
>> *vcpu, int seg)
>>                        return false;
>>        }
>>
>> +       if (var.limit&  0xfff0) {
>> +               if ((var.limit&  0xfff)<  0xfff)
>> +                       return false;
>> +               if (!var.g)
>> +                       return false;
>> +       } else {
>> +               if ((var.limit&  0xfff) == 0xfff)
>> +                       return false;
>> +               if (var.g)
>> +                       return false;
>> +       }
>>
>
> Even disregarding the incorrectness, you shouldn't duplicate code like this.
I was intending to consolidate it into a single function eventually, I
just wasn't sure that this was correct and I needed some comments on
it. It's not needed now anyhow.

>
>> @@ -2192,6 +2240,20 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
>>                return false;
>>        if (!ldtr.present)
>>                return false;
>> +       if (ldtr.s)
>> +               return false;
>>
>
> Architecturally impossible.
>
> --
> Do not meddle in the internals of kernels, for they are subtle and quick to
> panic.
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] KVM updates for 2.6.34-rc7

2010-05-13 Thread Marcelo Tosatti

Linus,

please pull from

git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm-updates/2.6.34

To receive the following updates:

Dongxiao Xu (1):
  KVM: x86: Call vcpu_load and vcpu_put in cpuid_update

Jan Kiszka (1):
  KVM: VMX: blocked-by-sti must not defer NMI injections

Joerg Roedel (1):
  KVM: SVM: Fix wrong intercept masks on 32 bit

Marcelo Tosatti (1):
  KVM: convert ioapic lock to spinlock

Roel Kluin (1):
  KVM: PPC: Keep index within boundaries in kvmppc_44x_emul_tlbwe()


 arch/powerpc/kvm/44x_tlb.c |2 +-
 arch/x86/kvm/svm.c |8 
 arch/x86/kvm/vmx.c |3 +--
 arch/x86/kvm/x86.c |4 
 virt/kvm/ioapic.c  |   30 +++---
 virt/kvm/ioapic.h  |2 +-
 6 files changed, 26 insertions(+), 23 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] x86 emulator: Add missing decoder flags for sub instruction

2010-05-13 Thread Marcelo Tosatti
On Wed, May 12, 2010 at 01:39:21AM +0300, Mohammed Gamal wrote:
> This adds missing decoder flags for sub instructions (opcodes 0x2c - 0x2d)
> 
> Signed-off-by: Mohammed Gamal 
> ---
>  arch/x86/kvm/emulate.c |2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)

Applied both, thanks.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: network between host and guest

2010-05-13 Thread Thanasis
on 05/13/2010 10:48 PM dry...@sky-haven.net wrote the following:
> Ar 10.05.10 10:46, scríobh Thanasis:
>> I have installed kvm (app-emulation/qemu-kvm-0.12.3-r1) on linux (a
>> laptop with gentoo linux) and MS Windows 7 as guest.
>> Here is what info I get about the network on each one:
>> 1) on linux (host):
>
> [snip]
>
> Hi there:
>
> The precise KVM command would be useful in determining what's going on.
>
> Since I'm guessing, I may as well presume you used "-net
> nic,(options...) -net user".  If you did, the magick is in the qemu
> binary which is both providing a DHCP service to the guest and
> performing the necessary outbound source NAT.
>
> But as others have mentioned, more information on how you started KVM
> might be useful to determine what's going on.  The reason is that there
> are several ways to set up networking in KVM.
>
The command is (and running as a user, not root):

$ kvm win7kvm.img -m 1024 -boot c -usb -usbdevice tablet

$ which kvm
/usr/bin/kvm
$ ls -l /usr/bin/kvm
lrwxrwxrwx 1 root root 17 Apr 21 15:10 /usr/bin/kvm -> /usr/bin/qemu-kvm
$ file /usr/bin/qemu-kvm
/usr/bin/qemu-kvm: POSIX shell script text executable
$ cat /usr/bin/qemu-kvm
#!/bin/sh
exec /usr/bin/qemu-system-x86_64 --enable-kvm "$@"
$ file /usr/bin/qemu-system-x86_64
/usr/bin/qemu-system-x86_64: ELF 64-bit LSB executable, x86-64, version
1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.9,
stripped
$



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] ixgbevf: Enable GRO by default

2010-05-13 Thread Shirley Ma
Enable GRO by default for performance. 

Signed-off-by: Shirley Ma 
---
 
 drivers/net/ixgbevf/ixgbevf_main.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ixgbevf/ixgbevf_main.c 
b/drivers/net/ixgbevf/ixgbevf_main.c
index 40f47b8..1bbb05e 100644
--- a/drivers/net/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ixgbevf/ixgbevf_main.c
@@ -3415,6 +3415,7 @@ static int __devinit ixgbevf_probe(struct pci_dev *pdev,
netdev->features |= NETIF_F_IPV6_CSUM;
netdev->features |= NETIF_F_TSO;
netdev->features |= NETIF_F_TSO6;
+   netdev->features |= NETIF_F_GRO;
netdev->vlan_features |= NETIF_F_TSO;
netdev->vlan_features |= NETIF_F_TSO6;
netdev->vlan_features |= NETIF_F_IP_CSUM;


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7] Consolidate vcpu ioctl locking

2010-05-13 Thread Alexander Graf


Am 13.05.2010 um 14:29 schrieb Avi Kivity :


On 05/13/2010 03:18 PM, Alexander Graf wrote:



[PATCH 0/7] Consolidate vcpu ioctl locking

In general, all vcpu ioctls need to take the vcpu mutex, but each  
one does it

(or not) individually.  This is cumbersome and error prone.

This patchset moves all locking to a central place.  This is  
complicated
by the fact that ppc's KVM_INTERRUPT and s390's KVM_S390_INTERRUPT  
break

the convention and need to run unlocked.


Why is the x86 non-kernel-pic path different?



Userspace issues the ioctl from a vcpu thread.

It has to, btw, since whether an interrupt can be injected or not  
depends on vcpu-synchronous registers: eflags.if and tpr/cr8.


On ppc we don't have a tpr, but eflags.if is basically the same as  
msr.ee.


The major difference apparently is that on ppc we KVM_INTERRUPT pulls  
the interrupt line. On vcpu_run we then check whether msr.ee is set  
and if so, trigger the interrupt.


I wonder why we don't do the same for x86. The current limitation on  
userspace checking eflags and the tpr seems cumbersome.


Alex




--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2 v2] pci: check caps from sysfs file open to read device dependent config space

2010-05-13 Thread Chris Wright
* Greg KH (g...@kroah.com) wrote:
> On Thu, May 13, 2010 at 10:43:07AM -0700, Chris Wright wrote:
> > * Alan Cox (a...@lxorguk.ukuu.org.uk) wrote:
> > > I agree with the problem - but IMHO the fix is to require opening the file
> > > checks CAP_SYS_something instead: not to hack the read method and make it
> > > even weirder and more un-Linux than it is now.
> > 
> > This patch does that.  Not as convenient from the KVM/libvirt point of view
> > because it is not prepared to do this setup before dropping privileges
> > and launching the VM.
> 
> So does that mean that this patch doesn't solve your original problem
> here?

Right, it means we have to change how we create a guest with a directly
assigned PCI device.

Currently KVM/libvirt is assuming that sysfs file ownership is sufficient
to read a sysfs file.  It chowns all relevant sysfs files and updates
security labels such that only that guest can access the files, then
drops privileges and launches the guest.

With the v2 patch we'll have to open the config space sysfs file in the
privileged context and pass it into the unprivileged one.  It is awkward,
but it should be doable.

thanks,
-chris
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2 v2] pci: check caps from sysfs file open to read device dependent config space

2010-05-13 Thread Greg KH
On Thu, May 13, 2010 at 10:43:07AM -0700, Chris Wright wrote:
> * Alan Cox (a...@lxorguk.ukuu.org.uk) wrote:
> > I agree with the problem - but IMHO the fix is to require opening the file
> > checks CAP_SYS_something instead: not to hack the read method and make it
> > even weirder and more un-Linux than it is now.
> 
> This patch does that.  Not as convenient from the KVM/libvirt point of view
> because it is not prepared to do this setup before dropping privileges
> and launching the VM.

So does that mean that this patch doesn't solve your original problem
here?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM and the OOM-Killer

2010-05-13 Thread David S. Ahern

>> Not sure I like the idea of running a 64bit user space kernel on top
>> of a 32bit host, prefer to re-install.
>>
>> Can I just replace my kernel with a 64bit one, or do I have to
>> re-install the host O/S ?
> 
> You can run 32-bit userspace with a 64-bit kernel, or reinstall,
> whichever you prefer.
> 
> I once upgraded a 32-bit Fedora install to 64-bit, but that takes some
> tinkering.
> 

You can just install a 64-bit kernel. For rpm based systems you have to
"unpack" the rpm using rpm2cpio. The modules.dep file cannot be updated
-- need to generate that elsewhere -- and mkinitrd needs to be modified
to not try to strip modules (s,strip,true,).

That's all I had to do to plop a 64-bit kernel onto a 32-bit install.

David
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: Another SIGFPE in display code, now in cirrus

2010-05-13 Thread Stefano Stabellini
On Thu, 13 May 2010, Michael Tokarev wrote:
> Stefano Stabellini wrote:
> []
> > diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c
> > index 9f61a01..81c443b 100644
> > --- a/hw/cirrus_vga.c
> > +++ b/hw/cirrus_vga.c
> 
> The same as with previous patch: Yellow screen
> (instead of crashing), and two lines on the
> stderr:
> 
> BUG: kvm_dirty_pages_log_enable_slot: invalid parameters
> BUG: kvm_dirty_pages_log_enable_slot: invalid parameters
> 

I tried to do the same thing on WinNT with qemu (thanks to
Michael for providing me the image) and it works OK with the
patch applied.

I think there must be another bug in the kvm dirty page handling...

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2 v2] pci: check caps from sysfs file open to read device dependent config space

2010-05-13 Thread Chris Wright
* Alan Cox (a...@lxorguk.ukuu.org.uk) wrote:
> I agree with the problem - but IMHO the fix is to require opening the file
> checks CAP_SYS_something instead: not to hack the read method and make it
> even weirder and more un-Linux than it is now.

This patch does that.  Not as convenient from the KVM/libvirt point of view
because it is not prepared to do this setup before dropping privileges
and launching the VM.

thanks,
-chris
--

From: Chris Wright 
Subject: [PATCH 2/2 v2] pci: check caps from sysfs file open to read device 
dependent config space

The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN
check to verify privileges before allowing a user to read device
dependent config space.  This is meant to protect from an unprivileged
user potentially locking up the box.

When assigning a PCI device directly to a guest with libvirt and KVM,
the sysfs config space file is chown'd to the unprivileged user that
the KVM guest will run as.  The guest needs to have full access to the
device's config space since it's responsible for driving the device.
However, despite being the owner of the sysfs file, the CAP_SYS_ADMIN
check will not allow read access beyond the config header.

With this patch we check privileges against the capabilities used when
openining the sysfs file.  The allows a privileged process to open the
file and hand it to an unprivileged process, and the unprivileged process
can still read all of the config space.

Signed-off-by: Chris Wright 
---
 drivers/pci/pci-sysfs.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index ad44557..6309c5a 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -367,7 +368,7 @@ pci_read_config(struct file *filp, struct kobject *kobj,
u8 *data = (u8*) buf;
 
/* Several chips lock up trying to read undefined config space */
-   if (capable(CAP_SYS_ADMIN)) {
+   if (cap_raised(filp->f_cred->cap_effective, CAP_SYS_ADMIN)) {
size = dev->cfg_size;
} else if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) {
size = 128;
-- 
1.6.5.2
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Prevent IRQ sharing for PCI passthrough

2010-05-13 Thread Chris Wright
* Avi Kivity (a...@redhat.com) wrote:
> On 05/13/2010 07:57 PM, Dirk Gouders wrote:
> >Hello List,
> >
> >please don't shout at me if the answer to my question is too obvious,
> 
> We won't shout at you.
> 
> >but in the German Linux Magazine I read an article about passing PCI
> >devices to KVM guests and the authors said that "shared interrupt
> >funcionality has to be disabled" which I read as "it is possible to
> >disable shared interrupt functionality".
> 
> It would be more correct to say "no go with shared host interrupts".
> 
> >I currently have the problem with a PCI ISDN card that is not MSI
> >capable and even if I change the PCI slot and disable all other slots,
> >USB etc. in the BIOS, that card still shares its interrupt with three
> >other devices.
> >
> >Even after extensive searching/reading, I am not sure if it possible to
> >prevent a PCI card from sharing an interrupt with other devices or
> >probably manually assign a free interrupt to that card via some kernel
> >parameter and would be glad if someone could give me an answer to that
> >question.
> 
> There is an ACPI _SRS method which can be used to move interrupts
> around.  However Linux doesn't appear to expose it.  Even if it did,
> the interrupt may be shared on the motherboard, in which case
> nothing would help (though you might be able to share it with unused
> devices).

The only way to work around this is to unbind the other drivers that
are sharing that interrupt.  This may not work if the other devices are
critical to you system (it's typically sharing w/ USB Host Controller).

Otherwise, we can't support this mode easily (there is some work in
progress to allow this to work if your ISDN device is PCI 2.3
compliant).

thanks,
-chris
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Prevent IRQ sharing for PCI passthrough

2010-05-13 Thread Avi Kivity

On 05/13/2010 07:57 PM, Dirk Gouders wrote:

Hello List,

please don't shout at me if the answer to my question is too obvious,
   


We won't shout at you.


but in the German Linux Magazine I read an article about passing PCI
devices to KVM guests and the authors said that "shared interrupt
funcionality has to be disabled" which I read as "it is possible to
disable shared interrupt functionality".
   


It would be more correct to say "no go with shared host interrupts".


I currently have the problem with a PCI ISDN card that is not MSI
capable and even if I change the PCI slot and disable all other slots,
USB etc. in the BIOS, that card still shares its interrupt with three
other devices.

Even after extensive searching/reading, I am not sure if it possible to
prevent a PCI card from sharing an interrupt with other devices or
probably manually assign a free interrupt to that card via some kernel
parameter and would be glad if someone could give me an answer to that
question.
   


There is an ACPI _SRS method which can be used to move interrupts 
around.  However Linux doesn't appear to expose it.  Even if it did, the 
interrupt may be shared on the motherboard, in which case nothing would 
help (though you might be able to share it with unused devices).


Chris?

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


pci device passthrough: Failed to assign irq

2010-05-13 Thread Gerd v. Egidy
Hi,

I'm trying to pass a pcie-device into a kvm guest. qemu-kvm is started
by libvirt with this command:

LC_ALL=C PATH=/sbin:/usr/sbin:/bin:/usr/bin HOME=/root USER=root LOGNAME=root 
QEMU_AUDIO_DRV=none 
/usr/bin/qemu-kvm -S -M pc-0.12 -enable-kvm -m 2048 -smp 
2,sockets=2,cores=1,threads=1 -name test
-uuid cefc7515-c0c2-27fc-8e7e-e0a65f9cb95b -nodefaults 
-chardev 
socket,id=monitor,path=/var/lib/libvirt/qemu/test.monitor,server,nowait 
-mon chardev=monitor,mode=readline -rtc base=utc -boot c 
-drive file=/dev/vg_main/test,if=none,id=drive-virtio-disk0,boot=on,format=raw 
-device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
-drive if=none,media=cdrom,id=drive-ide0-1-0,readonly=on 
-device ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 
-device virtio-net-pci,vlan=0,id=net0,mac=52:54:00:d0:84:c7,bus=pci.0,addr=0x7 
-net tap,fd=59,vlan=0,name=hostnet0 
-chardev pty,id=serial0 
-device isa-serial,chardev=serial0 -usb -vnc 127.0.0.1:0 -k de 
-vga cirrus -device pci-assign,host=05:00.0,id=hostdev0,bus=pci.0,addr=0x8 
-device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x3 

It failes with:

Failed to assign irq for "hostdev0": Operation not permitted
Perhaps you are assigning a device that shares an IRQ with another device?

The irq of the device does not seem to be shared:

05:00.0 Ethernet controller: Intel Corporation 82574L Gigabit Network Connection
Subsystem: ASUSTeK Computer Inc. Device 8369
Flags: bus master, fast devsel, latency 0, IRQ 18
Memory at faee (32-bit, non-prefetchable) [size=128K]
I/O ports at cc00 [size=32]
Memory at faedc000 (32-bit, non-prefetchable) [size=16K]
Capabilities: [c8] Power Management version 2
Capabilities: [d0] MSI: Enable- Count=1/1 Maskable- 64bit+
Capabilities: [e0] Express Endpoint, MSI 00
Capabilities: [a0] MSI-X: Enable- Count=5 Masked-
Capabilities: [100] Advanced Error Reporting
Capabilities: [140] Device Serial Number 90-e6-ba-ff-ff-7c-c6-ef
Kernel driver in use: pci-stub
Kernel modules: e1000e

when kvm is not running and the device used as eth0 on host:

# cat /proc/interrupts 
[...]
 17:  0  0  0  0  0  0  
0  0   IO-APIC-fasteoi   sata_sil24
 23: 69  0  0  0  0  0  
0  0   IO-APIC-fasteoi   ehci_hcd:usb2
[...]
 37:  2  0  0  0  0  0  
0  0   PCI-MSI-edge  eth0-rx-0
 38:  4  0  0  0  0  0  
0  0   PCI-MSI-edge  eth0-tx-0
 39:  1  0  0  0  0  0  
0  0   PCI-MSI-edge  eth0
[...]

qemu-kvm is running as root, selinux is running in permissive mode. So the 
rights shouldn't be
an issue too.

The processor is an Intel Core i7-860 on an Asus P7F-E board, chipset is an 
Intel 3420.
Bios is ver 0607, vt-d is enabled in bios setup.

This is logged in dmesg, so to me it seems like the system is indeed vt-d 
capable:

[...]
ACPI: Core revision 20091214
DMAR: Host address width 36
DMAR: DRHD base: 0x00fed9 flags: 0x1
IOMMU fed9: ver 1:0 cap c90780106f0462 ecap f020e3
DMAR: RMRR base: 0x0e4000 end: 0x0e7fff
DMAR: RMRR base: 0x00bf7ec000 end: 0x00bf7f
DMAR: No ATSR found
Setting APIC routing to physical flat
..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
CPU0: Intel(R) Core(TM) i7 CPU 860  @ 2.80GHz stepping 05
[...]
IOMMU 0xfed9: using Queued invalidation
IOMMU: Setting RMRR:
IOMMU: Setting identity map for device :00:1d.0 [0xbf7ec000 - 0xbf80]
IOMMU: Setting identity map for device :00:1a.0 [0xbf7ec000 - 0xbf80]
IOMMU: Setting identity map for device :00:1d.0 [0xe4000 - 0xe8000]
IOMMU: Setting identity map for device :00:1a.0 [0xe4000 - 0xe8000]
IOMMU: Prepare 0-16MiB unity mapping for LPC
IOMMU: Setting identity map for device :00:1f.0 [0x0 - 0x100]
  alloc irq_desc for 29 on node 0
  alloc kstat_irqs on node 0
PCI-DMA: Intel(R) Virtualization Technology for Directed I/O
Intel PCLMULQDQ-NI instructions are not detected.
[...]

Kernel is 2.6.33.3-85.fc13.x86_64. I tried qemu-0.12.3-8.fc13 as it came with
Fedora 13 beta and qemu-kvm-0.12.4, same results.

Any idea whats going wrong or what else I could try?

Kind regards,

Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Prevent IRQ sharing for PCI passthrough

2010-05-13 Thread Dirk Gouders
Hello List,

please don't shout at me if the answer to my question is too obvious,
but in the German Linux Magazine I read an article about passing PCI
devices to KVM guests and the authors said that "shared interrupt
funcionality has to be disabled" which I read as "it is possible to
disable shared interrupt functionality".

I currently have the problem with a PCI ISDN card that is not MSI
capable and even if I change the PCI slot and disable all other slots,
USB etc. in the BIOS, that card still shares its interrupt with three
other devices.

Even after extensive searching/reading, I am not sure if it possible to
prevent a PCI card from sharing an interrupt with other devices or
probably manually assign a free interrupt to that card via some kernel
parameter and would be glad if someone could give me an answer to that
question.

Best regards,

Dirk
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [SeaBIOS] [PATCHv2] Support for booting from virtio disks

2010-05-13 Thread Avi Kivity

On 05/10/2010 06:58 PM, Anthony Liguori wrote:

Isn't this problem unrelated to this patch?  I mean if I start qemu with
two ide devices can I specify from qemu command line which one I want to
boot from?


That's sort of what I'm asking.  If you compare this approach to 
extboot, extboot provided a capability to select a disk.  I think it 
can be argued though that this isn't a necessary feature to carry over 
and I'm looking for additional opinions on that.


I'd say it's a necessary feature, but not one to carry over from the 
extboot implementation.  We have the seabios boot menu (how to reach 
it?), we need to store the nvram persistently,  and we need to extend 
the selection menu to qemu, but that's unrelated to this patch.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: Another SIGFPE in display code, now in cirrus

2010-05-13 Thread Jamie Lokier
Stefano Stabellini wrote:
> > I think we need to consider only dstpitch for a full invalidate.  We 
> > might be copying an offscreen bitmap into the screen, and srcpitch is 
> > likely to be the bitmap width instead of the screen pitch.
> 
> Agreed.

Even when copying on-screen (or partially on-screen), the srcpitch
does not affect the invalidated area.  The source area might be
strange (parallelogram, single line repeated), but srcpitch should
only affect whether qemu_console_copy can be used, not the
invalidation.

-- Jamie
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] pci: allow sysfs file owner to read device dependent config space

2010-05-13 Thread Chris Wright
* Alan Cox (a...@lxorguk.ukuu.org.uk) wrote:
> On Wed, 12 May 2010 18:29:57 -0700
> Chris Wright  wrote:
> 
> > The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN
> > check to verify privileges before allowing a user to read device
> > dependent config space.  This is meant to protect from an unprivileged
> > user potentially locking up the box.
> 
> Or hacking it.

With read access?  You mean from read side-effect on a device register?

>  You could argue the correct requirement is to change it to
> require CAP_SYS_RAWIO in fact !

It's also pervasive through sysfs.  I found 22 CAP_SYS_ADMIN checks via
sysfs bin_attrs, most are firmware, config space, and similar; most of
which have could be abused to hack box in one way or another.

> > With this patch the sysfs file owner is also considered privileged enough
> > to read all of the config space.
> 
> Which breaks the main reason the check was there in the first place. To
> stop accidents of the form
> 
>   find / -exec grep {} "wibble"
> 
> blowing up in people's faces.

Right, this won't change w/out sysadmin intervention.

Currently, any random user doing that won't trigger an ill-behaving
device dependent config space read.  It requires an admin to chown the
file to the user, at which point you've given the device to the user.
This is typically only done in the privileged context of libvirt launching
a KVM guest that has a host PCI device directly assigned to it, and the
chown is typically to a non-shell user.

> I agree with the problem - but IMHO the fix is to require opening the file
> checks CAP_SYS_something instead:

We can't require CAP_SYS_something on open as that will break basic tools
like lspci.  We could note the privileges when opened and check them
later.

>  not to hack the read method and make it
> even weirder and more un-Linux than it is now.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Wiki docs on counting and tracing KVM perf events

2010-05-13 Thread Avi Kivity

On 05/13/2010 05:35 PM, Stefan Hajnoczi wrote:

How to count and trace KVM perf events:

http://www.linux-kvm.org/page/Perf_events

I want to draw attention to this because traditional kvm_stat and
kvm_trace use has been moving over to the debugfs based tracing
mechanisms.  Perhaps we can flesh out documentation and examples of
common perf event usage.

   


Two things are missing to make this really useful:

- a continuously updating difference mode like kvm_stat
- subevents; for example kvm:kvm_exit is an aggregate of all exit types 
that can be split using filters to show individual exit reason statistics


The plan is to eventually remove kvm_stat based performance monitoring 
in favour of perf events.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Wiki docs on counting and tracing KVM perf events

2010-05-13 Thread Stefan Hajnoczi
How to count and trace KVM perf events:

http://www.linux-kvm.org/page/Perf_events

I want to draw attention to this because traditional kvm_stat and
kvm_trace use has been moving over to the debugfs based tracing
mechanisms.  Perhaps we can flesh out documentation and examples of
common perf event usage.

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: Another SIGFPE in display code, now in cirrus

2010-05-13 Thread Michael Tokarev
Stefano Stabellini wrote:
[]
> diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c
> index 9f61a01..81c443b 100644
> --- a/hw/cirrus_vga.c
> +++ b/hw/cirrus_vga.c

The same as with previous patch: Yellow screen
(instead of crashing), and two lines on the
stderr:

BUG: kvm_dirty_pages_log_enable_slot: invalid parameters
BUG: kvm_dirty_pages_log_enable_slot: invalid parameters

Thanks!

/mjt
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] fix "info cpus" halted state display

2010-05-13 Thread Jan Kiszka
Gleb Natapov wrote:
> When in-kernel irqchip is used env->halted is never used for anything
> except "info cpus" command.

In fact, it's used in a few more places, namely cpu_dump_state and the
gdbstub.

> Halted state is synced in
> kvm_arch_save_mpstate() and showed by do_info_cpus() but otherwise never
> looked at. Zeroing it here breaks "info cpus" since before
> do_info_cpus() outputs env->halted in io thread it is zeroed here when
> vcpu thread reenters kernel.

Looks good for current qemu-kvm.

Execution of kvm_cpu_exec once depended on env->halted, even for
in-kernel irqchip, right? Anyway, there are not such traces left here.
We will just need to look at it again when pushing in-kernel irqchips
upstream as its kvm loop looks different.

Jan

> 
> Signed-off-by: Gleb Natapov 
> diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c
> index 61d9331..0ec2881 100644
> --- a/qemu-kvm-x86.c
> +++ b/qemu-kvm-x86.c
> @@ -922,10 +922,6 @@ void kvm_arch_load_regs(CPUState *env, int level)
>  if (env->kvm_vcpu_update_vapic)
>  kvm_tpr_enable_vapic(env);
>  }
> -if (kvm_irqchip_in_kernel()) {
> -/* Avoid deadlock: no user space IRQ will ever clear it. */
> -env->halted = 0;
> -}
>  
>  kvm_put_vcpu_events(env, level);
>  kvm_put_debugregs(env);
> --
>   Gleb.




signature.asc
Description: OpenPGP digital signature


Re: KVM and the OOM-Killer

2010-05-13 Thread Avi Kivity

On 05/13/2010 04:39 PM, James Stevens wrote:

I'd go with 64-bit at 2GB and above. It's both faster and safer.


safer, how? (apart from no lowmem exhaust).


Nothing apart from that (well, if you run nonpae you lose NX protection).



On a different subject, the qemu documentation says a guest VM can 
only have 2Gb of memory - does this still apply when using a 64bit 
host O/S ?


The documentation is outdated.




Since you can run a 64-bit kernel with your existing userspace, at least
you have a simple upgrade path.


Not sure I like the idea of running a 64bit user space kernel on top 
of a 32bit host, prefer to re-install.


Can I just replace my kernel with a 64bit one, or do I have to 
re-install the host O/S ?


You can run 32-bit userspace with a 64-bit kernel, or reinstall, 
whichever you prefer.


I once upgraded a 32-bit Fedora install to 64-bit, but that takes some 
tinkering.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: Another SIGFPE in display code, now in cirrus

2010-05-13 Thread Stefano Stabellini
On Thu, 13 May 2010, Avi Kivity wrote:
> >   /* extra x, y */
> > -sx = (src % ABS(s->cirrus_blt_srcpitch)) / depth;
> > -sy = (src / ABS(s->cirrus_blt_srcpitch));
> > +sx = (src % line_offset) / depth;
> > +sy = (src / line_offset);
> >
> 
> Does anything prevent the guest from programming the CRTC display pitch 
> to 0?
> 

Nope, I'll use line_offset there too to prevent possible divisions by 0.


> >   dx = (dst % ABS(s->cirrus_blt_dstpitch)) / depth;
> >   dy = (dst / ABS(s->cirrus_blt_dstpitch));
> >
> > @@ -725,18 +727,23 @@ static void cirrus_do_copy(CirrusVGAState *s, int 
> > dst, int src, int w, int h)
> >   s->cirrus_blt_dstpitch, s->cirrus_blt_srcpitch,
> >   s->cirrus_blt_width, s->cirrus_blt_height);
> >
> > -if (notify)
> > -   qemu_console_copy(s->vga.ds,
> > - sx, sy, dx, dy,
> > - s->cirrus_blt_width / depth,
> > - s->cirrus_blt_height);
> > -
> > -/* we don't have to notify the display that this portion has
> > -   changed since qemu_console_copy implies this */
> > -
> > -cirrus_invalidate_region(s, s->cirrus_blt_dstaddr,
> > -   s->cirrus_blt_dstpitch, s->cirrus_blt_width,
> > -   s->cirrus_blt_height);
> > + if (ABS(s->cirrus_blt_dstpitch) != line_offset ||
> > + ABS(s->cirrus_blt_srcpitch) != line_offset) {
> > + /* this is not going to happen very often */
> > + vga_hw_invalidate();
> >
> 
> I think we need to consider only dstpitch for a full invalidate.  We 
> might be copying an offscreen bitmap into the screen, and srcpitch is 
> likely to be the bitmap width instead of the screen pitch.
> 

Agreed.

> 
> > + } else {
> > + if (notify)
> > + /* we don't have to notify the display that this portion has
> > +changed since qemu_console_copy implies this */
> > + qemu_console_copy(s->vga.ds,
> > +   sx, sy, dx, dy,
> > +   s->cirrus_blt_width / depth,
> > +   s->cirrus_blt_height);
> > + else
> > + cirrus_invalidate_region(s, s->cirrus_blt_dstaddr,
> > +  s->cirrus_blt_dstpitch, 
> > s->cirrus_blt_width,
> > +  s->cirrus_blt_height);
> > + }
> >   }
> >
> >   static int cirrus_bitblt_videotovideo_copy(CirrusVGAState * s)
> > diff --git a/hw/cirrus_vga_rop.h b/hw/cirrus_vga_rop.h
> > index 39a7b72..80f135b 100644
> > --- a/hw/cirrus_vga_rop.h
> > +++ b/hw/cirrus_vga_rop.h
> > @@ -32,10 +32,10 @@ glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusVGAState 
> > *s,
> >   dstpitch -= bltwidth;
> >   srcpitch -= bltwidth;
> >
> > -if (dstpitch<  0 || srcpitch<  0) {
> > -/* is 0 valid? srcpitch == 0 could be useful */
> > +if (dstpitch<  0)
> >   return;
> > -}
> > +if (srcpitch<  0)
> > +srcpitch = 0;
> >
> 
> Why?

I wouldn't want an operation that is supposed to be a forward copy
to become a backward copy instead.
With the way the code is currently written in cirrus_vga it shouldn't be
possible, but it might be a good idea to have the checks anyway.
Actually the limit I wrote is too strict, I'll fix it.
 

---


diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c
index 9f61a01..81c443b 100644
--- a/hw/cirrus_vga.c
+++ b/hw/cirrus_vga.c
@@ -676,17 +676,19 @@ static void cirrus_do_copy(CirrusVGAState *s, int dst, 
int src, int w, int h)
 int sx, sy;
 int dx, dy;
 int width, height;
+uint32_t start_addr, line_offset, line_compare;
 int depth;
 int notify = 0;
 
 depth = s->vga.get_bpp(&s->vga) / 8;
 s->vga.get_resolution(&s->vga, &width, &height);
+s->vga.get_offsets(&s->vga, &line_offset, &start_addr, &line_compare);
 
 /* extra x, y */
-sx = (src % ABS(s->cirrus_blt_srcpitch)) / depth;
-sy = (src / ABS(s->cirrus_blt_srcpitch));
-dx = (dst % ABS(s->cirrus_blt_dstpitch)) / depth;
-dy = (dst / ABS(s->cirrus_blt_dstpitch));
+sx = (src % line_offset) / depth;
+sy = (src / line_offset);
+dx = (dst % line_offset) / depth;
+dy = (dst / line_offset);
 
 /* normalize width */
 w /= depth;
@@ -725,18 +727,22 @@ static void cirrus_do_copy(CirrusVGAState *s, int dst, 
int src, int w, int h)
  s->cirrus_blt_dstpitch, s->cirrus_blt_srcpitch,
  s->cirrus_blt_width, s->cirrus_blt_height);
 
-if (notify)
-   qemu_console_copy(s->vga.ds,
- sx, sy, dx, dy,
- s->cirrus_blt_width / depth,
- s->cirrus_blt_height);
-
-/* we don't have to notify the display that this portion has
-   changed since qemu_console_copy implies this */
-
-cirrus_invalidate_region(s, s->cirrus_blt_dstaddr,
-   

Re: [Qemu-devel] [RFC PATCH 0/2] Sheepdog: distributed storage system for QEMU

2010-05-13 Thread MORITA Kazutaka
At Thu, 13 May 2010 04:46:46 +0900,
MORITA Kazutaka wrote:
> 
> On 2010/05/12 20:38, Kevin Wolf wrote:
> > I'll have a closer look at your code later, but one thing I noticed is
> > that the new block driver is something in between a protocol and a
> > format driver (just like vvfat, which should stop doing so, too). I
> > think it ought to be a real protocol with the raw format driver on top
> > (or any other format - I don't see a reason why this should be
> > restricted to raw).
> > 
> > The one thing that is unusual about it as a protocol driver is that it
> > supports snapshots. However, while it is the first one, supporting
> > snapshots in protocols is a thing that could be generally useful to
> > support (for example thinking of a LVM protocol, which was discussed in
> > the past).
> > 
> 
> I agreed.  I'll modify the sheepdog driver patch as a protocol driver one,
> and remove unnecessary format check from my patch.
> 
> > So in block.c we could check if the format driver supports snapshots,
> > and if it doesn't we try again with the underlying protocol. Not sure
> > yet what we would do when both format and protocol do support snapshots
> > (qcow2 on sheepdog/LVM/...), but that's a detail.
> > 
> 

To support snapshot in a protocol, I'd like to call the hander of the
protocol driver in the following functions in block.c:

bdrv_snapshot_create
bdrv_snapshot_goto
bdrv_snapshot_delete
bdrv_snapshot_list
bdrv_save_vmstate
bdrv_load_vmstate

Is it okay?

In the case both format and protocol drivers support snapshots, I
think it is better to call the format driver handler.  Because qcow2
is well known as a snapshot support format, so when users use qcow2,
they expect to get snapshot with qcow2.

There is another problem to make the sheepdog driver be a protocol;
how to deal with protocol specific create_options?

For example, sheepdog supports cloning images as a format driver:

  $ qemu-img create -f sheepdog dst -b sheepdog:src

But if the sheepdog driver is a protocol, error will occur.

  $ qemu-img create sheepdog:dst -b sheepdog:src
  Unknown option 'backing_file'
  qemu-img: Backing file not supported for file format 'raw'

It is because the raw format doesn't support a backing_file option.
To support the protocol specific create_options, if the format driver
cannot parse some of the arguments, the protocol driver need to parse
them.

If my suggestions are okay, I'd like to prepare the patches.

Regards,

Kazutaka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM and the OOM-Killer

2010-05-13 Thread Johannes Stezenbach
On Thu, May 13, 2010 at 03:39:20PM +0300, Avi Kivity wrote:
> On 05/13/2010 03:20 PM, James Stevens wrote:
> >
> >General advice seems to be, if you have more than 16Gb RAM then
> >you should run the VM host 64bit.
> >
> >We didn't see this issue on a server with 32Gb running the same
> >set of VMs.
> >
> 
> I'd go with 64-bit at 2GB and above.  It's both faster and safer.

There's an interesting posting from Linus which explains details:
http://www.realworldtech.com/forums/index.cfm?action=detail&id=78966&threadid=78766&roomid=2


HTH,
Johannes
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM and the OOM-Killer

2010-05-13 Thread James Stevens

I'd go with 64-bit at 2GB and above. It's both faster and safer.


safer, how? (apart from no lowmem exhaust).

On a different subject, the qemu documentation says a guest VM can only 
have 2Gb of memory - does this still apply when using a 64bit host O/S ?



The lowmem load is about 0.5% of guest memory, so 48GB means 240MB
lowmem allocated. Thin ice.


We currently only have about 12Gb used by the VM guests, so not hitting 
that issue - the rest of HIGHMEM is host disk cache.


Our experience is that the bottle-neck on number of VM guests is disk 
i/o - with loads of memory we've pretty much eliminated reads, so that 
means disk writes.



Since you can run a 64-bit kernel with your existing userspace, at least
you have a simple upgrade path.


Not sure I like the idea of running a 64bit user space kernel on top of 
a 32bit host, prefer to re-install.


Can I just replace my kernel with a 64bit one, or do I have to 
re-install the host O/S ?






James
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] fix "info cpus" halted state display

2010-05-13 Thread Gleb Natapov
When in-kernel irqchip is used env->halted is never used for anything
except "info cpus" command. Halted state is synced in
kvm_arch_save_mpstate() and showed by do_info_cpus() but otherwise never
looked at. Zeroing it here breaks "info cpus" since before
do_info_cpus() outputs env->halted in io thread it is zeroed here when
vcpu thread reenters kernel.

Signed-off-by: Gleb Natapov 
diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c
index 61d9331..0ec2881 100644
--- a/qemu-kvm-x86.c
+++ b/qemu-kvm-x86.c
@@ -922,10 +922,6 @@ void kvm_arch_load_regs(CPUState *env, int level)
 if (env->kvm_vcpu_update_vapic)
 kvm_tpr_enable_vapic(env);
 }
-if (kvm_irqchip_in_kernel()) {
-/* Avoid deadlock: no user space IRQ will ever clear it. */
-env->halted = 0;
-}
 
 kvm_put_vcpu_events(env, level);
 kvm_put_debugregs(env);
--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM and the OOM-Killer

2010-05-13 Thread Avi Kivity

On 05/13/2010 03:20 PM, James Stevens wrote:

This is *NOT* a KVM issue, but may be worth adding into the FAQ...


We have a KVM host with 48Gb of RAM and run about 20 KVM clients on 
it. After some time - different time depending on the kernel version - 
the VM host kernel will start OOM-Killing the VM clients, even when 
there is lots of free RAM (>10Gb) and free SWAP (>34Gb).


This seems to be caused by the kernel running out of LOWMEM (memory 
below 1Gb) - because of the large amount of RAM a lot of LOWMEM 
(~400Mb) is used by the memory map (32 bytes per 4Kb page), add in the 
kernel itself and that leaves "only" about 460Mb of LOWMEM for kernel 
alloc.


This may not have been a problem, except Linux may also put cache 
blocks and user processes in LOWMEM - it seems this can then lead to a 
LOWMEM exhaust situation which triggers OOM-Killing even when there is 
LOADS of SWAP and HIGHMEM free.


Sadly, killing userland processes is not a good way to try and free 
LOWMEM, so what happens is a killing spree where by every process on 
the VM host gets killed (inc all the VMs, sysklogd, klogd, sshd, udevd 
etc).


This is very bad in 2.6.32.6, quite bad in 2.6.32.9,  better (but 
still bad in) 2.6.31.12 - currently testing 2.6.33.3


See https://bugzilla.kernel.org/show_bug.cgi?id=15058


General advice seems to be, if you have more than 16Gb RAM then you 
should run the VM host 64bit.


We didn't see this issue on a server with 32Gb running the same set of 
VMs.




I'd go with 64-bit at 2GB and above.  It's both faster and safer.

The lowmem load is about 0.5% of guest memory, so  48GB means 240MB 
lowmem allocated.  Thin ice.


Since you can run a 64-bit kernel with your existing userspace, at least 
you have a simple upgrade path.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM and the OOM-Killer

2010-05-13 Thread James Stevens

This is *NOT* a KVM issue, but may be worth adding into the FAQ...


We have a KVM host with 48Gb of RAM and run about 20 KVM clients on it. 
After some time - different time depending on the kernel version - the 
VM host kernel will start OOM-Killing the VM clients, even when there is 
lots of free RAM (>10Gb) and free SWAP (>34Gb).


This seems to be caused by the kernel running out of LOWMEM (memory 
below 1Gb) - because of the large amount of RAM a lot of LOWMEM (~400Mb) 
is used by the memory map (32 bytes per 4Kb page), add in the kernel 
itself and that leaves "only" about 460Mb of LOWMEM for kernel alloc.


This may not have been a problem, except Linux may also put cache blocks 
and user processes in LOWMEM - it seems this can then lead to a LOWMEM 
exhaust situation which triggers OOM-Killing even when there is LOADS of 
SWAP and HIGHMEM free.


Sadly, killing userland processes is not a good way to try and free 
LOWMEM, so what happens is a killing spree where by every process on the 
VM host gets killed (inc all the VMs, sysklogd, klogd, sshd, udevd etc).


This is very bad in 2.6.32.6, quite bad in 2.6.32.9,  better (but still 
bad in) 2.6.31.12 - currently testing 2.6.33.3


See https://bugzilla.kernel.org/show_bug.cgi?id=15058


General advice seems to be, if you have more than 16Gb RAM then you 
should run the VM host 64bit.


We didn't see this issue on a server with 32Gb running the same set of VMs.



James
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7] Consolidate vcpu ioctl locking

2010-05-13 Thread Avi Kivity

On 05/13/2010 03:18 PM, Alexander Graf wrote:



[PATCH 0/7] Consolidate vcpu ioctl locking

In general, all vcpu ioctls need to take the vcpu mutex, but each one does it
(or not) individually.  This is cumbersome and error prone.

This patchset moves all locking to a central place.  This is complicated
by the fact that ppc's KVM_INTERRUPT and s390's KVM_S390_INTERRUPT break
the convention and need to run unlocked.
 

Why is the x86 non-kernel-pic path different?
   


Userspace issues the ioctl from a vcpu thread.

It has to, btw, since whether an interrupt can be injected or not 
depends on vcpu-synchronous registers: eflags.if and tpr/cr8.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7] Consolidate vcpu ioctl locking

2010-05-13 Thread Alexander Graf

On 13.05.2010, at 14:03, Avi Kivity wrote:

> On 05/13/2010 03:03 PM, Avi Kivity wrote:
>> On 05/13/2010 03:01 PM, Avi Kivity wrote:
>>> On 05/13/2010 02:57 PM, Alexander Graf wrote:
 
 Mind to give a high level overview on where you're moving which locks?
 
>>> 
>>> Um, looks like I forgot to fill in the patchset header.  Sorry.
>> 
>> Gar, I actually wrote it but forgot to save the file.
>> 
> 
> And it had some useful info:
> 
> [PATCH 0/7] Consolidate vcpu ioctl locking
> 
> In general, all vcpu ioctls need to take the vcpu mutex, but each one does it
> (or not) individually.  This is cumbersome and error prone.
> 
> This patchset moves all locking to a central place.  This is complicated
> by the fact that ppc's KVM_INTERRUPT and s390's KVM_S390_INTERRUPT break
> the convention and need to run unlocked.

Why is the x86 non-kernel-pic path different?

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7] Consolidate vcpu ioctl locking

2010-05-13 Thread Avi Kivity

On 05/13/2010 03:03 PM, Avi Kivity wrote:

On 05/13/2010 03:01 PM, Avi Kivity wrote:

On 05/13/2010 02:57 PM, Alexander Graf wrote:


Mind to give a high level overview on where you're moving which locks?



Um, looks like I forgot to fill in the patchset header.  Sorry.


Gar, I actually wrote it but forgot to save the file.



And it had some useful info:

[PATCH 0/7] Consolidate vcpu ioctl locking

In general, all vcpu ioctls need to take the vcpu mutex, but each one 
does it

(or not) individually.  This is cumbersome and error prone.

This patchset moves all locking to a central place.  This is complicated
by the fact that ppc's KVM_INTERRUPT and s390's KVM_S390_INTERRUPT break
the convention and need to run unlocked.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7] Consolidate vcpu ioctl locking

2010-05-13 Thread Avi Kivity

On 05/13/2010 03:01 PM, Avi Kivity wrote:

On 05/13/2010 02:57 PM, Alexander Graf wrote:


Mind to give a high level overview on where you're moving which locks?



Um, looks like I forgot to fill in the patchset header.  Sorry.


Gar, I actually wrote it but forgot to save the file.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7] Consolidate vcpu ioctl locking

2010-05-13 Thread Avi Kivity

On 05/13/2010 02:57 PM, Alexander Graf wrote:


Mind to give a high level overview on where you're moving which locks?

   


Um, looks like I forgot to fill in the patchset header.  Sorry.

The patches move all vcpu ioctl locking from the individual ioctl 
handlers (e.g. kvm_vcpu_ioctl_set_cpuid()) to the top-level vcpu ioctl 
handler (kvm_vcpu_ioctl()).  So tons of vcpu_load()/vcpu_put() pairs 
(some of the missing) get replaced by a single pair.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7] Consolidate vcpu ioctl locking

2010-05-13 Thread Alexander Graf

On 13.05.2010, at 13:17, Avi Kivity wrote:

> 
> 
> Avi Kivity (7):
>  KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls
>  KVM: x86: Add missing locking to arch specific vcpu ioctls
>  KVM: move vcpu locking to dispatcher for generic vcpu ioctls
>  KVM: x86: Lock arch specific vcpu ioctls centrally
>  KVM: s390: Centrally lock arch specific vcpu ioctls
>  KVM: PPC: Centralize locking of arch specific vcpu ioctls
>  KVM: Consolidate arch specific vcpu ioctl locking

Mind to give a high level overview on where you're moving which locks?

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH 0/12] KVM, x86, ppc, asm-generic: moving dirty bitmaps to user space

2010-05-13 Thread Avi Kivity

On 05/10/2010 03:26 PM, Takuya Yoshikawa wrote:

No doubt get.org -> get.opt is measurable, but get.opt->switch.opt is
problematic. Have you tried profiling to see where the time is spent
(well I can guess, clearing the write access from the sptes).



Sorry but no, and I agree with your guess.
Anyway, I want to do some profiling to confirm this guess.


BTW, If we only think about performance improvement of time, optimized
get(get.opt) may be enough at this stage.

But if we consider the future expansion like using user allocated 
bitmaps,
new API's introduced for switch.opt won't become waste, I think, 
because we

need a structure to get and export bitmap addresses.


User allocated bitmaps have the advantage of reducing pinned memory.  
However we have plenty more pinned memory allocated in memory slots, so 
by itself, user allocated bitmaps don't justify this change.


Perhaps if we optimize memory slot write protection (I have some ideas 
about this) we can make the performance improvement more pronounced.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/7] KVM: Consolidate arch specific vcpu ioctl locking

2010-05-13 Thread Avi Kivity
Now that all arch specific ioctls have centralized locking, it is easy to
move it to the central dispatcher.

Signed-off-by: Avi Kivity 
---
 arch/powerpc/kvm/powerpc.c |   11 ---
 arch/s390/kvm/kvm-s390.c   |   13 ++---
 arch/x86/kvm/x86.c |2 --
 virt/kvm/kvm_main.c|2 --
 4 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index caeed7b..a1d8750 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -512,17 +512,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
long r;
 
-   if (ioctl == KVM_INTERRUPT) {
+   switch (ioctl) {
+   case KVM_INTERRUPT: {
struct kvm_interrupt irq;
r = -EFAULT;
if (copy_from_user(&irq, argp, sizeof(irq)))
-   goto out_nolock;
+   goto out;
r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-   goto out_nolock;
+   goto out;
}
 
-   vcpu_load(vcpu);
-   switch (ioctl) {
case KVM_ENABLE_CAP:
{
struct kvm_enable_cap cap;
@@ -537,8 +536,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 
 out:
-   vcpu_put(vcpu);
-out_nolock:
return r;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 28cd8fd..fad1024 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -638,16 +638,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
long r;
 
-   if (ioctl == KVM_S390_INTERRUPT) {
+   switch (ioctl) {
+   case KVM_S390_INTERRUPT: {
struct kvm_s390_interrupt s390int;
 
+   r = -EFAULT;
if (copy_from_user(&s390int, argp, sizeof(s390int)))
-   return -EFAULT;
-   return kvm_s390_inject_vcpu(vcpu, &s390int);
+   break;
+   r = kvm_s390_inject_vcpu(vcpu, &s390int);
+   break;
}
-
-   vcpu_load(vcpu);
-   switch (ioctl) {
case KVM_S390_STORE_STATUS:
r = kvm_s390_vcpu_store_status(vcpu, arg);
break;
@@ -666,7 +666,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
default:
r = -EINVAL;
}
-   vcpu_put(vcpu);
return r;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b9e5ec..3a763de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2288,7 +2288,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
int r;
struct kvm_lapic_state *lapic = NULL;
 
-   vcpu_load(vcpu);
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
@@ -2486,7 +2485,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
}
 out:
-   vcpu_put(vcpu);
kfree(lapic);
return r;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 08b2ccd..5ee558c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1564,9 +1564,7 @@ out_free2:
break;
}
default:
-   vcpu_put(vcpu);
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
-   vcpu_load(vcpu);
}
 out:
vcpu_put(vcpu);
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/7] KVM: s390: Centrally lock arch specific vcpu ioctls

2010-05-13 Thread Avi Kivity
Signed-off-by: Avi Kivity 
---
 arch/s390/kvm/kvm-s390.c |   40 +---
 1 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e80f55e..28cd8fd 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -363,9 +363,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 
 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 {
-   vcpu_load(vcpu);
kvm_s390_vcpu_initial_reset(vcpu);
-   vcpu_put(vcpu);
return 0;
 }
 
@@ -415,14 +413,12 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct 
kvm_vcpu *vcpu, psw_t psw)
 {
int rc = 0;
 
-   vcpu_load(vcpu);
if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
rc = -EBUSY;
else {
vcpu->run->psw_mask = psw.mask;
vcpu->run->psw_addr = psw.addr;
}
-   vcpu_put(vcpu);
return rc;
 }
 
@@ -573,7 +569,7 @@ static int __guestcopy(struct kvm_vcpu *vcpu, u64 
guestdest, const void *from,
  * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
  * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
  */
-int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
+static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long 
addr)
 {
const unsigned char archmode = 1;
int prefix;
@@ -635,45 +631,43 @@ int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, 
unsigned long addr)
return 0;
 }
 
-static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long 
addr)
-{
-   int rc;
-
-   vcpu_load(vcpu);
-   rc = __kvm_s390_vcpu_store_status(vcpu, addr);
-   vcpu_put(vcpu);
-   return rc;
-}
-
 long kvm_arch_vcpu_ioctl(struct file *filp,
 unsigned int ioctl, unsigned long arg)
 {
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
+   long r;
 
-   switch (ioctl) {
-   case KVM_S390_INTERRUPT: {
+   if (ioctl == KVM_S390_INTERRUPT) {
struct kvm_s390_interrupt s390int;
 
if (copy_from_user(&s390int, argp, sizeof(s390int)))
return -EFAULT;
return kvm_s390_inject_vcpu(vcpu, &s390int);
}
+
+   vcpu_load(vcpu);
+   switch (ioctl) {
case KVM_S390_STORE_STATUS:
-   return kvm_s390_vcpu_store_status(vcpu, arg);
+   r = kvm_s390_vcpu_store_status(vcpu, arg);
+   break;
case KVM_S390_SET_INITIAL_PSW: {
psw_t psw;
 
+   r = -EFAULT;
if (copy_from_user(&psw, argp, sizeof(psw)))
-   return -EFAULT;
-   return kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
+   break;
+   r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
+   break;
}
case KVM_S390_INITIAL_RESET:
-   return kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+   r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+   break;
default:
-   ;
+   r = -EINVAL;
}
-   return -EINVAL;
+   vcpu_put(vcpu);
+   return r;
 }
 
 /* Section: memory related */
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/7] KVM: move vcpu locking to dispatcher for generic vcpu ioctls

2010-05-13 Thread Avi Kivity
All vcpu ioctls need to be locked, so instead of locking each one specifically
we lock at the generic dispatcher.

This patch only updates generic ioctls and leaves arch specific ioctls alone.

Signed-off-by: Avi Kivity 
---
 arch/ia64/kvm/kvm-ia64.c   |   11 ---
 arch/powerpc/kvm/book3s.c  |   16 
 arch/powerpc/kvm/booke.c   |   10 --
 arch/powerpc/kvm/powerpc.c |4 
 arch/s390/kvm/kvm-s390.c   |   16 
 arch/x86/kvm/x86.c |   36 ++--
 virt/kvm/kvm_main.c|   15 +++
 7 files changed, 17 insertions(+), 91 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d5f4e91..29f6075 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -724,8 +724,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
int r;
sigset_t sigsaved;
 
-   vcpu_load(vcpu);
-
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -747,7 +745,6 @@ out:
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-   vcpu_put(vcpu);
return r;
 }
 
@@ -882,8 +879,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
int i;
 
-   vcpu_load(vcpu);
-
for (i = 0; i < 16; i++) {
vpd->vgr[i] = regs->vpd.vgr[i];
vpd->vbgr[i] = regs->vpd.vbgr[i];
@@ -930,8 +925,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu);
set_bit(KVM_REQ_RESUME, &vcpu->requests);
 
-   vcpu_put(vcpu);
-
return 0;
 }
 
@@ -1966,9 +1959,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
 {
-   vcpu_load(vcpu);
mp_state->mp_state = vcpu->arch.mp_state;
-   vcpu_put(vcpu);
return 0;
 }
 
@@ -1999,10 +1990,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu 
*vcpu,
 {
int r = 0;
 
-   vcpu_load(vcpu);
vcpu->arch.mp_state = mp_state->mp_state;
if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
r = vcpu_reset(vcpu);
-   vcpu_put(vcpu);
return r;
 }
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b998abf..f6eac2f 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1047,8 +1047,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
 {
int i;
 
-   vcpu_load(vcpu);
-
regs->pc = kvmppc_get_pc(vcpu);
regs->cr = kvmppc_get_cr(vcpu);
regs->ctr = kvmppc_get_ctr(vcpu);
@@ -1069,8 +1067,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
-   vcpu_put(vcpu);
-
return 0;
 }
 
@@ -1078,8 +1074,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
 {
int i;
 
-   vcpu_load(vcpu);
-
kvmppc_set_pc(vcpu, regs->pc);
kvmppc_set_cr(vcpu, regs->cr);
kvmppc_set_ctr(vcpu, regs->ctr);
@@ -1099,8 +1093,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
-   vcpu_put(vcpu);
-
return 0;
 }
 
@@ -1110,8 +1102,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
int i;
 
-   vcpu_load(vcpu);
-
sregs->pvr = vcpu->arch.pvr;
 
sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
@@ -1131,8 +1121,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
}
}
 
-   vcpu_put(vcpu);
-
return 0;
 }
 
@@ -1142,8 +1130,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
int i;
 
-   vcpu_load(vcpu);
-
kvmppc_set_pvr(vcpu, sregs->pvr);
 
vcpu3s->sdr1 = sregs->u.s.sdr1;
@@ -1171,8 +1157,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
/* Flush the MMU after messing with the segments */
kvmppc_mmu_pte_flush(vcpu, 0, 0);
 
-   vcpu_put(vcpu);
-
return 0;
 }
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a33ab8c..b687f43 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -485,8 +485,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
 {
int i;
 
-   vcpu_load(vcpu);
-
regs->pc = vcpu->arch.pc;
regs->cr 

[PATCH 6/7] KVM: PPC: Centralize locking of arch specific vcpu ioctls

2010-05-13 Thread Avi Kivity
Signed-off-by: Avi Kivity 
---
 arch/powerpc/kvm/powerpc.c |   12 
 1 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index e0fae7a..caeed7b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -512,15 +512,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
long r;
 
-   switch (ioctl) {
-   case KVM_INTERRUPT: {
+   if (ioctl == KVM_INTERRUPT) {
struct kvm_interrupt irq;
r = -EFAULT;
if (copy_from_user(&irq, argp, sizeof(irq)))
-   goto out;
+   goto out_nolock;
r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-   break;
+   goto out_nolock;
}
+
+   vcpu_load(vcpu);
+   switch (ioctl) {
case KVM_ENABLE_CAP:
{
struct kvm_enable_cap cap;
@@ -535,6 +537,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 
 out:
+   vcpu_put(vcpu);
+out_nolock:
return r;
 }
 
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/7] Consolidate vcpu ioctl locking

2010-05-13 Thread Avi Kivity


Avi Kivity (7):
  KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls
  KVM: x86: Add missing locking to arch specific vcpu ioctls
  KVM: move vcpu locking to dispatcher for generic vcpu ioctls
  KVM: x86: Lock arch specific vcpu ioctls centrally
  KVM: s390: Centrally lock arch specific vcpu ioctls
  KVM: PPC: Centralize locking of arch specific vcpu ioctls
  KVM: Consolidate arch specific vcpu ioctl locking

 arch/ia64/kvm/kvm-ia64.c   |   11 ---
 arch/powerpc/kvm/book3s.c  |   10 +-
 arch/powerpc/kvm/booke.c   |5 ++-
 arch/powerpc/kvm/powerpc.c |7 +---
 arch/s390/kvm/kvm-s390.c   |   55 ++-
 arch/x86/kvm/x86.c |   69 +--
 virt/kvm/kvm_main.c|   13 
 7 files changed, 39 insertions(+), 131 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/7] KVM: x86: Lock arch specific vcpu ioctls centrally

2010-05-13 Thread Avi Kivity
Signed-off-by: Avi Kivity 
---
 arch/x86/kvm/x86.c |   41 ++---
 1 files changed, 2 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eedb23b..8b9e5ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1531,16 +1531,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct 
kvm_msrs *msrs,
 {
int i, idx;
 
-   vcpu_load(vcpu);
-
idx = srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
-   vcpu_put(vcpu);
-
return i;
 }
 
@@ -1788,7 +1784,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
if (copy_from_user(cpuid_entries, entries,
   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
goto out_free;
-   vcpu_load(vcpu);
for (i = 0; i < cpuid->nent; i++) {
vcpu->arch.cpuid_entries[i].function = 
cpuid_entries[i].function;
vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1806,7 +1801,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
r = 0;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
-   vcpu_put(vcpu);
 
 out_free:
vfree(cpuid_entries);
@@ -1827,11 +1821,9 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu 
*vcpu,
if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
-   vcpu_load(vcpu);
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
-   vcpu_put(vcpu);
return 0;
 
 out:
@@ -1844,7 +1836,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu 
*vcpu,
 {
int r;
 
-   vcpu_load(vcpu);
r = -E2BIG;
if (cpuid->nent < vcpu->arch.cpuid_nent)
goto out;
@@ -1856,7 +1847,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu 
*vcpu,
 
 out:
cpuid->nent = vcpu->arch.cpuid_nent;
-   vcpu_put(vcpu);
return r;
 }
 
@@ -2088,9 +2078,7 @@ out:
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
 {
-   vcpu_load(vcpu);
memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-   vcpu_put(vcpu);
 
return 0;
 }
@@ -2098,11 +2086,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu 
*vcpu,
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
 {
-   vcpu_load(vcpu);
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_post_state_restore(vcpu);
update_cr8_intercept(vcpu);
-   vcpu_put(vcpu);
 
return 0;
 }
@@ -2114,20 +2100,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu 
*vcpu,
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
return -ENXIO;
-   vcpu_load(vcpu);
 
kvm_queue_interrupt(vcpu, irq->irq, false);
 
-   vcpu_put(vcpu);
-
return 0;
 }
 
 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 {
-   vcpu_load(vcpu);
kvm_inject_nmi(vcpu);
-   vcpu_put(vcpu);
 
return 0;
 }
@@ -2147,7 +2128,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu 
*vcpu,
int r;
unsigned bank_num = mcg_cap & 0xff, bank;
 
-   vcpu_load(vcpu);
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
@@ -2162,7 +2142,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu 
*vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
-   vcpu_put(vcpu);
return r;
 }
 
@@ -2220,8 +2199,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu 
*vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
   struct kvm_vcpu_events *events)
 {
-   vcpu_load(vcpu);
-
events->exception.injected =
vcpu->arch.exception.pending &&
!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2246,8 +2223,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct 
kvm_vcpu *vcpu,
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 | KVM_VCPUEVENT_VALID_SHADOW);
-
-   vcpu_put(vcpu);
 }
 
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2258,8 +2233,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct 
kvm_vcpu *vcpu,
  | KVM_VCPUEVENT_VALID_SHADOW))
return -EINVAL;
 
-   vcpu_load(vcpu);
-
vcpu->arch.exception.

[PATCH 2/7] KVM: x86: Add missing locking to arch specific vcpu ioctls

2010-05-13 Thread Avi Kivity
Signed-off-by: Avi Kivity 
---
 arch/x86/kvm/x86.c |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b1433f..f54ec24 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1844,6 +1844,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu 
*vcpu,
 {
int r;
 
+   vcpu_load(vcpu);
r = -E2BIG;
if (cpuid->nent < vcpu->arch.cpuid_nent)
goto out;
@@ -1855,6 +1856,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu 
*vcpu,
 
 out:
cpuid->nent = vcpu->arch.cpuid_nent;
+   vcpu_put(vcpu);
return r;
 }
 
@@ -2145,6 +2147,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu 
*vcpu,
int r;
unsigned bank_num = mcg_cap & 0xff, bank;
 
+   vcpu_load(vcpu);
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
@@ -2159,6 +2162,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu 
*vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
+   vcpu_put(vcpu);
return r;
 }
 
@@ -2467,7 +2471,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&mce, argp, sizeof mce))
goto out;
+   vcpu_load(vcpu);
r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+   vcpu_put(vcpu);
break;
}
case KVM_GET_VCPU_EVENTS: {
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/7] KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls

2010-05-13 Thread Avi Kivity
Signed-off-by: Avi Kivity 
---
 arch/powerpc/kvm/book3s.c |   10 ++
 arch/powerpc/kvm/booke.c  |   15 ++-
 2 files changed, 24 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 11f226f..b998abf 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1110,6 +1110,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
int i;
 
+   vcpu_load(vcpu);
+
sregs->pvr = vcpu->arch.pvr;
 
sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
@@ -1128,6 +1130,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
}
}
+
+   vcpu_put(vcpu);
+
return 0;
 }
 
@@ -1137,6 +1142,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
int i;
 
+   vcpu_load(vcpu);
+
kvmppc_set_pvr(vcpu, sregs->pvr);
 
vcpu3s->sdr1 = sregs->u.s.sdr1;
@@ -1163,6 +1170,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
/* Flush the MMU after messing with the segments */
kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+   vcpu_put(vcpu);
+
return 0;
 }
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index c922240..a33ab8c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -485,6 +485,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
 {
int i;
 
+   vcpu_load(vcpu);
+
regs->pc = vcpu->arch.pc;
regs->cr = kvmppc_get_cr(vcpu);
regs->ctr = vcpu->arch.ctr;
@@ -505,6 +507,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
+   vcpu_put(vcpu);
+
return 0;
 }
 
@@ -512,6 +516,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
 {
int i;
 
+   vcpu_load(vcpu);
+
vcpu->arch.pc = regs->pc;
kvmppc_set_cr(vcpu, regs->cr);
vcpu->arch.ctr = regs->ctr;
@@ -531,6 +537,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
+   vcpu_put(vcpu);
+
return 0;
 }
 
@@ -559,7 +567,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, 
struct kvm_fpu *fpu)
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
   struct kvm_translation *tr)
 {
-   return kvmppc_core_vcpu_translate(vcpu, tr);
+   int r;
+
+   vcpu_load(vcpu);
+   r = kvmppc_core_vcpu_translate(vcpu, tr);
+   vcpu_put(vcpu);
+   return r;
 }
 
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] pci: allow sysfs file owner to read device dependent config space

2010-05-13 Thread Daniel P. Berrange
On Thu, May 13, 2010 at 01:56:53PM +0300, Avi Kivity wrote:
> On 05/13/2010 04:29 AM, Chris Wright wrote:
> >The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN
> >check to verify privileges before allowing a user to read device
> >dependent config space.  This is meant to protect from an unprivileged
> >user potentially locking up the box.
> >
> >When assigning a PCI device directly to a guest with libvirt and KVM,
> >the sysfs config space file is chown'd to the unprivileged user that
> >the KVM guest will run as.  The guest needs to have full access to the
> >device's config space since it's responsible for driving the device.
> >However, despite being the owner of the sysfs file, the CAP_SYS_ADMIN
> >check will not allow read access beyond the config header.
> >
> >With this patch the sysfs file owner is also considered privileged enough
> >to read all of the config space.
> >
> >   
> 
> Related questions:
> 
> - does sysfs support selinux labels?

With a recent enough kernel + selinux policy it does.

Regards,
Daniel
-- 
|: Red Hat, Engineering, London-o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :|
|: http://autobuild.org-o- http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-   F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH RFC 10/12] KVM: move dirty bitmaps to user space

2010-05-13 Thread Takuya Yoshikawa




mark_page_dirty is called with the mmu_lock spinlock held in set_spte.
Must find a way to move it outside of the spinlock section.



Oh, it's a serious problem. I have to consider it.


Avi, Marcelo,

Sorry but I have to say that mmu_lock spin_lock problem was completely out of
my mind. Although I looked through the code, it seems not easy to move the
set_bit_user to outside of spinlock section without breaking the semantics of
its protection.

So this may take some time to solve.

But personally, I want to do something for x86's "vmallc() every time" problem
even though moving dirty bitmaps to user space cannot be achieved soon.

In that sense, do you mind if we do double buffering without moving dirty 
bitmaps to
user space?

I know that the resource for vmalloc() is precious for x86 but even now, at the 
timing
of get_dirty_log, we use the same amount of memory as double buffering.


Thanks,
  Takuya

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] pci: allow sysfs file owner to read device dependent config space

2010-05-13 Thread Avi Kivity

On 05/13/2010 04:29 AM, Chris Wright wrote:

The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN
check to verify privileges before allowing a user to read device
dependent config space.  This is meant to protect from an unprivileged
user potentially locking up the box.

When assigning a PCI device directly to a guest with libvirt and KVM,
the sysfs config space file is chown'd to the unprivileged user that
the KVM guest will run as.  The guest needs to have full access to the
device's config space since it's responsible for driving the device.
However, despite being the owner of the sysfs file, the CAP_SYS_ADMIN
check will not allow read access beyond the config header.

With this patch the sysfs file owner is also considered privileged enough
to read all of the config space.

   


Related questions:

- does sysfs support selinux labels?
- what about iommu?

So we give a user privileges to access a device.  But if we don't 
enforce iommu protection, you're giving the user access to the entire 
system.


With kvm, qemu wraps the device with an iommu, but this is less secure 
than it might be.  Better to have a privileged process open the device, 
irrevocably wrap it with an iommu, and pass the wrapped fd to qemu.


This is probably best done with uio, but it means all device access 
needs to be available through the uio fd, including pci config space access.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] pci: allow sysfs file owner to read device dependent config space

2010-05-13 Thread Alan Cox
On Wed, 12 May 2010 18:29:57 -0700
Chris Wright  wrote:

> The PCI config space bin_attr read handler has a hardcoded CAP_SYS_ADMIN
> check to verify privileges before allowing a user to read device
> dependent config space.  This is meant to protect from an unprivileged
> user potentially locking up the box.

Or hacking it. You could argue the correct requirement is to change it to
require CAP_SYS_RAWIO in fact !

> With this patch the sysfs file owner is also considered privileged enough
> to read all of the config space.

Which breaks the main reason the check was there in the first place. To
stop accidents of the form

find / -exec grep {} "wibble"

blowing up in people's faces.


I agree with the problem - but IMHO the fix is to require opening the file
checks CAP_SYS_something instead: not to hack the read method and make it
even weirder and more un-Linux than it is now.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] last minute vhost-net fix

2010-05-13 Thread Michael S. Tsirkin
David, if it's not too late, please pull the following
last minute fix into 2.6.34.
Thanks!

The following changes since commit de02d72bb3cc5b3d4c873db4ca8291723dd48479:

  Merge branch 'master' of 
git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6 (2010-05-10 
22:53:41 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git net-2.6

Michael S. Tsirkin (1):
  vhost: fix barrier pairing

 drivers/vhost/vhost.c |7 ++-
 1 files changed, 6 insertions(+), 1 deletions(-)

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Another SIGFPE in display code, now in cirrus

2010-05-13 Thread Paolo Bonzini

On 05/12/2010 05:57 PM, Stefano Stabellini wrote:

I guess even a src blt pitch of 0 could be useful there, however in
practice I think the only rop function that was written with this case in
mind has:

dstpitch -= bltwidth;
srcpitch -= bltwidth;

if (dstpitch<  0 || srcpitch<  0) {
 /* is 0 valid? srcpitch == 0 could be useful */
 return;
}


Note that here srcpitch == 0 is actually srcpitch == bltwidth, which 
_is_ obviously useful.


The "real" srcpitch == 0 case would result in srcpitch == -bltwidth, and 
it is actually quite useful if you want to stretch a Nx1 bitmap to NxN.


Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html