[COMMIT master] KVM: MMU: invalidate and flush on spte small-large page size change

2010-05-31 Thread Avi Kivity
From: Marcelo Tosatti mtosa...@redhat.com

Always invalidate spte and flush TLBs when changing page size, to make
sure different sized translations for the same address are never cached
in a CPU's TLB.

Currently the only case where this occurs is when a non-leaf spte pointer is
overwritten by a leaf, large spte entry. This can happen after dirty
logging is disabled on a memslot, for example.

Noticed by Andrea.

KVM-Stable-Tag
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 56f8c3c..43ec285 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1952,6 +1952,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
 
child = page_header(pte  PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, sptep);
+   __set_spte(sptep, shadow_trap_nonpresent_pte);
+   kvm_flush_remote_tlbs(vcpu-kvm);
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk(hfn old %lx new %lx\n,
 spte_to_pfn(*sptep), pfn);
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -v3] KVM, Fix QEMU-KVM is killed by guest SRAO MCE (resend)

2010-05-31 Thread Huang Ying
In common cases, guest SRAO MCE will cause corresponding poisoned page
be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay
the MCE to guest OS.

But it is reported that if the poisoned page is accessed in guest
after un-mapped and before MCE is relayed to guest OS, QEMU-KVM will
be killed.

The reason is as follow. Because poisoned page has been un-mapped,
guest access will cause guest exit and kvm_mmu_page_fault will be
called. kvm_mmu_page_fault can not get the poisoned page for fault
address, so kernel and user space MMIO processing is tried in turn. In
user MMIO processing, poisoned page is accessed again, then QEMU-KVM
is killed by force_sig_info.

To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM
and do not try kernel and user space MMIO processing for poisoned
page.


Changelog:

v3:

- Make is_hwpoison_address workable for pud_large or pmd_large address.

v2:

- Use page table walker to determine whether the virtual address is
  poisoned to avoid change user space interface (via changing
  get_user_pages).

- Wrap bad page processing into kvm_handle_bad_page to avoid code
  duplicating.

Reported-by: Max Asbock masb...@linux.vnet.ibm.com
Signed-off-by: Huang Ying ying.hu...@intel.com
---
 arch/x86/kvm/mmu.c |   34 ++
 arch/x86/kvm/paging_tmpl.h |7 ++-
 include/linux/kvm_host.h   |1 +
 include/linux/mm.h |8 
 mm/memory-failure.c|   30 ++
 virt/kvm/kvm_main.c|   30 --
 6 files changed, 95 insertions(+), 15 deletions(-)

--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -32,6 +32,7 @@
 #include linux/compiler.h
 #include linux/srcu.h
 #include linux/slab.h
+#include linux/uaccess.h
 
 #include asm/page.h
 #include asm/cmpxchg.h
@@ -1953,6 +1954,27 @@ static int __direct_map(struct kvm_vcpu
return pt_write;
 }
 
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+   char buf[1];
+   void __user *hva;
+   int r;
+
+   /* Touch the page, so send SIGBUS */
+   hva = (void __user *)gfn_to_hva(kvm, gfn);
+   r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+   kvm_release_pfn_clean(pfn);
+   if (is_hwpoison_pfn(pfn)) {
+   kvm_send_hwpoison_signal(kvm, gfn);
+   return 0;
+   }
+   return 1;
+}
+
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
int r;
@@ -1976,10 +1998,8 @@ static int nonpaging_map(struct kvm_vcpu
pfn = gfn_to_pfn(vcpu-kvm, gfn);
 
/* mmio */
-   if (is_error_pfn(pfn)) {
-   kvm_release_pfn_clean(pfn);
-   return 1;
-   }
+   if (is_error_pfn(pfn))
+   return kvm_handle_bad_page(vcpu-kvm, gfn, pfn);
 
spin_lock(vcpu-kvm-mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2191,10 +2211,8 @@ static int tdp_page_fault(struct kvm_vcp
mmu_seq = vcpu-kvm-mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu-kvm, gfn);
-   if (is_error_pfn(pfn)) {
-   kvm_release_pfn_clean(pfn);
-   return 1;
-   }
+   if (is_error_pfn(pfn))
+   return kvm_handle_bad_page(vcpu-kvm, gfn, pfn);
spin_lock(vcpu-kvm-mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -430,11 +430,8 @@ static int FNAME(page_fault)(struct kvm_
pfn = gfn_to_pfn(vcpu-kvm, walker.gfn);
 
/* mmio */
-   if (is_error_pfn(pfn)) {
-   pgprintk(gfn %lx is mmio\n, walker.gfn);
-   kvm_release_pfn_clean(pfn);
-   return 1;
-   }
+   if (is_error_pfn(pfn))
+   return kvm_handle_bad_page(vcpu-kvm, walker.gfn, pfn);
 
spin_lock(vcpu-kvm-mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -266,6 +266,7 @@ extern pfn_t bad_pfn;
 
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
+int is_hwpoison_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
  struct kvm_userspace_memory_region *mem,
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1465,6 +1465,14 @@ extern int sysctl_memory_failure_recover
 extern void shake_page(struct page *p, int access);
 extern atomic_long_t mce_bad_pages;
 extern int soft_offline_page(struct page *page, int flags);
+#ifdef CONFIG_MEMORY_FAILURE
+int is_hwpoison_address(unsigned long addr);
+#else
+static inline int is_hwpoison_address(unsigned long addr)
+{
+   return 0;
+}
+#endif
 
 extern void dump_page(struct page *page);
 
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -45,6 +45,7 @@
 #include linux/page-isolation.h
 

[PATCH UPDATED 3/3] vhost: apply cpumask and cgroup to vhost pollers

2010-05-31 Thread Tejun Heo
Apply the cpumask and cgroup of the initializing task to the created
vhost poller.

Based on Sridhar Samudrala's patch.  Li Zefan spotted a bug in error
path, fixed.

Cc: Michael S. Tsirkin m...@redhat.com
Cc: Sridhar Samudrala samudrala.srid...@gmail.com
Cc: Li Zefan l...@cn.fujitsu.com
---
Updated accordingly.  Thanks.

 drivers/vhost/vhost.c |   36 +++-
 1 file changed, 31 insertions(+), 5 deletions(-)

Index: work/drivers/vhost/vhost.c
===
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -23,6 +23,7 @@
 #include linux/highmem.h
 #include linux/slab.h
 #include linux/kthread.h
+#include linux/cgroup.h

 #include linux/net.h
 #include linux/if_packet.h
@@ -176,12 +177,30 @@ repeat:
 long vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue *vqs, int nvqs)
 {
-   struct task_struct *poller;
-   int i;
+   struct task_struct *poller = NULL;
+   cpumask_var_t mask;
+   int i, ret = -ENOMEM;
+
+   if (!alloc_cpumask_var(mask, GFP_KERNEL))
+   goto out;

poller = kthread_create(vhost_poller, dev, vhost-%d, current-pid);
-   if (IS_ERR(poller))
-   return PTR_ERR(poller);
+   if (IS_ERR(poller)) {
+   ret = PTR_ERR(poller);
+   goto out;
+   }
+
+   ret = sched_getaffinity(current-pid, mask);
+   if (ret)
+   goto out;
+
+   ret = sched_setaffinity(poller-pid, mask);
+   if (ret)
+   goto out;
+
+   ret = cgroup_attach_task_current_cg(poller);
+   if (ret)
+   goto out;

dev-vqs = vqs;
dev-nvqs = nvqs;
@@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
vhost_poll_init(dev-vqs[i].poll,
dev-vqs[i].handle_kick, POLLIN, dev);
}
-   return 0;
+
+   wake_up_process(poller);/* avoid contributing to loadavg */
+   ret = 0;
+out:
+   if (ret  poller)
+   kthread_stop(poller);
+   free_cpumask_var(mask);
+   return ret;
 }

 /* Caller should have device mutex */
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup

2010-05-31 Thread Tejun Heo
On 05/31/2010 03:07 AM, Li Zefan wrote:
 04:24, Tejun Heo wrote:
 From: Sridhar Samudrala samudrala.srid...@gmail.com

 Add a new kernel API to attach a task to current task's cgroup
 in all the active hierarchies.

 Signed-off-by: Sridhar Samudrala s...@us.ibm.com
 
 Acked-by: Li Zefan l...@cn.fujitsu.com
 
 btw: you lost the reviewed-by tag given by Paul Menage.

I only got bounced the original posting.  Michael, can you please add
it if/when you commit these?

Thank you.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2-RFC 0/2] virtio: put last seen used index into ring itself

2010-05-31 Thread Jes Sorensen
On 05/30/10 13:22, Michael S. Tsirkin wrote:
 On Fri, May 28, 2010 at 11:56:54AM +0200, Jes Sorensen wrote:
 It looks pretty good to me, however one thing I have been thinking of
 while reading through it:

 Rather than storing a pointer within the ring struct, pointing into a
 position within the same struct. How about storing a byte offset instead
 and using a cast to get to the pointer position? That would avoid the
 pointer dereference, which is less effective cache wise and harder for
 the CPU to predict.

 Not sure whether it really matters performance wise, just a thought.
 
 I think this won't work: when PUBLUSH_USED_IDX is negotiated,
 the pointer is to within the ring.

Hmmm shame, it would be a nice optimization.

Maybe it's time to introduce the v2 ring format, rather than having
adding more kludges to the existing one?

Cheers,
Jes
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH UPDATED 3/3] vhost: apply cpumask and cgroup to vhost pollers

2010-05-31 Thread Li Zefan
   poller = kthread_create(vhost_poller, dev, vhost-%d, current-pid);
 - if (IS_ERR(poller))
 - return PTR_ERR(poller);
 + if (IS_ERR(poller)) {
 + ret = PTR_ERR(poller);
 + goto out;

here...

 + }
 +
 + ret = sched_getaffinity(current-pid, mask);
 + if (ret)
 + goto out;
 +
 + ret = sched_setaffinity(poller-pid, mask);
 + if (ret)
 + goto out;
 +
 + ret = cgroup_attach_task_current_cg(poller);
 + if (ret)
 + goto out;
 
   dev-vqs = vqs;
   dev-nvqs = nvqs;
 @@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
   vhost_poll_init(dev-vqs[i].poll,
   dev-vqs[i].handle_kick, POLLIN, dev);
   }
 - return 0;
 +
 + wake_up_process(poller);/* avoid contributing to loadavg */
 + ret = 0;
 +out:
 + if (ret  poller)

It's still buggy..

 + kthread_stop(poller);
 + free_cpumask_var(mask);
 + return ret;
  }
 
  /* Caller should have device mutex */
 
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2-RFC 0/2] virtio: put last seen used index into ring itself

2010-05-31 Thread Rusty Russell
On Thu, 27 May 2010 05:20:35 am Michael S. Tsirkin wrote:
 Here's a rewrite of the original patch with a new layout.
 I haven't tested it yet so no idea how this performs, but
 I think this addresses the cache bounce issue raised by Avi.
 Posting for early flames/comments.

Sorry, not without some evidence that it'll actually reduce cacheline
bouncing.  I *think* it will, but it's not obvious: the host may keep
looking at avail_idx as we're updating last_seen.  Or does qemu always
look at both together anyway?

Can someone convince me this is a win?
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] arch/s390/kvm: Use GFP_ATOMIC when a lock is held

2010-05-31 Thread Christian Borntraeger
 The containing function is called from several places.  At one of them, in
 the function __sigp_stop, the spin lock fi-lock is held.
[...]
 Signed-off-by: Julia Lawall ju...@diku.dk

Acked-by: Christian Borntraeger borntrae...@de.ibm.com

[...]
 --- a/arch/s390/kvm/sigp.c
 +++ b/arch/s390/kvm/sigp.c
 @@ -113,7 +113,7 @@ static int __inject_sigp_stop(struct kvm
  {
   struct kvm_s390_interrupt_info *inti;
 
 - inti = kzalloc(sizeof(*inti), GFP_KERNEL);
 + inti = kzalloc(sizeof(*inti), GFP_ATOMIC);
   if (!inti)
   return -ENOMEM;
   inti-type = KVM_S390_SIGP_STOP;

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm mmu: Don't calculate quadrant if tdp_enabled.

2010-05-31 Thread Gui Jianfeng
There's no need to calculate quadrant if tdp is enabled.

Signed-off-by: Gui Jianfeng guijianf...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bb9f17..431863b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1346,7 +1346,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
if (role.direct)
role.cr4_pae = 0;
role.access = access;
-   if (vcpu-arch.mmu.root_level = PT32_ROOT_LEVEL) {
+   if (!tdp_enabled  vcpu-arch.mmu.root_level = PT32_ROOT_LEVEL) {
quadrant = gaddr  (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant = (1  ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [QEMU-KVM]: Megasas + TCM_Loop + SG_IO into Windows XP guests

2010-05-31 Thread Gerd Hoffmann

  Hi,


Also it's worth mentioning that I am still running C: on QEMU IDE
emulation, as I could not quite figure out how to boot for a megasas LD
with option-rom.  What QEMU CLI ops where requried to make this work
again..?


-option-rom $file

or

-device megasas,romfile=$file

cheers,
  Gerd

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Setup scsi-bus xfer and xfer_mode for PR IN/OUT and Maintenance IN/OUT

2010-05-31 Thread Gerd Hoffmann

On 05/31/10 03:42, Nicholas A. Bellinger wrote:

From: Nicholas Bellingern...@linux-iscsi.org

Greetings Gerd, Kevin and Co,

Attached are two patches to add the necesary CDB parsing to determine 
SCSIRequest-cmd.xfer
(length) and SCSIRequest-cmd.mode (direction) for Persistent Reservation IN/OUT
CDBs and for Maintenance IN/OUT CDBs used for Asymmetric Logical Unit Access, 
et al.
There is a special case for the latter Maintenance CDBs with TYPE_ROM that has 
been
included in scsi_req_length().

Also, I should mention this is a temporary measure in order to ensure that we 
can actually
do passthrough of these CDBs into KVM Guest for lsi and megaraid HBA emulation. 
 What will
need to eventually happen is to get rid of scsi_req_xfer_mode() all together 
and just setup
SCSIRequest-cmd.mode based on CDB type in scsi_req_length(), instead of having 
to have another
switch(cdb[0]) statement for every SCSI WRITE CDB on the planet to set 
SCSI_XFER_TO_DEV.

Anyways, I will look at doing this conversion in scsi_req_length() at some 
point, but please
apply these for the moment so folks can get access to their SPC-4 Port LUNs 
with QEMU.  ;)


Patches look fine to me.

Acked-by: Gerd Hoffmann kra...@redhat.com

cheers,
  Gerd
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH UPDATED2 3/3] vhost: apply cpumask and cgroup to vhost pollers

2010-05-31 Thread Tejun Heo
Apply the cpumask and cgroup of the initializing task to the created
vhost poller.

Based on Sridhar Samudrala's patch.  Li Zefan spotted a bug in error
path (twice), fixed (twice).

Cc: Michael S. Tsirkin m...@redhat.com
Cc: Sridhar Samudrala samudrala.srid...@gmail.com
Cc: Li Zefan l...@cn.fujitsu.com
---
Heh... that's embarrassing.  Let's see if I can get it right the third
time.

Thank you.

 drivers/vhost/vhost.c |   36 
 1 file changed, 32 insertions(+), 4 deletions(-)

Index: work/drivers/vhost/vhost.c
===
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -23,6 +23,7 @@
 #include linux/highmem.h
 #include linux/slab.h
 #include linux/kthread.h
+#include linux/cgroup.h

 #include linux/net.h
 #include linux/if_packet.h
@@ -177,11 +178,29 @@ long vhost_dev_init(struct vhost_dev *de
struct vhost_virtqueue *vqs, int nvqs)
 {
struct task_struct *poller;
-   int i;
+   cpumask_var_t mask;
+   int i, ret = -ENOMEM;
+
+   if (!alloc_cpumask_var(mask, GFP_KERNEL))
+   goto out_free_mask;

poller = kthread_create(vhost_poller, dev, vhost-%d, current-pid);
-   if (IS_ERR(poller))
-   return PTR_ERR(poller);
+   if (IS_ERR(poller)) {
+   ret = PTR_ERR(poller);
+   goto out_free_mask;
+   }
+
+   ret = sched_getaffinity(current-pid, mask);
+   if (ret)
+   goto out_stop_poller;
+
+   ret = sched_setaffinity(poller-pid, mask);
+   if (ret)
+   goto out_stop_poller;
+
+   ret = cgroup_attach_task_current_cg(poller);
+   if (ret)
+   goto out_stop_poller;

dev-vqs = vqs;
dev-nvqs = nvqs;
@@ -202,7 +221,16 @@ long vhost_dev_init(struct vhost_dev *de
vhost_poll_init(dev-vqs[i].poll,
dev-vqs[i].handle_kick, POLLIN, dev);
}
-   return 0;
+
+   wake_up_process(poller);/* avoid contributing to loadavg */
+   ret = 0;
+   goto out_free_mask;
+
+out_stop_poller:
+   kthread_stop(poller);
+out_free_mask:
+   free_cpumask_var(mask);
+   return ret;
 }

 /* Caller should have device mutex */
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -v3] KVM, Fix QEMU-KVM is killed by guest SRAO MCE (resend)

2010-05-31 Thread Avi Kivity

On 05/31/2010 09:28 AM, Huang Ying wrote:

In common cases, guest SRAO MCE will cause corresponding poisoned page
be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay
the MCE to guest OS.

But it is reported that if the poisoned page is accessed in guest
after un-mapped and before MCE is relayed to guest OS, QEMU-KVM will
be killed.

The reason is as follow. Because poisoned page has been un-mapped,
guest access will cause guest exit and kvm_mmu_page_fault will be
called. kvm_mmu_page_fault can not get the poisoned page for fault
address, so kernel and user space MMIO processing is tried in turn. In
user MMIO processing, poisoned page is accessed again, then QEMU-KVM
is killed by force_sig_info.

To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM
and do not try kernel and user space MMIO processing for poisoned
page.
   


Applied, thanks.  Sorry about the delay.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] KVM: MMU: split the operations of kvm_mmu_zap_page()

2010-05-31 Thread Avi Kivity

On 05/31/2010 05:13 AM, Xiao Guangrong wrote:


Avi Kivity wrote:
   

On 05/30/2010 03:37 PM, Xiao Guangrong wrote:
 

Using kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page() to
split kvm_mmu_zap_page() function, then we can:

- traverse hlist safely
- easily to gather remote tlb flush which occurs during page zapped


+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct
kvm_mmu_page *sp)
+{
+int ret;
+
+trace_kvm_mmu_zap_page(sp);
+++kvm-stat.mmu_shadow_zapped;
+ret = mmu_zap_unsync_children(kvm, sp);
+kvm_mmu_page_unlink_children(kvm, sp);
+kvm_mmu_unlink_parents(kvm, sp);
+if (!sp-role.invalid   !sp-role.direct)
+unaccount_shadowed(kvm, sp-gfn);
+if (sp-unsync)
+kvm_unlink_unsync_page(kvm, sp);
+if (!sp-root_count)
+/* Count self */
+ret++;
+else
+kvm_reload_remote_mmus(kvm);
+
+sp-role.invalid = 1;
+list_move(sp-link,kvm-arch.invalid_mmu_pages);
+kvm_mmu_reset_last_pte_updated(kvm);
+return ret;
+}
+
+static void kvm_mmu_commit_zap_page(struct kvm *kvm)
+{
+struct kvm_mmu_page *sp, *n;
+
+if (list_empty(kvm-arch.invalid_mmu_pages))
+return;
+
+kvm_flush_remote_tlbs(kvm);
+list_for_each_entry_safe(sp, n,kvm-arch.invalid_mmu_pages, link) {
+WARN_ON(!sp-role.invalid);
+if (!sp-root_count)
+kvm_mmu_free_page(kvm, sp);
+}
+}
+

   

You're adding two new functions but not using them here?  Possibly in
the old kvm_mmu_zap_page()?
 

I use those in the next patch, it's not in kvm_mmu_zap_page(), it's used like:

hold mmu spin lock

kvm_mmu_prepare_zap_page page A
kvm_mmu_prepare_zap_page page B
kvm_mmu_prepare_zap_page page C
..
kvm_mmu_commit_zap_page
   


It would be better to rewrite kvm_mmu_zap_page() in terms of 
prepare/commit in the patch so we don't have two copies of the same 
thing (also easier to review).






This is a good idea, but belongs in a separate patch?  We can use it to
reclaim invalid pages before allocating new ones.

 

This patch is very simple and kvm_mmu_commit_zap_page() function should depend 
on
kvm-arch.invalid_mmu_pages, so i think we on need separate this patch, your 
opinion? :-)

   


How about passing the list as a parameter to prepare() and commit()?  If 
the lifetime of the list is just prepare/commit, it shouldn't be a global.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/5] KVM: MMU: introduce some macros to cleanup hlist traverseing

2010-05-31 Thread Avi Kivity

On 05/31/2010 05:00 AM, Xiao Guangrong wrote:


 

+
+#define for_each_gfn_indirect_sp(kvm, sp, gfn, pos, n)\
+  hlist_for_each_entry_safe(sp, pos, n,\
+kvm-arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)\
+if (sp-gfn == gfn   !sp-role.direct)
+
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n)\
+  hlist_for_each_entry_safe(sp, pos, n,\
+kvm-arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)\
+if (sp-gfn == gfn   !sp-role.direct \
+!sp-role.invalid)

   

Shouldn't we always skip invalid gfns?
 

Actually, in kvm_mmu_unprotect_page() function, it need find out
invalid shadow pages:

|   hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
|   if (sp-gfn == gfn  !sp-role.direct) {
|   pgprintk(%s: gfn %lx role %x\n, __func__, gfn,
|sp-role.word);
|   r = 1;
|   if (kvm_mmu_zap_page(kvm, sp))
|   goto restart;
|   }

I'm not sure whether we can skip invalid sp here, since it can change this
function's return value. :-(
   


Hm.  Invalid pages don't need to be write protected.  So I think you can 
patch unprotect_page() to ignore invalid pages, and then you can convert 
it to the new macros which ignore invalid pages as well.


The invariant is: if an sp exists with !role.invalid and !unsync, then 
the page must be write protected.




What about providing both gfn and role to the macro?

 

In current code, no code simply use role and gfn to find sp,
in kvm_mmu_get_page(), we need do other work for
'sp-gfn == gfn  sp-role != role' sp, and other functions only need compare
some members in role, but not all members.
   


How about just gfn?  I think everything compares against that!

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] test: Add XSAVE unit test

2010-05-31 Thread Sheng Yang
Only test legal action so far, we can extend it later.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 kvm/test/config-x86-common.mak |5 +-
 kvm/test/x86/xsave.c   |  173 
 2 files changed, 177 insertions(+), 1 deletions(-)
 create mode 100644 kvm/test/x86/xsave.c

diff --git a/kvm/test/config-x86-common.mak b/kvm/test/config-x86-common.mak
index 9084e2d..2110e4e 100644
--- a/kvm/test/config-x86-common.mak
+++ b/kvm/test/config-x86-common.mak
@@ -24,7 +24,8 @@ FLATLIBS = lib/libcflat.a $(libgcc)
 
 tests-common = $(TEST_DIR)/vmexit.flat $(TEST_DIR)/tsc.flat \
$(TEST_DIR)/smptest.flat  $(TEST_DIR)/port80.flat \
-   $(TEST_DIR)/realmode.flat $(TEST_DIR)/msr.flat
+   $(TEST_DIR)/realmode.flat $(TEST_DIR)/msr.flat \
+   $(TEST_DIR)/xsave.flat
 
 test_cases: $(tests-common) $(tests)
 
@@ -58,6 +59,8 @@ $(TEST_DIR)/realmode.o: bits = 32
 
 $(TEST_DIR)/msr.flat: $(cstart.o) $(TEST_DIR)/msr.o
 
+$(TEST_DIR)/xsave.flat: $(cstart.o) $(TEST_DIR)/xsave.o
+
 arch_clean:
$(RM) $(TEST_DIR)/*.o $(TEST_DIR)/*.flat \
$(TEST_DIR)/.*.d $(TEST_DIR)/lib/.*.d $(TEST_DIR)/lib/*.o
diff --git a/kvm/test/x86/xsave.c b/kvm/test/x86/xsave.c
new file mode 100644
index 000..2d6ea07
--- /dev/null
+++ b/kvm/test/x86/xsave.c
@@ -0,0 +1,173 @@
+#include libcflat.h
+
+#ifdef __x86_64__
+#define uint64_t unsigned long
+#else
+#define uint64_t unsigned long long
+#endif
+
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+   unsigned int *ecx, unsigned int *edx)
+{
+   /* ecx is often an input as well as an output. */
+   asm volatile(cpuid
+   : =a (*eax),
+ =b (*ebx),
+ =c (*ecx),
+ =d (*edx)
+   : 0 (*eax), 2 (*ecx));
+}
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+void cpuid(unsigned int op,
+unsigned int *eax, unsigned int *ebx,
+unsigned int *ecx, unsigned int *edx)
+{
+   *eax = op;
+   *ecx = 0;
+   __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+void cpuid_count(unsigned int op, int count,
+  unsigned int *eax, unsigned int *ebx,
+  unsigned int *ecx, unsigned int *edx)
+{
+   *eax = op;
+   *ecx = count;
+   __cpuid(eax, ebx, ecx, edx);
+}
+
+u64 xgetbv(u32 index)
+{
+   u32 eax, edx;
+
+   asm volatile(.byte 0x0f,0x01,0xd0 /* xgetbv */
+: =a (eax), =d (edx)
+: c (index));
+   return eax + ((u64)edx  32);
+}
+
+void xsetbv(u32 index, u64 value)
+{
+   u32 eax = value;
+   u32 edx = value  32;
+
+   asm volatile(.byte 0x0f,0x01,0xd1 /* xsetbv */
+: : a (eax), d (edx), c (index));
+}
+
+unsigned long read_cr4(void)
+{
+   unsigned long val;
+   asm volatile(mov %%cr4,%0 : =r (val));
+   return val;
+}
+
+void write_cr4(unsigned long val)
+{
+   asm volatile(mov %0,%%cr4: : r (val));
+}
+
+#define CPUID_1_ECX_XSAVE  (1  26)
+int check_xsave()
+{
+   unsigned int eax, ebx, ecx, edx;
+   cpuid(1, eax, ebx, ecx, edx);
+   if (ecx  CPUID_1_ECX_XSAVE)
+   return 1;
+   return 0;
+}
+
+uint64_t get_supported_xcr0()
+{
+   unsigned int eax, ebx, ecx, edx;
+   cpuid_count(0xd, 0, eax, ebx, ecx, edx);
+   printf(eax %x, ebx %x, ecx %x, edx %x\n,
+   eax, ebx, ecx, edx);
+   return eax + ((u64)edx  32);
+}
+
+#define X86_CR4_OSXSAVE0x0004
+#define XCR_XFEATURE_ENABLED_MASK   0x
+
+#define XSTATE_FP   0x1
+#define XSTATE_SSE  0x2
+#define XSTATE_YMM  0x4
+
+static unsigned int fault_mask;
+
+void pass_if(int condition)
+{
+   if (condition)
+   printf(Pass!\n);
+   else
+   printf(Fail!\n);
+}
+
+void pass_if_no_fault(void)
+{
+   pass_if(!fault_mask);
+   fault_mask = 0;
+}
+
+void pass_if_fault(unsigned int fault_bit)
+{
+   pass_if(fault_mask  fault_bit);
+   fault_mask = 0;
+}
+
+#define UD_VECTOR_MASK (1  6)
+#define GP_VECTOR_MASK (1  13)
+
+void test_xsave()
+{
+   unsigned int cr4;
+   uint64_t supported_xcr0;
+   uint64_t test_bits;
+
+   supported_xcr0 = get_supported_xcr0();
+   printf(Supported XCR0 bits: 0x%x\n, supported_xcr0);
+
+   printf(Check minimal XSAVE required bits: );
+   test_bits = XSTATE_FP | XSTATE_SSE | XSTATE_YMM;
+   pass_if((supported_xcr0  test_bits) == test_bits);
+
+   printf(Check CR4 OSXSAVE: );
+   cr4 = read_cr4();
+   write_cr4(cr4 | X86_CR4_OSXSAVE);
+   pass_if_no_fault();
+
+   printf(Check XSETBV for XCR0 bits\n);
+   printf(Legal tests\n);
+   printf(XSTATE_FP: );
+   test_bits = 

Re: [PATCH] KVM: x86: XSAVE/XRSTOR live migration support

2010-05-31 Thread Sheng Yang
On Thursday 27 May 2010 18:02:31 Avi Kivity wrote:
 On 05/27/2010 12:48 PM, Sheng Yang wrote:
  This patch enable save/restore of xsave state.
  
  Signed-off-by: Sheng Yangsh...@linux.intel.com
  ---
  
arch/x86/include/asm/kvm.h |   29 
arch/x86/kvm/x86.c |   79
 include/linux/kvm.h  
 |6 +++
 
 Documentation/kvm/api.txt +
 
  +/* for KVM_CAP_XSAVE */
  +struct kvm_xsave {
  +   struct {
  +   __u16 cwd;
  +   __u16 swd;
  +   __u16 twd;
  +   __u16 fop;
  +   __u64 rip;
  +   __u64 rdp;
  +   __u32 mxcsr;
  +   __u32 mxcsr_mask;
  +   __u32 st_space[32];
  +   __u32 xmm_space[64];
  +   __u32 padding[12];
  +   __u32 sw_reserved[12];
  +   } i387;
  +   struct {
  +   __u64 xstate_bv;
  +   __u64 reserved1[2];
  +   __u64 reserved2[5];
  +   } xsave_hdr;
  +   struct {
  +   __u32 ymmh_space[64];
  +   } ymmh;
  +   __u64 xcr0;
  +   __u32 padding[256];
  +};
 
 Need to reserve way more space here for future xsave growth.  I think at
 least 4K.  LRB wa 32x512bit = 1K (though it probably isn't a candidate
 for vmx).  Would be good to get an opinion from your processor architects.
 
 I don't think we need to detail the contents of the structures since
 they're described by the SDM; so we can have just a large array that is
 1:1 with the xsave as saved by the fpu.

I think we can reserve one page here. But one big array make it harder to work 
with QEmu CPUState. Do we need lots of marcos in QEmu to parse the array? Also 
it's easier to transfer get/set_fpu to get/set_xsave interface using current 
structure I think.

--
regards
Yang, Sheng

 
 If we do that then xcr0 needs to be in a separate structure, say
 kvm_xcr, with a flags field and reserved space of its own for future xcr
 growth.
 
  @@ -2363,6 +2366,59 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct
  kvm_vcpu *vcpu,
  
  return 0;

}
  
  +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
  +   struct kvm_xsave *guest_xsave)
  +{
  +   struct xsave_struct *xsave =vcpu-arch.guest_fpu.state-xsave;
  +
  +   if (!cpu_has_xsave)
  +   return;
 
 Hm, it would be nice to make it backward compatible and return the
 legacy fpu instead.  I think the layouts are compatible?
 
  +
  +   guest_xsave-i387.cwd = xsave-i387.cwd;
  +   guest_xsave-i387.swd = xsave-i387.swd;
  +   guest_xsave-i387.twd = xsave-i387.twd;
  +   guest_xsave-i387.fop = xsave-i387.fop;
  +   guest_xsave-i387.rip = xsave-i387.rip;
  +   guest_xsave-i387.rdp = xsave-i387.rdp;
  +   memcpy(guest_xsave-i387.st_space, xsave-i387.st_space, 128);
  +   memcpy(guest_xsave-i387.xmm_space, xsave-i387.xmm_space,
  +   sizeof guest_xsave-i387.xmm_space);
  +
  +   guest_xsave-xsave_hdr.xstate_bv = xsave-xsave_hdr.xstate_bv;
  +   memcpy(guest_xsave-ymmh.ymmh_space, xsave-ymmh.ymmh_space,
  +   sizeof xsave-ymmh.ymmh_space);
 
 And we can do a big memcpy here.  But we need to limit it to what the
 host actually allocated.
 
  +
  +   guest_xsave-xcr0 = vcpu-arch.xcr0;
  +}
  +
  
long kvm_arch_vcpu_ioctl(struct file *filp,

   unsigned int ioctl, unsigned long arg)

{
  
  @@ -2564,6 +2620,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
  
  r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu,dbgregs);
  break;
  
  }
  
  +   case KVM_GET_XSAVE: {
  +   struct kvm_xsave xsave;
 
 Too big for stack (especially if we reserve room for growth).
 
  diff --git a/include/linux/kvm.h b/include/linux/kvm.h
  index 23ea022..5006761 100644
  --- a/include/linux/kvm.h
  +++ b/include/linux/kvm.h
  @@ -524,6 +524,9 @@ struct kvm_enable_cap {
  
#define KVM_CAP_PPC_OSI 52
#define KVM_CAP_PPC_UNSET_IRQ 53
#define KVM_CAP_ENABLE_CAP 54
  
  +#ifdef __KVM_HAVE_XSAVE
  +#define KVM_CAP_XSAVE 55
  +#endif
 
 Might make sense to have a separate KVM_CAP_XCR, just for consistency.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: XSAVE/XRSTOR live migration support

2010-05-31 Thread Avi Kivity

On 05/31/2010 02:21 PM, Sheng Yang wrote:



Need to reserve way more space here for future xsave growth.  I think at
least 4K.  LRB wa 32x512bit = 1K (though it probably isn't a candidate
for vmx).  Would be good to get an opinion from your processor architects.

I don't think we need to detail the contents of the structures since
they're described by the SDM; so we can have just a large array that is
1:1 with the xsave as saved by the fpu.
 

I think we can reserve one page here. But one big array make it harder to work
with QEmu CPUState. Do we need lots of marcos in QEmu to parse the array? Also
it's easier to transfer get/set_fpu to get/set_xsave interface using current
structure I think.
   


We'll need that code somewhere, so we aren't losing anything by putting 
it in userspace (in fact, qemu already has to have most of this code 
since it supports fxsave/fxrstor emulation).


What we gain is that if we make a bug, it is easier to fix it in 
userspace than in the kernel.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM: MMU: always invalidate and flush on spte page size change

2010-05-31 Thread Avi Kivity

On 05/30/2010 06:19 PM, Marcelo Tosatti wrote:

On Sun, May 30, 2010 at 01:28:19PM +0300, Avi Kivity wrote:
   

On 05/28/2010 03:44 PM, Marcelo Tosatti wrote:
 

Always invalidate spte and flush TLBs when changing page size, to make
sure different sized translations for the same address are never cached
in a CPU's TLB.

The first case where this occurs is when a non-leaf spte pointer is
overwritten by a leaf, large spte entry. This can happen after dirty
logging is disabled on a memslot, for example.

The second case is a leaf, large spte entry is overwritten with a
non-leaf spte pointer, in __direct_map. Note this cannot happen now
because the only potential source of such overwrite is dirty logging
being enabled, which zaps all MMU pages. But this might change
in the future, so better be robust against it.

Noticed by Andrea.

KVM-Stable-Tag
Signed-off-by: Marcelo Tosattimtosa...@redhat.com

Index: kvm/arch/x86/kvm/mmu.c
===
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -1952,6 +1952,8 @@ static void mmu_set_spte(struct kvm_vcpu

child = page_header(pte   PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, sptep);
+   __set_spte(sptep, shadow_trap_nonpresent_pte);
+   kvm_flush_remote_tlbs(vcpu-kvm);
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk(hfn old %lx new %lx\n,
 spte_to_pfn(*sptep), pfn);
   

Applied this bit.

 

@@ -2015,6 +2017,16 @@ static int __direct_map(struct kvm_vcpu
break;
}

+   if (is_shadow_present_pte(*iterator.sptep)
+   !is_large_pte(*iterator.sptep))
+   continue;
+
+   if (is_large_pte(*iterator.sptep)) {
+   rmap_remove(vcpu-kvm, iterator.sptep);
+   __set_spte(iterator.sptep, shadow_trap_nonpresent_pte);
+   kvm_flush_remote_tlbs(vcpu-kvm);
+   }
+
   

Don't we have exactly the same issue in FNAME(fetch)()?
 

Yes and its already handled there:

 if (is_shadow_present_pte(*sptep)  !is_large_pte(*sptep))
 continue;

 if (is_large_pte(*sptep)) {
 rmap_remove(vcpu-kvm, sptep);
 __set_spte(sptep, shadow_trap_nonpresent_pte);
 kvm_flush_remote_tlbs(vcpu-kvm);
 }

   


Well that bit of code wants deduplication under a good name.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] VFIO driver: Non-privileged user level PCI drivers

2010-05-31 Thread Avi Kivity

On 05/30/2010 05:53 PM, Michael S. Tsirkin wrote:


So what I suggested is failing any kind of access until iommu
is assigned.
   


So, the kernel driver must be aware of the iommu.  In which case it may 
as well program it.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v7] KVM: VMX: Enable XSAVE/XRSTOR for guest

2010-05-31 Thread Sheng Yang
From: Dexuan Cui dexuan@intel.com

This patch enable guest to use XSAVE/XRSTOR instructions.

We assume that host_xcr0 would use all possible bits that OS supported.

And we loaded xcr0 in the same way we handled fpu - do it as late as we can.

Signed-off-by: Dexuan Cui dexuan@intel.com
Signed-off-by: Sheng Yang sh...@linux.intel.com
---

Change from v6:

Make kvm_set_xcr() generically.

 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/include/asm/vmx.h  |1 +
 arch/x86/kvm/kvm_cache_regs.h   |6 ++
 arch/x86/kvm/vmx.c  |   13 
 arch/x86/kvm/x86.c  |  126 --
 include/linux/kvm_host.h|2 +-
 6 files changed, 142 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d08bb4a..271487b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -302,6 +302,7 @@ struct kvm_vcpu_arch {
} update_pte;
 
struct fpu guest_fpu;
+   u64 xcr0;
 
gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
@@ -605,6 +606,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long 
*val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+void kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f..346ea66 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -266,6 +266,7 @@ enum vmcs_field {
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
 #define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
 
 /*
  * Interruption-information format
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index d2a98f8..6491ac8 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -71,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
return kvm_read_cr4_bits(vcpu, ~0UL);
 }
 
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+   return (kvm_register_read(vcpu, VCPU_REGS_RAX)  -1u)
+   | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX)  -1u)  32);
+}
+
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 99ae513..8649627 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -36,6 +36,8 @@
 #include asm/vmx.h
 #include asm/virtext.h
 #include asm/mce.h
+#include asm/i387.h
+#include asm/xcr.h
 
 #include trace.h
 
@@ -3354,6 +3356,16 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu)
return 1;
 }
 
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
+{
+   u64 new_bv = kvm_read_edx_eax(vcpu);
+   u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+
+   kvm_set_xcr(vcpu, index, new_bv);
+   skip_emulated_instruction(vcpu);
+   return 1;
+}
+
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
@@ -3632,6 +3644,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu 
*vcpu) = {
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD]  = handle_wbinvd,
+   [EXIT_REASON_XSETBV]  = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY]  = handle_machine_check,
[EXIT_REASON_EPT_VIOLATION]   = handle_ept_violation,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7be1d36..0fcd8de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -64,6 +64,7 @@
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
+ | X86_CR4_OSXSAVE \
  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -149,6 +150,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
 };
 
+u64 __read_mostly host_xcr0;
+
+static inline u32 bit(int bitno)
+{
+   return 1  (bitno  31);
+}
+
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
unsigned slot;
@@ -473,6 +481,58 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+   u64 xcr0;
+
+   /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
+   if (index != XCR_XFEATURE_ENABLED_MASK)
+

Re: [PATCHv2-RFC 0/2] virtio: put last seen used index into ring itself

2010-05-31 Thread Michael S. Tsirkin
On Mon, May 31, 2010 at 05:16:42PM +0930, Rusty Russell wrote:
 On Thu, 27 May 2010 05:20:35 am Michael S. Tsirkin wrote:
  Here's a rewrite of the original patch with a new layout.
  I haven't tested it yet so no idea how this performs, but
  I think this addresses the cache bounce issue raised by Avi.
  Posting for early flames/comments.
 
 Sorry, not without some evidence that it'll actually reduce cacheline
 bouncing.  I *think* it will, but it's not obvious: the host may keep
 looking at avail_idx as we're updating last_seen.  Or does qemu always
 look at both together anyway?
 Can someone convince me this is a win?
 Rusty.

What really happens is host looks at flags and last_seen together.
And flags happens to be in the same cache line with avail idx.
So to get an obvious win, we should put flags and last_seen
in a separate cache line from avail, which us easy - just add some padding.

And I'll relax the requirement from guest to only require it to update
last_seen when interrupts are enabled. This way flags and
last_seen are written together and read together.

Makes sense?
-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2-RFC 0/2] virtio: put last seen used index into ring itself

2010-05-31 Thread Michael S. Tsirkin
On Mon, May 31, 2010 at 09:36:00AM +0200, Jes Sorensen wrote:
 On 05/30/10 13:22, Michael S. Tsirkin wrote:
  On Fri, May 28, 2010 at 11:56:54AM +0200, Jes Sorensen wrote:
  It looks pretty good to me, however one thing I have been thinking of
  while reading through it:
 
  Rather than storing a pointer within the ring struct, pointing into a
  position within the same struct. How about storing a byte offset instead
  and using a cast to get to the pointer position? That would avoid the
  pointer dereference, which is less effective cache wise and harder for
  the CPU to predict.
 
  Not sure whether it really matters performance wise, just a thought.
  
  I think this won't work: when PUBLUSH_USED_IDX is negotiated,
  the pointer is to within the ring.
 
 Hmmm shame, it would be a nice optimization.
 
 Maybe it's time to introduce the v2 ring format, rather than having
 adding more kludges to the existing one?
 
 Cheers,
 Jes

There has been discussion about a ring format that does not
use indexes at all. My guess is that would be a good point
for v2 ring format. But making that a product
and tuning might take a while. So definitely something to
keep in mind but I would not want that to block this optimization.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Add Documentation/kvm/msr.txt

2010-05-31 Thread Glauber Costa
On Sun, May 30, 2010 at 12:14:21PM +0300, Avi Kivity wrote:
 On 05/27/2010 07:02 PM, Glauber Costa wrote:
 
 +
 +Custom MSR list
 +
 +
 +The current supported Custom MSR list is:
 +
 +MSR_KVM_WALL_CLOCK:  0x11
 +
 +  data: physical address of a memory area.
 Which must be in guest RAM (i.e., don't point it somewhere random
 and expect the hypervisor to allocate it for you).
 
 Must be aligned to 4 bytes (we don't enforce it though).
 I don't see the reason for it.
 
 Insurance.
 
 If this is a requirement, our own implementation
 is failing to meet it.
 
 How are we failing to meet it?  All guests align to at least four bytes.
static struct pvclock_wall_clock wall_clock;

Unless there is something that the compiler does that I should take for granted,
there is no alignment directive in there, and it could be anywhere.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Add Documentation/kvm/msr.txt

2010-05-31 Thread Avi Kivity

On 05/31/2010 04:49 PM, Glauber Costa wrote:



How are we failing to meet it?  All guests align to at least four bytes.
 

static struct pvclock_wall_clock wall_clock;

Unless there is something that the compiler does that I should take for granted,
there is no alignment directive in there, and it could be anywhere.
   


The compiler will align u32 and larger to 4 bytes on i386 (and align u64 
to 8 bytes on x86_64).


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread

2010-05-31 Thread Oleg Nesterov
On 05/30, Tejun Heo wrote:

 This conversion is to make each vhost use a dedicated kthread so that
 resource control via cgroup can be applied.

Personally, I agree. I think This is better than play with workqueue thread.

A couple of simple questions after the quick glance at the unapplied patch...

  void vhost_poll_flush(struct vhost_poll *poll)
  {
 - flush_work(poll-work);
 + int seq = poll-queue_seq;
 +
 + if (seq - poll-done_seq  0)
 + wait_event(poll-done, seq - poll-done_seq = 0);

The check before wait_event() is not needed, please note that wait_event()
checks the condition before __wait_event().

What I can't understand is why we do have -queue_seq and -done_seq.

Isn't the single bool poll-active enough? vhost_poll_queue() sets
-active == T, vhost_poller() clears it before wake_up_all(poll-done).

 +static int vhost_poller(void *data)
 +{
 + struct vhost_dev *dev = data;
 + struct vhost_poll *poll;
 +
 +repeat:
 + set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */

I don't understand the comment... why do we need this barrier?

 + if (kthread_should_stop()) {
 + __set_current_state(TASK_RUNNING);
 + return 0;
 + }
 +
 + poll = NULL;
 + spin_lock(dev-poller_lock);
 + if (!list_empty(dev-poll_list)) {
 + poll = list_first_entry(dev-poll_list,
 + struct vhost_poll, node);
 + list_del_init(poll-node);
 + }
 + spin_unlock(dev-poller_lock);
 +
 + if (poll) {
 + __set_current_state(TASK_RUNNING);
 + poll-fn(poll);
 + smp_wmb();  /* paired with rmb in vhost_poll_flush() */
 + poll-done_seq = poll-queue_seq;
 + wake_up_all(poll-done);
 + } else
 + schedule();
 +
 + goto repeat;
 +}

Given that vhost_poll_queue() does list_add() and wake_up_process() under
-poller_lock, I don't think we need any barriers to change -state.

IOW, can't vhost_poller() simply do

while(!kthread_should_stop()) {

poll = NULL;
spin_lock(dev-poller_lock);
if (!list_empty(dev-poll_list)) {
...
} else
 __set_current_state(TASK_INTERRUPTIBLE);
spin_unlock(dev-poller_lock);

if (poll) {
...
} else
schedule();
}

?

Oleg.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread

2010-05-31 Thread Tejun Heo
Hello,

On 05/31/2010 04:39 PM, Oleg Nesterov wrote:
 On 05/30, Tejun Heo wrote:

 This conversion is to make each vhost use a dedicated kthread so that
 resource control via cgroup can be applied.
 
 Personally, I agree. I think This is better than play with workqueue thread.

Yeap, I think so too.  In vhost's case tho, as it exports a lot of
workqueue characteristics to vhost users, it's a bit more complex than
I wish it were.  It can probably be simplified more if someone who
knows the code better takes a look or maybe we need to make this kind
of things easier by providing a generic helpers if more cases like
this spring up, but if that happens probably the RTTD would be somehow
teaching workqueue how to deal with cgroups.  As this is the first
case, I guess open coding is okay for now.

 A couple of simple questions after the quick glance at the unapplied patch...
 
  void vhost_poll_flush(struct vhost_poll *poll)
  {
 -flush_work(poll-work);
 +int seq = poll-queue_seq;
 +
 +if (seq - poll-done_seq  0)
 +wait_event(poll-done, seq - poll-done_seq = 0);
 
 The check before wait_event() is not needed, please note that wait_event()
 checks the condition before __wait_event().

Heh... right, I was looking at __wait_event() and thinking ooh... we
can skip lock in the fast path.  :-)

 What I can't understand is why we do have -queue_seq and -done_seq.
 
 Isn't the single bool poll-active enough? vhost_poll_queue() sets
 -active == T, vhost_poller() clears it before wake_up_all(poll-done).

I might have slightly over engineered this part not knowing the
expected workload.  -queue_seq/-done_seq pair is to guarantee that
flushers never get starved.  Without sequencing queueings and
executions, flushers should wait for !pending  !active which can
take some time to come if the poll in question is very busy.

 +static int vhost_poller(void *data)
 +{
 +struct vhost_dev *dev = data;
 +struct vhost_poll *poll;
 +
 +repeat:
 +set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
 
 I don't understand the comment... why do we need this barrier?

So that either kthread_stop()'s should_stop = 1 in kthread_stop() is
visible to kthread_should_stop() or task state is set to RUNNING.

 +if (kthread_should_stop()) {
 +__set_current_state(TASK_RUNNING);
 +return 0;
 +}
 +
 +poll = NULL;
 +spin_lock(dev-poller_lock);
 +if (!list_empty(dev-poll_list)) {
 +poll = list_first_entry(dev-poll_list,
 +struct vhost_poll, node);
 +list_del_init(poll-node);
 +}
 +spin_unlock(dev-poller_lock);
 +
 +if (poll) {
 +__set_current_state(TASK_RUNNING);
 +poll-fn(poll);
 +smp_wmb();  /* paired with rmb in vhost_poll_flush() */
 +poll-done_seq = poll-queue_seq;
 +wake_up_all(poll-done);
 +} else
 +schedule();
 +
 +goto repeat;
 +}
 
 Given that vhost_poll_queue() does list_add() and wake_up_process() under
 -poller_lock, I don't think we need any barriers to change -state.
 
 IOW, can't vhost_poller() simply do
 
   while(!kthread_should_stop()) {
 
   poll = NULL;
   spin_lock(dev-poller_lock);
   if (!list_empty(dev-poll_list)) {
   ...
   } else
__set_current_state(TASK_INTERRUPTIBLE);
   spin_unlock(dev-poller_lock);
 
   if (poll) {
   ...
   } else
   schedule();
   }
 ?

But then kthread_stop() can happen between kthread_should_stop() and
__set_current_state(TASK_INTERRUPTIBLE) and poller can just sleep in
schedule() not knowing that.

Thank you.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread

2010-05-31 Thread Michael S. Tsirkin
On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
 Replace vhost_workqueue with per-vhost kthread.  Other than callback
 argument change from struct work_struct * to struct vhost_poll *,
 there's no visible change to vhost_poll_*() interface.

I would prefer a substructure vhost_work, even just to make
the code easier to review and compare to workqueue.c.

 This conversion is to make each vhost use a dedicated kthread so that
 resource control via cgroup can be applied.
 
 Partially based on Sridhar Samudrala's patch.
 
 Cc: Michael S. Tsirkin m...@redhat.com
 Cc: Sridhar Samudrala samudrala.srid...@gmail.com
 ---
 Okay, here is three patch series to convert vhost to use per-vhost
 kthread, add cgroup_attach_task_current_cg() and apply it to the vhost
 kthreads.  The conversion is mostly straight forward although flush is
 slightly tricky.
 
 The problem is that I have no idea how to test this.

It's a 3 step process:

1. 
Install qemu-kvm under fc13, or build recent one from source,
get it from here:
git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git

2. install guest under it:
qemu-img create -f qcow2 disk.qcow2 100G
Now get some image (e.g. fedora 13 in fc13.iso)
and install guest:
qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2


3. set up networking. I usually simply do host to guest 
on a special subnet for testing purposes:

Set up a bridge named mstbr0:

ifconfig mstbr0 down
brctl delbr mstbr0
brctl addbr mstbr0
brctl setfd mstbr0 0
ifconfig mstbr0 11.0.0.1

cat  ifup  EOF
#!/bin/sh -x
/sbin/ifconfig msttap0 0.0.0.0 up
brctl addif mstbr0 msttap0
EOF


qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2
 -net nic,model=virtio,netdev=foo -netdev
tap,id=foo,ifname=msttap0,script=/home/mst/ifup,downscript=no,vhost=on

after you set up the guest, log into it and
ifconfig eth0 11.0.0.2

You should now be able to ping guest to host and back.
Use something like netperf to stress the connection.
Close qemu with kill -9 and unload module to test flushing code.



 Index: work/drivers/vhost/vhost.c
 ===
 --- work.orig/drivers/vhost/vhost.c
 +++ work/drivers/vhost/vhost.c

...

 @@ -125,10 +139,50 @@ static void vhost_vq_reset(struct vhost_
   vq-log_ctx = NULL;
  }
 
 +static int vhost_poller(void *data)
 +{
 + struct vhost_dev *dev = data;
 + struct vhost_poll *poll;
 +
 +repeat:
 + set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
 +
 + if (kthread_should_stop()) {
 + __set_current_state(TASK_RUNNING);
 + return 0;
 + }
 +
 + poll = NULL;
 + spin_lock(dev-poller_lock);
 + if (!list_empty(dev-poll_list)) {
 + poll = list_first_entry(dev-poll_list,
 + struct vhost_poll, node);
 + list_del_init(poll-node);
 + }
 + spin_unlock(dev-poller_lock);
 +
 + if (poll) {
 + __set_current_state(TASK_RUNNING);
 + poll-fn(poll);
 + smp_wmb();  /* paired with rmb in vhost_poll_flush() */
 + poll-done_seq = poll-queue_seq;
 + wake_up_all(poll-done);


This seems to add wakeups on data path, which uses spinlocks etc.
OTOH workqueue.c adds a special barrier
entry which only does a wakeup when needed.
Right?

 + } else
 + schedule();
 +
 + goto repeat;
 +}
 +
  long vhost_dev_init(struct vhost_dev *dev,
   struct vhost_virtqueue *vqs, int nvqs)
  {
 + struct task_struct *poller;
   int i;
 +
 + poller = kthread_create(vhost_poller, dev, vhost-%d, current-pid);
 + if (IS_ERR(poller))
 + return PTR_ERR(poller);
 +
   dev-vqs = vqs;
   dev-nvqs = nvqs;
   mutex_init(dev-mutex);
 @@ -136,6 +190,9 @@ long vhost_dev_init(struct vhost_dev *de
   dev-log_file = NULL;
   dev-memory = NULL;
   dev-mm = NULL;
 + spin_lock_init(dev-poller_lock);
 + INIT_LIST_HEAD(dev-poll_list);
 + dev-poller = poller;
 
   for (i = 0; i  dev-nvqs; ++i) {
   dev-vqs[i].dev = dev;
 @@ -143,8 +200,7 @@ long vhost_dev_init(struct vhost_dev *de
   vhost_vq_reset(dev, dev-vqs + i);
   if (dev-vqs[i].handle_kick)
   vhost_poll_init(dev-vqs[i].poll,
 - dev-vqs[i].handle_kick,
 - POLLIN);
 + dev-vqs[i].handle_kick, POLLIN, dev);
   }
   return 0;
  }
 @@ -217,6 +273,8 @@ void vhost_dev_cleanup(struct vhost_dev
   if (dev-mm)
   mmput(dev-mm);
   dev-mm = NULL;
 +
 + kthread_stop(dev-poller);
  }
 
  static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 @@ -1113,16 +1171,3 @@ void vhost_disable_notify(struct vhost_v
   vq_err(vq, Failed to enable notification at %p: %d\n,
  vq-used-flags, r);
  }
 -

Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread

2010-05-31 Thread Oleg Nesterov
On 05/31, Tejun Heo wrote:

 On 05/31/2010 04:39 PM, Oleg Nesterov wrote:

  What I can't understand is why we do have -queue_seq and -done_seq.
 
  Isn't the single bool poll-active enough? vhost_poll_queue() sets
  -active == T, vhost_poller() clears it before wake_up_all(poll-done).

 I might have slightly over engineered this part not knowing the
 expected workload.  -queue_seq/-done_seq pair is to guarantee that
 flushers never get starved.

Ah, indeed.

Well, afaics we do not need 2 counters anyway, both vhost_poll_queue()
and vhost_poller() could increment the single counter and the flusher
can take bit 0 into account. But I agree 2 counters are much more clean.

  +static int vhost_poller(void *data)
  +{
  +  struct vhost_dev *dev = data;
  +  struct vhost_poll *poll;
  +
  +repeat:
  +  set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
 
  I don't understand the comment... why do we need this barrier?

 So that either kthread_stop()'s should_stop = 1 in kthread_stop() is
 visible to kthread_should_stop() or task state is set to RUNNING.

Of course, you are right. I am really surprized I asked this question ;)

Thanks,

Oleg.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread

2010-05-31 Thread Tejun Heo
Hello,

On 05/31/2010 05:31 PM, Oleg Nesterov wrote:
 I might have slightly over engineered this part not knowing the
 expected workload.  -queue_seq/-done_seq pair is to guarantee that
 flushers never get starved.
 
 Ah, indeed.
 
 Well, afaics we do not need 2 counters anyway, both vhost_poll_queue()
 and vhost_poller() could increment the single counter and the flusher
 can take bit 0 into account. But I agree 2 counters are much more clean.

Right, we can do that too.  Cool. :-)

 +static int vhost_poller(void *data)
 +{
 +  struct vhost_dev *dev = data;
 +  struct vhost_poll *poll;
 +
 +repeat:
 +  set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */

 I don't understand the comment... why do we need this barrier?

 So that either kthread_stop()'s should_stop = 1 in kthread_stop() is
 visible to kthread_should_stop() or task state is set to RUNNING.
 
 Of course, you are right. I am really surprized I asked this question ;)

This part is always a bit tricky tho.  Maybe it would be a good idea
to make kthread_stop() do periodic wakeups.  It's already injecting
one rather unexpected wake up into the mix anyway so there isn't much
point in avoiding multiple and it would make designing kthread loops
easier.  Or maybe what we need is something like kthread_idle() which
encapsulates the check and sleep part.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread

2010-05-31 Thread Tejun Heo
Hello,

On 05/31/2010 05:22 PM, Michael S. Tsirkin wrote:
 On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
 Replace vhost_workqueue with per-vhost kthread.  Other than callback
 argument change from struct work_struct * to struct vhost_poll *,
 there's no visible change to vhost_poll_*() interface.
 
 I would prefer a substructure vhost_work, even just to make
 the code easier to review and compare to workqueue.c.

Yeap, sure.

 The problem is that I have no idea how to test this.
 
 It's a 3 step process:
...
 You should now be able to ping guest to host and back.
 Use something like netperf to stress the connection.
 Close qemu with kill -9 and unload module to test flushing code.

Thanks for the instruction.  I'll see if there's a way to do it
without building qemu myself on opensuse.  But please feel free to go
ahead and test it.  It might just work!  :-)

 +if (poll) {
 +__set_current_state(TASK_RUNNING);
 +poll-fn(poll);
 +smp_wmb();  /* paired with rmb in vhost_poll_flush() */
 +poll-done_seq = poll-queue_seq;
 +wake_up_all(poll-done);
 

 This seems to add wakeups on data path, which uses spinlocks etc.
 OTOH workqueue.c adds a special barrier entry which only does a
 wakeup when needed.  Right?

Yeah, well, if it's a really hot path sure we can avoid wake_up_all()
in most cases.  Do you think that would be necessary?

 -void vhost_cleanup(void)
 -{
 -destroy_workqueue(vhost_workqueue);
 
 I note that destroy_workqueue does a flush, kthread_stop
 doesn't. Right? Sure we don't need to check nothing is in one of
 the lists? Maybe add a BUG_ON?

There were a bunch of flushes before kthread_stop() and they seemed to
stop and flush everything.  Aren't they enough?  We can definitely add
BUG_ON() after kthread_should_stop() check succeeds either way tho.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: host panic on kernel 2.6.34

2010-05-31 Thread Maciej Rutecki
On poniedziałek, 24 maja 2010 o 10:23:11 Hao, Xudong wrote:
 Hi all
 I build latest kvm 37dec075a7854f0f550540bf3b9bbeef37c11e2a, based on
  kernel 2.6.34, after kvm and kvm_intel module loaded, then /etc/init.d/kvm
  start, a few minutes later, the system will panic.
 

I created a Bugzilla entry at 
https://bugzilla.kernel.org/show_bug.cgi?id=16082
for your bug report, please add your address to the CC list in there, thanks!


-- 
Maciej Rutecki
http://www.maciek.unixy.pl
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread

2010-05-31 Thread Michael S. Tsirkin
On Mon, May 31, 2010 at 05:45:07PM +0200, Tejun Heo wrote:
 Hello,
 
 On 05/31/2010 05:22 PM, Michael S. Tsirkin wrote:
  On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
  Replace vhost_workqueue with per-vhost kthread.  Other than callback
  argument change from struct work_struct * to struct vhost_poll *,
  there's no visible change to vhost_poll_*() interface.
  
  I would prefer a substructure vhost_work, even just to make
  the code easier to review and compare to workqueue.c.
 
 Yeap, sure.
 
  The problem is that I have no idea how to test this.
  
  It's a 3 step process:
 ...
  You should now be able to ping guest to host and back.
  Use something like netperf to stress the connection.
  Close qemu with kill -9 and unload module to test flushing code.
 
 Thanks for the instruction.  I'll see if there's a way to do it
 without building qemu myself on opensuse.

My guess is no, there was no stable qemu release with vhost net support
yet.  Building it is mostly configure/make/make install,
as far as I remember you only need devel versions of X libraries,
SDL and curses installed.

  But please feel free to go
 ahead and test it.  It might just work!  :-)
 
  +  if (poll) {
  +  __set_current_state(TASK_RUNNING);
  +  poll-fn(poll);
  +  smp_wmb();  /* paired with rmb in vhost_poll_flush() */
  +  poll-done_seq = poll-queue_seq;
  +  wake_up_all(poll-done);
  
 
  This seems to add wakeups on data path, which uses spinlocks etc.
  OTOH workqueue.c adds a special barrier entry which only does a
  wakeup when needed.  Right?
 
 Yeah, well, if it's a really hot path sure we can avoid wake_up_all()
 in most cases.  Do you think that would be necessary?

My guess is yes. This is really very hot path in code, and we are
close to 100% CPU in some benchmarks.

  -void vhost_cleanup(void)
  -{
  -  destroy_workqueue(vhost_workqueue);
  
  I note that destroy_workqueue does a flush, kthread_stop
  doesn't. Right? Sure we don't need to check nothing is in one of
  the lists? Maybe add a BUG_ON?
 
 There were a bunch of flushes before kthread_stop() and they seemed to
 stop and flush everything.  Aren't they enough?

I was just asking, I'll need to review the code in depth.

 We can definitely add
 BUG_ON() after kthread_should_stop() check succeeds either way tho.
 
 Thanks.
 
 -- 
 tejun
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] VFIO driver: Non-privileged user level PCI drivers

2010-05-31 Thread Alan Cox

 +/*
 + * Map usr buffer at specific IO virtual address
 + */
 +static int vfio_dma_map_iova(

 + mlp = kzalloc(sizeof *mlp, GFP_KERNEL);

Not good at that point. I think you need to allocate it first, error if
it can't be allocated and then do the work and free it on error ?


 + mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
 + mlp-pages = pages;

Ditto


 +int vfio_enable_msix(struct vfio_dev *vdev, int nvec, void __user *uarg)
 +{
 + struct pci_dev *pdev = vdev-pdev;
 + struct eventfd_ctx *ctx;
 + int ret = 0;
 + int i;
 + int fd;
 +
 + vdev-msix = kzalloc(nvec * sizeof(struct msix_entry),
 + GFP_KERNEL);
 + vdev-ev_msix = kzalloc(nvec * sizeof(struct eventfd_ctx *),
 + GFP_KERNEL);

These don't seem to get freed on the error path - or indeed protected
against being allocated twice (eg two parallel ioctls ?)



 + case VFIO_DMA_MAP_ANYWHERE:
 + case VFIO_DMA_MAP_IOVA:
 + if (copy_from_user(dm, uarg, sizeof dm))
 + return -EFAULT;
 + ret = vfio_dma_map_common(listener, cmd, dm);
 + if (!ret  copy_to_user(uarg, dm, sizeof dm))

So the vfio_dma_map is untrusted. That seems to be checked ok later but
the dma_map_common code then plays in current-mm- without apparently
holding any locks to stop the values getting corrupted by a parallel
mlock ?

Actually no I take that back

dmp-size is 64bit

So npage can end up with values like 0x and cause 32bit
boxes to go kerblam

 +
 + case VFIO_EVENTFD_IRQ:
 + if (copy_from_user(fd, uarg, sizeof fd))
 + return -EFAULT;
 + if (vdev-ev_irq)
 + eventfd_ctx_put(vdev-ev_irq);

These paths need locking - suppose two EVENTFD irq ioctls occur at once
(in general these paths seem not to be covered)


 + case VFIO_BAR_LEN:
 + if (copy_from_user(bar, uarg, sizeof bar))
 + return -EFAULT;
 + if (bar  0 || bar  PCI_ROM_RESOURCE)
 + return -EINVAL;
 + bar = pci_resource_len(pdev, bar);
 + if (copy_to_user(uarg, bar, sizeof bar))
 + return -EFAULT;

How does this all work out if the device is a bridge ?

 + pci_read_config_byte(pdev, PCI_INTERRUPT_LINE, line);
 + if (line == 0)
 + goto out;

That may produce some interestingly wrong answers. Firstly the platform
has interrupt abstraction so dev-irq may not match PCI_INTERRUPT_LINE,
secondly you have devices that report their IRQ via other paths as per
spec (notably IDE class devices in non-native mode)

So that would also want extra checks.


 + pci_read_config_word(pdev, PCI_COMMAND, orig);
 + ret = orig  PCI_COMMAND_MASTER;
 + if (!ret) {
 + new = orig | PCI_COMMAND_MASTER;
 + pci_write_config_word(pdev, PCI_COMMAND, new);
 + pci_read_config_word(pdev, PCI_COMMAND, new);
 + ret = new  PCI_COMMAND_MASTER;
 + pci_write_config_word(pdev, PCI_COMMAND, orig);

The master bit on some devices can be turned on but not off. Not sure it
matters here.

 + vdev-pdev = pdev;

Probably best to take/drop a reference. Not needed if you can prove your
last use is before the end of the remove path though.


Does look like it needs a locking audit, some memory and error checks
reviewing and some further review of the ioctl security and
overflows/trusted values.

Rather a nice way of attacking the user space PCI problem.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] VFIO driver: Non-privileged user level PCI drivers

2010-05-31 Thread Michael S. Tsirkin
On Mon, May 31, 2010 at 02:50:29PM +0300, Avi Kivity wrote:
 On 05/30/2010 05:53 PM, Michael S. Tsirkin wrote:

 So what I suggested is failing any kind of access until iommu
 is assigned.


 So, the kernel driver must be aware of the iommu.  In which case it may  
 as well program it.

It's a kernel driver anyway. Point is that
the *device* driver is better off not programming iommu,
this way we do not need to reprogram it for each device.

 -- 
 I have a truly marvellous patch that fixes the bug which this
 signature is too narrow to contain.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] make-release: misc fixes

2010-05-31 Thread Michael S. Tsirkin
On Mon, May 24, 2010 at 04:12:14PM +0300, Michael S. Tsirkin wrote:
 This fixes /tmp usage in make-release script for security.
 Also, create output directory if it does not exist.
 This also adds a 'tarball' optin to specify output file name.
 Finally, remote output file before gzip to avoid prompt
 'do you want to overwrite'.
 
 Signed-off-by: Michael S. Tsirkin m...@redhat.com

Ping. Please review/apply.

 ---
  kvm/scripts/make-release |   13 ++---
  1 files changed, 10 insertions(+), 3 deletions(-)
 
 diff --git a/kvm/scripts/make-release b/kvm/scripts/make-release
 index 11d9c27..fdc402b 100755
 --- a/kvm/scripts/make-release
 +++ b/kvm/scripts/make-release
 @@ -1,7 +1,7 @@
  #!/bin/bash -e
  
  usage() {
 -echo usage: $0 [--upload] [--formal] commit [name]
 +echo usage: $0 [--upload] [--formal] commit [name] [tarball]
  exit 1
  }
  
 @@ -12,7 +12,7 @@ formal=
  
  releasedir=~/sf-release
  [[ -z $TMP ]]  TMP=/tmp
 -tmpdir=$TMP/qemu-kvm-make-release.$$
 +tmpdir=`mktemp -d --tmpdir=$TMP qemu-kvm-make-release.XX`
  while [[ $1 = -* ]]; do
  opt=$1
  shift
 @@ -40,9 +40,15 @@ if [[ -z $name ]]; then
  name=$commit
  fi
  
 -tarball=$releasedir/$name.tar
 +tarball=$3
 +if [[ -z $tarball ]]; then
 +tarball=$releasedir/$name.tar.gz
 +fi
 +#strip trailing .gz if any
 +tarball=${tarball/%.gz/}
  
  cd $(dirname $0)/../..
 +mkdir -p $(dirname $tarball)
  git archive --prefix=$name/ --format=tar $commit  $tarball
  
  mkdir -p $tmpdir
 @@ -59,6 +65,7 @@ if [[ -n $formal ]]; then
  rm -rf $tmpdir
  fi
  
 +rm -f $tarball.gz
  gzip -9 $tarball
  tarball=$tarball.gz
  
 -- 
 1.7.1.12.g42b7f
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [QEMU-KVM]: Megasas + TCM_Loop + SG_IO into Windows XP guests

2010-05-31 Thread Alexander Graf


Am 31.05.2010 um 11:52 schrieb Gerd Hoffmann kra...@redhat.com:


 Hi,


Also it's worth mentioning that I am still running C: on QEMU IDE
emulation, as I could not quite figure out how to boot for a  
megasas LD

with option-rom.  What QEMU CLI ops where requried to make this work
again..?


-option-rom $file

or

-device megasas,romfile=$file


Or -drive ...,boot=on if you're using qemu-kvm.

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] ceph/rbd block driver for qemu-kvm (v3)

2010-05-31 Thread Christian Brunner
Hi Kevin,

here is an updated patch for the ceph/rbd driver. I hope that everything 
is fine now.

Regards,
Christian


This is a block driver for the distributed file system Ceph
(http://ceph.newdream.net/). This driver uses librados (which
is part of the Ceph server) for direct access to the Ceph object
store and is running entirely in userspace. Therefore it is
called rbd - rados block device.

To compile the driver a recent version of ceph (unstable/testing git
head or 0.20.3 once it is released) is needed.

Additional information is available on the Ceph-Wiki:

http://ceph.newdream.net/wiki/Kvm-rbd

The patch is based on git://repo.or.cz/qemu/kevin.git block


Signed-off-by: Christian Brunner c...@muc.de
---
 Makefile.objs |1 +
 block/rbd.c   |  600 +
 block/rbd_types.h |   64 ++
 configure |   31 +++
 4 files changed, 696 insertions(+), 0 deletions(-)
 create mode 100644 block/rbd.c
 create mode 100644 block/rbd_types.h

diff --git a/Makefile.objs b/Makefile.objs
index 1a942e5..08dc11f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -18,6 +18,7 @@ block-nested-y += parallels.o nbd.o blkdebug.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
+block-nested-$(CONFIG_RBD) += rbd.o
 
 block-obj-y +=  $(addprefix block/, $(block-nested-y))
 
diff --git a/block/rbd.c b/block/rbd.c
new file mode 100644
index 000..4a60dda
--- /dev/null
+++ b/block/rbd.c
@@ -0,0 +1,600 @@
+/*
+ * QEMU Block driver for RADOS (Ceph)
+ *
+ * Copyright (C) 2010 Christian Brunner c...@muc.de
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include qemu-common.h
+#include qemu-error.h
+#include sys/types.h
+#include stdbool.h
+
+#include qemu-common.h
+
+#include rbd_types.h
+#include module.h
+#include block_int.h
+
+#include stdio.h
+#include stdlib.h
+#include rados/librados.h
+
+#include signal.h
+
+/*
+ * When specifying the image filename use:
+ *
+ * rbd:poolname/devicename
+ *
+ * poolname must be the name of an existing rados pool
+ *
+ * devicename is the basename for all objects used to
+ * emulate the raw device.
+ *
+ * Metadata information (image size, ...) is stored in an
+ * object with the name devicename.rbd.
+ *
+ * The raw device is split into 4MB sized objects by default.
+ * The sequencenumber is encoded in a 12 byte long hex-string,
+ * and is attached to the devicename, separated by a dot.
+ * e.g. devicename.1234567890ab
+ *
+ */
+
+#define OBJ_MAX_SIZE (1UL  OBJ_DEFAULT_OBJ_ORDER)
+
+typedef struct RBDAIOCB {
+BlockDriverAIOCB common;
+QEMUBH *bh;
+int ret;
+QEMUIOVector *qiov;
+char *bounce;
+int write;
+int64_t sector_num;
+int aiocnt;
+int error;
+} RBDAIOCB;
+
+typedef struct RADOSCB {
+int rcbid;
+RBDAIOCB *acb;
+int done;
+int64_t segsize;
+char *buf;
+} RADOSCB;
+
+typedef struct BDRVRBDState {
+rados_pool_t pool;
+char name[RBD_MAX_OBJ_NAME_SIZE];
+uint64_t size;
+uint64_t objsize;
+} BDRVRBDState;
+
+typedef struct rbd_obj_header_ondisk RbdHeader1;
+
+static int rbd_parsename(const char *filename, char *pool, char *name)
+{
+const char *rbdname;
+char *p;
+int l;
+
+if (!strstart(filename, rbd:, rbdname)) {
+return -EINVAL;
+}
+
+pstrcpy(pool, 2 * RBD_MAX_SEG_NAME_SIZE, rbdname);
+p = strchr(pool, '/');
+if (p == NULL) {
+return -EINVAL;
+}
+
+*p = '\0';
+
+l = strlen(pool);
+if(l = RBD_MAX_SEG_NAME_SIZE) {
+error_report(pool name to long);
+return -EINVAL;
+} else if (l = 0) {
+error_report(pool name to short);
+return -EINVAL;
+}
+
+l = strlen(++p);
+if (l = RBD_MAX_OBJ_NAME_SIZE) {
+error_report(object name to long);
+return -EINVAL;
+} else if (l = 0) {
+error_report(object name to short);
+return -EINVAL;
+}
+
+strcpy(name, p);
+
+return l;
+}
+
+static int create_tmap_op(uint8_t op, const char *name, char **tmap_desc)
+{
+uint32_t len = strlen(name);
+/* total_len = encoding op + name + empty buffer */
+uint32_t total_len = 1 + (sizeof(uint32_t) + len) + sizeof(uint32_t);
+char *desc = NULL;
+
+qemu_malloc(total_len);
+
+*tmap_desc = desc;
+
+*desc = op;
+desc++;
+memcpy(desc, len, sizeof(len));
+desc += sizeof(len);
+memcpy(desc, name, len);
+desc += len;
+len = 0;
+memcpy(desc, len, sizeof(len));
+desc += sizeof(len);
+
+return desc - *tmap_desc;
+}
+
+static void free_tmap_op(char *tmap_desc)
+{
+qemu_free(tmap_desc);
+}
+
+static int rbd_register_image(rados_pool_t pool, const char *name)
+{
+char *tmap_desc;
+const char *dir = RBD_DIRECTORY;
+int ret;
+
+ret = create_tmap_op(CEPH_OSD_TMAP_SET, name, tmap_desc);
+if (ret  0) 

[PATCH][RESEND] Print a user-friendly message on failed vmentry

2010-05-31 Thread Mohammed Gamal
This patch address bug report in https://bugs.launchpad.net/qemu/+bug/530077.

Failed vmentries were handled with handle_unhandled() which prints a rather
unfriendly message to the user. This patch separates handling vmentry failures
from unknown exit reasons and prints a friendly message to the user.

Signed-off-by: Mohammed Gamal m.gamal...@gmail.com
---
 qemu-kvm.c |   27 ++-
 1 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/qemu-kvm.c b/qemu-kvm.c
index 9534b31..b8a9278 100644
--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -106,6 +106,31 @@ static int handle_unhandled(uint64_t reason)
 return -EINVAL;
 }
 
+#define VMX_INVALID_GUEST_STATE 0x8021
+
+static int handle_failed_vmentry(uint64_t reason)
+{
+fprintf(stderr, kvm: vm entry failed with error 0x% PRIx64 \n\n, 
reason);
+
+/* Perhaps we will need to check if this machine is intel since exit 
reason 0x21 
+   has a different interpretation on SVM */
+if (reason == VMX_INVALID_GUEST_STATE) {
+fprintf(stderr, If you're runnning a guest on an Intel machine 
without\n);
+fprintf(stderr, unrestricted mode support, the failure can be most 
likely\n);
+fprintf(stderr, due to the guest entering an invalid state for Intel 
VT.\n);
+fprintf(stderr, For example, the guest maybe running in big real 
mode\n);
+fprintf(stderr, which is not supported on less recent Intel 
processors.\n\n);
+fprintf(stderr, You may want to try enabling KVM real mode emulation. 
To\n);
+fprintf(stderr, enable it, you can run the following commands as 
root:\n);
+fprintf(stderr, # rmmod kvm_intel\n);
+fprintf(stderr, # rmmod kvm\n);
+fprintf(stderr, # modprobe kvm_intel 
emulate_invalid_guest_state=1\n\n);
+fprintf(stderr, WARNING: Real mode emulation is still 
work-in-progress\n);
+fprintf(stderr, and thus it is not always guaranteed to work.\n\n);
+}
+
+return -EINVAL;
+}
 
 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
 {
@@ -586,7 +611,7 @@ int kvm_run(CPUState *env)
 r = handle_unhandled(run-hw.hardware_exit_reason);
 break;
 case KVM_EXIT_FAIL_ENTRY:
-r = 
handle_unhandled(run-fail_entry.hardware_entry_failure_reason);
+r = 
handle_failed_vmentry(run-fail_entry.hardware_entry_failure_reason);
 break;
 case KVM_EXIT_EXCEPTION:
 fprintf(stderr, exception %d (%x)\n, run-ex.exception,
-- 
1.7.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH][RESEND] VMX: Properly return error to userspace on vmentry failure

2010-05-31 Thread Mohammed Gamal
The vmexit handler returns KVM_EXIT_UNKNOWN since there is no handler
for vmentry failures. This intercepts vmentry failures and returns
KVM_FAIL_ENTRY to userspace instead.

Signed-off-by: Mohammed Gamal m.gamal...@gmail.com
---
 arch/x86/kvm/vmx.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0e561a5..1b6a3be 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3668,6 +3668,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (enable_ept  is_paging(vcpu))
vcpu-arch.cr3 = vmcs_readl(GUEST_CR3);
 
+   if (exit_reason  VMX_EXIT_REASONS_FAILED_VMENTRY) {
+   vcpu-run-exit_reason = KVM_EXIT_FAIL_ENTRY;
+   vcpu-run-fail_entry.hardware_entry_failure_reason
+   = exit_reason;
+   return 0;
+   }
+
if (unlikely(vmx-fail)) {
vcpu-run-exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu-run-fail_entry.hardware_entry_failure_reason
-- 
1.7.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: PPC: elide struct thread_struct instances from stack

2010-05-31 Thread Andreas Schwab
Instead of instantiating a whole thread_struct on the stack use only the
required parts of it.

Signed-off-by: Andreas Schwab sch...@linux-m68k.org
Tested-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/include/asm/kvm_fpu.h   |   27 +
 arch/powerpc/kernel/ppc_ksyms.c  |4 -
 arch/powerpc/kvm/book3s.c|   49 +---
 arch/powerpc/kvm/book3s_paired_singles.c |   94 --
 arch/powerpc/kvm/fpu.S   |   18 ++
 5 files changed, 97 insertions(+), 95 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_fpu.h 
b/arch/powerpc/include/asm/kvm_fpu.h
index 94f05de..c3d4f05 100644
--- a/arch/powerpc/include/asm/kvm_fpu.h
+++ b/arch/powerpc/include/asm/kvm_fpu.h
@@ -22,24 +22,24 @@
 
 #include linux/types.h
 
-extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1);
+extern void fps_fres(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_frsqrte(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_fsqrts(u64 *fpscr, u32 *dst, u32 *src1);
 
-extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fdivs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fmuls(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
 
-extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
   u32 *src3);
-extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
   u32 *src3);
-extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 
*src2,
+extern void fps_fnmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
u32 *src3);
-extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 
*src2,
+extern void fps_fnmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
u32 *src3);
-extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fsel(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 u32 *src3);
 
 #define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
@@ -82,4 +82,7 @@ FPD_THREE_IN(fmadd)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmadd)
 
+extern void kvm_cvt_fd(u32 *from, u64 *to, u64 *fpscr);
+extern void kvm_cvt_df(u64 *from, u32 *to, u64 *fpscr);
+
 #endif
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index bc9f39d..ab3e392 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -101,10 +101,6 @@ EXPORT_SYMBOL(pci_dram_offset);
 EXPORT_SYMBOL(start_thread);
 EXPORT_SYMBOL(kernel_thread);
 
-#ifndef CONFIG_BOOKE
-EXPORT_SYMBOL_GPL(cvt_df);
-EXPORT_SYMBOL_GPL(cvt_fd);
-#endif
 EXPORT_SYMBOL(giveup_fpu);
 #ifdef CONFIG_ALTIVEC
 EXPORT_SYMBOL(giveup_altivec);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b998abf..3fea19d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1309,12 +1309,17 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, 
struct kvm_vcpu *vcpu);
 int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
int ret;
-   struct thread_struct ext_bkp;
+   double fpr[32][TS_FPRWIDTH];
+   unsigned int fpscr;
+   int fpexc_mode;
 #ifdef CONFIG_ALTIVEC
-   bool save_vec = current-thread.used_vr;
+   vector128 vr[32];
+   vector128 vscr;
+   unsigned long uninitialized_var(vrsave);
+   int used_vr;
 #endif
 #ifdef CONFIG_VSX
-   bool save_vsx = current-thread.used_vsr;
+   int used_vsr;
 #endif
ulong ext_msr;
 
@@ -1327,27 +1332,27 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
/* Save FPU state in stack */
if (current-thread.regs-msr  MSR_FP)
giveup_fpu(current);
-   memcpy(ext_bkp.fpr, current-thread.fpr, sizeof(current-thread.fpr));
-   ext_bkp.fpscr = current-thread.fpscr;
-   ext_bkp.fpexc_mode = current-thread.fpexc_mode;
+   memcpy(fpr, current-thread.fpr, sizeof(current-thread.fpr));
+   fpscr = current-thread.fpscr.val;
+   fpexc_mode = current-thread.fpexc_mode;
 
 #ifdef CONFIG_ALTIVEC
/* Save Altivec state in stack */
-   if (save_vec) {
+   used_vr = current-thread.used_vr;
+   if (used_vr) {
if 

Re: [PATCH] KVM: PPC: elide struct thread_struct instances from stack

2010-05-31 Thread Alexander Graf

On 31.05.2010, at 21:59, Andreas Schwab wrote:

 Instead of instantiating a whole thread_struct on the stack use only the
 required parts of it.
 
 Signed-off-by: Andreas Schwab sch...@linux-m68k.org
 Tested-by: Alexander Graf ag...@suse.de

Avi or Marcelo, please pull this in through kvm.git.

Signed-off-by: Alexander Graf ag...@suse.de


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] make-release: misc fixes

2010-05-31 Thread Marcelo Tosatti
On Mon, May 24, 2010 at 04:12:14PM +0300, Michael S. Tsirkin wrote:
 This fixes /tmp usage in make-release script for security.
 Also, create output directory if it does not exist.
 This also adds a 'tarball' optin to specify output file name.
 Finally, remote output file before gzip to avoid prompt
 'do you want to overwrite'.
 
 Signed-off-by: Michael S. Tsirkin m...@redhat.com

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH qemu-kvm] device-assignment: add config fd qdev property

2010-05-31 Thread Marcelo Tosatti
On Mon, May 24, 2010 at 10:53:49AM -0600, Alex Williamson wrote:
 On Wed, 2010-05-19 at 12:00 -0700, Chris Wright wrote:
  When libvirt launches a guest it first chowns the relevenat
  /sys/bus/pci/.../config file for an assigned device then drops privileges.
  
  This causes an issue for device assignment because despite being file
  owner, the sysfs config space file checks for CAP_SYS_ADMIN before
  allowing access to device dependent config space.
  
  This adds a new qdev configfd property which allows libvirt to open the
  sysfs config space file and give qemu an already opened file descriptor.
  Along with a change pending for the 2.6.35 kernel, this allows the
  capability check to compare against privileges from when the file was
  opened.
 
 We need to make configfd be a string option so that we can pass a
 descriptor from libvirt for the hotplug case.  Here's a rework.
 
 Signed-off-by: Chris Wright chr...@redhat.com
 Signed-off-by: Alex Williamson alex.william...@redhat.com

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: elide struct thread_struct instances from stack

2010-05-31 Thread Paul Mackerras
On Mon, May 31, 2010 at 09:59:13PM +0200, Andreas Schwab wrote:

 Instead of instantiating a whole thread_struct on the stack use only the
 required parts of it.

...

 +_GLOBAL(kvm_cvt_fd)
 + lfd 0,0(r5) /* load up fpscr value */
 + MTFSF_L(0)
 + lfs 0,0(r3)
 + stfd0,0(r4)
 + mffs0
 + stfd0,0(r5) /* save new fpscr value */
 + blr
 +
 +_GLOBAL(kvm_cvt_df)
 + lfd 0,0(r5) /* load up fpscr value */
 + MTFSF_L(0)
 + lfd 0,0(r3)
 + stfs0,0(r4)
 + mffs0
 + stfd0,0(r5) /* save new fpscr value */
 + blr

I re-read the relevant part of the PowerPC architecture spec
yesterday, and it seems pretty clear that the FPSCR doesn't affect the
behaviour of lfs and stfs, and is not affected by them.  So in fact 4
out of the 7 instructions in each of those procedures are unnecessary
(and similarly for the cvt_fd/df used in the alignment fixup code).

Paul.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: elide struct thread_struct instances from stack

2010-05-31 Thread Alexander Graf

On 01.06.2010, at 00:40, Paul Mackerras wrote:

 On Mon, May 31, 2010 at 09:59:13PM +0200, Andreas Schwab wrote:
 
 Instead of instantiating a whole thread_struct on the stack use only the
 required parts of it.
 
 ...
 
 +_GLOBAL(kvm_cvt_fd)
 +lfd 0,0(r5) /* load up fpscr value */
 +MTFSF_L(0)
 +lfs 0,0(r3)
 +stfd0,0(r4)
 +mffs0
 +stfd0,0(r5) /* save new fpscr value */
 +blr
 +
 +_GLOBAL(kvm_cvt_df)
 +lfd 0,0(r5) /* load up fpscr value */
 +MTFSF_L(0)
 +lfd 0,0(r3)
 +stfs0,0(r4)
 +mffs0
 +stfd0,0(r5) /* save new fpscr value */
 +blr
 
 I re-read the relevant part of the PowerPC architecture spec
 yesterday, and it seems pretty clear that the FPSCR doesn't affect the
 behaviour of lfs and stfs, and is not affected by them.  So in fact 4
 out of the 7 instructions in each of those procedures are unnecessary
 (and similarly for the cvt_fd/df used in the alignment fixup code).

So the rounding control field is not used on lfs? Interesting. I couldn't find 
a reference to it being used or modified either though.

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] KVM: MMU: split the operations of kvm_mmu_zap_page()

2010-05-31 Thread Xiao Guangrong


Avi Kivity wrote:
  
 
 It would be better to rewrite kvm_mmu_zap_page() in terms of
 prepare/commit in the patch so we don't have two copies of the same
 thing (also easier to review).

OK, i'll do it in the next version.

 
 
 
 
 This is a good idea, but belongs in a separate patch?  We can use it to
 reclaim invalid pages before allocating new ones.

  
 This patch is very simple and kvm_mmu_commit_zap_page() function
 should depend on
 kvm-arch.invalid_mmu_pages, so i think we on need separate this
 patch, your opinion? :-)


 
 How about passing the list as a parameter to prepare() and commit()?  If
 the lifetime of the list is just prepare/commit, it shouldn't be a global.
 

Does below example code show your meaning correctly?

+   struct list_head free_list = LIST_HEAD_INIT(free_list);

hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
if (sp-gfn == gfn  !sp-role.direct
 !sp-role.invalid) {
pgprintk(%s: zap %lx %x\n,
 __func__, gfn, sp-role.word);
+   kvm_mmu_prepare_zap_page(kvm, sp, free_list);
}
}
+   kvm_mmu_commit_zap_page(kvm, free_list);


Thanks,
Xiao


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] qemu: kvm: Extend kvm_arch_get_supported_cpuid() to support index

2010-05-31 Thread Sheng Yang
Would use it later for XSAVE related CPUID.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 kvm.h |2 +-
 qemu-kvm-x86.c|8 
 target-i386/kvm.c |   19 +++
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/kvm.h b/kvm.h
index aab5118..16b06a4 100644
--- a/kvm.h
+++ b/kvm.h
@@ -152,7 +152,7 @@ bool kvm_arch_stop_on_emulation_error(CPUState *env);
 int kvm_check_extension(KVMState *s, unsigned int extension);
 
 uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
-  int reg);
+  uint32_t index, int reg);
 void kvm_cpu_synchronize_state(CPUState *env);
 void kvm_cpu_synchronize_post_reset(CPUState *env);
 void kvm_cpu_synchronize_post_init(CPUState *env);
diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c
index 95b7aa5..1eb8768 100644
--- a/qemu-kvm-x86.c
+++ b/qemu-kvm-x86.c
@@ -1188,18 +1188,18 @@ int kvm_arch_init_vcpu(CPUState *cenv)
 #endif
 
 kvm_trim_features(cenv-cpuid_features,
-  kvm_arch_get_supported_cpuid(cenv, 1, R_EDX));
+  kvm_arch_get_supported_cpuid(cenv, 1, 0, R_EDX));
 
 /* prevent the hypervisor bit from being cleared by the kernel */
 i = cenv-cpuid_ext_features  CPUID_EXT_HYPERVISOR;
 kvm_trim_features(cenv-cpuid_ext_features,
-  kvm_arch_get_supported_cpuid(cenv, 1, R_ECX));
+  kvm_arch_get_supported_cpuid(cenv, 1, 0, R_ECX));
 cenv-cpuid_ext_features |= i;
 
 kvm_trim_features(cenv-cpuid_ext2_features,
-  kvm_arch_get_supported_cpuid(cenv, 0x8001, R_EDX));
+  kvm_arch_get_supported_cpuid(cenv, 0x8001, 0, 
R_EDX));
 kvm_trim_features(cenv-cpuid_ext3_features,
-  kvm_arch_get_supported_cpuid(cenv, 0x8001, R_ECX));
+  kvm_arch_get_supported_cpuid(cenv, 0x8001, 0, 
R_ECX));
 
 copy = *cenv;
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 87c1133..626cac6 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -71,7 +71,8 @@ static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
 return cpuid;
 }
 
-uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function, int 
reg)
+uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
+  uint32_t index, int reg)
 {
 struct kvm_cpuid2 *cpuid;
 int i, max;
@@ -88,7 +89,8 @@ uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t 
function, int reg)
 }
 
 for (i = 0; i  cpuid-nent; ++i) {
-if (cpuid-entries[i].function == function) {
+if (cpuid-entries[i].function == function 
+cpuid-entries[i].index == index) {
 switch (reg) {
 case R_EAX:
 ret = cpuid-entries[i].eax;
@@ -110,7 +112,7 @@ uint32_t kvm_arch_get_supported_cpuid(CPUState *env, 
uint32_t function, int reg)
 /* On Intel, kvm returns cpuid according to the Intel spec,
  * so add missing bits according to the AMD spec:
  */
-cpuid_1_edx = kvm_arch_get_supported_cpuid(env, 1, R_EDX);
+cpuid_1_edx = kvm_arch_get_supported_cpuid(env, 1, 0, 
R_EDX);
 ret |= cpuid_1_edx  0x183f7ff;
 break;
 }
@@ -126,7 +128,8 @@ uint32_t kvm_arch_get_supported_cpuid(CPUState *env, 
uint32_t function, int reg)
 
 #else
 
-uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function, int 
reg)
+uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
+  uint32_t index, int reg)
 {
 return -1U;
 }
@@ -180,16 +183,16 @@ int kvm_arch_init_vcpu(CPUState *env)
 
 env-mp_state = KVM_MP_STATE_RUNNABLE;
 
-env-cpuid_features = kvm_arch_get_supported_cpuid(env, 1, R_EDX);
+env-cpuid_features = kvm_arch_get_supported_cpuid(env, 1, 0, R_EDX);
 
 i = env-cpuid_ext_features  CPUID_EXT_HYPERVISOR;
-env-cpuid_ext_features = kvm_arch_get_supported_cpuid(env, 1, R_ECX);
+env-cpuid_ext_features = kvm_arch_get_supported_cpuid(env, 1, 0, R_ECX);
 env-cpuid_ext_features |= i;
 
 env-cpuid_ext2_features = kvm_arch_get_supported_cpuid(env, 0x8001,
- R_EDX);
+ 0, R_EDX);
 env-cpuid_ext3_features = kvm_arch_get_supported_cpuid(env, 0x8001,
- R_ECX);
+ 0, R_ECX);
 
 cpuid_i = 0;
 
-- 
1.7.0.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] qemu: Enable XSAVE related CPUID

2010-05-31 Thread Sheng Yang
We can support it in KVM now. The 0xd leaf is queried from KVM.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 target-i386/cpuid.c |   21 +
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/target-i386/cpuid.c b/target-i386/cpuid.c
index eebf038..ec47950 100644
--- a/target-i386/cpuid.c
+++ b/target-i386/cpuid.c
@@ -1067,6 +1067,27 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 *ecx = 0;
 *edx = 0;
 break;
+case 0xD:
+/* Processor Extended State */
+if (!(env-cpuid_ext_features  CPUID_EXT_XSAVE)) {
+*eax = 0;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+break;
+}
+if (kvm_enabled()) {
+*eax = kvm_arch_get_supported_cpuid(env, 0xd, count, R_EAX);
+*ebx = kvm_arch_get_supported_cpuid(env, 0xd, count, R_EBX);
+*ecx = kvm_arch_get_supported_cpuid(env, 0xd, count, R_ECX);
+*edx = kvm_arch_get_supported_cpuid(env, 0xd, count, R_EDX);
+} else {
+*eax = 0;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+}
+break;
 case 0x8000:
 *eax = env-cpuid_xlevel;
 *ebx = env-cpuid_vendor1;
-- 
1.7.0.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/5] KVM: MMU: introduce some macros to cleanup hlist traverseing

2010-05-31 Thread Xiao Guangrong


Avi Kivity wrote:
 On 05/31/2010 05:00 AM, Xiao Guangrong wrote:

 
 +
 +#define for_each_gfn_indirect_sp(kvm, sp, gfn, pos, n)\
 +  hlist_for_each_entry_safe(sp, pos, n,\
 +kvm-arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)\
 +if (sp-gfn == gfn   !sp-role.direct)
 +
 +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n)\
 +  hlist_for_each_entry_safe(sp, pos, n,\
 +kvm-arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)\
 +if (sp-gfn == gfn   !sp-role.direct \
 +!sp-role.invalid)


 Shouldn't we always skip invalid gfns?
  
 Actually, in kvm_mmu_unprotect_page() function, it need find out
 invalid shadow pages:

 |hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
 |if (sp-gfn == gfn  !sp-role.direct) {
 |pgprintk(%s: gfn %lx role %x\n, __func__, gfn,
 | sp-role.word);
 |r = 1;
 |if (kvm_mmu_zap_page(kvm, sp))
 |goto restart;
 |}

 I'm not sure whether we can skip invalid sp here, since it can change
 this
 function's return value. :-(

 
 Hm.  Invalid pages don't need to be write protected.  So I think you can
 patch unprotect_page() to ignore invalid pages, and then you can convert
 it to the new macros which ignore invalid pages as well.
 
 The invariant is: if an sp exists with !role.invalid and !unsync, then
 the page must be write protected.

OK, will fix it in the next version.

 
 
 What about providing both gfn and role to the macro?

  
 In current code, no code simply use role and gfn to find sp,
 in kvm_mmu_get_page(), we need do other work for
 'sp-gfn == gfn  sp-role != role' sp, and other functions only need
 compare
 some members in role, but not all members.

 
 How about just gfn?  I think everything compares against that!
 

In this patch, it already introduced a macro to only compares 'gfn', that is:

+#define for_each_gfn_sp(kvm, sp, gfn, pos, n)  \
+  hlist_for_each_entry_safe(sp, pos, n,\
+   kvm-arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)\
+   if (sp-gfn == gfn)

Sorry if i misunderstand your meaning.

Thanks,
Xiao
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/21] arch/powerpc: Remove unnecessary kmalloc casts

2010-05-31 Thread Joe Perches
Signed-off-by: Joe Perches j...@perches.com
---
 arch/powerpc/kernel/nvram_64.c |3 +--
 arch/powerpc/kvm/book3s.c  |4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 9cf197f..99c6dab 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -502,8 +502,7 @@ static int __init nvram_scan_partitions(void)
   detected: 0-length partition\n);
goto out;
}
-   tmp_part = (struct nvram_partition *)
-   kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
+   tmp_part = kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
err = -ENOMEM;
if (!tmp_part) {
printk(KERN_ERR nvram_scan_partitions: kmalloc 
failed\n);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b998abf..2a8b9b6 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1248,8 +1248,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, 
unsigned int id)
 
memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s));
 
-   vcpu_book3s-shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
-   kzalloc(sizeof(*vcpu_book3s-shadow_vcpu), GFP_KERNEL);
+   vcpu_book3s-shadow_vcpu = kzalloc(sizeof(*vcpu_book3s-shadow_vcpu),
+  GFP_KERNEL);
if (!vcpu_book3s-shadow_vcpu)
goto free_vcpu;
 
-- 
1.7.0.3.311.g6a6955

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM call agenda for June 1

2010-05-31 Thread Chris Wright
Please send in any agenda items you are interested in covering.

If we have a lack of agenda items I'll cancel the week's call.

thanks,
-chris
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: host panic on kernel 2.6.34

2010-05-31 Thread Hao, Xudong
Maciej Rutecki wrote:
 On poniedziałek, 24 maja 2010 o 10:23:11 Hao, Xudong wrote:
 Hi all
 I build latest kvm 37dec075a7854f0f550540bf3b9bbeef37c11e2a, based on
  kernel 2.6.34, after kvm and kvm_intel module loaded, then
  /etc/init.d/kvm start, a few minutes later, the system will panic.
 
 
 I created a Bugzilla entry at
 https://bugzilla.kernel.org/show_bug.cgi?id=16082
 for your bug report, please add your address to the CC list in there,
 thanks! 

Thanks, Maciej. I register a account and CC myself in there.

-Xudong--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Issues with getting PCI pass-through to work

2010-05-31 Thread Adhyas Avasthi
Tried working on it a bit more. Now it complains that it cannot find IOMMU.
This is an Intel Montevina chipset with DMA remapping support (VT-d).
Any pointers as to why it is not finding the support?
The message is coming from hw/device-assignment.c after the call to
kvm_check_extension for IOMMU fails.
The corresponding ioctl code in kvm-kmod lands in iommu_found(), which
I believe is reporting false.

I am wondering if kvm-kmod driver has support for Intel VT-d (Intel
Cantiga chipsets etc) ? Can PCI pass-through work without IOMMU?

Thanks,
Adhyas


On Sun, May 30, 2010 at 6:19 PM, Adhyas Avasthi adh...@gmail.com wrote:
 I have a PCI WLAN controller that I wish to pass-through to the
 standard linux-img VM (downloaded from qemu website).
 When I try qemu-kvm -pcidevice host=0c:00.0,name=abcd linux-0.2.img
 The VM does boot up but does not see the device.

 On the command prompt, I see the following message:
 Failed to assign irq for abcd: Operation not supported
 Perhaps you are assigning a device that shares an IRQ with another device?

 I want to know what does this message exactly mean? Is PCI
 pass-through having the requirement of reconfiguring the IRQ of my
 host devices before I can pass them through? Can I configure qemu-kvm
 somehow to emulate the IRQ (and other registers) instead of trying to
 pass-through the IRQ etc?

 BTW, my box has VT and VT-d enabled in BIOS.

 --
 Adhyas
 
 Two types have compatible type if their types are the same.
    — ANSI C Standard, 3.1.2.6.
 




-- 
Adhyas

Two types have compatible type if their types are the same.
— ANSI C Standard, 3.1.2.6.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: PPC: elide struct thread_struct instances from stack

2010-05-31 Thread Andreas Schwab
Instead of instantiating a whole thread_struct on the stack use only the
required parts of it.

Signed-off-by: Andreas Schwab sch...@linux-m68k.org
Tested-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/include/asm/kvm_fpu.h   |   27 +
 arch/powerpc/kernel/ppc_ksyms.c  |4 -
 arch/powerpc/kvm/book3s.c|   49 +---
 arch/powerpc/kvm/book3s_paired_singles.c |   94 --
 arch/powerpc/kvm/fpu.S   |   18 ++
 5 files changed, 97 insertions(+), 95 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_fpu.h 
b/arch/powerpc/include/asm/kvm_fpu.h
index 94f05de..c3d4f05 100644
--- a/arch/powerpc/include/asm/kvm_fpu.h
+++ b/arch/powerpc/include/asm/kvm_fpu.h
@@ -22,24 +22,24 @@
 
 #include linux/types.h
 
-extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1);
+extern void fps_fres(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_frsqrte(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_fsqrts(u64 *fpscr, u32 *dst, u32 *src1);
 
-extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fdivs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fmuls(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
 
-extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
   u32 *src3);
-extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
   u32 *src3);
-extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 
*src2,
+extern void fps_fnmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
u32 *src3);
-extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 
*src2,
+extern void fps_fnmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
u32 *src3);
-extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fsel(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 u32 *src3);
 
 #define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
@@ -82,4 +82,7 @@ FPD_THREE_IN(fmadd)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmadd)
 
+extern void kvm_cvt_fd(u32 *from, u64 *to, u64 *fpscr);
+extern void kvm_cvt_df(u64 *from, u32 *to, u64 *fpscr);
+
 #endif
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index bc9f39d..ab3e392 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -101,10 +101,6 @@ EXPORT_SYMBOL(pci_dram_offset);
 EXPORT_SYMBOL(start_thread);
 EXPORT_SYMBOL(kernel_thread);
 
-#ifndef CONFIG_BOOKE
-EXPORT_SYMBOL_GPL(cvt_df);
-EXPORT_SYMBOL_GPL(cvt_fd);
-#endif
 EXPORT_SYMBOL(giveup_fpu);
 #ifdef CONFIG_ALTIVEC
 EXPORT_SYMBOL(giveup_altivec);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b998abf..3fea19d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1309,12 +1309,17 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, 
struct kvm_vcpu *vcpu);
 int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
int ret;
-   struct thread_struct ext_bkp;
+   double fpr[32][TS_FPRWIDTH];
+   unsigned int fpscr;
+   int fpexc_mode;
 #ifdef CONFIG_ALTIVEC
-   bool save_vec = current-thread.used_vr;
+   vector128 vr[32];
+   vector128 vscr;
+   unsigned long uninitialized_var(vrsave);
+   int used_vr;
 #endif
 #ifdef CONFIG_VSX
-   bool save_vsx = current-thread.used_vsr;
+   int used_vsr;
 #endif
ulong ext_msr;
 
@@ -1327,27 +1332,27 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
/* Save FPU state in stack */
if (current-thread.regs-msr  MSR_FP)
giveup_fpu(current);
-   memcpy(ext_bkp.fpr, current-thread.fpr, sizeof(current-thread.fpr));
-   ext_bkp.fpscr = current-thread.fpscr;
-   ext_bkp.fpexc_mode = current-thread.fpexc_mode;
+   memcpy(fpr, current-thread.fpr, sizeof(current-thread.fpr));
+   fpscr = current-thread.fpscr.val;
+   fpexc_mode = current-thread.fpexc_mode;
 
 #ifdef CONFIG_ALTIVEC
/* Save Altivec state in stack */
-   if (save_vec) {
+   used_vr = current-thread.used_vr;
+   if (used_vr) {
if 

Re: [PATCH] KVM: PPC: elide struct thread_struct instances from stack

2010-05-31 Thread Alexander Graf

On 31.05.2010, at 21:59, Andreas Schwab wrote:

 Instead of instantiating a whole thread_struct on the stack use only the
 required parts of it.
 
 Signed-off-by: Andreas Schwab sch...@linux-m68k.org
 Tested-by: Alexander Graf ag...@suse.de

Avi or Marcelo, please pull this in through kvm.git.

Signed-off-by: Alexander Graf ag...@suse.de


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: elide struct thread_struct instances from stack

2010-05-31 Thread Paul Mackerras
On Mon, May 31, 2010 at 09:59:13PM +0200, Andreas Schwab wrote:

 Instead of instantiating a whole thread_struct on the stack use only the
 required parts of it.

...

 +_GLOBAL(kvm_cvt_fd)
 + lfd 0,0(r5) /* load up fpscr value */
 + MTFSF_L(0)
 + lfs 0,0(r3)
 + stfd0,0(r4)
 + mffs0
 + stfd0,0(r5) /* save new fpscr value */
 + blr
 +
 +_GLOBAL(kvm_cvt_df)
 + lfd 0,0(r5) /* load up fpscr value */
 + MTFSF_L(0)
 + lfd 0,0(r3)
 + stfs0,0(r4)
 + mffs0
 + stfd0,0(r5) /* save new fpscr value */
 + blr

I re-read the relevant part of the PowerPC architecture spec
yesterday, and it seems pretty clear that the FPSCR doesn't affect the
behaviour of lfs and stfs, and is not affected by them.  So in fact 4
out of the 7 instructions in each of those procedures are unnecessary
(and similarly for the cvt_fd/df used in the alignment fixup code).

Paul.
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: elide struct thread_struct instances from stack

2010-05-31 Thread Alexander Graf

On 01.06.2010, at 00:40, Paul Mackerras wrote:

 On Mon, May 31, 2010 at 09:59:13PM +0200, Andreas Schwab wrote:
 
 Instead of instantiating a whole thread_struct on the stack use only the
 required parts of it.
 
 ...
 
 +_GLOBAL(kvm_cvt_fd)
 +lfd 0,0(r5) /* load up fpscr value */
 +MTFSF_L(0)
 +lfs 0,0(r3)
 +stfd0,0(r4)
 +mffs0
 +stfd0,0(r5) /* save new fpscr value */
 +blr
 +
 +_GLOBAL(kvm_cvt_df)
 +lfd 0,0(r5) /* load up fpscr value */
 +MTFSF_L(0)
 +lfd 0,0(r3)
 +stfs0,0(r4)
 +mffs0
 +stfd0,0(r5) /* save new fpscr value */
 +blr
 
 I re-read the relevant part of the PowerPC architecture spec
 yesterday, and it seems pretty clear that the FPSCR doesn't affect the
 behaviour of lfs and stfs, and is not affected by them.  So in fact 4
 out of the 7 instructions in each of those procedures are unnecessary
 (and similarly for the cvt_fd/df used in the alignment fixup code).

So the rounding control field is not used on lfs? Interesting. I couldn't find 
a reference to it being used or modified either though.

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/21] arch/powerpc: Remove unnecessary kmalloc casts

2010-05-31 Thread Joe Perches
Signed-off-by: Joe Perches j...@perches.com
---
 arch/powerpc/kernel/nvram_64.c |3 +--
 arch/powerpc/kvm/book3s.c  |4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 9cf197f..99c6dab 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -502,8 +502,7 @@ static int __init nvram_scan_partitions(void)
   detected: 0-length partition\n);
goto out;
}
-   tmp_part = (struct nvram_partition *)
-   kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
+   tmp_part = kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
err = -ENOMEM;
if (!tmp_part) {
printk(KERN_ERR nvram_scan_partitions: kmalloc 
failed\n);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b998abf..2a8b9b6 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1248,8 +1248,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, 
unsigned int id)
 
memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s));
 
-   vcpu_book3s-shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
-   kzalloc(sizeof(*vcpu_book3s-shadow_vcpu), GFP_KERNEL);
+   vcpu_book3s-shadow_vcpu = kzalloc(sizeof(*vcpu_book3s-shadow_vcpu),
+  GFP_KERNEL);
if (!vcpu_book3s-shadow_vcpu)
goto free_vcpu;
 
-- 
1.7.0.3.311.g6a6955

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html