[PATCH net v2] vhost: fix ref cnt checking deadlock

2014-02-13 Thread Michael S. Tsirkin
vhost checked the counter within the refcnt before decrementing.  It
really wanted to know that it is the one that has the last reference, as
a way to batch freeing resources a bit more efficiently.

Note: we only let refcount go to 0 on device release.

This works well but we now access the ref counter twice so there's a
race: all users might see a high count and decide to defer freeing
resources.
In the end no one initiates freeing resources until the last reference
is gone (which is on VM shotdown so might happen after a long time).

Let's do what we probably should have done straight away:
switch from kref to plain atomic, documenting the
semantics, return the refcount value atomically after decrement,
then use that to avoid the deadlock.

Reported-by: Qin Chuanyu qinchua...@huawei.com
Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

This patch is needed for 3.14 and -stable.

 drivers/vhost/net.c | 41 -
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 831eb4f..b12176f 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -70,7 +70,12 @@ enum {
 };
 
 struct vhost_net_ubuf_ref {
-   struct kref kref;
+   /* refcount follows semantics similar to kref:
+*  0: object is released
+*  1: no outstanding ubufs
+* 1: outstanding ubufs
+*/
+   atomic_t refcount;
wait_queue_head_t wait;
struct vhost_virtqueue *vq;
 };
@@ -116,14 +121,6 @@ static void vhost_net_enable_zcopy(int vq)
vhost_net_zcopy_mask |= 0x1  vq;
 }
 
-static void vhost_net_zerocopy_done_signal(struct kref *kref)
-{
-   struct vhost_net_ubuf_ref *ubufs;
-
-   ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
-   wake_up(ubufs-wait);
-}
-
 static struct vhost_net_ubuf_ref *
 vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
 {
@@ -134,21 +131,24 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool 
zcopy)
ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
if (!ubufs)
return ERR_PTR(-ENOMEM);
-   kref_init(ubufs-kref);
+   atomic_set(ubufs-refcount, 1);
init_waitqueue_head(ubufs-wait);
ubufs-vq = vq;
return ubufs;
 }
 
-static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
+static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
 {
-   kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
+   int r = atomic_sub_return(1, ubufs-refcount);
+   if (unlikely(!r))
+   wake_up(ubufs-wait);
+   return r;
 }
 
 static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
 {
-   kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
-   wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount));
+   vhost_net_ubuf_put(ubufs);
+   wait_event(ubufs-wait, !atomic_read(ubufs-refcount));
 }
 
 static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
@@ -306,22 +306,21 @@ static void vhost_zerocopy_callback(struct ubuf_info 
*ubuf, bool success)
 {
struct vhost_net_ubuf_ref *ubufs = ubuf-ctx;
struct vhost_virtqueue *vq = ubufs-vq;
-   int cnt = atomic_read(ubufs-kref.refcount);
+   int cnt;
 
/* set len to mark this desc buffers done DMA */
vq-heads[ubuf-desc].len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
-   vhost_net_ubuf_put(ubufs);
+   cnt = vhost_net_ubuf_put(ubufs);
 
/*
 * Trigger polling thread if guest stopped submitting new buffers:
-* in this case, the refcount after decrement will eventually reach 1
-* so here it is 2.
+* in this case, the refcount after decrement will eventually reach 1.
 * We also trigger polling periodically after each 16 packets
 * (the value 16 here is more or less arbitrary, it's tuned to trigger
 * less than 10% of times).
 */
-   if (cnt = 2 || !(cnt % 16))
+   if (cnt = 1 || !(cnt % 16))
vhost_poll_queue(vq-poll);
 }
 
@@ -420,7 +419,7 @@ static void handle_tx(struct vhost_net *net)
msg.msg_control = ubuf;
msg.msg_controllen = sizeof(ubuf);
ubufs = nvq-ubufs;
-   kref_get(ubufs-kref);
+   atomic_inc(ubufs-refcount);
nvq-upend_idx = (nvq-upend_idx + 1) % UIO_MAXIOV;
} else {
msg.msg_control = NULL;
@@ -785,7 +784,7 @@ static void vhost_net_flush(struct vhost_net *n)
vhost_net_ubuf_put_and_wait(n-vqs[VHOST_NET_VQ_TX].ubufs);
mutex_lock(n-vqs[VHOST_NET_VQ_TX].vq.mutex);
n-tx_flush = false;
-   kref_init(n-vqs[VHOST_NET_VQ_TX].ubufs-kref);
+   atomic_set(n-vqs[VHOST_NET_VQ_TX].ubufs-refcount, 1);

[PATCH net v2] vhost: fix a theoretical race in device cleanup

2014-02-13 Thread Michael S. Tsirkin
vhost_zerocopy_callback accesses VQ right after it drops a ubuf
reference.  In theory, this could race with device removal which waits
on the ubuf kref, and crash on use after free.

Do all accesses within rcu read side critical section, and synchronize
on release.

Since callbacks are always invoked from bh, synchronize_rcu_bh seems
enough and will help release complete a bit faster.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

This is was previously posted as part of patch
series, but it's an independent fix really.
Theoretical race so not needed for stable I think.

changes from v1:
fixed typo in commit log

 drivers/vhost/net.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index b12176f..f1be80d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -308,6 +308,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, 
bool success)
struct vhost_virtqueue *vq = ubufs-vq;
int cnt;
 
+   rcu_read_lock_bh();
+
/* set len to mark this desc buffers done DMA */
vq-heads[ubuf-desc].len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
@@ -322,6 +324,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, 
bool success)
 */
if (cnt = 1 || !(cnt % 16))
vhost_poll_queue(vq-poll);
+
+   rcu_read_unlock_bh();
 }
 
 /* Expects to be always run from workqueue - which acts as
@@ -804,6 +808,8 @@ static int vhost_net_release(struct inode *inode, struct 
file *f)
fput(tx_sock-file);
if (rx_sock)
fput(rx_sock-file);
+   /* Make sure no callbacks are outstanding */
+   synchronize_rcu_bh();
/* We do an extra flush before freeing memory,
 * since jobs can re-queue themselves. */
vhost_net_flush(n);
-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [virtio-dev] [PATCH net v2] vhost: fix a theoretical race in device cleanup

2014-02-13 Thread Jason Wang
On 02/13/2014 05:45 PM, Michael S. Tsirkin wrote:
 vhost_zerocopy_callback accesses VQ right after it drops a ubuf
 reference.  In theory, this could race with device removal which waits
 on the ubuf kref, and crash on use after free.

 Do all accesses within rcu read side critical section, and synchronize
 on release.

 Since callbacks are always invoked from bh, synchronize_rcu_bh seems
 enough and will help release complete a bit faster.

 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---

 This is was previously posted as part of patch
 series, but it's an independent fix really.
 Theoretical race so not needed for stable I think.

 changes from v1:
   fixed typo in commit log

  drivers/vhost/net.c | 6 ++
  1 file changed, 6 insertions(+)

 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index b12176f..f1be80d 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -308,6 +308,8 @@ static void vhost_zerocopy_callback(struct ubuf_info 
 *ubuf, bool success)
   struct vhost_virtqueue *vq = ubufs-vq;
   int cnt;
  
 + rcu_read_lock_bh();
 +
   /* set len to mark this desc buffers done DMA */
   vq-heads[ubuf-desc].len = success ?
   VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
 @@ -322,6 +324,8 @@ static void vhost_zerocopy_callback(struct ubuf_info 
 *ubuf, bool success)
*/
   if (cnt = 1 || !(cnt % 16))
   vhost_poll_queue(vq-poll);
 +
 + rcu_read_unlock_bh();
  }
  
  /* Expects to be always run from workqueue - which acts as
 @@ -804,6 +808,8 @@ static int vhost_net_release(struct inode *inode, struct 
 file *f)
   fput(tx_sock-file);
   if (rx_sock)
   fput(rx_sock-file);
 + /* Make sure no callbacks are outstanding */
 + synchronize_rcu_bh();
   /* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
   vhost_net_flush(n);

Acked-by: Jason Wang jasow...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v2] vhost: fix ref cnt checking deadlock

2014-02-13 Thread Jason Wang
On 02/13/2014 05:42 PM, Michael S. Tsirkin wrote:
 vhost checked the counter within the refcnt before decrementing.  It
 really wanted to know that it is the one that has the last reference, as
 a way to batch freeing resources a bit more efficiently.

 Note: we only let refcount go to 0 on device release.

 This works well but we now access the ref counter twice so there's a
 race: all users might see a high count and decide to defer freeing
 resources.
 In the end no one initiates freeing resources until the last reference
 is gone (which is on VM shotdown so might happen after a long time).

 Let's do what we probably should have done straight away:
 switch from kref to plain atomic, documenting the
 semantics, return the refcount value atomically after decrement,
 then use that to avoid the deadlock.

 Reported-by: Qin Chuanyu qinchua...@huawei.com
 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---

 This patch is needed for 3.14 and -stable.

  drivers/vhost/net.c | 41 -
  1 file changed, 20 insertions(+), 21 deletions(-)

 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 831eb4f..b12176f 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -70,7 +70,12 @@ enum {
  };
  
  struct vhost_net_ubuf_ref {
 - struct kref kref;
 + /* refcount follows semantics similar to kref:
 +  *  0: object is released
 +  *  1: no outstanding ubufs
 +  * 1: outstanding ubufs
 +  */
 + atomic_t refcount;
   wait_queue_head_t wait;
   struct vhost_virtqueue *vq;
  };
 @@ -116,14 +121,6 @@ static void vhost_net_enable_zcopy(int vq)
   vhost_net_zcopy_mask |= 0x1  vq;
  }
  
 -static void vhost_net_zerocopy_done_signal(struct kref *kref)
 -{
 - struct vhost_net_ubuf_ref *ubufs;
 -
 - ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
 - wake_up(ubufs-wait);
 -}
 -
  static struct vhost_net_ubuf_ref *
  vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
  {
 @@ -134,21 +131,24 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool 
 zcopy)
   ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
   if (!ubufs)
   return ERR_PTR(-ENOMEM);
 - kref_init(ubufs-kref);
 + atomic_set(ubufs-refcount, 1);
   init_waitqueue_head(ubufs-wait);
   ubufs-vq = vq;
   return ubufs;
  }
  
 -static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
 +static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
  {
 - kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
 + int r = atomic_sub_return(1, ubufs-refcount);
 + if (unlikely(!r))
 + wake_up(ubufs-wait);
 + return r;
  }
  
  static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
  {
 - kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
 - wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount));
 + vhost_net_ubuf_put(ubufs);
 + wait_event(ubufs-wait, !atomic_read(ubufs-refcount));
  }
  
  static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref 
 *ubufs)
 @@ -306,22 +306,21 @@ static void vhost_zerocopy_callback(struct ubuf_info 
 *ubuf, bool success)
  {
   struct vhost_net_ubuf_ref *ubufs = ubuf-ctx;
   struct vhost_virtqueue *vq = ubufs-vq;
 - int cnt = atomic_read(ubufs-kref.refcount);
 + int cnt;
  
   /* set len to mark this desc buffers done DMA */
   vq-heads[ubuf-desc].len = success ?
   VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
 - vhost_net_ubuf_put(ubufs);
 + cnt = vhost_net_ubuf_put(ubufs);
  
   /*
* Trigger polling thread if guest stopped submitting new buffers:
 -  * in this case, the refcount after decrement will eventually reach 1
 -  * so here it is 2.
 +  * in this case, the refcount after decrement will eventually reach 1.
* We also trigger polling periodically after each 16 packets
* (the value 16 here is more or less arbitrary, it's tuned to trigger
* less than 10% of times).
*/
 - if (cnt = 2 || !(cnt % 16))
 + if (cnt = 1 || !(cnt % 16))
   vhost_poll_queue(vq-poll);
  }
  
 @@ -420,7 +419,7 @@ static void handle_tx(struct vhost_net *net)
   msg.msg_control = ubuf;
   msg.msg_controllen = sizeof(ubuf);
   ubufs = nvq-ubufs;
 - kref_get(ubufs-kref);
 + atomic_inc(ubufs-refcount);
   nvq-upend_idx = (nvq-upend_idx + 1) % UIO_MAXIOV;
   } else {
   msg.msg_control = NULL;
 @@ -785,7 +784,7 @@ static void vhost_net_flush(struct vhost_net *n)
   vhost_net_ubuf_put_and_wait(n-vqs[VHOST_NET_VQ_TX].ubufs);
   mutex_lock(n-vqs[VHOST_NET_VQ_TX].vq.mutex);
   n-tx_flush = false;
 - kref_init(n-vqs[VHOST_NET_VQ_TX].ubufs-kref);
 + 

Re: [RFC 2/2] xen-netback: disable multicast and use a random hw MAC address

2014-02-13 Thread Ian Campbell
On Wed, 2014-02-12 at 14:05 -0800, Luis R. Rodriguez wrote:
  I meant the PV protocol extension which allows guests (netfront) to
  register to receive multicast frames across the PV ring -- i.e. for
  multicast to work from the guests PoV.
 
 Not quite sure I understand, ipv6 works on guests so multicast works,
 so its unclear what you mean by multicast frames across the PV ring.
 Is there any code or or documents I can look at ?

xen/include/public/io/netif.h talks about 'feature-multicast-control'
and XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}.

Looking at it now in the absence of those then flooding is the
default...

  (maybe that was just an optimisation though and the default is to flood
  everything, it was a long time ago)
 
 From a networking perspective everything is being flooded as I've seen
 it so far.

... which is why it works ;-)

Ian.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [qemu64,+smep,+smap] Kernel panic - not syncing: No working init found.

2014-02-13 Thread H. Peter Anvin
On 02/13/2014 04:45 AM, Fengguang Wu wrote:
 Greetings,
 
 I find that when running
 
 qemu-system-x86_64 -cpu qemu64,+smep,+smap
 
 Some kernels will 100% produce this error, where the error code
 -13,-14 are -EACCES and -EFAULT:
 
 Any ideas?
 

I notice this is a non-SMAP kernel:

# CONFIG_X86_SMAP is not set

If the kernel turns on SMAP in CR4 even though SMAP isn't enabled in the
kernel, that is a kernel bug.  If Qemu enforces SMAP even if it is
turned off in CR4, that would be a Qemu bug.  I have reproduced the
failure locally and an am considering both possibilities now.

-hpa

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [qemu64,+smep,+smap] Kernel panic - not syncing: No working init found.

2014-02-13 Thread H. Peter Anvin
On 02/13/2014 06:55 AM, H. Peter Anvin wrote:
 On 02/13/2014 04:45 AM, Fengguang Wu wrote:
 Greetings,

 I find that when running

 qemu-system-x86_64 -cpu qemu64,+smep,+smap

 Some kernels will 100% produce this error, where the error code
 -13,-14 are -EACCES and -EFAULT:

 Any ideas?

 
 I notice this is a non-SMAP kernel:
 
 # CONFIG_X86_SMAP is not set
 
 If the kernel turns on SMAP in CR4 even though SMAP isn't enabled in the
 kernel, that is a kernel bug.  If Qemu enforces SMAP even if it is
 turned off in CR4, that would be a Qemu bug.  I have reproduced the
 failure locally and an am considering both possibilities now.
 

So we do turn on the bit in CR4 even with SMAP compiled out.  This is a
bug.  However, I still get the same failure even with that bug fixed
(and qemu info registers verify that it is, indeed, not set) so I'm
wondering if there is a bug in Qemu as well.  However, staring at the
code in Qemu I don't see where that bug would be...

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [qemu64,+smep,+smap] Kernel panic - not syncing: No working init found.

2014-02-13 Thread H. Peter Anvin
On 02/13/2014 06:55 AM, H. Peter Anvin wrote:
 On 02/13/2014 04:45 AM, Fengguang Wu wrote:
 Greetings,

 I find that when running

 qemu-system-x86_64 -cpu qemu64,+smep,+smap

 Some kernels will 100% produce this error, where the error code
 -13,-14 are -EACCES and -EFAULT:

 Any ideas?

 
 I notice this is a non-SMAP kernel:
 
 # CONFIG_X86_SMAP is not set
 
 If the kernel turns on SMAP in CR4 even though SMAP isn't enabled in the
 kernel, that is a kernel bug.  If Qemu enforces SMAP even if it is
 turned off in CR4, that would be a Qemu bug.  I have reproduced the
 failure locally and an am considering both possibilities now.
 

No, it is simply a second kernel bug.  I have patches for both and will
push them momentarily.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Another preempt folding issue?

2014-02-13 Thread Stefan Bader
On 12.02.2014 12:54, Peter Zijlstra wrote:
 On Wed, Feb 12, 2014 at 12:09:29PM +0100, Stefan Bader wrote:
 Something else here I run a kernel with CONFIG_PREEMPT not set and NR_CPUS
 limited to 8 (for the 32bit kernel). So the default apic driver is used. 
 Since
 default_send_IPI_mask_logical is only used from there, I assume the trace you
 got does the same. Maybe something there is wrong which would explain why we
 only see it on 32bit hosts.
 
 Can you try with a different APIC driver to test this?
 
I don't think I can. And I think the statement about this only be used for 32bit
could be wrong. I got mislead to think so because those are only defined in
probe_32 but the 64bit counterpart isn't containing much aside that.

Anyway, I played around with tracing a bit more. So with this change:

if (need_resched()) {
srcu_read_unlock(kvm-srcu, vcpu-srcu_idx);
if (need_resched() != should_resched()) {
+   trace_printk(need(%i) != should(%i)\n,
+   need_resched(), should_resched());
+   trace_printk(exit_reason=%u\n,
+   vcpu-run-exit_reason);
+trace_printk(preempt_count=%lx\n,
+__this_cpu_read_4(__preempt_count));
+tracing_stop();
+printk(KERN_ERR Stopped tracing, due to
inconsistent state.\n);
}
 +  schedule();
 -  cond_reschedule();
vcpu-srcu_idx = srcu_read_lock(kvm-srcu);
}

I get the following (weird) output:

Xorg-1078  [001] d...71.270251: native_smp_send_reschedule
-resched_task
Xorg-1078  [001] d...71.270251: default_send_IPI_mask_logical
-native_smp_send_reschedule
  bamfdaemon-2318  [001] d...71.270465: resched_task 
-check_preempt_wakeup
  bamfdaemon-2318  [001] d...71.270539: resched_task 
-check_preempt_wakeup
  compiz-2365  [001] d...71.270689: resched_task 
-check_preempt_wakeup
  compiz-2365  [001] d...71.270827: resched_task 
-check_preempt_wakeup
  compiz-2365  [001] d...71.270940: resched_task 
-check_preempt_wakeup
 qemu-system-x86-2679  [000] dn..71.270999: smp_reschedule_interrupt
-reschedule_interrupt
 qemu-system-x86-2679  [000] dn..71.270999: scheduler_ipi
-smp_reschedule_interrupt
 qemu-system-x86-2679  [000] .N..71.271001: kvm_arch_vcpu_ioctl_run: need(1)
!= should(0)
 qemu-system-x86-2679  [000] .N..71.271002: kvm_arch_vcpu_ioctl_run:
exit_reason=2
 qemu-system-x86-2679  [000] .N..71.271003: kvm_arch_vcpu_ioctl_run:
preempt_count=0

So am I reading this right, that the interrupt did get delivered to cpu#0 while
the thread info already had the resched flag set. So this really should have
cleared the bit in preempt_count. But while the trace info shows 'N' for some
reason should_reschedule returns false but at the same time reading the preempt
count manually shows it 0?




signature.asc
Description: OpenPGP digital signature


Re: Another preempt folding issue?

2014-02-13 Thread Peter Zijlstra
On Thu, Feb 13, 2014 at 06:00:19PM +0100, Stefan Bader wrote:
 On 12.02.2014 12:54, Peter Zijlstra wrote:
  On Wed, Feb 12, 2014 at 12:09:29PM +0100, Stefan Bader wrote:
  Something else here I run a kernel with CONFIG_PREEMPT not set and NR_CPUS
  limited to 8 (for the 32bit kernel). So the default apic driver is used. 
  Since
  default_send_IPI_mask_logical is only used from there, I assume the trace 
  you
  got does the same. Maybe something there is wrong which would explain why 
  we
  only see it on 32bit hosts.
  
  Can you try with a different APIC driver to test this?
  
 I don't think I can. And I think the statement about this only be used for 
 32bit
 could be wrong. I got mislead to think so because those are only defined in
 probe_32 but the 64bit counterpart isn't containing much aside that.
 
 Anyway, I played around with tracing a bit more. So with this change:
 
 if (need_resched()) {
 srcu_read_unlock(kvm-srcu, vcpu-srcu_idx);
 if (need_resched() != should_resched()) {
 +   trace_printk(need(%i) != should(%i)\n,
 +   need_resched(), should_resched());
 +   trace_printk(exit_reason=%u\n,
 +   vcpu-run-exit_reason);
 +trace_printk(preempt_count=%lx\n,
 +__this_cpu_read_4(__preempt_count));
 +tracing_stop();
 +printk(KERN_ERR Stopped tracing, due to
 inconsistent state.\n);
 }
  +  schedule();
  -  cond_reschedule();
 vcpu-srcu_idx = srcu_read_lock(kvm-srcu);
 }
 
 I get the following (weird) output:
 
 Xorg-1078  [001] d...71.270251: native_smp_send_reschedule
 -resched_task
 Xorg-1078  [001] d...71.270251: default_send_IPI_mask_logical
 -native_smp_send_reschedule
   bamfdaemon-2318  [001] d...71.270465: resched_task 
 -check_preempt_wakeup
   bamfdaemon-2318  [001] d...71.270539: resched_task 
 -check_preempt_wakeup
   compiz-2365  [001] d...71.270689: resched_task 
 -check_preempt_wakeup
   compiz-2365  [001] d...71.270827: resched_task 
 -check_preempt_wakeup
   compiz-2365  [001] d...71.270940: resched_task 
 -check_preempt_wakeup
  qemu-system-x86-2679  [000] dn..71.270999: smp_reschedule_interrupt
 -reschedule_interrupt
  qemu-system-x86-2679  [000] dn..71.270999: scheduler_ipi
 -smp_reschedule_interrupt
  qemu-system-x86-2679  [000] .N..71.271001: kvm_arch_vcpu_ioctl_run: 
 need(1)
 != should(0)
  qemu-system-x86-2679  [000] .N..71.271002: kvm_arch_vcpu_ioctl_run:
 exit_reason=2
  qemu-system-x86-2679  [000] .N..71.271003: kvm_arch_vcpu_ioctl_run:
 preempt_count=0
 
 So am I reading this right, that the interrupt did get delivered to cpu#0 
 while
 the thread info already had the resched flag set. So this really should have
 cleared the bit in preempt_count. But while the trace info shows 'N' for some
 reason should_reschedule returns false but at the same time reading the 
 preempt
 count manually shows it 0?

*blink*... That's weird indeed... do you have the asm that goes along
with that?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Another preempt folding issue?

2014-02-13 Thread Peter Zijlstra
On Thu, Feb 13, 2014 at 07:03:56PM +0100, Stefan Bader wrote:
 Yeah... not sure the interleaved source helps or not ...

It did, thanks!
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Another preempt folding issue?

2014-02-13 Thread Peter Zijlstra
On Thu, Feb 13, 2014 at 06:00:19PM +0100, Stefan Bader wrote:
 On 12.02.2014 12:54, Peter Zijlstra wrote:
  On Wed, Feb 12, 2014 at 12:09:29PM +0100, Stefan Bader wrote:
  Something else here I run a kernel with CONFIG_PREEMPT not set and NR_CPUS
  limited to 8 (for the 32bit kernel). So the default apic driver is used. 
  Since
  default_send_IPI_mask_logical is only used from there, I assume the trace 
  you
  got does the same. Maybe something there is wrong which would explain why 
  we
  only see it on 32bit hosts.
  
  Can you try with a different APIC driver to test this?
  
 I don't think I can. And I think the statement about this only be used for 
 32bit
 could be wrong. I got mislead to think so because those are only defined in
 probe_32 but the 64bit counterpart isn't containing much aside that.
 
 Anyway, I played around with tracing a bit more. So with this change:
 
 if (need_resched()) {
 srcu_read_unlock(kvm-srcu, vcpu-srcu_idx);
 if (need_resched() != should_resched()) {
 +   trace_printk(need(%i) != should(%i)\n,
 +   need_resched(), should_resched());
 +   trace_printk(exit_reason=%u\n,
 +   vcpu-run-exit_reason);
 +trace_printk(preempt_count=%lx\n,
 +__this_cpu_read_4(__preempt_count));
 +tracing_stop();
 +printk(KERN_ERR Stopped tracing, due to
 inconsistent state.\n);
 }
  +  schedule();
  -  cond_reschedule();
 vcpu-srcu_idx = srcu_read_lock(kvm-srcu);
 }
 
 I get the following (weird) output:
 
 Xorg-1078  [001] d...71.270251: native_smp_send_reschedule
 -resched_task
 Xorg-1078  [001] d...71.270251: default_send_IPI_mask_logical
 -native_smp_send_reschedule
   bamfdaemon-2318  [001] d...71.270465: resched_task 
 -check_preempt_wakeup
   bamfdaemon-2318  [001] d...71.270539: resched_task 
 -check_preempt_wakeup
   compiz-2365  [001] d...71.270689: resched_task 
 -check_preempt_wakeup
   compiz-2365  [001] d...71.270827: resched_task 
 -check_preempt_wakeup
   compiz-2365  [001] d...71.270940: resched_task 
 -check_preempt_wakeup
  qemu-system-x86-2679  [000] dn..71.270999: smp_reschedule_interrupt
 -reschedule_interrupt
  qemu-system-x86-2679  [000] dn..71.270999: scheduler_ipi
 -smp_reschedule_interrupt
  qemu-system-x86-2679  [000] .N..71.271001: kvm_arch_vcpu_ioctl_run: 
 need(1)
 != should(0)
  qemu-system-x86-2679  [000] .N..71.271002: kvm_arch_vcpu_ioctl_run:
 exit_reason=2
  qemu-system-x86-2679  [000] .N..71.271003: kvm_arch_vcpu_ioctl_run:
 preempt_count=0
 
 So am I reading this right, that the interrupt did get delivered to cpu#0 
 while
 the thread info already had the resched flag set. So this really should have
 cleared the bit in preempt_count. But while the trace info shows 'N' for some
 reason should_reschedule returns false but at the same time reading the 
 preempt
 count manually shows it 0?

So the assembly merges the first and second should_resched(), so its
possible that load got before the interrupt().

The 3rd preempt_count load gets re-issued and so that would show the
'true' value again.

If you want to force a reload after the condition; put in a barrier().

In any case; this looks like a false-positive. Please try again until
you get one where the interrupt doesn't happen and we stay in 'n' state.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC 2/3] s390/kvm: Platform specific kvm_arch_vcpu_dont_yield

2014-02-13 Thread Paolo Bonzini

Il 11/02/2014 12:45, Christian Borntraeger ha scritto:

From: Michael Mueller m...@linux.vnet.ibm.com

Commit s390/kvm: Use common waitqueue caused a performance regression
on s390. It turned out that a yield candidate was missed by just a simple
test on its non-empty waitqueue. If an interrupt is outstanding, the candidate
might be suitable. kvm_arch_vcpu_dont_yield is extended by a test that
additionally tests for not yet delivered interrupts.

Significant performance measurement work and code analysis to solve
this issue was provided by Mao Chuan Li and his team in Beijing.

Signed-off-by: Michael Mueller m...@linux.vnet.ibm.com
Reviewed-by: Christian Borntraeger borntrae...@de.ibm.com
Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
---
 arch/s390/kvm/Kconfig| 1 +
 arch/s390/kvm/kvm-s390.c | 7 +++
 2 files changed, 8 insertions(+)

diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index c8bacbc..e44adef 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -25,6 +25,7 @@ config KVM
select HAVE_KVM_EVENTFD
select KVM_ASYNC_PF
select KVM_ASYNC_PF_SYNC
+   select HAVE_KVM_ARCH_VCPU_DONT_YIELD
---help---
  Support hosting paravirtualized guest machines using the SIE
  virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index a5da2cc..1a33e1e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1231,6 +1231,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct 
vm_fault *vmf)
return VM_FAULT_SIGBUS;
 }

+#ifdef CONFIG_HAVE_KVM_ARCH_VCPU_DONT_YIELD
+bool kvm_arch_vcpu_dont_yield(struct kvm_vcpu *vcpu)
+{
+   return waitqueue_active(vcpu-wq)  !kvm_cpu_has_interrupt(vcpu);
+}
+#endif


I wonder if just using  !kvm_arch_vcpu_runnable(vcpu) in 
kvm_vcpu_on_spin would be better.


Right now, you do not need it in s390 because kvm_vcpu_block is not used 
either.  But you could simply define it to kvm_cpu_has_interrupt(vcpu) 
instead.


Paolo


+
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
   struct kvm_memory_slot *dont)
 {



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC 2/3] s390/kvm: Platform specific kvm_arch_vcpu_dont_yield

2014-02-13 Thread Christian Borntraeger
On 13/02/14 23:37, Paolo Bonzini wrote:
 Il 11/02/2014 12:45, Christian Borntraeger ha scritto:
 From: Michael Mueller m...@linux.vnet.ibm.com

 Commit s390/kvm: Use common waitqueue caused a performance regression
 on s390. It turned out that a yield candidate was missed by just a simple
 test on its non-empty waitqueue. If an interrupt is outstanding, the 
 candidate
 might be suitable. kvm_arch_vcpu_dont_yield is extended by a test that
 additionally tests for not yet delivered interrupts.

 Significant performance measurement work and code analysis to solve
 this issue was provided by Mao Chuan Li and his team in Beijing.

 Signed-off-by: Michael Mueller m...@linux.vnet.ibm.com
 Reviewed-by: Christian Borntraeger borntrae...@de.ibm.com
 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
 ---
  arch/s390/kvm/Kconfig| 1 +
  arch/s390/kvm/kvm-s390.c | 7 +++
  2 files changed, 8 insertions(+)

 diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
 index c8bacbc..e44adef 100644
 --- a/arch/s390/kvm/Kconfig
 +++ b/arch/s390/kvm/Kconfig
 @@ -25,6 +25,7 @@ config KVM
  select HAVE_KVM_EVENTFD
  select KVM_ASYNC_PF
  select KVM_ASYNC_PF_SYNC
 +select HAVE_KVM_ARCH_VCPU_DONT_YIELD
  ---help---
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
 index a5da2cc..1a33e1e 100644
 --- a/arch/s390/kvm/kvm-s390.c
 +++ b/arch/s390/kvm/kvm-s390.c
 @@ -1231,6 +1231,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct 
 vm_fault *vmf)
  return VM_FAULT_SIGBUS;
  }

 +#ifdef CONFIG_HAVE_KVM_ARCH_VCPU_DONT_YIELD
 +bool kvm_arch_vcpu_dont_yield(struct kvm_vcpu *vcpu)
 +{
 +return waitqueue_active(vcpu-wq)  !kvm_cpu_has_interrupt(vcpu);
 +}
 +#endif
 
 I wonder if just using  !kvm_arch_vcpu_runnable(vcpu) in kvm_vcpu_on_spin 
 would be better.
 
 Right now, you do not need it in s390 because kvm_vcpu_block is not used 
 either.  But you could simply define it to kvm_cpu_has_interrupt(vcpu) 
 instead.
 
 Paolo

We had several variants but in the end we tried to come up with a patch that 
does not
influence other architectures. Your proposal would certainly be fine for s390,
but what impact does it have on x86, arm, arm64? Will it cause performance 
regressions?
So I think that the patch as is is probably the safest choice until we have some
data from x86, arm, arm64, no?

Christian

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC 2/3] s390/kvm: Platform specific kvm_arch_vcpu_dont_yield

2014-02-13 Thread Paolo Bonzini

Il 13/02/2014 23:54, Christian Borntraeger ha scritto:

We had several variants but in the end we tried to come up with a patch that 
does not
influence other architectures. Your proposal would certainly be fine for s390,
but what impact does it have on x86, arm, arm64? Will it cause performance 
regressions?


It may also have the same advantages you got on s390.


So I think that the patch as is is probably the safest choice until we have some
data from x86, arm, arm64, no?


No, using an existing API is always better than inventing a new one.

If you post the new patch series, and describe the benchmark you were 
using, we can reproduce it on x86.


Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v2] vhost: fix ref cnt checking deadlock

2014-02-13 Thread David Miller
From: Michael S. Tsirkin m...@redhat.com
Date: Thu, 13 Feb 2014 11:42:05 +0200

 vhost checked the counter within the refcnt before decrementing.  It
 really wanted to know that it is the one that has the last reference, as
 a way to batch freeing resources a bit more efficiently.
 
 Note: we only let refcount go to 0 on device release.
 
 This works well but we now access the ref counter twice so there's a
 race: all users might see a high count and decide to defer freeing
 resources.
 In the end no one initiates freeing resources until the last reference
 is gone (which is on VM shotdown so might happen after a long time).
 
 Let's do what we probably should have done straight away:
 switch from kref to plain atomic, documenting the
 semantics, return the refcount value atomically after decrement,
 then use that to avoid the deadlock.
 
 Reported-by: Qin Chuanyu qinchua...@huawei.com
 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---
 
 This patch is needed for 3.14 and -stable.

Applied and queued up for -stable.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v2] vhost: fix a theoretical race in device cleanup

2014-02-13 Thread David Miller
From: Michael S. Tsirkin m...@redhat.com
Date: Thu, 13 Feb 2014 11:45:11 +0200

 vhost_zerocopy_callback accesses VQ right after it drops a ubuf
 reference.  In theory, this could race with device removal which waits
 on the ubuf kref, and crash on use after free.
 
 Do all accesses within rcu read side critical section, and synchronize
 on release.
 
 Since callbacks are always invoked from bh, synchronize_rcu_bh seems
 enough and will help release complete a bit faster.
 
 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---
 
 This is was previously posted as part of patch
 series, but it's an independent fix really.
 Theoretical race so not needed for stable I think.

Ok, no -stable, applied.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: hyper-v support in KVM

2014-02-13 Thread Liu, RongrongX
 -Original Message-
 From: Vadim Rozenfeld [mailto:vroze...@redhat.com]
 Sent: Wednesday, February 12, 2014 3:42 PM
 To: Zhang, Yang Z
 Cc: kvm@vger.kernel.org; Liu, RongrongX
 Subject: Re: hyper-v support in KVM
 
 On Wed, 2014-02-12 at 01:33 +, Zhang, Yang Z wrote:
  Vadim Rozenfeld wrote on 2014-02-10:
   On Mon, 2014-02-10 at 08:21 +, Zhang, Yang Z wrote:
   Hi Vadim,
  
   Do you know the latest status of Hyper-v Enlightenments supporting in
 KVM?
   Like how many Hyper-v interfaces are supported in KVM?
  
   Hi Yang,
  
   There is no many at the moment. KVM currently supports the following
   Hyper-V features:
  
   Guest Spinlocks
   http://msdn.microsoft.com/en-us/library/windows/hardware/ff539081%
   28v=vs.85%29.aspx
  
   Local APIC MSR Accesses
   http://msdn.microsoft.com/en-us/library/windows/hardware/ff542396%
   28v=vs.85%29.aspx
  
   Reference Time Counter
   http://msdn.microsoft.com/en-us/library/windows/hardware/ff542637%
   28v=vs.85%29.aspx
  
   We are going to add: Reference TSC Page
   http://msdn.microsoft.com/en-us/library/windows/hardware/ff542643%
   28v=vs.85%29.aspx
  
   Lazy EOI support, maybe more.
  
 
  Thanks for your update. More questions:
  I want to measure the performance improvement with hyper-v features
 enabled. So I want to know:
  Are those features enabled by default in KVM?
 In KVM - yes, but you also need to specify them in QEMU command line.
 They can be enabled by -cpu features hv_vapic, hv_spinlocks, hv_time, and
 hv_relaxed
Hi Vadim,
in QEMU command line, how to enable these feature?
I try it with
1. [root@vt-snb9 ]#qemu-system-x86_64 -enable-kvm -m 2048 -smp 2 -net none 
win8.1.img -cpu feature hv_vapic,+hv_spinlocks,+hv_time,+hv_relaxed
 qemu-system-x86_64: -cpu feature: drive with bus=0, unit=0 (index=0) exists
2. [root@vt-snb9]# qemu-system-x86_64 -enable-kvm -m 2048 -smp 2 -net none 
win8.1.img -cpu qemu64,+hv_vapic,+hv_spinlocks,+hv_time,+hv_relaxed
CPU feature hv_vapic not found
CPU feature hv_spinlocks not found
CPU feature hv_time not found
CPU feature hv_relaxed not found
CPU feature hv_vapic not found
CPU feature hv_spinlocks not found
CPU feature hv_time not found
CPU feature hv_relaxed not found
VNC server running on `::1:5900'


  How to turn off/on it manually?
 Yes. From the QEMU command line.
 
  And how can I know whether guest is using it really?
 There are two options - printk from the KVM side or WinDbg from the guest
 side. But in case of hv_time you can check the value returned by
 QueryPerformanceFrequency
 http://msdn.microsoft.com/en-us/library/windows/desktop/ms644905%
 28v=vs.85%29.aspx
 it should be 10MHz
 
   Also, Do you have any performance data?
 http://www.linux-kvm.org/wiki/images/0/0a/2012-forum-kvm_hyperv.pdf
 pp 16, 18
 I compared DPC and ISR times with xperf for two cases - with and without
 enlightenment.
 I also have seen reports mentioned around 5-10 percent CPU usage drop on the
 host side, when loading guest with some disk-stress tests.
 
 
   Kind regards.
   Vadim.
  
  
   best regards
   yang
  
   --
   To unsubscribe from this list: send the line unsubscribe kvm in
   the body of a message to majord...@vger.kernel.org More majordomo
   info at  http://vger.kernel.org/majordomo-info.html
  
  
 
 
  Best regards,
  Yang
 
 
 

N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

Re: hyper-v support in KVM

2014-02-13 Thread Vadim Rozenfeld
On Fri, 2014-02-14 at 02:35 +, Liu, RongrongX wrote:
  -Original Message-
  From: Vadim Rozenfeld [mailto:vroze...@redhat.com]
  Sent: Wednesday, February 12, 2014 3:42 PM
  To: Zhang, Yang Z
  Cc: kvm@vger.kernel.org; Liu, RongrongX
  Subject: Re: hyper-v support in KVM
  
  On Wed, 2014-02-12 at 01:33 +, Zhang, Yang Z wrote:
   Vadim Rozenfeld wrote on 2014-02-10:
On Mon, 2014-02-10 at 08:21 +, Zhang, Yang Z wrote:
Hi Vadim,
   
Do you know the latest status of Hyper-v Enlightenments supporting in
  KVM?
Like how many Hyper-v interfaces are supported in KVM?
   
Hi Yang,
   
There is no many at the moment. KVM currently supports the following
Hyper-V features:
   
Guest Spinlocks
http://msdn.microsoft.com/en-us/library/windows/hardware/ff539081%
28v=vs.85%29.aspx
   
Local APIC MSR Accesses
http://msdn.microsoft.com/en-us/library/windows/hardware/ff542396%
28v=vs.85%29.aspx
   
Reference Time Counter
http://msdn.microsoft.com/en-us/library/windows/hardware/ff542637%
28v=vs.85%29.aspx
   
We are going to add: Reference TSC Page
http://msdn.microsoft.com/en-us/library/windows/hardware/ff542643%
28v=vs.85%29.aspx
   
Lazy EOI support, maybe more.
   
  
   Thanks for your update. More questions:
   I want to measure the performance improvement with hyper-v features
  enabled. So I want to know:
   Are those features enabled by default in KVM?
  In KVM - yes, but you also need to specify them in QEMU command line.
  They can be enabled by -cpu features hv_vapic, hv_spinlocks, hv_time, and
  hv_relaxed
 Hi Vadim,
 in QEMU command line, how to enable these feature?
 I try it with
 1. [root@vt-snb9 ]#qemu-system-x86_64 -enable-kvm -m 2048 -smp 2 -net none 
 win8.1.img -cpu feature hv_vapic,+hv_spinlocks,+hv_time,+hv_relaxed
  qemu-system-x86_64: -cpu feature: drive with bus=0, unit=0 (index=0) exists
 2. [root@vt-snb9]# qemu-system-x86_64 -enable-kvm -m 2048 -smp 2 -net none 
 win8.1.img -cpu qemu64,+hv_vapic,+hv_spinlocks,+hv_time,+hv_relaxed

something like this:
-cpu qemu64,
+x2apic,family=0xf,hv_vapic,hv_spinlocks=0xfff,hv_relaxed,hv_time

(for hv_vapic we also need x2apic to be enabled)

Best regards,
Vadim.

 CPU feature hv_vapic not found
 CPU feature hv_spinlocks not found
 CPU feature hv_time not found
 CPU feature hv_relaxed not found
 CPU feature hv_vapic not found
 CPU feature hv_spinlocks not found
 CPU feature hv_time not found
 CPU feature hv_relaxed not found
 VNC server running on `::1:5900'
 
 
   How to turn off/on it manually?
  Yes. From the QEMU command line.
  
   And how can I know whether guest is using it really?
  There are two options - printk from the KVM side or WinDbg from the guest
  side. But in case of hv_time you can check the value returned by
  QueryPerformanceFrequency
  http://msdn.microsoft.com/en-us/library/windows/desktop/ms644905%
  28v=vs.85%29.aspx
  it should be 10MHz
  
Also, Do you have any performance data?
  http://www.linux-kvm.org/wiki/images/0/0a/2012-forum-kvm_hyperv.pdf
  pp 16, 18
  I compared DPC and ISR times with xperf for two cases - with and without
  enlightenment.
  I also have seen reports mentioned around 5-10 percent CPU usage drop on the
  host side, when loading guest with some disk-stress tests.
  
  
Kind regards.
Vadim.
   
   
best regards
yang
   
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org More majordomo
info at  http://vger.kernel.org/majordomo-info.html
   
   
  
  
   Best regards,
   Yang
  
  
  
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: remove redundant registration of BSP's hv_clock area

2014-02-13 Thread Fernando Luis Vázquez Cao
These days hv_clock allocation is memblock based (i.e. the percpu
allocator is not involved), which means that the physical address
of each of the per-cpu hv_clock areas is guaranteed to remain
unchanged through all its lifetime and we do not need to update
its location after CPU bring-up.

Signed-off-by: Fernando Luis Vazquez Cao ferna...@oss.ntt.co.jp
---

diff -urNp linux-3.14-rc2-orig/arch/x86/kernel/kvm.c 
linux-3.14-rc2/arch/x86/kernel/kvm.c
--- linux-3.14-rc2-orig/arch/x86/kernel/kvm.c   2014-02-12 15:49:32.359727407 
+0900
+++ linux-3.14-rc2/arch/x86/kernel/kvm.c2014-02-14 14:39:15.657337787 
+0900
@@ -417,7 +417,6 @@ void kvm_disable_steal_time(void)
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
-   WARN_ON(kvm_register_clock(primary cpu clock));
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
kvm_spinlock_init();
diff -urNp linux-3.14-rc2-orig/arch/x86/kernel/kvmclock.c 
linux-3.14-rc2/arch/x86/kernel/kvmclock.c
--- linux-3.14-rc2-orig/arch/x86/kernel/kvmclock.c  2014-01-20 
11:40:07.0 +0900
+++ linux-3.14-rc2/arch/x86/kernel/kvmclock.c   2014-02-14 14:20:10.546961060 
+0900
@@ -242,7 +248,7 @@ void __init kvmclock_init(void)
hv_clock = __va(mem);
memset(hv_clock, 0, size);
 
-   if (kvm_register_clock(boot clock)) {
+   if (kvm_register_clock(boot/primary cpu clock)) {
hv_clock = NULL;
memblock_free(mem, size);
return;



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html