Re: [RFC V2 PATCH 4/4] virtio-net: add multiqueue support

2012-07-02 Thread Jason Wang

On 07/01/2012 05:43 PM, Michael S. Tsirkin wrote:

On Mon, Jun 25, 2012 at 06:04:49PM +0800, Jason Wang wrote:

This patch let the virtio-net can transmit and recevie packets through multiuple
VLANClientStates and abstract them as multiple virtqueues to guest. A new
parameter 'queues' were introduced to specify the number of queue pairs.

The main goal for vhost support is to let the multiqueue could be used without
changes in vhost code. So each vhost_net structure were used to track a single
VLANClientState and two virtqueues in the past. As multiple VLANClientState were
stored in the NICState, we can infer the correspond VLANClientState from this
and queue_index easily.

Signed-off-by: Jason Wangjasow...@redhat.com

Can this patch be split up?
1. extend vhost API to allow multiqueue and minimally tweak virtio
2. add real multiqueue for virtio

Hmm?


Sure, do you think it's necessary to separate the vhost parts of 
multiqueue from virtio?

---
  hw/vhost.c  |   58 ---
  hw/vhost.h  |1
  hw/vhost_net.c  |7 +
  hw/vhost_net.h  |2
  hw/virtio-net.c |  461 +--
  hw/virtio-net.h |3
  6 files changed, 355 insertions(+), 177 deletions(-)

diff --git a/hw/vhost.c b/hw/vhost.c
index 43664e7..6318bb2 100644
--- a/hw/vhost.c
+++ b/hw/vhost.c
@@ -620,11 +620,12 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
  {
  target_phys_addr_t s, l, a;
  int r;
+int vhost_vq_index = (idx  2 ? idx - 1 : idx) % dev-nvqs;
  struct vhost_vring_file file = {
-.index = idx,
+   .index = vhost_vq_index
  };
  struct vhost_vring_state state = {
-.index = idx,
+.index = vhost_vq_index
  };
  struct VirtQueue *vvq = virtio_get_queue(vdev, idx);

@@ -670,11 +671,12 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
  goto fail_alloc_ring;
  }

-r = vhost_virtqueue_set_addr(dev, vq, idx, dev-log_enabled);
+r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev-log_enabled);
  if (r  0) {
  r = -errno;
  goto fail_alloc;
  }
+
  file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
  r = ioctl(dev-control, VHOST_SET_VRING_KICK,file);
  if (r) {
@@ -715,7 +717,7 @@ static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
  unsigned idx)
  {
  struct vhost_vring_state state = {
-.index = idx,
+.index = (idx  2 ? idx - 1 : idx) % dev-nvqs,
  };
  int r;
  r = ioctl(dev-control, VHOST_GET_VRING_BASE,state);
@@ -829,7 +831,9 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
  }

  for (i = 0; i  hdev-nvqs; ++i) {
-r = vdev-binding-set_host_notifier(vdev-binding_opaque, i, true);
+r = vdev-binding-set_host_notifier(vdev-binding_opaque,
+hdev-start_idx + i,
+true);
  if (r  0) {
  fprintf(stderr, vhost VQ %d notifier binding failed: %d\n, i, 
-r);
  goto fail_vq;
@@ -839,7 +843,9 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
  return 0;
  fail_vq:
  while (--i= 0) {
-r = vdev-binding-set_host_notifier(vdev-binding_opaque, i, false);
+r = vdev-binding-set_host_notifier(vdev-binding_opaque,
+hdev-start_idx + i,
+false);
  if (r  0) {
  fprintf(stderr, vhost VQ %d notifier cleanup error: %d\n, i, 
-r);
  fflush(stderr);
@@ -860,7 +866,9 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
  int i, r;

  for (i = 0; i  hdev-nvqs; ++i) {
-r = vdev-binding-set_host_notifier(vdev-binding_opaque, i, false);
+r = vdev-binding-set_host_notifier(vdev-binding_opaque,
+hdev-start_idx + i,
+false);
  if (r  0) {
  fprintf(stderr, vhost VQ %d notifier cleanup failed: %d\n, i, 
-r);
  fflush(stderr);
@@ -874,15 +882,17 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice 
*vdev)
  {
  int i, r;
  if (!vdev-binding-set_guest_notifiers) {
-fprintf(stderr, binding does not support guest notifiers\n);
+fprintf(stderr, binding does not support guest notifier\n);
  r = -ENOSYS;
  goto fail;
  }

-r = vdev-binding-set_guest_notifiers(vdev-binding_opaque, true);
-if (r  0) {
-fprintf(stderr, Error binding guest notifier: %d\n, -r);
-goto fail_notifiers;
+if (hdev-start_idx == 0) {
+r = vdev-binding-set_guest_notifiers(vdev-binding_opaque, true);
+if (r  0) {
+fprintf(stderr, Error binding guest notifier: %d\n, -r);
+goto fail_notifiers;
+ 

Re: qemu-kvm-1.0.1 - unable to exit if vcpu is in infinite loop

2012-07-02 Thread Jan Kiszka
On 2012-07-01 21:18, Peter Lieven wrote:
 
 Am 01.07.2012 um 10:19 schrieb Avi Kivity:
 
 On 06/28/2012 10:27 PM, Peter Lieven wrote:

 Am 28.06.2012 um 18:32 schrieb Avi Kivity:

 On 06/28/2012 07:29 PM, Peter Lieven wrote:
 Yes. A signal is sent, and KVM returns from the guest to userspace on
 pending signals.

 is there a description available how this process exactly works?

 The kernel part is in vcpu_enter_guest(), see the check for
 signal_pending().  But this hasn't seen changes for quite a long while.

 Thank you, i will have a look. I noticed a few patches that where submitted
 during the last year, maybe one of them is related:

 Switch SIG_IPI to SIGUSR1
 Fix signal handling of SIG_IPI when io-thread is enabled

 In the first commit there is mentioned a 32-on-64-bit Linux kernel bug
 is there any reference to that?


 http://web.archiveorange.com/archive/v/1XS1vwGSFLyYygwTXg1K.  Are you
 running 32-on-64?
 
 I think the issue occurs when running a 32-bit guest on a 64-bit system. 
 Afaik, the
 isolinux loader where is see the race is 32-bit altough it is a 64-bit ubuntu 
 lts
 cd image. The second case where i have seen the race is on shutdown of a
 Windows 2000 Server which is also 32-bit.

32-on-64 particularly means using a 32-bit QEMU[-kvm] binary on a
64-bit host kernel. What does file qemu-system-x86_64 report about yours?

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [bug 1.1] -M pc-1.0 + vhost = SIGSEGV

2012-07-02 Thread Jan Kiszka
On 2012-07-01 17:06, Michael Tokarev wrote:
 When running current git version of qemu-kvm with -M pc-1.0

Just to clarify: you are talking about stable-1.1 git, not master.

 and with vhost-net enabled, it crashes with SIGSEGV right when
 linux guest loads a virtio-net module.
 
 I haven't tried to debug this deeply.  The first result is:
 
 (gdb) ru -M pc-1.0 -nodefconfig -nodefaults -rtc base=utc -device 
 piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -netdev 
 tap,ifname=tap-kvm,script=no,id=hostnet0,vhost=on -device 
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:b4:05:b5,bus=pci.0,addr=0x3
  -vga cirrus
 Starting program: /build/kvm/debian/build/x86_64-softmmu/qemu-system-x86_64 
 -M pc-1.0 -nodefconfig -nodefaults -rtc base=utc -device 
 piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -netdev 
 tap,ifname=tap-kvm,script=no,id=hostnet0,vhost=on -device 
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:b4:05:b5,bus=pci.0,addr=0x3
  -vga cirrus
 ...
 Program received signal SIGSEGV, Segmentation fault.
 [Switching to Thread 0xf65f4b70 (LWP 11245)]
 0x5668b01a in virtio_pci_mask_vq (vector=vector@entry=1, vq=0x57064448,
 masked=masked@entry=0,
 dev=error reading variable: Unhandled dwarf expression opcode 0xfa)
 at /build/kvm/debian/hw/virtio-pci.c:546
 546   int r = kvm_set_irqfd(dev-msix_irq_entries[vector].gsi,
 
 Now, my gdb can't read `dev' variable.  One level up the stack
 this variable is shown correctly:
 
 #1  0x5668b15d in virtio_pci_mask_notifier (dev=0x57062748, vector=1, 
 masked=0)
 at /build/kvm/debian/hw/virtio-pci.c:576
 576   r = virtio_pci_mask_vq(dev, vector, virtio_get_queue(vdev, n), 
 masked);
 (gdb) p dev-msix_irq_entries[vector].gsi
 Cannot access memory at address 0x10
 (gdb) p dev
 $1 = (PCIDevice *) 0x57062748
 (gdb) p dev-msix_irq_entries
 $4 = (KVMMsiMessage *) 0x0
 
 So it looks like msix isn't initialized for -M pc-1.0 ?

Yes, because the machine option defaults are missing here. Will send a
patch.

Also vhost is buggy as it depends on in-kernel irqchip but doesn't check
for it. Needs to be fixed as well.

Jan
-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RFD: virtio balloon API use (was Re: [PATCH 5 of 5] virtio: expose added descriptors immediately)

2012-07-02 Thread Michael S. Tsirkin
On Mon, Jul 02, 2012 at 10:35:47AM +0930, Rusty Russell wrote:
 On Sun, 1 Jul 2012 12:20:51 +0300, Michael S. Tsirkin m...@redhat.com 
 wrote:
  On Thu, Nov 03, 2011 at 06:12:53PM +1030, Rusty Russell wrote:
   A virtio driver does virtqueue_add_buf() multiple times before finally
   calling virtqueue_kick(); previously we only exposed the added buffers
   in the virtqueue_kick() call.  This means we don't need a memory
   barrier in virtqueue_add_buf(), but it reduces concurrency as the
   device (ie. host) can't see the buffers until the kick.
   
   Signed-off-by: Rusty Russell ru...@rustcorp.com.au
  
  Looking at recent mm compaction patches made me look at locking
  in balloon closely. And I noticed the referenced patch (commit
  ee7cd8981e15bcb365fc762afe3fc47b8242f630 upstream) interacts strangely
  with virtio balloon; balloon currently does:
  
  static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
  {
  struct scatterlist sg;
  
  sg_init_one(sg, vb-pfns, sizeof(vb-pfns[0]) * vb-num_pfns);
  
  init_completion(vb-acked);
  
  /* We should always be able to add one buffer to an empty queue. */
  if (virtqueue_add_buf(vq, sg, 1, 0, vb, GFP_KERNEL)  0)
  BUG();
  virtqueue_kick(vq);
  
  /* When host has read buffer, this completes via balloon_ack */
  wait_for_completion(vb-acked);
  }
  
  
  While vq callback does:
  
  static void balloon_ack(struct virtqueue *vq)
  {
  struct virtio_balloon *vb;
  unsigned int len;
  
  vb = virtqueue_get_buf(vq, len);
  if (vb)
  complete(vb-acked);
  }
  
  
  So virtqueue_get_buf might now run concurrently with virtqueue_kick.
  I audited both and this seems safe in practice but I think
 
 Good spotting!
 
 Agreed.  Because there's only add_buf, we get away with it: the add_buf
 must be almost finished by the time get_buf runs because the device has
 seen the buffer.
 
  we need to either declare this legal at the API level
  or add locking in driver.
 
 I wonder if we should just lock in the balloon driver, rather than
 document this corner case and set a bad example.

We'll need to replace vb-acked with a waitqueue
and do get_buf from the same thread.
But I note that stats_request hash the same issue.
Let's see if we can fix it.

 Are there other
 drivers which take the same shortcut?

Not that I know.

  Further, is there a guarantee that we never get
  spurious callbacks?  We currently check ring not empty
  but esp for non shared MSI this might not be needed.
 
 Yes, I think this saves us.  A spurious interrupt won't trigger
 a spurious callback.
 
  If a spurious callback triggers, virtqueue_get_buf can run
  concurrently with virtqueue_add_buf which is known to be racy.
  Again I think this is currently safe as no spurious callbacks in
  practice but should we guarantee no spurious callbacks at the API level
  or add locking in driver?
 
 I think we should guarantee it, but is there a hole in the current
 implementation?
 
 Thanks,
 Rusty.

Could be. The check for ring empty looks somewhat suspicious.
It might be expensive to make it 100% robust - that check was
intended as an optimization for shared interrupts.
Whith per vq interrupts we IMO do not need the check.
If we add locking in balloon I think there's no need
to guarantee no spurious interrupts.


-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC] virtio-balloon: fix add/get API use

2012-07-02 Thread Michael S. Tsirkin
In virtio balloon virtqueue_get_buf might now run concurrently with
virtqueue_kick.  I audited both and this seems safe in practice but
this is not guaranteed by the API.
Additionally, a spurious interrupt might in theory make
virtqueue_get_buf run in parallel with virtqueue_add_buf, which is racy.

While we might try to protect against spurious callbacks it's
easier to fix the driver: balloon seems to be the only one
(mis)using the API like this, so let's just fix balloon.

Signed-off-by: Michael S. Tsirkin m...@redhat.com

---

Warning: completely untested.

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index bfbc15c..a26eb4f 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -47,7 +47,7 @@ struct virtio_balloon
struct task_struct *thread;
 
/* Waiting for host to ack the pages we released. */
-   struct completion acked;
+   wait_queue_head_t acked;
 
/* Number of balloon pages we've told the Host we're not using. */
unsigned int num_pages;
@@ -89,29 +89,26 @@ static struct page *balloon_pfn_to_page(u32 pfn)
 
 static void balloon_ack(struct virtqueue *vq)
 {
-   struct virtio_balloon *vb;
-   unsigned int len;
+   struct virtio_balloon *vb = vq-vdev-priv;
 
-   vb = virtqueue_get_buf(vq, len);
-   if (vb)
-   complete(vb-acked);
+   wake_up(vb-acked);
 }
 
 static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 {
struct scatterlist sg;
+   unsigned int len;
+   void *buf;
 
sg_init_one(sg, vb-pfns, sizeof(vb-pfns[0]) * vb-num_pfns);
 
-   init_completion(vb-acked);
-
/* We should always be able to add one buffer to an empty queue. */
if (virtqueue_add_buf(vq, sg, 1, 0, vb, GFP_KERNEL)  0)
BUG();
virtqueue_kick(vq);
 
/* When host has read buffer, this completes via balloon_ack */
-   wait_for_completion(vb-acked);
+   wait_event(vb-acked, virtqueue_get_buf(vq, len));
 }
 
 static void set_page_pfns(u32 pfns[], struct page *page)
@@ -231,12 +228,8 @@ static void update_balloon_stats(struct virtio_balloon *vb)
  */
 static void stats_request(struct virtqueue *vq)
 {
-   struct virtio_balloon *vb;
-   unsigned int len;
+   struct virtio_balloon *vb = vq-vdev-priv;
 
-   vb = virtqueue_get_buf(vq, len);
-   if (!vb)
-   return;
vb-need_stats_update = 1;
wake_up(vb-config_change);
 }
@@ -245,11 +238,14 @@ static void stats_handle_request(struct virtio_balloon 
*vb)
 {
struct virtqueue *vq;
struct scatterlist sg;
+   unsigned int len;
 
vb-need_stats_update = 0;
update_balloon_stats(vb);
 
vq = vb-stats_vq;
+   if (!virtqueue_get_buf(vq, len))
+   return;
sg_init_one(sg, vb-stats, sizeof(vb-stats));
if (virtqueue_add_buf(vq, sg, 1, 0, vb, GFP_KERNEL)  0)
BUG();
@@ -358,6 +354,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
INIT_LIST_HEAD(vb-pages);
vb-num_pages = 0;
init_waitqueue_head(vb-config_change);
+   init_waitqueue_head(vb-acked);
vb-vdev = vdev;
vb-need_stats_update = 0;
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH stable-1.1] qemu-kvm: Add missing default machine options

2012-07-02 Thread Jan Kiszka
qemu-kvm-specific machine defaults were missing for pc-0.15 and pc-1.0.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 hw/pc_piix.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index c3fb74e..4e8a280 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -393,6 +393,7 @@ static QEMUMachine pc_machine_v1_0 = {
 .desc = Standard PC,
 .init = pc_init_pci,
 .max_cpus = 255,
+.default_machine_opts = accel=kvm,kernel_irqchip=on,
 .compat_props = (GlobalProperty[]) {
 PC_COMPAT_1_0,
 { /* end of list */ }
@@ -407,6 +408,7 @@ static QEMUMachine pc_machine_v0_15 = {
 .desc = Standard PC,
 .init = pc_init_pci,
 .max_cpus = 255,
+.default_machine_opts = accel=kvm,kernel_irqchip=on,
 .compat_props = (GlobalProperty[]) {
 PC_COMPAT_0_15,
 { /* end of list */ }
-- 
1.7.3.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC V2 PATCH 4/4] virtio-net: add multiqueue support

2012-07-02 Thread Michael S. Tsirkin
On Mon, Jul 02, 2012 at 03:04:00PM +0800, Jason Wang wrote:
 On 07/01/2012 05:43 PM, Michael S. Tsirkin wrote:
 On Mon, Jun 25, 2012 at 06:04:49PM +0800, Jason Wang wrote:
 This patch let the virtio-net can transmit and recevie packets through 
 multiuple
 VLANClientStates and abstract them as multiple virtqueues to guest. A new
 parameter 'queues' were introduced to specify the number of queue pairs.
 
 The main goal for vhost support is to let the multiqueue could be used 
 without
 changes in vhost code. So each vhost_net structure were used to track a 
 single
 VLANClientState and two virtqueues in the past. As multiple VLANClientState 
 were
 stored in the NICState, we can infer the correspond VLANClientState from 
 this
 and queue_index easily.
 
 Signed-off-by: Jason Wangjasow...@redhat.com
 Can this patch be split up?
 1. extend vhost API to allow multiqueue and minimally tweak virtio
 2. add real multiqueue for virtio
 
 Hmm?
 
 Sure, do you think it's necessary to separate the vhost parts of
 multiqueue from virtio?

Separating is always nice if it is not too hard.

 ---
   hw/vhost.c  |   58 ---
   hw/vhost.h  |1
   hw/vhost_net.c  |7 +
   hw/vhost_net.h  |2
   hw/virtio-net.c |  461 
  +--
   hw/virtio-net.h |3
   6 files changed, 355 insertions(+), 177 deletions(-)
 
 diff --git a/hw/vhost.c b/hw/vhost.c
 index 43664e7..6318bb2 100644
 --- a/hw/vhost.c
 +++ b/hw/vhost.c
 @@ -620,11 +620,12 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
   {
   target_phys_addr_t s, l, a;
   int r;
 +int vhost_vq_index = (idx  2 ? idx - 1 : idx) % dev-nvqs;
   struct vhost_vring_file file = {
 -.index = idx,
 +   .index = vhost_vq_index
   };
   struct vhost_vring_state state = {
 -.index = idx,
 +.index = vhost_vq_index
   };
   struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
 
 @@ -670,11 +671,12 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
   goto fail_alloc_ring;
   }
 
 -r = vhost_virtqueue_set_addr(dev, vq, idx, dev-log_enabled);
 +r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, 
 dev-log_enabled);
   if (r  0) {
   r = -errno;
   goto fail_alloc;
   }
 +
   file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
   r = ioctl(dev-control, VHOST_SET_VRING_KICK,file);
   if (r) {
 @@ -715,7 +717,7 @@ static void vhost_virtqueue_cleanup(struct vhost_dev 
 *dev,
   unsigned idx)
   {
   struct vhost_vring_state state = {
 -.index = idx,
 +.index = (idx  2 ? idx - 1 : idx) % dev-nvqs,
   };
   int r;
   r = ioctl(dev-control, VHOST_GET_VRING_BASE,state);
 @@ -829,7 +831,9 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
 VirtIODevice *vdev)
   }
 
   for (i = 0; i  hdev-nvqs; ++i) {
 -r = vdev-binding-set_host_notifier(vdev-binding_opaque, i, 
 true);
 +r = vdev-binding-set_host_notifier(vdev-binding_opaque,
 +hdev-start_idx + i,
 +true);
   if (r  0) {
   fprintf(stderr, vhost VQ %d notifier binding failed: %d\n, 
  i, -r);
   goto fail_vq;
 @@ -839,7 +843,9 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
 VirtIODevice *vdev)
   return 0;
   fail_vq:
   while (--i= 0) {
 -r = vdev-binding-set_host_notifier(vdev-binding_opaque, i, 
 false);
 +r = vdev-binding-set_host_notifier(vdev-binding_opaque,
 +hdev-start_idx + i,
 +false);
   if (r  0) {
   fprintf(stderr, vhost VQ %d notifier cleanup error: %d\n, 
  i, -r);
   fflush(stderr);
 @@ -860,7 +866,9 @@ void vhost_dev_disable_notifiers(struct vhost_dev 
 *hdev, VirtIODevice *vdev)
   int i, r;
 
   for (i = 0; i  hdev-nvqs; ++i) {
 -r = vdev-binding-set_host_notifier(vdev-binding_opaque, i, 
 false);
 +r = vdev-binding-set_host_notifier(vdev-binding_opaque,
 +hdev-start_idx + i,
 +false);
   if (r  0) {
   fprintf(stderr, vhost VQ %d notifier cleanup failed: %d\n, 
  i, -r);
   fflush(stderr);
 @@ -874,15 +882,17 @@ int vhost_dev_start(struct vhost_dev *hdev, 
 VirtIODevice *vdev)
   {
   int i, r;
   if (!vdev-binding-set_guest_notifiers) {
 -fprintf(stderr, binding does not support guest notifiers\n);
 +fprintf(stderr, binding does not support guest notifier\n);
   r = -ENOSYS;
   goto fail;
   }
 
 -r = vdev-binding-set_guest_notifiers(vdev-binding_opaque, true);
 -if (r  0) {
 -fprintf(stderr, Error binding guest notifier: %d\n, -r);
 -goto fail_notifiers;
 +if 

[PATCH stable-1.1] qemu-kvm: virtio: Do not register mask notifiers without in-kernel irqchip support

2012-07-02 Thread Jan Kiszka
We crash if we registers mask notifiers without backing in-kernel
irqchip. This corresponds to the check in QEMU upstream after 1.1 now.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---

Not needed for master as we have upstream logic there already.

 hw/virtio-pci.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index a0c2ca7..5b64356 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -629,7 +629,7 @@ static int virtio_pci_set_guest_notifiers(void *opaque, 
bool assign)
 
 /* Must unset mask notifier while guest notifier
  * is still assigned */
-if (!assign) {
+if (kvm_irqchip_in_kernel()  !assign) {
r = msix_unset_mask_notifier(proxy-pci_dev);
 assert(r = 0);
 }
@@ -647,7 +647,7 @@ static int virtio_pci_set_guest_notifiers(void *opaque, 
bool assign)
 
 /* Must set mask notifier after guest notifier
  * has been assigned */
-if (assign) {
+if (kvm_irqchip_in_kernel()  assign) {
 r = msix_set_mask_notifier(proxy-pci_dev,
virtio_pci_mask_notifier);
 if (r  0) {
-- 
1.7.3.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm-1.0.1 - unable to exit if vcpu is in infinite loop

2012-07-02 Thread Peter Lieven

On 02.07.2012 09:05, Jan Kiszka wrote:

On 2012-07-01 21:18, Peter Lieven wrote:

Am 01.07.2012 um 10:19 schrieb Avi Kivity:


On 06/28/2012 10:27 PM, Peter Lieven wrote:

Am 28.06.2012 um 18:32 schrieb Avi Kivity:


On 06/28/2012 07:29 PM, Peter Lieven wrote:

Yes. A signal is sent, and KVM returns from the guest to userspace on
pending signals.

is there a description available how this process exactly works?

The kernel part is in vcpu_enter_guest(), see the check for
signal_pending().  But this hasn't seen changes for quite a long while.

Thank you, i will have a look. I noticed a few patches that where submitted
during the last year, maybe one of them is related:

Switch SIG_IPI to SIGUSR1
Fix signal handling of SIG_IPI when io-thread is enabled

In the first commit there is mentioned a 32-on-64-bit Linux kernel bug
is there any reference to that?


http://web.archiveorange.com/archive/v/1XS1vwGSFLyYygwTXg1K.  Are you
running 32-on-64?

I think the issue occurs when running a 32-bit guest on a 64-bit system. Afaik, 
the
isolinux loader where is see the race is 32-bit altough it is a 64-bit ubuntu 
lts
cd image. The second case where i have seen the race is on shutdown of a
Windows 2000 Server which is also 32-bit.

32-on-64 particularly means using a 32-bit QEMU[-kvm] binary on a
64-bit host kernel. What does file qemu-system-x86_64 report about yours?
Its custom build on a 64-bit linux as 64-bit application. I will try to 
continue to find out

today whats going wrong. Any help or hints appreciated ;-)

Thanks,
Peter


Jan



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/8] KVM: Optimize MMU notifier's THP page invalidation -v4

2012-07-02 Thread Takuya Yoshikawa
v3-v4: Resolved trace_kvm_age_page() issue -- patch 6,7
v2-v3: Fixed intersection calculations. -- patch 3, 8

Takuya

Takuya Yoshikawa (8):
  KVM: MMU: Use __gfn_to_rmap() to clean up kvm_handle_hva()
  KVM: Introduce hva_to_gfn_memslot() for kvm_handle_hva()
  KVM: MMU: Make kvm_handle_hva() handle range of addresses
  KVM: Introduce kvm_unmap_hva_range() for 
kvm_mmu_notifier_invalidate_range_start()
  KVM: Separate rmap_pde from kvm_lpage_info-write_count
  KVM: MMU: Add memslot parameter to hva handlers
  KVM: MMU: Push trace_kvm_age_page() into kvm_age_rmapp()
  KVM: MMU: Avoid handling same rmap_pde in kvm_handle_hva_range()

 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   47 ---
 arch/x86/include/asm/kvm_host.h |3 +-
 arch/x86/kvm/mmu.c  |  107 +++
 arch/x86/kvm/x86.c  |   11 
 include/linux/kvm_host.h|8 +++
 virt/kvm/kvm_main.c |3 +-
 7 files changed, 131 insertions(+), 50 deletions(-)


From v2:

The new test result was impressively good, see below, and THP page
invalidation was more than 5 times faster on my x86 machine.

Before:
  ...
  19.852 us  |  __mmu_notifier_invalidate_range_start();
  28.033 us  |  __mmu_notifier_invalidate_range_start();
  19.066 us  |  __mmu_notifier_invalidate_range_start();
  44.715 us  |  __mmu_notifier_invalidate_range_start();
  31.613 us  |  __mmu_notifier_invalidate_range_start();
  20.659 us  |  __mmu_notifier_invalidate_range_start();
  19.979 us  |  __mmu_notifier_invalidate_range_start();
  20.416 us  |  __mmu_notifier_invalidate_range_start();
  20.632 us  |  __mmu_notifier_invalidate_range_start();
  22.316 us  |  __mmu_notifier_invalidate_range_start();
  ...

After:
  ...
  4.089 us   |  __mmu_notifier_invalidate_range_start();
  4.096 us   |  __mmu_notifier_invalidate_range_start();
  3.560 us   |  __mmu_notifier_invalidate_range_start();
  3.376 us   |  __mmu_notifier_invalidate_range_start();
  3.772 us   |  __mmu_notifier_invalidate_range_start();
  3.353 us   |  __mmu_notifier_invalidate_range_start();
  3.332 us   |  __mmu_notifier_invalidate_range_start();
  3.332 us   |  __mmu_notifier_invalidate_range_start();
  3.332 us   |  __mmu_notifier_invalidate_range_start();
  3.337 us   |  __mmu_notifier_invalidate_range_start();
  ...
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] KVM: MMU: Use __gfn_to_rmap() to clean up kvm_handle_hva()

2012-07-02 Thread Takuya Yoshikawa
We can treat every level uniformly.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
---
 arch/x86/kvm/mmu.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3b53d9e..d3e7e6a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1205,14 +1205,14 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned 
long hva,
gfn_t gfn_offset = (hva - start)  PAGE_SHIFT;
gfn_t gfn = memslot-base_gfn + gfn_offset;
 
-   ret = handler(kvm, memslot-rmap[gfn_offset], data);
+   ret = 0;
 
-   for (j = 0; j  KVM_NR_PAGE_SIZES - 1; ++j) {
-   struct kvm_lpage_info *linfo;
+   for (j = PT_PAGE_TABLE_LEVEL;
+j  PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+   unsigned long *rmapp;
 
-   linfo = lpage_info_slot(gfn, memslot,
-   PT_DIRECTORY_LEVEL + j);
-   ret |= handler(kvm, linfo-rmap_pde, data);
+   rmapp = __gfn_to_rmap(gfn, j, memslot);
+   ret |= handler(kvm, rmapp, data);
}
trace_kvm_age_page(hva, memslot, ret);
retval |= ret;
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/8] KVM: Introduce hva_to_gfn_memslot() for kvm_handle_hva()

2012-07-02 Thread Takuya Yoshikawa
This restricts hva handling in mmu code and makes it easier to extend
kvm_handle_hva() so that it can treat a range of addresses later in this
patch series.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
Cc: Alexander Graf ag...@suse.de
Cc: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c |6 +++---
 arch/x86/kvm/mmu.c  |3 +--
 include/linux/kvm_host.h|8 
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d03eb6f..3703755 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -772,10 +772,10 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
 
end = start + (memslot-npages  PAGE_SHIFT);
if (hva = start  hva  end) {
-   gfn_t gfn_offset = (hva - start)  PAGE_SHIFT;
+   gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
+   gfn_t gfn_offset = gfn - memslot-base_gfn;
 
-   ret = handler(kvm, memslot-rmap[gfn_offset],
- memslot-base_gfn + gfn_offset);
+   ret = handler(kvm, memslot-rmap[gfn_offset], gfn);
retval |= ret;
}
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d3e7e6a..b898bec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1202,8 +1202,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
 
end = start + (memslot-npages  PAGE_SHIFT);
if (hva = start  hva  end) {
-   gfn_t gfn_offset = (hva - start)  PAGE_SHIFT;
-   gfn_t gfn = memslot-base_gfn + gfn_offset;
+   gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
 
ret = 0;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c7f7787..c0dd2e9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -740,6 +740,14 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t 
base_gfn, int level)
(base_gfn  KVM_HPAGE_GFN_SHIFT(level));
 }
 
+static inline gfn_t
+hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
+{
+   gfn_t gfn_offset = (hva - slot-userspace_addr)  PAGE_SHIFT;
+
+   return slot-base_gfn + gfn_offset;
+}
+
 static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
   gfn_t gfn)
 {
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/8] KVM: MMU: Make kvm_handle_hva() handle range of addresses

2012-07-02 Thread Takuya Yoshikawa
When guest's memory is backed by THP pages, MMU notifier needs to call
kvm_unmap_hva(), which in turn leads to kvm_handle_hva(), in a loop to
invalidate a range of pages which constitute one huge page:

  for each page
for each memslot
  if page is in memslot
unmap using rmap

This means although every page in that range is expected to be found in
the same memslot, we are forced to check unrelated memslots many times.
If the guest has more memslots, the situation will become worse.

Furthermore, if the range does not include any pages in the guest's
memory, the loop over the pages will just consume extra time.

This patch, together with the following patches, solves this problem by
introducing kvm_handle_hva_range() which makes the loop look like this:

  for each memslot
for each page in memslot
  unmap using rmap

In this new processing, the actual work is converted to a loop over rmap
which is much more cache friendly than before.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
Cc: Alexander Graf ag...@suse.de
Cc: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   36 ++--
 arch/x86/kvm/mmu.c  |   44 ++
 2 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 3703755..1a470bc 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
goto out_put;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-unsigned long gfn))
+static int kvm_handle_hva_range(struct kvm *kvm,
+   unsigned long start,
+   unsigned long end,
+   int (*handler)(struct kvm *kvm,
+  unsigned long *rmapp,
+  unsigned long gfn))
 {
int ret;
int retval = 0;
@@ -767,12 +770,22 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
 
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
-   unsigned long start = memslot-userspace_addr;
-   unsigned long end;
+   unsigned long hva_start, hva_end;
+   gfn_t gfn, gfn_end;
+
+   hva_start = max(start, memslot-userspace_addr);
+   hva_end = min(end, memslot-userspace_addr +
+   (memslot-npages  PAGE_SHIFT));
+   if (hva_start = hva_end)
+   continue;
+   /*
+* {gfn(page) | page intersects with [hva_start, hva_end)} =
+* {gfn, gfn+1, ..., gfn_end-1}.
+*/
+   gfn = hva_to_gfn_memslot(hva_start, memslot);
+   gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-   end = start + (memslot-npages  PAGE_SHIFT);
-   if (hva = start  hva  end) {
-   gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
+   for (; gfn  gfn_end; ++gfn) {
gfn_t gfn_offset = gfn - memslot-base_gfn;
 
ret = handler(kvm, memslot-rmap[gfn_offset], gfn);
@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
return retval;
 }
 
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+unsigned long gfn))
+{
+   return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
+}
+
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
   unsigned long gfn)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b898bec..6903568 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1183,10 +1183,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned 
long *rmapp,
return 0;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-unsigned long data))
+static int kvm_handle_hva_range(struct kvm *kvm,
+   unsigned long start,
+   unsigned long end,
+   unsigned long data,
+   int (*handler)(struct kvm *kvm,
+  unsigned long *rmapp,
+  unsigned long data))
 {
int j;
int ret;
@@ 

[PATCH 4/8] KVM: Introduce kvm_unmap_hva_range() for kvm_mmu_notifier_invalidate_range_start()

2012-07-02 Thread Takuya Yoshikawa
When we tested KVM under memory pressure, with THP enabled on the host,
we noticed that MMU notifier took a long time to invalidate huge pages.

Since the invalidation was done with mmu_lock held, it not only wasted
the CPU but also made the host harder to respond.

This patch mitigates this by using kvm_handle_hva_range().

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
Cc: Alexander Graf ag...@suse.de
Cc: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_host.h |2 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c |7 +++
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/mmu.c  |5 +
 virt/kvm/kvm_main.c |3 +--
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..572ad01 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -52,6 +52,8 @@
 
 struct kvm;
 extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_unmap_hva_range(struct kvm *kvm,
+  unsigned long start, unsigned long end);
 extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 1a470bc..3c635c0 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -870,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return 0;
 }
 
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long 
end)
+{
+   if (kvm-arch.using_mmu_notifiers)
+   kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+   return 0;
+}
+
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 unsigned long gfn)
 {
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 24b7647..5aab8d4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -942,6 +942,7 @@ extern bool kvm_rebooting;
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long 
end);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6903568..a5f0200 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1248,6 +1248,11 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
 }
 
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long 
end)
+{
+   return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
kvm_handle_hva(kvm, hva, (unsigned long)pte, kvm_set_pte_rmapp);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 636bd08..c597d00 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -332,8 +332,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
 * count is also read inside the mmu_lock critical section.
 */
kvm-mmu_notifier_count++;
-   for (; start  end; start += PAGE_SIZE)
-   need_tlb_flush |= kvm_unmap_hva(kvm, start);
+   need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm-tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/8] KVM: Separate rmap_pde from kvm_lpage_info-write_count

2012-07-02 Thread Takuya Yoshikawa
This makes it possible to loop over rmap_pde arrays in the same way as
we do over rmap so that we can optimize kvm_handle_hva_range() easily in
the following patch.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
---
 arch/x86/include/asm/kvm_host.h |2 +-
 arch/x86/kvm/mmu.c  |6 +++---
 arch/x86/kvm/x86.c  |   11 +++
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5aab8d4..aea1673 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -499,11 +499,11 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_lpage_info {
-   unsigned long rmap_pde;
int write_count;
 };
 
 struct kvm_arch_memory_slot {
+   unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a5f0200..f2c408a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -931,13 +931,13 @@ static void pte_list_walk(unsigned long *pte_list, 
pte_list_walk_fn fn)
 static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
 {
-   struct kvm_lpage_info *linfo;
+   unsigned long idx;
 
if (likely(level == PT_PAGE_TABLE_LEVEL))
return slot-rmap[gfn - slot-base_gfn];
 
-   linfo = lpage_info_slot(gfn, slot, level);
-   return linfo-rmap_pde;
+   idx = gfn_to_index(gfn, slot-base_gfn, level);
+   return slot-arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx];
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8eacb2e..9136f2c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6313,6 +6313,10 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
int i;
 
for (i = 0; i  KVM_NR_PAGE_SIZES - 1; ++i) {
+   if (!dont || free-arch.rmap_pde[i] != dont-arch.rmap_pde[i]) {
+   kvm_kvfree(free-arch.rmap_pde[i]);
+   free-arch.rmap_pde[i] = NULL;
+   }
if (!dont || free-arch.lpage_info[i] != 
dont-arch.lpage_info[i]) {
kvm_kvfree(free-arch.lpage_info[i]);
free-arch.lpage_info[i] = NULL;
@@ -6332,6 +6336,11 @@ int kvm_arch_create_memslot(struct kvm_memory_slot 
*slot, unsigned long npages)
lpages = gfn_to_index(slot-base_gfn + npages - 1,
  slot-base_gfn, level) + 1;
 
+   slot-arch.rmap_pde[i] =
+   kvm_kvzalloc(lpages * sizeof(*slot-arch.rmap_pde[i]));
+   if (!slot-arch.rmap_pde[i])
+   goto out_free;
+
slot-arch.lpage_info[i] =
kvm_kvzalloc(lpages * 
sizeof(*slot-arch.lpage_info[i]));
if (!slot-arch.lpage_info[i])
@@ -6360,7 +6369,9 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, 
unsigned long npages)
 
 out_free:
for (i = 0; i  KVM_NR_PAGE_SIZES - 1; ++i) {
+   kvm_kvfree(slot-arch.rmap_pde[i]);
kvm_kvfree(slot-arch.lpage_info[i]);
+   slot-arch.rmap_pde[i] = NULL;
slot-arch.lpage_info[i] = NULL;
}
return -ENOMEM;
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/8] KVM: MMU: Add memslot parameter to hva handlers

2012-07-02 Thread Takuya Yoshikawa
This is needed to push trace_kvm_age_page() into kvm_age_rmapp() in the
following patch.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
---
 arch/x86/kvm/mmu.c |   16 +---
 1 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f2c408a..bf8d50e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1124,7 +1124,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
-  unsigned long data)
+  struct kvm_memory_slot *slot, unsigned long data)
 {
u64 *sptep;
struct rmap_iterator iter;
@@ -1142,7 +1142,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long 
*rmapp,
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
-unsigned long data)
+struct kvm_memory_slot *slot, unsigned long data)
 {
u64 *sptep;
struct rmap_iterator iter;
@@ -1189,6 +1189,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long data,
int (*handler)(struct kvm *kvm,
   unsigned long *rmapp,
+  struct kvm_memory_slot *slot,
   unsigned long data))
 {
int j;
@@ -1223,7 +1224,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long *rmapp;
 
rmapp = __gfn_to_rmap(gfn, j, memslot);
-   ret |= handler(kvm, rmapp, data);
+   ret |= handler(kvm, rmapp, memslot, data);
}
trace_kvm_age_page(memslot-userspace_addr +
(gfn - memslot-base_gfn) * PAGE_SIZE,
@@ -1238,6 +1239,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
  unsigned long data,
  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+struct kvm_memory_slot *slot,
 unsigned long data))
 {
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
@@ -1259,7 +1261,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, 
pte_t pte)
 }
 
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
-unsigned long data)
+struct kvm_memory_slot *slot, unsigned long data)
 {
u64 *sptep;
struct rmap_iterator uninitialized_var(iter);
@@ -1274,7 +1276,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long 
*rmapp,
 * out actively used pages or breaking up actively used hugepages.
 */
if (!shadow_accessed_mask)
-   return kvm_unmap_rmapp(kvm, rmapp, data);
+   return kvm_unmap_rmapp(kvm, rmapp, slot, data);
 
for (sptep = rmap_get_first(*rmapp, iter); sptep;
 sptep = rmap_get_next(iter)) {
@@ -1291,7 +1293,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long 
*rmapp,
 }
 
 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
 {
u64 *sptep;
struct rmap_iterator iter;
@@ -1329,7 +1331,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 
*spte, gfn_t gfn)
 
rmapp = gfn_to_rmap(vcpu-kvm, gfn, sp-role.level);
 
-   kvm_unmap_rmapp(vcpu-kvm, rmapp, 0);
+   kvm_unmap_rmapp(vcpu-kvm, rmapp, NULL, 0);
kvm_flush_remote_tlbs(vcpu-kvm);
 }
 
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/8] KVM: MMU: Push trace_kvm_age_page() into kvm_age_rmapp()

2012-07-02 Thread Takuya Yoshikawa
This restricts the tracing to page aging and makes it possible to
optimize kvm_handle_hva_range() further in the following patch.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
---
 arch/x86/kvm/mmu.c |   23 ++-
 1 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bf8d50e..3082199 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1193,8 +1193,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
   unsigned long data))
 {
int j;
-   int ret;
-   int retval = 0;
+   int ret = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
 
@@ -1217,8 +1216,6 @@ static int kvm_handle_hva_range(struct kvm *kvm,
gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
for (; gfn  gfn_end; ++gfn) {
-   ret = 0;
-
for (j = PT_PAGE_TABLE_LEVEL;
 j  PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
unsigned long *rmapp;
@@ -1226,14 +1223,10 @@ static int kvm_handle_hva_range(struct kvm *kvm,
rmapp = __gfn_to_rmap(gfn, j, memslot);
ret |= handler(kvm, rmapp, memslot, data);
}
-   trace_kvm_age_page(memslot-userspace_addr +
-   (gfn - memslot-base_gfn) * PAGE_SIZE,
-   memslot, ret);
-   retval |= ret;
}
}
 
-   return retval;
+   return ret;
 }
 
 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
@@ -1275,8 +1268,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long 
*rmapp,
 * This has some overhead, but not as much as the cost of swapping
 * out actively used pages or breaking up actively used hugepages.
 */
-   if (!shadow_accessed_mask)
-   return kvm_unmap_rmapp(kvm, rmapp, slot, data);
+   if (!shadow_accessed_mask) {
+   young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
+   goto out;
+   }
 
for (sptep = rmap_get_first(*rmapp, iter); sptep;
 sptep = rmap_get_next(iter)) {
@@ -1288,7 +1283,9 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long 
*rmapp,
 (unsigned long *)sptep);
}
}
-
+out:
+   /* @data has hva passed to kvm_age_hva(). */
+   trace_kvm_age_page(data, slot, young);
return young;
 }
 
@@ -1337,7 +1334,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 
*spte, gfn_t gfn)
 
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 {
-   return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
+   return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
 }
 
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] KVM: x86: Implement PCID/INVPCID for guests with EPT

2012-07-02 Thread Avi Kivity
On 07/02/2012 03:32 AM, Mao, Junjie wrote:
  I think this means I can replace the code here with a check in
 nested_vmx_run. Do I understand correctly?
 
 Correct, but the check already exists:
 if (!vmx_control_verify(vmcs12-cpu_based_vm_exec_control,
   nested_vmx_procbased_ctls_low,
 nested_vmx_procbased_ctls_high) ||
 !vmx_control_verify(vmcs12-secondary_vm_exec_control,
   nested_vmx_secondary_ctls_low,
 nested_vmx_secondary_ctls_high) ||
 !vmx_control_verify(vmcs12-pin_based_vm_exec_control,
   nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high)
 ||
 !vmx_control_verify(vmcs12-vm_exit_controls,
   nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
 !vmx_control_verify(vmcs12-vm_entry_controls,
   nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
 {
 nested_vmx_failValid(vcpu,
 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 return 1;
 }
 
 So all that is needed is to initializr nested_vmx_entry_ctls_high properly.
 
 nested_vmx_entry_ctls_high only contains 
 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES at present, which means it should be 
 safe to simply remove the code.

Yes, I misread the code as initializing it to what the cpu supports, but
it is correct as is.  So just drop this check.



-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/8] KVM: MMU: Avoid handling same rmap_pde in kvm_handle_hva_range()

2012-07-02 Thread Takuya Yoshikawa
When we invalidate a THP page, we call the handler with the same
rmap_pde argument 512 times in the following loop:

  for each guest page in the range
for each level
  unmap using rmap

This patch avoids these extra handler calls by changing the loop order
like this:

  for each level
for each rmap in the range
  unmap using rmap

With the preceding patches in the patch series, this made THP page
invalidation more than 5 times faster on our x86 host: the host became
more responsive during swapping the guest's memory as a result.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
---
 arch/x86/kvm/mmu.c |   28 ++--
 1 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3082199..d07b4ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1201,7 +1201,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 
kvm_for_each_memslot(memslot, slots) {
unsigned long hva_start, hva_end;
-   gfn_t gfn, gfn_end;
+   gfn_t gfn_start, gfn_end;
 
hva_start = max(start, memslot-userspace_addr);
hva_end = min(end, memslot-userspace_addr +
@@ -1210,19 +1210,27 @@ static int kvm_handle_hva_range(struct kvm *kvm,
continue;
/*
 * {gfn(page) | page intersects with [hva_start, hva_end)} =
-* {gfn, gfn+1, ..., gfn_end-1}.
+* {gfn_start, gfn_start+1, ..., gfn_end-1}.
 */
-   gfn = hva_to_gfn_memslot(hva_start, memslot);
+   gfn_start = hva_to_gfn_memslot(hva_start, memslot);
gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-   for (; gfn  gfn_end; ++gfn) {
-   for (j = PT_PAGE_TABLE_LEVEL;
-j  PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
-   unsigned long *rmapp;
+   for (j = PT_PAGE_TABLE_LEVEL;
+j  PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+   unsigned long idx, idx_end;
+   unsigned long *rmapp;
 
-   rmapp = __gfn_to_rmap(gfn, j, memslot);
-   ret |= handler(kvm, rmapp, memslot, data);
-   }
+   /*
+* {idx(page_j) | page_j intersects with
+*  [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+*/
+   idx = gfn_to_index(gfn_start, memslot-base_gfn, j);
+   idx_end = gfn_to_index(gfn_end - 1, memslot-base_gfn, 
j);
+
+   rmapp = __gfn_to_rmap(gfn_start, j, memslot);
+
+   for (; idx = idx_end; ++idx)
+   ret |= handler(kvm, rmapp++, memslot, data);
}
}
 
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/7] memory: Flush coalesced MMIO on selected region access

2012-07-02 Thread Avi Kivity
On 06/29/2012 07:37 PM, Jan Kiszka wrote:
 Instead of flushing pending coalesced MMIO requests on every vmexit,
 this provides a mechanism to selectively flush when memory regions
 related to the coalesced one are accessed. This first of all includes
 the coalesced region itself but can also applied to other regions, e.g.
 of the same device, by calling memory_region_set_flush_coalesced.

Looks fine.

I have a hard time deciding whether this should go through the kvm tree
or memory tree.  Anthony, perhaps you can commit it directly to avoid
the livelock?

Reviewed-by: Avi Kivity a...@redhat.com


-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/7] memory: Flush coalesced MMIO on selected region access

2012-07-02 Thread Avi Kivity
On 07/02/2012 12:07 PM, Avi Kivity wrote:
 
 Reviewed-by: Avi Kivity a...@redhat.com

(for the entire patchset)


-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/8] KVM: Optimize MMU notifier's THP page invalidation -v4

2012-07-02 Thread Avi Kivity
On 07/02/2012 11:52 AM, Takuya Yoshikawa wrote:
 v3-v4: Resolved trace_kvm_age_page() issue -- patch 6,7
 v2-v3: Fixed intersection calculations. -- patch 3, 8
 
   Takuya
 
 Takuya Yoshikawa (8):
   KVM: MMU: Use __gfn_to_rmap() to clean up kvm_handle_hva()
   KVM: Introduce hva_to_gfn_memslot() for kvm_handle_hva()
   KVM: MMU: Make kvm_handle_hva() handle range of addresses
   KVM: Introduce kvm_unmap_hva_range() for 
 kvm_mmu_notifier_invalidate_range_start()
   KVM: Separate rmap_pde from kvm_lpage_info-write_count
   KVM: MMU: Add memslot parameter to hva handlers
   KVM: MMU: Push trace_kvm_age_page() into kvm_age_rmapp()
   KVM: MMU: Avoid handling same rmap_pde in kvm_handle_hva_range()
 

Reviewed-by: Avi Kivity a...@redhat.com


-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH stable-1.1] qemu-kvm: virtio: Do not register mask notifiers without in-kernel irqchip support

2012-07-02 Thread Michael S. Tsirkin
On Mon, Jul 02, 2012 at 10:05:39AM +0200, Jan Kiszka wrote:
 We crash if we registers mask notifiers without backing in-kernel
 irqchip. This corresponds to the check in QEMU upstream after 1.1 now.
 
 Signed-off-by: Jan Kiszka jan.kis...@siemens.com

Acked-by: Michael S. Tsirkin m...@redhat.com

 ---
 
 Not needed for master as we have upstream logic there already.
 
  hw/virtio-pci.c |4 ++--
  1 files changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
 index a0c2ca7..5b64356 100644
 --- a/hw/virtio-pci.c
 +++ b/hw/virtio-pci.c
 @@ -629,7 +629,7 @@ static int virtio_pci_set_guest_notifiers(void *opaque, 
 bool assign)
  
  /* Must unset mask notifier while guest notifier
   * is still assigned */
 -if (!assign) {
 +if (kvm_irqchip_in_kernel()  !assign) {
   r = msix_unset_mask_notifier(proxy-pci_dev);
  assert(r = 0);
  }
 @@ -647,7 +647,7 @@ static int virtio_pci_set_guest_notifiers(void *opaque, 
 bool assign)
  
  /* Must set mask notifier after guest notifier
   * has been assigned */
 -if (assign) {
 +if (kvm_irqchip_in_kernel()  assign) {
  r = msix_set_mask_notifier(proxy-pci_dev,
 virtio_pci_mask_notifier);
  if (r  0) {
 -- 
 1.7.3.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


plan for device assignment upstream

2012-07-02 Thread Michael S. Tsirkin
I've been thinking hard about Jan's patches for device
assignment. Basically while I thought it makes sense
to make all devices: assignment and not - behave the
same and use same APIs for injecting irqs, Anthony thinks there is huge
value in making irq propagation hierarchical and device assignment
should be special cased.

We seem to be at impasse for now and I think merging
assignment ASAP has higher value than this specific
issue. So I fold - let's do it as Anthony and Jan's
original patches proposed.

Jan, can you please rebase and repost your original patchset (against
master, not against pci) that added query for host irqs callbacks for
device assignment? I'll re-review ignoring the idea of using the cache,
with intent apply after I'll drop cache code from the pci branch in a
couple of days (busy otherwise now).

I still intend to rework this later on, but that can wait.

Thanks,

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: plan for device assignment upstream

2012-07-02 Thread Avi Kivity
On 07/02/2012 12:18 PM, Michael S. Tsirkin wrote:
 I've been thinking hard about Jan's patches for device
 assignment. Basically while I thought it makes sense
 to make all devices: assignment and not - behave the
 same and use same APIs for injecting irqs, Anthony thinks there is huge
 value in making irq propagation hierarchical and device assignment
 should be special cased.
 
 We seem to be at impasse for now and I think merging
 assignment ASAP has higher value than this specific
 issue. So I fold - let's do it as Anthony and Jan's
 original patches proposed.
 
 Jan, can you please rebase and repost your original patchset (against
 master, not against pci) that added query for host irqs callbacks for
 device assignment? I'll re-review ignoring the idea of using the cache,
 with intent apply after I'll drop cache code from the pci branch in a
 couple of days (busy otherwise now).
 
 I still intend to rework this later on, but that can wait.

Agree with both your ideas about the API and the decision to rework it
in tree.

-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [bug 1.1] -M pc-1.0 + vhost = SIGSEGV

2012-07-02 Thread Michael Tokarev
02.07.2012 11:20, Jan Kiszka wrote:
 On 2012-07-01 17:06, Michael Tokarev wrote:
 When running current git version of qemu-kvm with -M pc-1.0
 
 Just to clarify: you are talking about stable-1.1 git, not master.

Yes, as the $Subject (partially) says.

[]
 So it looks like msix isn't initialized for -M pc-1.0 ?

And for earlier pc numbers too (eg -M pc-0.15).

 Yes, because the machine option defaults are missing here. Will send a
 patch.
 
 Also vhost is buggy as it depends on in-kernel irqchip but doesn't check
 for it. Needs to be fixed as well.

And while we're at it, can we please take a look at the kernel
side of this bug, mentioned in other my email?  Namely, when
qemu-kvm sigsegvs in this place, the (persistent) tap device
becomes unusable and needs to be re-created (no packets are
flowing).  We've a nice reproducer now for this kernel issue.

(Cc'ing mst for this)

Thanks,

/mjt
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: plan for device assignment upstream

2012-07-02 Thread Jan Kiszka
On 2012-07-02 11:18, Michael S. Tsirkin wrote:
 I've been thinking hard about Jan's patches for device
 assignment. Basically while I thought it makes sense
 to make all devices: assignment and not - behave the
 same and use same APIs for injecting irqs, Anthony thinks there is huge
 value in making irq propagation hierarchical and device assignment
 should be special cased.

On the long term, we will need direct injection, ie. caching, to allow
making it lock-less. Stepping through all intermediate layers will cause
troubles, at least performance-wise, when having to take and drop a lock
at each stop.

 
 We seem to be at impasse for now and I think merging
 assignment ASAP has higher value than this specific
 issue. So I fold - let's do it as Anthony and Jan's
 original patches proposed.
 
 Jan, can you please rebase and repost your original patchset (against
 master, not against pci) that added query for host irqs callbacks for
 device assignment? I'll re-review ignoring the idea of using the cache,
 with intent apply after I'll drop cache code from the pci branch in a
 couple of days (busy otherwise now).

OK, will do ASAP.

 
 I still intend to rework this later on, but that can wait.
 
 Thanks,
 

Thanks as well,
Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: plan for device assignment upstream

2012-07-02 Thread Avi Kivity
On 07/02/2012 12:30 PM, Jan Kiszka wrote:
 On 2012-07-02 11:18, Michael S. Tsirkin wrote:
 I've been thinking hard about Jan's patches for device
 assignment. Basically while I thought it makes sense
 to make all devices: assignment and not - behave the
 same and use same APIs for injecting irqs, Anthony thinks there is huge
 value in making irq propagation hierarchical and device assignment
 should be special cased.
 
 On the long term, we will need direct injection, ie. caching, to allow
 making it lock-less. Stepping through all intermediate layers will cause
 troubles, at least performance-wise, when having to take and drop a lock
 at each stop.

So we precalculate everything beforehand.  Instead of each qemu_irq
triggering a callback, calculating the next hop and firing the next
qemu_irq, configure each qemu_irq array with a function that describes
how to take the next hop.  Whenever the configuration changes,
recalculate all routes.

For device assignment or vhost, we can have a qemu_irq_irqfd() which
converts a qemu_irq to an eventfd.  If the route calculations determine
that it can be serviced via a real irqfd, they also configure it as an
irqfd.  Otherwise qemu configures a poll on this eventfd and calls the
callback when needed.


-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] kvm/s390: sigp related changes for 3.6

2012-07-02 Thread Martin Schwidefsky
On Fri, 29 Jun 2012 20:19:46 -0300
Marcelo Tosatti mtosa...@redhat.com wrote:

 On Tue, Jun 26, 2012 at 04:06:35PM +0200, Cornelia Huck wrote:
  Avi, Marcelo,
  
  here are some more s390 patches for the next release.
  
  Patches 1 and 2 are included for dependency reasons; they will also
  be sent through Martin's s390 tree.
 
 I don't see why patch 1 is a dependency for merging in the kvm 
 tree, and why patch 2 should go through both trees?
 
 That is, patch 1 can go through S390 tree, patches 2-6 through 
 KVM tree. No?

Patch 3 has a dependency on patch 2 and patch 2 has a dependency
on patch 1. The hunk in pcpu_running would cause a reject:

@@ -155,8 +131,8 @@ static inline int pcpu_stopped(struct pcpu *pcpu)
 
 static inline int pcpu_running(struct pcpu *pcpu)
 {
-   if (__pcpu_sigp(pcpu-address, sigp_sense_running,
-   0, pcpu-status) != sigp_status_stored)
+   if (__pcpu_sigp(pcpu-address, SIGP_SENSE_RUNNING,
+   0, pcpu-status) != SIGP_CC_STATUS_STORED)
return 1;
/* Status stored condition code is equivalent to cpu not running. */
return 0;


-- 
blue skies,
   Martin.

Reality continues to ruin my life. - Calvin.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM call agenda for Tuesday, July 3rd

2012-07-02 Thread Juan Quintela

Hi

Please send in any agenda items you are interested in covering.

Later, Juan.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH] Expose tsc deadline timer feature to guest

2012-07-02 Thread Liu, Jinsong
Eduardo, Jan, Andreas

As we sync 3 months ago, I wait until qemu1.1 done, then re-write patch based 
on qemu1.1.
Now it's time to re-write my patch based on qemu1.1.

Attached is a RFC patch for exposing tsc deadline timer to guest.
I have checked current qemu1.1 code, and read some emails regarding to cpuid 
exposing these days.
However, I think I may ignore something (so many discussion :-), so if you 
think anything wrong, please point out to me.

Thanks,
Jinsong


From 8b5b003f6f8834d2d5d71e18bb47b7f089bc4928 Mon Sep 17 00:00:00 2001
From: Liu, Jinsong jinsong@intel.com
Date: Tue, 3 Jul 2012 02:35:10 +0800
Subject: [PATCH] Expose tsc deadline timer feature to guest

This patch exposes tsc deadline timer feature to guest if
1). in-kernel irqchip is used, and
2). kvm has emulated tsc deadline timer, and
3). user authorize the feature exposing via -cpu or +/- tsc-deadline

Signed-off-by: Liu, Jinsong jinsong@intel.com
---
 target-i386/cpu.h |1 +
 target-i386/kvm.c |5 +
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 79cc640..d1a4a04 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -400,6 +400,7 @@
 #define CPUID_EXT_X2APIC   (1  21)
 #define CPUID_EXT_MOVBE(1  22)
 #define CPUID_EXT_POPCNT   (1  23)
+#define CPUID_EXT_TSC_DEADLINE_TIMER (1  24)
 #define CPUID_EXT_XSAVE(1  26)
 #define CPUID_EXT_OSXSAVE  (1  27)
 #define CPUID_EXT_HYPERVISOR  (1  31)
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 0d0d8f6..52b577f 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -361,8 +361,13 @@ int kvm_arch_init_vcpu(CPUX86State *env)
 env-cpuid_features = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
 
 i = env-cpuid_ext_features  CPUID_EXT_HYPERVISOR;
+j = env-cpuid_ext_features  CPUID_EXT_TSC_DEADLINE_TIMER;
 env-cpuid_ext_features = kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX);
 env-cpuid_ext_features |= i;
+if (j  kvm_irqchip_in_kernel() 
+kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
+env-cpuid_ext_features |= CPUID_EXT_TSC_DEADLINE_TIMER;
+}
 
 env-cpuid_ext2_features = kvm_arch_get_supported_cpuid(s, 0x8001,
  0, R_EDX);
-- 
1.7.1


0001-Expose-tsc-deadline-timer-feature-to-guest.patch
Description: 0001-Expose-tsc-deadline-timer-feature-to-guest.patch


Re: [RFC PATCH] Expose tsc deadline timer feature to guest

2012-07-02 Thread Andreas Färber
Am 02.07.2012 13:08, schrieb Liu, Jinsong:
 Eduardo, Jan, Andreas
 
 As we sync 3 months ago, I wait until qemu1.1 done, then re-write patch based 
 on qemu1.1.
 Now it's time to re-write my patch based on qemu1.1.
 
 Attached is a RFC patch for exposing tsc deadline timer to guest.
 I have checked current qemu1.1 code, and read some emails regarding to cpuid 
 exposing these days.
 However, I think I may ignore something (so many discussion :-), so if you 
 think anything wrong, please point out to me.

No objections or nitpicks from my (CPU) side FWIW.

Regards,
Andreas

 
 Thanks,
 Jinsong
 
 
 From 8b5b003f6f8834d2d5d71e18bb47b7f089bc4928 Mon Sep 17 00:00:00 2001
 From: Liu, Jinsong jinsong@intel.com
 Date: Tue, 3 Jul 2012 02:35:10 +0800
 Subject: [PATCH] Expose tsc deadline timer feature to guest
 
 This patch exposes tsc deadline timer feature to guest if
 1). in-kernel irqchip is used, and
 2). kvm has emulated tsc deadline timer, and
 3). user authorize the feature exposing via -cpu or +/- tsc-deadline
 
 Signed-off-by: Liu, Jinsong jinsong@intel.com
 ---
  target-i386/cpu.h |1 +
  target-i386/kvm.c |5 +
  2 files changed, 6 insertions(+), 0 deletions(-)
 
 diff --git a/target-i386/cpu.h b/target-i386/cpu.h
 index 79cc640..d1a4a04 100644
 --- a/target-i386/cpu.h
 +++ b/target-i386/cpu.h
 @@ -400,6 +400,7 @@
  #define CPUID_EXT_X2APIC   (1  21)
  #define CPUID_EXT_MOVBE(1  22)
  #define CPUID_EXT_POPCNT   (1  23)
 +#define CPUID_EXT_TSC_DEADLINE_TIMER (1  24)
  #define CPUID_EXT_XSAVE(1  26)
  #define CPUID_EXT_OSXSAVE  (1  27)
  #define CPUID_EXT_HYPERVISOR  (1  31)
 diff --git a/target-i386/kvm.c b/target-i386/kvm.c
 index 0d0d8f6..52b577f 100644
 --- a/target-i386/kvm.c
 +++ b/target-i386/kvm.c
 @@ -361,8 +361,13 @@ int kvm_arch_init_vcpu(CPUX86State *env)
  env-cpuid_features = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
  
  i = env-cpuid_ext_features  CPUID_EXT_HYPERVISOR;
 +j = env-cpuid_ext_features  CPUID_EXT_TSC_DEADLINE_TIMER;
  env-cpuid_ext_features = kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX);
  env-cpuid_ext_features |= i;
 +if (j  kvm_irqchip_in_kernel() 
 +kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
 +env-cpuid_ext_features |= CPUID_EXT_TSC_DEADLINE_TIMER;
 +}
  
  env-cpuid_ext2_features = kvm_arch_get_supported_cpuid(s, 0x8001,
   0, R_EDX);


-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock

2012-07-02 Thread Avi Kivity
Revisiting after hiatus.

On 05/21/2012 11:58 PM, Marcelo Tosatti wrote:
 On Thu, May 17, 2012 at 01:24:42PM +0300, Avi Kivity wrote:
 Signed-off-by: Avi Kivity a...@redhat.com
 ---
  virt/kvm/kvm_main.c |   16 
  1 file changed, 8 insertions(+), 8 deletions(-)
 
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 585ab45..9f6d15d 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -302,11 +302,11 @@ static void kvm_mmu_notifier_invalidate_page(struct 
 mmu_notifier *mn,
  kvm-mmu_notifier_seq++;
  if (kvm_unmap_hva(kvm, address))
  kvm_mark_tlb_dirty(kvm);
 -/* we've to flush the tlb before the pages can be freed */
 -kvm_cond_flush_remote_tlbs(kvm);
 -
  spin_unlock(kvm-mmu_lock);
  srcu_read_unlock(kvm-srcu, idx);
 +
 +/* we've to flush the tlb before the pages can be freed */
 +kvm_cond_flush_remote_tlbs(kvm);
  }
 
 There are still sites that assumed implicitly that acquiring mmu_lock
 guarantees that sptes and remote TLBs are in sync. Example:
 
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
 struct kvm_mmu_page *sp, *node;
 LIST_HEAD(invalid_list);
 
 spin_lock(kvm-mmu_lock);
 restart:
 list_for_each_entry_safe(sp, node, kvm-arch.active_mmu_pages,
 link)
 if (kvm_mmu_prepare_zap_page(kvm, sp, invalid_list))
 goto restart;
 
 kvm_mmu_commit_zap_page(kvm, invalid_list);
 spin_unlock(kvm-mmu_lock);
 }
 
 kvm_mmu_commit_zap_page only flushes if the TLB was dirtied by this
 context, not before. kvm_mmu_unprotect_page is similar.
 
 In general:
 
 context 1   context 2
 
 lock(mmu_lock)
 modify spte
 mark_tlb_dirty
 unlock(mmu_lock)
 lock(mmu_lock)
 read spte
 make a decision based on spte value
 unlock(mmu_lock)
 flush remote TLBs
 
 
 Is scary.

It scares me too.  Could be something trivial like following an
intermediate paging structure entry or not, depending on whether it is
present.  I don't think we'll be able to audit the entire code base to
ensure nothing like that happens, or to enforce it later on.

 Perhaps have a rule that says:
 
 1) Conditionally flush remote TLB after acquiring mmu_lock, 
 before anything (even perhaps inside the lock macro).

I would like to avoid any flush within the lock.  We may back down on
this goal but let's try to find something that works and doesn't depend
on huge audits.

 2) Except special cases where it is clear that this is not 
 necessary.

One option: your idea, but without taking the lock

  def mmu_begin():
  clean = False
  while not clean:
cond_flush the tlb
rtlb = kvm.remote_tlb_counter # atomically wrt flush
spin_lock mmu_lock
if rtlb != kvm.remote_tlb_counter
clean = False
spin_unlock(mmu_lock)

Since we're spinning over the tlb counter, there's no real advantage
here except that preemption is enabled.

A simpler option is to make mmu_end() do a cond_flush():

   def mmu_begin():
   spin_lock(mmu_lock)

   def mmu_end():
   spin_unlock(mmu_lock)
   cond_flush

We need something for lockbreaking too:

   def mmu_lockbreak():
   if not (contended or need_resched):
   return False
   remember flush counter
   cond_resched_lock
   return flush counter changed

The caller would check the return value to see if it needs to redo
anything.  But this has the danger of long operations (like write
protecting a slot) never completing.

-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 5/8] KVM: PPC: ev_idle hcall support for e500 guests

2012-07-02 Thread Alexander Graf

On 22.06.2012, at 22:04, Stuart Yoder wrote:

 From: Liu Yu-B13201 yu@freescale.com
 
 Signed-off-by: Liu Yu yu@freescale.com
 [varun: 64-bit changes]
 Signed-off-by: Varun Sethi varun.se...@freescale.com
 Signed-off-by: Stuart Yoder stuart.yo...@freescale.com
 ---
 -v11:
   -added comment about origin of idle instruction sequence
   -update for 64-bit support found in testing
 
 arch/powerpc/include/asm/epapr_hcalls.h |   11 +
 arch/powerpc/kernel/epapr_hcalls.S  |   32 +++
 arch/powerpc/kernel/epapr_paravirt.c|   11 +-
 3 files changed, 48 insertions(+), 6 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/epapr_hcalls.h 
 b/arch/powerpc/include/asm/epapr_hcalls.h
 index c0c7adc..833ce2c 100644
 --- a/arch/powerpc/include/asm/epapr_hcalls.h
 +++ b/arch/powerpc/include/asm/epapr_hcalls.h
 @@ -50,10 +50,6 @@
 #ifndef _EPAPR_HCALLS_H
 #define _EPAPR_HCALLS_H
 
 -#include linux/types.h
 -#include linux/errno.h
 -#include asm/byteorder.h
 -
 #define EV_BYTE_CHANNEL_SEND  1
 #define EV_BYTE_CHANNEL_RECEIVE   2
 #define EV_BYTE_CHANNEL_POLL  3
 @@ -109,6 +105,11 @@
 #define EV_UNIMPLEMENTED  12  /* Unimplemented hypercall */
 #define EV_BUFFER_OVERFLOW13  /* Caller-supplied buffer too small */
 
 +#ifndef __ASSEMBLY__
 +#include linux/types.h
 +#include linux/errno.h
 +#include asm/byteorder.h
 +
 /*
  * Hypercall register clobber list
  *
 @@ -506,5 +507,5 @@ static inline unsigned int ev_idle(void)
 
   return r3;
 }
 -
 +#endif /* !__ASSEMBLY__ */
 #endif
 diff --git a/arch/powerpc/kernel/epapr_hcalls.S 
 b/arch/powerpc/kernel/epapr_hcalls.S
 index 697b390..eb19377 100644
 --- a/arch/powerpc/kernel/epapr_hcalls.S
 +++ b/arch/powerpc/kernel/epapr_hcalls.S
 @@ -8,13 +8,45 @@
  */
 
 #include linux/threads.h
 +#include asm/epapr_hcalls.h
 #include asm/reg.h
 #include asm/page.h
 #include asm/cputable.h
 #include asm/thread_info.h
 #include asm/ppc_asm.h
 +#include asm/asm-compat.h
 #include asm/asm-offsets.h
 
 +/* epapr_ev_idle() was derived from e500_idle() */
 +_GLOBAL(epapr_ev_idle)
 +#ifndef CONFIG_PPC64
 + rlwinm  r3, r1, 0, 0, 31-THREAD_SHIFT   /* current thread_info */
 +#else
 + clrrdi  r3, r1, THREAD_SHIFT
 +#endif

Ben, would you be opposed to a macro that does this? Open-coding subtile 
details like this always makes me wary we could screw them up :)


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Alexander Graf

On 22.06.2012, at 22:06, Stuart Yoder wrote:

 From: Liu Yu-B13201 yu@freescale.com
 
 Signed-off-by: Liu Yu yu@freescale.com
 Signed-off-by: Stuart Yoder stuart.yo...@freescale.com
 ---
 -v11: no changes
 
 arch/powerpc/include/asm/epapr_hcalls.h |   22 +-
 arch/powerpc/include/asm/fsl_hcalls.h   |   36 +++---
 2 files changed, 29 insertions(+), 29 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/epapr_hcalls.h 
 b/arch/powerpc/include/asm/epapr_hcalls.h
 index 833ce2c..b8d9445 100644
 --- a/arch/powerpc/include/asm/epapr_hcalls.h
 +++ b/arch/powerpc/include/asm/epapr_hcalls.h
 @@ -195,7 +195,7 @@ static inline unsigned int ev_int_set_config(unsigned int 
 interrupt,
   r5  = priority;
   r6  = destination;
 
 - __asm__ __volatile__ (sc 1
 + asm volatile(blepapr_hypercall_start
   : +r (r11), +r (r3), +r (r4), +r (r5), +r (r6)
   : : EV_HCALL_CLOBBERS4

Hrm. ePAPR hypercalls are allowed to clobber lr, right? But our hypercall entry 
code depends on lr staying alive:

.global epapr_hypercall_start
epapr_hypercall_start:
li  r3, -1
nop
nop
nop
blr

So I suppose for this to work, we'd have to store lr off to the stack before 
doing the hypercall in epapr_hypercall_start.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock

2012-07-02 Thread Avi Kivity
On 07/02/2012 03:05 PM, Avi Kivity wrote:
 We need something for lockbreaking too:
 
def mmu_lockbreak():
if not (contended or need_resched):
return False
remember flush counter
cond_resched_lock
return flush counter changed
 
 The caller would check the return value to see if it needs to redo
 anything.  But this has the danger of long operations (like write
 protecting a slot) never completing.

In fact I don't think we need the return value.  All long running
operations can tolerate a lock break.

kvm_mmu_change_mmu_pages: so long as the counter is maintained
correctly, it will function.  May livelock though, so we should abort if
we don't manage to keep the counter down.

get_dirty_log: so long as a write fault updates the next bitmap, we're fine

kvm_mmu_slot_remove_write_access: same.  It's hard to continue the loop
after a lockbreak though.  We can switch it to be rmap based instead.

flush_shadow (zap_all): just restart from the beginning after dropping
the lock.  May livelock, can be fixed by using a generation counter for
kvm_mmu_page.

kvm_mmu_sync_roots: already does lockbreaking

kvm_mmu_unprotect_page: not long-running in normal operation, but a
guest can make it long running.  However, we're allowed not to be
accurate about it and just return to the guest.

kvm_mmu_pte_write: can be a long operation with crazy guests. Normal
guests will work fine.

-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 11/15] ARM: KVM: World-switch implementation

2012-07-02 Thread Avi Kivity
On 06/21/2012 08:54 PM, Christoffer Dall wrote:
 @@ -504,6 +514,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
 *vcpu, struct kvm_run *run)
*/
   preempt_disable();
   local_irq_disable();
 +
 + if (check_new_vmid_gen(kvm)) {
 + local_irq_enable();
 + preempt_enable();
 + continue;
 + }
 +

 I see the same race with signals.  Your signal_pending() check needs to
 be after the local_irq_disable(), otherwise we can enter a guest with a
 pending signal.

 
 that's not functionally incorrect though is it? It may simply increase
 the latency for the signal delivery as far as I can see, but I
 definitely don't mind changing this path in any case.

Nothing guarantees that there will be a next exit.  I think we still run
the timer tick on guest entry, so we'll exit after a few milliseconds,
but there are patches to disable the timer tick if only one task is queued.

-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 5/8] KVM: PPC: ev_idle hcall support for e500 guests

2012-07-02 Thread Benjamin Herrenschmidt
On Mon, 2012-07-02 at 14:20 +0200, Alexander Graf wrote:
 
 Ben, would you be opposed to a macro that does this? Open-coding
 subtile details like this always makes me wary we could screw them
 up :)

That's definitely material for a macro (and fixup the gazillion places
where we do it by hand already :-)

Cheers,
Ben.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 15298] kernel BUG at /usr/src/packages/BUILD/kernel-desktop-2.6.31.8/linux-2.6.31/mm/slab.c:532!

2012-07-02 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=15298


Alan a...@lxorguk.ukuu.org.uk changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 CC||a...@lxorguk.ukuu.org.uk
 Resolution||OBSOLETE




-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 5/8] KVM: PPC: ev_idle hcall support for e500 guests

2012-07-02 Thread Alexander Graf

On 02.07.2012, at 15:24, Benjamin Herrenschmidt wrote:

 On Mon, 2012-07-02 at 14:20 +0200, Alexander Graf wrote:
 
 Ben, would you be opposed to a macro that does this? Open-coding
 subtile details like this always makes me wary we could screw them
 up :)
 
 That's definitely material for a macro (and fixup the gazillion places
 where we do it by hand already :-)

Just like I thought.

Stuart, want to take this on you? It'd be mostly a simple search+replace of the 
pattern in question with a macro GET_THREAD_INFO(r3, r1) that does the same 
thing as the code we have today, but makes it easier to read.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: bookehv: Add ESR flag to Data Storage Interrupt

2012-07-02 Thread Alexander Graf

On 23.06.2012, at 01:33, Mihai Caraman wrote:

 ESR register is required by Data Storage Interrupt handling code.
 Add the specific flag to the interrupt handler.
 
 Signed-off-by: Mihai Caraman mihai.cara...@freescale.com

Thanks, applied to kvm-ppc-next.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v2 1/3] KVM: Add new -cpu best

2012-07-02 Thread Alexander Graf

On 26.06.2012, at 18:39, Alexander Graf wrote:

 During discussions on whether to make -cpu host the default in SLE, I found
 myself disagreeing to the thought, because it potentially opens a big can
 of worms for potential bugs. But if I already am so opposed to it for SLE, how
 can it possibly be reasonable to default to -cpu host in upstream QEMU? And
 what would a sane default look like?
 
 So I had this idea of looping through all available CPU definitions. We can
 pretty well tell if our host is able to execute any of them by checking the
 respective flags and seeing if our host has all features the CPU definition
 requires. With that, we can create a -cpu type that would fall back to the
 best known CPU definition that our host can fulfill. On my Phenom II
 system for example, that would be -cpu phenom.
 
 With this approach we can test and verify that CPU types actually work at
 any random user setup, because we can always verify that all the -cpu types
 we ship actually work. And we only default to some clever mechanism that
 chooses from one of these.
 
 Signed-off-by: Alexander Graf ag...@suse.de

Ping :)


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock

2012-07-02 Thread Takuya Yoshikawa
On Mon, 02 Jul 2012 15:41:30 +0300
Avi Kivity a...@redhat.com wrote:

 kvm_mmu_slot_remove_write_access: same.  It's hard to continue the loop
 after a lockbreak though.  We can switch it to be rmap based instead.

Switching to rmap based protection was on my queue before, but I wanted
to do that after your work.

Thanks,
Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock

2012-07-02 Thread Avi Kivity
On 07/02/2012 05:09 PM, Takuya Yoshikawa wrote:
 On Mon, 02 Jul 2012 15:41:30 +0300
 Avi Kivity a...@redhat.com wrote:
 
 kvm_mmu_slot_remove_write_access: same.  It's hard to continue the loop
 after a lockbreak though.  We can switch it to be rmap based instead.
 
 Switching to rmap based protection was on my queue before, but I wanted
 to do that after your work.

It makes sense.  I'll dive back in after we agree on the way to go.

-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v2 1/3] KVM: Add new -cpu best

2012-07-02 Thread Andreas Färber
Am 26.06.2012 18:39, schrieb Alexander Graf:
 During discussions on whether to make -cpu host the default in SLE, I found

s/make -cpu host the default/support/?

 myself disagreeing to the thought, because it potentially opens a big can
 of worms for potential bugs. But if I already am so opposed to it for SLE, how
 can it possibly be reasonable to default to -cpu host in upstream QEMU? And
 what would a sane default look like?
 
 So I had this idea of looping through all available CPU definitions. We can
 pretty well tell if our host is able to execute any of them by checking the
 respective flags and seeing if our host has all features the CPU definition
 requires. With that, we can create a -cpu type that would fall back to the
 best known CPU definition that our host can fulfill. On my Phenom II
 system for example, that would be -cpu phenom.
 
 With this approach we can test and verify that CPU types actually work at
 any random user setup, because we can always verify that all the -cpu types
 we ship actually work. And we only default to some clever mechanism that
 chooses from one of these.
 
 Signed-off-by: Alexander Graf ag...@suse.de

Despite the long commit message a cover letter would've been nice. ;)

Anything that operates on x86_def_t will obviously need to be refactored
when we agree on the course for x86 CPU subclasses.
But no objection to getting it done some way that works today.

Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/3] KVM: Add new -cpu best

2012-07-02 Thread Avi Kivity
On 06/26/2012 07:39 PM, Alexander Graf wrote:
 During discussions on whether to make -cpu host the default in SLE, I found
 myself disagreeing to the thought, because it potentially opens a big can
 of worms for potential bugs. But if I already am so opposed to it for SLE, how
 can it possibly be reasonable to default to -cpu host in upstream QEMU? And
 what would a sane default look like?
 
 So I had this idea of looping through all available CPU definitions. We can
 pretty well tell if our host is able to execute any of them by checking the
 respective flags and seeing if our host has all features the CPU definition
 requires. With that, we can create a -cpu type that would fall back to the
 best known CPU definition that our host can fulfill. On my Phenom II
 system for example, that would be -cpu phenom.
 
 With this approach we can test and verify that CPU types actually work at
 any random user setup, because we can always verify that all the -cpu types
 we ship actually work. And we only default to some clever mechanism that
 chooses from one of these.
 
  
 +/* Are all guest feature bits present on the host? */
 +static bool cpu_x86_feature_subset(uint32_t host, uint32_t guest)
 +{
 +int i;
 +
 +for (i = 0; i  32; i++) {
 +uint32_t mask = 1  i;
 +if ((guest  mask)  !(host  mask)) {
 +return false;
 +}
 +}
 +
 +return true;

return !(guest  ~host);


 +}



 +
 +
 +
 +static void cpu_x86_fill_best(x86_def_t *x86_cpu_def)
 +{
 +x86_def_t *def;
 +
 +x86_cpu_def-family = 0;
 +x86_cpu_def-model = 0;
 +for (def = x86_defs; def; def = def-next) {
 +if (cpu_x86_fits_host(def)  cpu_x86_fits_higher(def, x86_cpu_def)) 
 {
 +memcpy(x86_cpu_def, def, sizeof(*def));
 +}
  *x86_cpu_def = *def;
 +}
 +
 +if (!x86_cpu_def-family  !x86_cpu_def-model) {
 +fprintf(stderr, No fitting CPU model found!\n);
 +exit(1);
 +}
 +}
 +
  static int unavailable_host_feature(struct model_features_t *f, uint32_t 
 mask)
  {
  int i;
 @@ -878,6 +957,8 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, 
 const char *cpu_model)
  break;
  if (kvm_enabled()  name  strcmp(name, host) == 0) {
  cpu_x86_fill_host(x86_cpu_def);
 +} else if (kvm_enabled()  name  strcmp(name, best) == 0) {
 +cpu_x86_fill_best(x86_cpu_def);
  } else if (!def) {
  goto error;
  } else {
 

Should we copy the cache size etc. from the host?


-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/3] KVM: Use -cpu best as default on x86

2012-07-02 Thread Avi Kivity
On 06/26/2012 07:39 PM, Alexander Graf wrote:
 When running QEMU without -cpu parameter, the user usually wants a sane
 default. So far, we're using the qemu64/qemu32 CPU type, which basically
 means the maximum TCG can emulate.
 
 That's a really good default when using TCG, but when running with KVM
 we much rather want a default saying the maximum performance I can get.
 
 Fortunately we just added an option that gives us the best performance
 while still staying safe on the testability side of things: -cpu best.
 So all we need to do is make -cpu best the default when the user doesn't
 explicitly specify a CPU type.
 
 This fixes a lot of subtile breakage in the GNU toolchain (libgmp) which

subtle

 hicks up on QEMU's non-existent CPU models.
 
 This patch also adds a new pc-1.2 machine type to stay backwards compatible
 with older versions of QEMU.

-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: handle last_boosted_vcpu = 0 case

2012-07-02 Thread Rik van Riel

On 06/28/2012 06:55 PM, Vinod, Chegu wrote:

Hello,

I am just catching up on this email thread...

Perhaps one of you may be able to help answer this query.. preferably along 
with some data.  [BTW, I do understand the basic intent behind PLE in a typical 
[sweet spot] use case where there is over subscription etc. and the need to 
optimize the PLE handler in the host etc. ]

In a use case where the host has fewer but much larger guests (say 40VCPUs and 
higher) and there is no over subscription (i.e. # of vcpus across guests= 
physical cpus in the host  and perhaps each guest has their vcpu's pinned to 
specific physical cpus for other reasons), I would like to understand if/how  the 
PLE really helps ?  For these use cases would it be ok to turn PLE off (ple_gap=0) 
since is no real need to take an exit and find some other VCPU to yield to ?


Yes, that should be ok.

On a related note, I wonder if we should increase the ple_gap
significantly.

After all, 4096 cycles of spinning is not that much, when you
consider how much time is spent doing the subsequent vmexit,
scanning the other VCPU's status (200 cycles per cache miss),
deciding what to do, maybe poking another CPU, and eventually
a vmenter.

A factor 4 increase in ple_gap might be what it takes to
get the amount of time spent spinning equal to the amount of
time spent on the host side doing KVM stuff...

--
All rights reversed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: race between kvm-kmod-3.0 and kvm-kmod-3.3 // was: race condition in qemu-kvm-1.0.1

2012-07-02 Thread Avi Kivity
On 06/28/2012 12:38 PM, Peter Lieven wrote:
 does anyone know whats that here in handle_mmio?
 
 /* hack: Red Hat 7.1 generates these weird accesses. */
 if ((addr  0xa-4  addr = 0xa)  kvm_run-mmio.len == 3)
 return 0;
 

Just what it says.  There is a 4-byte access to address 0x9.  The
first byte lies in RAM, the next three bytes are in mmio.  qemu is
geared to power-of-two accesses even though x86 can generate accesses to
any number of bytes between 1 and 8.

It appears that this has happened with your guest.  It's not impossible
that it's genuine.

-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 3/3] kvm: Sanitize KVM_IRQFD flags

2012-07-02 Thread Alex Williamson
On Sun, 2012-07-01 at 11:26 +0300, Michael S. Tsirkin wrote:
 On Fri, Jun 29, 2012 at 09:56:24AM -0600, Alex Williamson wrote:
  We only know of one so far.
  
  Signed-off-by: Alex Williamson alex.william...@redhat.com
 
 BTW should we sanitize padding as well?

I imagine that to make use of a field in the padding we'll set a bit in
the flags to indicate that the field is present/valid.  That way we
don't care about garbage in the padding as long as we can sanitize the
flags.  Thanks,

Alex

  ---
  
   virt/kvm/eventfd.c |3 +++
   1 file changed, 3 insertions(+)
  
  diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
  index c307c24..7d7e2aa 100644
  --- a/virt/kvm/eventfd.c
  +++ b/virt/kvm/eventfd.c
  @@ -340,6 +340,9 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd 
  *args)
   int
   kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
   {
  +   if (args-flags  ~KVM_IRQFD_FLAG_DEASSIGN)
  +   return -EINVAL;
  +
  if (args-flags  KVM_IRQFD_FLAG_DEASSIGN)
  return kvm_irqfd_deassign(kvm, args);
   
  
  --
  To unsubscribe from this list: send the line unsubscribe linux-kernel in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
  Please read the FAQ at  http://www.tux.org/lkml/



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 0/3] kvm: KVM_IRQFD cleanup, docs, sanitize flags

2012-07-02 Thread Alex Williamson
On Fri, 2012-06-29 at 09:56 -0600, Alex Williamson wrote:
 Before we start fiddling with what we can and can't add to KVM_IRQFD
 we need to figure out if anyone has been sloppy in their use of the
 ioctl flags.  This series has a minor cleanup to pass the struct
 kvm_irqfd to seup functions rather than individual parameters, making
 it more consistent with ioeventfd, adds API documentation for this
 ioctl, and sanitizes the flags.  If anyone screams, we may have to
 revert this last patch.  Thanks,

Avi, what do you think about trying to get this in for 3.5 to test
whether we break anyone?  Then we can aim for 3.6 for level irqfd 
eoifd support.  Thanks,

Alex

 ---
 
 Alex Williamson (3):
   kvm: Sanitize KVM_IRQFD flags
   kvm: Add missing KVM_IRQFD API documentation
   kvm: Pass kvm_irqfd to functions
 
 
  Documentation/virtual/kvm/api.txt |   16 
  include/linux/kvm_host.h  |4 ++--
  virt/kvm/eventfd.c|   23 +--
  virt/kvm/kvm_main.c   |2 +-
  4 files changed, 32 insertions(+), 13 deletions(-)



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 0/3] kvm: KVM_IRQFD cleanup, docs, sanitize flags

2012-07-02 Thread Avi Kivity
On 07/02/2012 06:51 PM, Alex Williamson wrote:
 On Fri, 2012-06-29 at 09:56 -0600, Alex Williamson wrote:
 Before we start fiddling with what we can and can't add to KVM_IRQFD
 we need to figure out if anyone has been sloppy in their use of the
 ioctl flags.  This series has a minor cleanup to pass the struct
 kvm_irqfd to seup functions rather than individual parameters, making
 it more consistent with ioeventfd, adds API documentation for this
 ioctl, and sanitizes the flags.  If anyone screams, we may have to
 revert this last patch.  Thanks,
 
 Avi, what do you think about trying to get this in for 3.5 to test
 whether we break anyone?  Then we can aim for 3.6 for level irqfd 
 eoifd support.

Makes sense.  Marcelo?


-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: race between kvm-kmod-3.0 and kvm-kmod-3.3 // was: race condition in qemu-kvm-1.0.1

2012-07-02 Thread Peter Lieven

On 02.07.2012 17:05, Avi Kivity wrote:

On 06/28/2012 12:38 PM, Peter Lieven wrote:

does anyone know whats that here in handle_mmio?

 /* hack: Red Hat 7.1 generates these weird accesses. */
 if ((addr  0xa-4  addr= 0xa)  kvm_run-mmio.len == 3)
 return 0;


Just what it says.  There is a 4-byte access to address 0x9.  The
first byte lies in RAM, the next three bytes are in mmio.  qemu is
geared to power-of-two accesses even though x86 can generate accesses to
any number of bytes between 1 and 8.

I just stumbled across the word hack in the comment. When the race
occurs the CPU is basically reading from 0xa in an endless loop.

It appears that this has happened with your guest.  It's not impossible
that it's genuine.


I had a lot to do the last days, but I update our build environment to
Ubuntu LTS 12.04 64-bit Server which is based on Linux 3.2.0. I still
see the issue. If I use the kvm Module provided with the kernel it is
working correctly. If I use kvm-kmod-3.4 with qemu-kvm-1.0.1 (both
from sourceforge) I can reproduce the race condition.

I will keep you posted when I have more evidence.

Thanks,
Peter
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RFD: virtio balloon API use (was Re: [PATCH 5 of 5] virtio: expose added descriptors immediately)

2012-07-02 Thread Rafael Aquini
On Mon, Jul 02, 2012 at 10:25:58AM +0300, Michael S. Tsirkin wrote:
 On Mon, Jul 02, 2012 at 10:35:47AM +0930, Rusty Russell wrote:
  On Sun, 1 Jul 2012 12:20:51 +0300, Michael S. Tsirkin m...@redhat.com 
  wrote:
   On Thu, Nov 03, 2011 at 06:12:53PM +1030, Rusty Russell wrote:
A virtio driver does virtqueue_add_buf() multiple times before finally
calling virtqueue_kick(); previously we only exposed the added buffers
in the virtqueue_kick() call.  This means we don't need a memory
barrier in virtqueue_add_buf(), but it reduces concurrency as the
device (ie. host) can't see the buffers until the kick.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
   
   Looking at recent mm compaction patches made me look at locking
   in balloon closely. And I noticed the referenced patch (commit
   ee7cd8981e15bcb365fc762afe3fc47b8242f630 upstream) interacts strangely
   with virtio balloon; balloon currently does:
   
   static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
   {
   struct scatterlist sg;
   
   sg_init_one(sg, vb-pfns, sizeof(vb-pfns[0]) * vb-num_pfns);
   
   init_completion(vb-acked);
   
   /* We should always be able to add one buffer to an empty queue. 
   */
   if (virtqueue_add_buf(vq, sg, 1, 0, vb, GFP_KERNEL)  0)
   BUG();
   virtqueue_kick(vq);
   
   /* When host has read buffer, this completes via balloon_ack */
   wait_for_completion(vb-acked);
   }
   
   
   While vq callback does:
   
   static void balloon_ack(struct virtqueue *vq)
   {
   struct virtio_balloon *vb;
   unsigned int len;
   
   vb = virtqueue_get_buf(vq, len);
   if (vb)
   complete(vb-acked);
   }
   
   
   So virtqueue_get_buf might now run concurrently with virtqueue_kick.
   I audited both and this seems safe in practice but I think
  
  Good spotting!
  
  Agreed.  Because there's only add_buf, we get away with it: the add_buf
  must be almost finished by the time get_buf runs because the device has
  seen the buffer.
  
   we need to either declare this legal at the API level
   or add locking in driver.
  
  I wonder if we should just lock in the balloon driver, rather than
  document this corner case and set a bad example.
 
 We'll need to replace vb-acked with a waitqueue
 and do get_buf from the same thread.
 But I note that stats_request hash the same issue.
 Let's see if we can fix it.
 
  Are there other
  drivers which take the same shortcut?
 
 Not that I know.
 
   Further, is there a guarantee that we never get
   spurious callbacks?  We currently check ring not empty
   but esp for non shared MSI this might not be needed.
  
  Yes, I think this saves us.  A spurious interrupt won't trigger
  a spurious callback.
  
   If a spurious callback triggers, virtqueue_get_buf can run
   concurrently with virtqueue_add_buf which is known to be racy.
   Again I think this is currently safe as no spurious callbacks in
   practice but should we guarantee no spurious callbacks at the API level
   or add locking in driver?
  
  I think we should guarantee it, but is there a hole in the current
  implementation?
  
  Thanks,
  Rusty.
 
 Could be. The check for ring empty looks somewhat suspicious.
 It might be expensive to make it 100% robust - that check was
 intended as an optimization for shared interrupts.
 Whith per vq interrupts we IMO do not need the check.
 If we add locking in balloon I think there's no need
 to guarantee no spurious interrupts.
 

As 'locking in balloon', may I assume the approach I took for the compaction 
case
is OK and aligned to address these concerns of yours? If not, do not hesitate in
giving me your thoughts, please. I'm respinning a V3 series to address a couple
of extra nitpicks from the compaction standpoint, and I'd love to be able to
address any extra concern you might have on the balloon side of that work.


Thanks!
Rafael.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv3 RFC 0/2] kvm: direct msix injection

2012-07-02 Thread Alex Williamson
On Mon, 2012-06-25 at 11:32 +0200, Jan Kiszka wrote:
 On 2012-06-11 13:19, Michael S. Tsirkin wrote:
  We can deliver certain interrupts, notably MSIX,
  from atomic context.
  Here's an untested patch to do this (compiled only).
  
  Changes from v2:
  Don't inject broadcast interrupts directly
  Changes from v1:
  Tried to address comments from v1, except unifying
  with kvm_set_irq: passing flags to it looks too ugly.
  Added a comment.
  
  Jan, you said you can test this?
  
  
  Michael S. Tsirkin (2):
kvm: implement kvm_set_msi_inatomic
kvm: deliver msix interrupts from irq handler
  
   include/linux/kvm_host.h |  3 ++
   virt/kvm/assigned-dev.c  | 31 ++--
   virt/kvm/irq_comm.c  | 75 
  
   3 files changed, 102 insertions(+), 7 deletions(-)
  
 
 Finally-tested-by: Jan Kiszka jan.kis...@siemens.com

Michael, we need either this or the simple oneshot patch to get device
assignment working again for 3.5.  Are you planning to push this for
3.5?  Thanks,

Alex



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Alexander Graf

On 02.07.2012, at 19:10, Scott Wood wrote:

 On 07/02/2012 07:30 AM, Alexander Graf wrote:
 
 On 22.06.2012, at 22:06, Stuart Yoder wrote:
 
 From: Liu Yu-B13201 yu@freescale.com
 
 Signed-off-by: Liu Yu yu@freescale.com
 Signed-off-by: Stuart Yoder stuart.yo...@freescale.com
 ---
 -v11: no changes
 
 arch/powerpc/include/asm/epapr_hcalls.h |   22 +-
 arch/powerpc/include/asm/fsl_hcalls.h   |   36 
 +++---
 2 files changed, 29 insertions(+), 29 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/epapr_hcalls.h 
 b/arch/powerpc/include/asm/epapr_hcalls.h
 index 833ce2c..b8d9445 100644
 --- a/arch/powerpc/include/asm/epapr_hcalls.h
 +++ b/arch/powerpc/include/asm/epapr_hcalls.h
 @@ -195,7 +195,7 @@ static inline unsigned int ev_int_set_config(unsigned 
 int interrupt,
 r5  = priority;
 r6  = destination;
 
 -   __asm__ __volatile__ (sc 1
 +   asm volatile(blepapr_hypercall_start
 : +r (r11), +r (r3), +r (r4), +r (r5), +r (r6)
 : : EV_HCALL_CLOBBERS4
 
 Hrm. ePAPR hypercalls are allowed to clobber lr, right? But our hypercall 
 entry code depends on lr staying alive:
 
 ePAPR 1.1 says LR is nonvolatile.

Why is it in the clobber list then? Not complaining - we need to have it in 
there for the bl to work out - but still surprised.

arch/powerpc/include/asm/epapr_hcalls.h:#define EV_HCALL_CLOBBERS r0, r12, 
xer, ctr, lr, cc, memory


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Scott Wood
On 07/02/2012 12:13 PM, Alexander Graf wrote:
 
 On 02.07.2012, at 19:10, Scott Wood wrote:
 
 On 07/02/2012 07:30 AM, Alexander Graf wrote:

 On 22.06.2012, at 22:06, Stuart Yoder wrote:

 From: Liu Yu-B13201 yu@freescale.com

 Signed-off-by: Liu Yu yu@freescale.com
 Signed-off-by: Stuart Yoder stuart.yo...@freescale.com
 ---
 -v11: no changes

 arch/powerpc/include/asm/epapr_hcalls.h |   22 +-
 arch/powerpc/include/asm/fsl_hcalls.h   |   36 
 +++---
 2 files changed, 29 insertions(+), 29 deletions(-)

 diff --git a/arch/powerpc/include/asm/epapr_hcalls.h 
 b/arch/powerpc/include/asm/epapr_hcalls.h
 index 833ce2c..b8d9445 100644
 --- a/arch/powerpc/include/asm/epapr_hcalls.h
 +++ b/arch/powerpc/include/asm/epapr_hcalls.h
 @@ -195,7 +195,7 @@ static inline unsigned int ev_int_set_config(unsigned 
 int interrupt,
r5  = priority;
r6  = destination;

 -  __asm__ __volatile__ (sc 1
 +  asm volatile(blepapr_hypercall_start
: +r (r11), +r (r3), +r (r4), +r (r5), +r (r6)
: : EV_HCALL_CLOBBERS4

 Hrm. ePAPR hypercalls are allowed to clobber lr, right? But our hypercall 
 entry code depends on lr staying alive:

 ePAPR 1.1 says LR is nonvolatile.
 
 Why is it in the clobber list then? 

Because the inline assembly code is clobbering it -- not the hv-provided
hcall instructions.

-Scott

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Alexander Graf

On 02.07.2012, at 19:16, Scott Wood wrote:

 On 07/02/2012 12:13 PM, Alexander Graf wrote:
 
 On 02.07.2012, at 19:10, Scott Wood wrote:
 
 On 07/02/2012 07:30 AM, Alexander Graf wrote:
 
 On 22.06.2012, at 22:06, Stuart Yoder wrote:
 
 From: Liu Yu-B13201 yu@freescale.com
 
 Signed-off-by: Liu Yu yu@freescale.com
 Signed-off-by: Stuart Yoder stuart.yo...@freescale.com
 ---
 -v11: no changes
 
 arch/powerpc/include/asm/epapr_hcalls.h |   22 +-
 arch/powerpc/include/asm/fsl_hcalls.h   |   36 
 +++---
 2 files changed, 29 insertions(+), 29 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/epapr_hcalls.h 
 b/arch/powerpc/include/asm/epapr_hcalls.h
 index 833ce2c..b8d9445 100644
 --- a/arch/powerpc/include/asm/epapr_hcalls.h
 +++ b/arch/powerpc/include/asm/epapr_hcalls.h
 @@ -195,7 +195,7 @@ static inline unsigned int ev_int_set_config(unsigned 
 int interrupt,
   r5  = priority;
   r6  = destination;
 
 - __asm__ __volatile__ (sc 1
 + asm volatile(blepapr_hypercall_start
   : +r (r11), +r (r3), +r (r4), +r (r5), +r (r6)
   : : EV_HCALL_CLOBBERS4
 
 Hrm. ePAPR hypercalls are allowed to clobber lr, right? But our hypercall 
 entry code depends on lr staying alive:
 
 ePAPR 1.1 says LR is nonvolatile.
 
 Why is it in the clobber list then? 
 
 Because the inline assembly code is clobbering it -- not the hv-provided
 hcall instructions.

Only after the change, not before it.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Scott Wood
On 07/02/2012 12:17 PM, Alexander Graf wrote:
 
 On 02.07.2012, at 19:16, Scott Wood wrote:
 
 On 07/02/2012 12:13 PM, Alexander Graf wrote:

 On 02.07.2012, at 19:10, Scott Wood wrote:

 On 07/02/2012 07:30 AM, Alexander Graf wrote:

 On 22.06.2012, at 22:06, Stuart Yoder wrote:

 From: Liu Yu-B13201 yu@freescale.com

 Signed-off-by: Liu Yu yu@freescale.com
 Signed-off-by: Stuart Yoder stuart.yo...@freescale.com
 ---
 -v11: no changes

 arch/powerpc/include/asm/epapr_hcalls.h |   22 +-
 arch/powerpc/include/asm/fsl_hcalls.h   |   36 
 +++---
 2 files changed, 29 insertions(+), 29 deletions(-)

 diff --git a/arch/powerpc/include/asm/epapr_hcalls.h 
 b/arch/powerpc/include/asm/epapr_hcalls.h
 index 833ce2c..b8d9445 100644
 --- a/arch/powerpc/include/asm/epapr_hcalls.h
 +++ b/arch/powerpc/include/asm/epapr_hcalls.h
 @@ -195,7 +195,7 @@ static inline unsigned int 
 ev_int_set_config(unsigned int interrupt,
  r5  = priority;
  r6  = destination;

 -__asm__ __volatile__ (sc 1
 +asm volatile(blepapr_hypercall_start
  : +r (r11), +r (r3), +r (r4), +r (r5), +r (r6)
  : : EV_HCALL_CLOBBERS4

 Hrm. ePAPR hypercalls are allowed to clobber lr, right? But our hypercall 
 entry code depends on lr staying alive:

 ePAPR 1.1 says LR is nonvolatile.

 Why is it in the clobber list then? 

 Because the inline assembly code is clobbering it -- not the hv-provided
 hcall instructions.
 
 Only after the change, not before it.

Hmm.  The comment says, XER, CTR, and LR are currently listed as
clobbers because it's uncertain whether they will be clobbered.  Maybe
it dates back to when the ABI was still being discussed?  Timur, do you
recall?

In any case, LR is nonvolatile in the spec and in the implementations I
know about (KVM and Topaz).

-Scott

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Scott Wood
On 07/02/2012 07:30 AM, Alexander Graf wrote:
 
 On 22.06.2012, at 22:06, Stuart Yoder wrote:
 
 From: Liu Yu-B13201 yu@freescale.com

 Signed-off-by: Liu Yu yu@freescale.com
 Signed-off-by: Stuart Yoder stuart.yo...@freescale.com
 ---
 -v11: no changes

 arch/powerpc/include/asm/epapr_hcalls.h |   22 +-
 arch/powerpc/include/asm/fsl_hcalls.h   |   36 +++---
 2 files changed, 29 insertions(+), 29 deletions(-)

 diff --git a/arch/powerpc/include/asm/epapr_hcalls.h 
 b/arch/powerpc/include/asm/epapr_hcalls.h
 index 833ce2c..b8d9445 100644
 --- a/arch/powerpc/include/asm/epapr_hcalls.h
 +++ b/arch/powerpc/include/asm/epapr_hcalls.h
 @@ -195,7 +195,7 @@ static inline unsigned int ev_int_set_config(unsigned 
 int interrupt,
  r5  = priority;
  r6  = destination;

 -__asm__ __volatile__ (sc 1
 +asm volatile(blepapr_hypercall_start
  : +r (r11), +r (r3), +r (r4), +r (r5), +r (r6)
  : : EV_HCALL_CLOBBERS4
 
 Hrm. ePAPR hypercalls are allowed to clobber lr, right? But our hypercall 
 entry code depends on lr staying alive:

ePAPR 1.1 says LR is nonvolatile.

-Scott

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call agenda for Tuesday, July 3rd

2012-07-02 Thread Eric Blake
On 07/02/2012 04:16 AM, Juan Quintela wrote:
 
 Hi
 
 Please send in any agenda items you are interested in covering.

Can we discuss the future of 'getfd', the possibility of 'pass-fd', or
even the enhancement of all existing monitor commands to take an
optional 'nfds' JSON argument for atomic management of fd passing?
Which commands need to reopen a file with different access, and do we
bite the bullet to special case all of those commands to allow fd
passing or can we make qemu_open() coupled with high-level fd passing
generic enough to satisfy all of our reopen needs?

-- 
Eric Blake   ebl...@redhat.com+1-919-301-3266
Libvirt virtualization library http://libvirt.org





signature.asc
Description: OpenPGP digital signature


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Timur Tabi
Scott Wood wrote:

 Hmm.  The comment says, XER, CTR, and LR are currently listed as
 clobbers because it's uncertain whether they will be clobbered.  Maybe
 it dates back to when the ABI was still being discussed?  Timur, do you
 recall?

Nope, sorry.  I'm sure we discussed this and looked at the code.  My guess
is that we weren't certain what the compiler was going to do at the time.

I'm still a little confused.  Which inline assembly code is clobbering LR?
 Are you talking about the BL instruction, which wasn't there before?

-- 
Timur Tabi
Linux kernel developer at Freescale

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Scott Wood
On 07/02/2012 12:34 PM, Timur Tabi wrote:
 Scott Wood wrote:
 
 Hmm.  The comment says, XER, CTR, and LR are currently listed as
 clobbers because it's uncertain whether they will be clobbered.  Maybe
 it dates back to when the ABI was still being discussed?  Timur, do you
 recall?
 
 Nope, sorry.  I'm sure we discussed this and looked at the code.  My guess
 is that we weren't certain what the compiler was going to do at the time.

I'm not sure how the compiler is involved here -- we're just telling it
what our asm code (and the asm code in the hypervisor) will do.

  I'm still a little confused.  Which inline assembly code is clobbering LR?
  Are you talking about the BL instruction, which wasn't there before?

Yes, I didn't realize that LR had been in the clobber list before that.

-Scott

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Timur Tabi
Scott Wood wrote:
   I'm still a little confused.  Which inline assembly code is clobbering LR?
   Are you talking about the BL instruction, which wasn't there before?

 Yes, I didn't realize that LR had been in the clobber list before that.

So are you saying that it was wrong before, but it's correct now?

-- 
Timur Tabi
Linux kernel developer at Freescale

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] Expose tsc deadline timer feature to guest

2012-07-02 Thread Eduardo Habkost
On Mon, Jul 02, 2012 at 11:08:14AM +, Liu, Jinsong wrote:
 Eduardo, Jan, Andreas
 
 As we sync 3 months ago, I wait until qemu1.1 done, then re-write patch based 
 on qemu1.1.
 Now it's time to re-write my patch based on qemu1.1.
 
 Attached is a RFC patch for exposing tsc deadline timer to guest.
 I have checked current qemu1.1 code, and read some emails regarding to cpuid 
 exposing these days.
 However, I think I may ignore something (so many discussion :-), so if you 
 think anything wrong, please point out to me.

Looks good, considering that even if we change the GET_SUPPORTED_CPUID
behavior eventually, there are existing kernels that use
KVM_CAP_TSC_DEADLINE_TIMER to report TSC-deadline support. So:

Reviewed-by: Eduardo Habkost ehabk...@redhat.com


 
 Thanks,
 Jinsong
 
 
 From 8b5b003f6f8834d2d5d71e18bb47b7f089bc4928 Mon Sep 17 00:00:00 2001
 From: Liu, Jinsong jinsong@intel.com
 Date: Tue, 3 Jul 2012 02:35:10 +0800
 Subject: [PATCH] Expose tsc deadline timer feature to guest
 
 This patch exposes tsc deadline timer feature to guest if
 1). in-kernel irqchip is used, and
 2). kvm has emulated tsc deadline timer, and
 3). user authorize the feature exposing via -cpu or +/- tsc-deadline
 
 Signed-off-by: Liu, Jinsong jinsong@intel.com
 ---
  target-i386/cpu.h |1 +
  target-i386/kvm.c |5 +
  2 files changed, 6 insertions(+), 0 deletions(-)
 
 diff --git a/target-i386/cpu.h b/target-i386/cpu.h
 index 79cc640..d1a4a04 100644
 --- a/target-i386/cpu.h
 +++ b/target-i386/cpu.h
 @@ -400,6 +400,7 @@
  #define CPUID_EXT_X2APIC   (1  21)
  #define CPUID_EXT_MOVBE(1  22)
  #define CPUID_EXT_POPCNT   (1  23)
 +#define CPUID_EXT_TSC_DEADLINE_TIMER (1  24)
  #define CPUID_EXT_XSAVE(1  26)
  #define CPUID_EXT_OSXSAVE  (1  27)
  #define CPUID_EXT_HYPERVISOR  (1  31)
 diff --git a/target-i386/kvm.c b/target-i386/kvm.c
 index 0d0d8f6..52b577f 100644
 --- a/target-i386/kvm.c
 +++ b/target-i386/kvm.c
 @@ -361,8 +361,13 @@ int kvm_arch_init_vcpu(CPUX86State *env)
  env-cpuid_features = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
  
  i = env-cpuid_ext_features  CPUID_EXT_HYPERVISOR;
 +j = env-cpuid_ext_features  CPUID_EXT_TSC_DEADLINE_TIMER;
  env-cpuid_ext_features = kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX);
  env-cpuid_ext_features |= i;
 +if (j  kvm_irqchip_in_kernel() 
 +kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
 +env-cpuid_ext_features |= CPUID_EXT_TSC_DEADLINE_TIMER;
 +}
  
  env-cpuid_ext2_features = kvm_arch_get_supported_cpuid(s, 0x8001,
   0, R_EDX);
 -- 
 1.7.1



-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Scott Wood
On 07/02/2012 12:53 PM, Timur Tabi wrote:
 Scott Wood wrote:
  I'm still a little confused.  Which inline assembly code is clobbering LR?
  Are you talking about the BL instruction, which wasn't there before?
 
 Yes, I didn't realize that LR had been in the clobber list before that.
 
 So are you saying that it was wrong before, but it's correct now?

Not really *wrong* before, but unnecessary.

-Scott

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11 8/8] PPC: Don't use hardcoded opcode for ePAPR hcall invocation

2012-07-02 Thread Timur Tabi
Scott Wood wrote:
  So are you saying that it was wrong before, but it's correct now?

 Not really *wrong* before, but unnecessary.

In that case, my code was really just ahead of its time. :-)

-- 
Timur Tabi
Linux kernel developer at Freescale

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/6] file_ram_alloc(): coding style fixes

2012-07-02 Thread Eduardo Habkost
Cc: Blue Swirl blauwir...@gmail.com
Signed-off-by: Eduardo Habkost ehabk...@redhat.com
---
 exec.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/exec.c b/exec.c
index 8244d54..c8bfd27 100644
--- a/exec.c
+++ b/exec.c
@@ -2392,7 +2392,7 @@ static void *file_ram_alloc(RAMBlock *block,
 unlink(filename);
 free(filename);
 
-memory = (memory+hpagesize-1)  ~(hpagesize-1);
+memory = (memory + hpagesize - 1)  ~(hpagesize - 1);
 
 /*
  * ftruncate is not supported by hugetlbfs in older
@@ -2400,8 +2400,9 @@ static void *file_ram_alloc(RAMBlock *block,
  * If anything goes wrong with it under other filesystems,
  * mmap will fail.
  */
-if (ftruncate(fd, memory))
+if (ftruncate(fd, memory)) {
 perror(ftruncate);
+}
 
 #ifdef MAP_POPULATE
 /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 6/6] add -keep-mem-path-files option (v2)

2012-07-02 Thread Eduardo Habkost
This make QEMU create files inside the -mem-path directory using
more predictable names, and not remove them afterwards.

This allow (for example) users or management layers to use numactl
later, to set NUMA policy for the guest RAM.

Changes v1 - v2:
 - Fix trailing space issue
 - Coding style changes
 - Do not initialize static variable to false
 - Use g_strdup_printf() instead of asprintf()

Signed-off-by: Eduardo Habkost ehabk...@redhat.com
---
 cpu-all.h   |1 +
 exec.c  |   26 +-
 qemu-options.hx |   12 
 vl.c|5 +
 4 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/cpu-all.h b/cpu-all.h
index 2beed5a..acd59ff 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -490,6 +490,7 @@ typedef struct RAMList {
 extern RAMList ram_list;
 
 extern const char *mem_path;
+extern bool keep_mem_path_files;
 extern bool mem_prealloc;
 
 /* Flags stored in the low bits of the TLB virtual address.  These are
diff --git a/exec.c b/exec.c
index 456ac73..2525a65 100644
--- a/exec.c
+++ b/exec.c
@@ -2377,6 +2377,25 @@ static int get_temp_fd(const char *path)
 return fd;
 }
 
+/* Return FD to RAM block file, using the memory region name as filename
+ */
+static int open_ramblock_file(RAMBlock *block, const char *path)
+{
+int fd;
+gchar *filename;
+
+filename = g_strdup_printf(%s/%s, path, block-mr-name);
+fd = open(filename, O_RDWR|O_CREAT);
+if (fd  0) {
+perror(unable to open backing store for hugepages);
+g_free(filename);
+return -1;
+}
+g_free(filename);
+
+return fd;
+}
+
 static void *file_ram_alloc(RAMBlock *block,
 size_t length,
 const char *path)
@@ -2402,7 +2421,12 @@ static void *file_ram_alloc(RAMBlock *block,
 return NULL;
 }
 
-fd = get_temp_fd(path);
+if (keep_mem_path_files) {
+fd = open_ramblock_file(block, path);
+} else {
+fd = get_temp_fd(path);
+}
+
 if (fd  0) {
 return NULL;
 }
diff --git a/qemu-options.hx b/qemu-options.hx
index 8b66264..f2eb237 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -399,6 +399,18 @@ STEXI
 Allocate guest RAM from a temporarily created file in @var{path}.
 ETEXI
 
+DEF(keep-mem-path-files, HAS_ARG, QEMU_OPTION_keep_mempath_files,
+-keep-mem-path-files  Keep files created in -mem-path\n, QEMU_ARCH_ALL)
+STEXI
+@item -keep-mem-path-files
+Create the files for -mem-path using the memory region names, and don't remove
+them afterwards.
+
+This allows further fine-tuning of NUMA policy for memory regions using
+numactl.
+ETEXI
+
+
 #ifdef MAP_POPULATE
 DEF(mem-prealloc, 0, QEMU_OPTION_mem_prealloc,
 -mem-prealloc   preallocate guest memory (use with -mem-path)\n,
diff --git a/vl.c b/vl.c
index 5c80189..9f18d53 100644
--- a/vl.c
+++ b/vl.c
@@ -177,6 +177,8 @@ int display_remote = 0;
 const char* keyboard_layout = NULL;
 ram_addr_t ram_size;
 const char *mem_path = NULL;
+bool keep_mem_path_files; /* Keep files created at mem_path.
+   * use memory region names as filename */
 #ifdef MAP_POPULATE
 bool mem_prealloc; /* force preallocation of physical target memory */
 #endif
@@ -2671,6 +2673,9 @@ int main(int argc, char **argv, char **envp)
 case QEMU_OPTION_mempath:
 mem_path = optarg;
 break;
+case QEMU_OPTION_keep_mempath_files:
+keep_mem_path_files = true;
+break;
 #ifdef MAP_POPULATE
 case QEMU_OPTION_mem_prealloc:
 mem_prealloc = true;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] file_ram_alloc(): use g_strdup_printf() instead of asprintf()

2012-07-02 Thread Eduardo Habkost
Cc: Blue Swirl blauwir...@gmail.com
Signed-off-by: Eduardo Habkost ehabk...@redhat.com
---
 exec.c |   14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/exec.c b/exec.c
index c8bfd27..d856325 100644
--- a/exec.c
+++ b/exec.c
@@ -24,6 +24,9 @@
 #include sys/mman.h
 #endif
 
+#include glib.h
+#include glib/gprintf.h
+
 #include qemu-common.h
 #include cpu.h
 #include tcg.h
@@ -2357,7 +2360,7 @@ static void *file_ram_alloc(RAMBlock *block,
 ram_addr_t memory,
 const char *path)
 {
-char *filename;
+gchar *filename;
 void *area;
 int fd;
 #ifdef MAP_POPULATE
@@ -2379,18 +2382,15 @@ static void *file_ram_alloc(RAMBlock *block,
 return NULL;
 }
 
-if (asprintf(filename, %s/qemu_back_mem.XX, path) == -1) {
-return NULL;
-}
-
+filename = g_strdup_printf(%s/qemu_back_mem.XX, path);
 fd = mkstemp(filename);
 if (fd  0) {
 perror(unable to create backing store for hugepages);
-free(filename);
+g_free(filename);
 return NULL;
 }
 unlink(filename);
-free(filename);
+g_free(filename);
 
 memory = (memory + hpagesize - 1)  ~(hpagesize - 1);
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 0/6] option to not remove files inside -mem-path dir (v2)

2012-07-02 Thread Eduardo Habkost
Resending series, after fixing some coding style issues. Does anybody has any
feedback about this proposal?

Changes v1 - v2:
 - Coding style fixes

Original cover letter:

I was investigating if there are any mechanisms that allow manually pinning of
guest RAM to specific host NUMA nodes, in the case of multi-node KVM guests, and
noticed that -mem-path could be used for that, except that it currently removes
any files it creates (using mkstemp()) immediately, not allowing numactl to be
used on the backing files, as a result. This patches add a -keep-mem-path-files
option to make QEMU create the files inside -mem-path with more predictable
names, and not remove them after creation.

Some previous discussions about the subject, for reference:
 - Message-ID: 1281534738-8310-1-git-send-email-andre.przyw...@amd.com
   http://article.gmane.org/gmane.comp.emulators.kvm.devel/57684
 - Message-ID: 4c7d7c2a.7000...@codemonkey.ws
   http://article.gmane.org/gmane.comp.emulators.kvm.devel/58835

A more recent thread can be found at:
 - Message-ID: 20111029184502.gh11...@in.ibm.com
   http://article.gmane.org/gmane.comp.emulators.qemu/123001

Note that this is just a mechanism to facilitate manual static binding using
numactl on hugetlbfs later, for optimization. This may be especially useful for
single large multi-node guests use-cases (and, of course, has to be used with
care).

I don't know if it is a good idea to use the memory range names as a publicly-
visible interface. Another option may be to use a single file instead, and mmap
different regions inside the same file for each memory region. I an open to
comments and suggestions.

Example (untested) usage to bind manually each half of the RAM of a guest to a
different NUMA node:

 $ qemu-system-x86_64 [...] -m 2048 -smp 4 \
   -numa node,cpus=0-1,mem=1024 -numa node,cpus=2-3,mem=1024 \
   -mem-prealloc -keep-mem-path-files -mem-path /mnt/hugetlbfs/FOO
 $ numactl --offset=1G --length=1G --membind=1 --file /mnt/hugetlbfs/FOO/pc.ram
 $ numactl --offset=0  --length=1G --membind=2 --file /mnt/hugetlbfs/FOO/pc.ram


Eduardo Habkost (6):
  file_ram_alloc(): coding style fixes
  file_ram_alloc(): use g_strdup_printf() instead of asprintf()
  vl.c: change mem_prealloc to bool (v2)
  file_ram_alloc: change length argument to size_t (v2)
  file_ram_alloc(): extract temporary-file creation code to separate
function (v2)
  add -keep-mem-path-files option (v2)

 cpu-all.h   |3 ++-
 exec.c  |   68 +++
 qemu-options.hx |   12 ++
 vl.c|9 ++--
 4 files changed, 75 insertions(+), 17 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/6] file_ram_alloc: change length argument to size_t (v2)

2012-07-02 Thread Eduardo Habkost
While we are at it, rename it to length, as memory doesn't mean
anything.

Changes v1 - v2:
 - Rebase after coding style changes

Signed-off-by: Eduardo Habkost ehabk...@redhat.com
---
 exec.c |   12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/exec.c b/exec.c
index d856325..1e98244 100644
--- a/exec.c
+++ b/exec.c
@@ -2357,7 +2357,7 @@ static long gethugepagesize(const char *path)
 }
 
 static void *file_ram_alloc(RAMBlock *block,
-ram_addr_t memory,
+size_t length,
 const char *path)
 {
 gchar *filename;
@@ -2373,7 +2373,7 @@ static void *file_ram_alloc(RAMBlock *block,
 return NULL;
 }
 
-if (memory  hpagesize) {
+if (length  hpagesize) {
 return NULL;
 }
 
@@ -2392,7 +2392,7 @@ static void *file_ram_alloc(RAMBlock *block,
 unlink(filename);
 g_free(filename);
 
-memory = (memory + hpagesize - 1)  ~(hpagesize - 1);
+length = (length + hpagesize - 1)  ~(hpagesize - 1);
 
 /*
  * ftruncate is not supported by hugetlbfs in older
@@ -2400,7 +2400,7 @@ static void *file_ram_alloc(RAMBlock *block,
  * If anything goes wrong with it under other filesystems,
  * mmap will fail.
  */
-if (ftruncate(fd, memory)) {
+if (ftruncate(fd, length)) {
 perror(ftruncate);
 }
 
@@ -2410,9 +2410,9 @@ static void *file_ram_alloc(RAMBlock *block,
  * to sidestep this quirk.
  */
 flags = mem_prealloc ? MAP_POPULATE | MAP_SHARED : MAP_PRIVATE;
-area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0);
+area = mmap(0, length, PROT_READ | PROT_WRITE, flags, fd, 0);
 #else
-area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+area = mmap(0, length, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
 #endif
 if (area == MAP_FAILED) {
 perror(file_ram_alloc: can't mmap RAM pages);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] vl.c: change mem_prealloc to bool (v2)

2012-07-02 Thread Eduardo Habkost
Changes v1 - v2:
 - Do not initialize variable to false, to make checkpatch.pl happy

Signed-off-by: Eduardo Habkost ehabk...@redhat.com
---
 cpu-all.h |2 +-
 vl.c  |4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index 9dc249a..2beed5a 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -490,7 +490,7 @@ typedef struct RAMList {
 extern RAMList ram_list;
 
 extern const char *mem_path;
-extern int mem_prealloc;
+extern bool mem_prealloc;
 
 /* Flags stored in the low bits of the TLB virtual address.  These are
defined so that fast path ram access is all zeros.  */
diff --git a/vl.c b/vl.c
index 1329c30..5c80189 100644
--- a/vl.c
+++ b/vl.c
@@ -178,7 +178,7 @@ const char* keyboard_layout = NULL;
 ram_addr_t ram_size;
 const char *mem_path = NULL;
 #ifdef MAP_POPULATE
-int mem_prealloc = 0; /* force preallocation of physical target memory */
+bool mem_prealloc; /* force preallocation of physical target memory */
 #endif
 int nb_nics;
 NICInfo nd_table[MAX_NICS];
@@ -2673,7 +2673,7 @@ int main(int argc, char **argv, char **envp)
 break;
 #ifdef MAP_POPULATE
 case QEMU_OPTION_mem_prealloc:
-mem_prealloc = 1;
+mem_prealloc = true;
 break;
 #endif
 case QEMU_OPTION_d:
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] file_ram_alloc(): extract temporary-file creation code to separate function (v2)

2012-07-02 Thread Eduardo Habkost
Changes v1 - v2:
 - Fix trailing space issue
 - Rebase against new code using g_strdup_printf()

Signed-off-by: Eduardo Habkost ehabk...@redhat.com
---
 exec.c |   29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/exec.c b/exec.c
index 1e98244..456ac73 100644
--- a/exec.c
+++ b/exec.c
@@ -2356,11 +2356,31 @@ static long gethugepagesize(const char *path)
 return fs.f_bsize;
 }
 
+/* Return FD to temporary file inside directory at 'path',
+ * truncated to size 'length'
+ */
+static int get_temp_fd(const char *path)
+{
+int fd;
+gchar *filename;
+
+filename = g_strdup_printf(%s/qemu_back_mem.XX, path);
+fd = mkstemp(filename);
+if (fd  0) {
+perror(unable to create backing store for hugepages);
+g_free(filename);
+return -1;
+}
+unlink(filename);
+g_free(filename);
+
+return fd;
+}
+
 static void *file_ram_alloc(RAMBlock *block,
 size_t length,
 const char *path)
 {
-gchar *filename;
 void *area;
 int fd;
 #ifdef MAP_POPULATE
@@ -2382,15 +2402,10 @@ static void *file_ram_alloc(RAMBlock *block,
 return NULL;
 }
 
-filename = g_strdup_printf(%s/qemu_back_mem.XX, path);
-fd = mkstemp(filename);
+fd = get_temp_fd(path);
 if (fd  0) {
-perror(unable to create backing store for hugepages);
-g_free(filename);
 return NULL;
 }
-unlink(filename);
-g_free(filename);
 
 length = (length + hpagesize - 1)  ~(hpagesize - 1);
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/6] option to not remove files inside -mem-path dir (v2)

2012-07-02 Thread Daniel P. Berrange
On Mon, Jul 02, 2012 at 03:06:32PM -0300, Eduardo Habkost wrote:
 Resending series, after fixing some coding style issues. Does anybody has any
 feedback about this proposal?
 
 Changes v1 - v2:
  - Coding style fixes
 
 Original cover letter:
 
 I was investigating if there are any mechanisms that allow manually pinning of
 guest RAM to specific host NUMA nodes, in the case of multi-node KVM guests, 
 and
 noticed that -mem-path could be used for that, except that it currently 
 removes
 any files it creates (using mkstemp()) immediately, not allowing numactl to be
 used on the backing files, as a result. This patches add a 
 -keep-mem-path-files
 option to make QEMU create the files inside -mem-path with more predictable
 names, and not remove them after creation.
 
 Some previous discussions about the subject, for reference:
  - Message-ID: 1281534738-8310-1-git-send-email-andre.przyw...@amd.com
http://article.gmane.org/gmane.comp.emulators.kvm.devel/57684
  - Message-ID: 4c7d7c2a.7000...@codemonkey.ws
http://article.gmane.org/gmane.comp.emulators.kvm.devel/58835
 
 A more recent thread can be found at:
  - Message-ID: 20111029184502.gh11...@in.ibm.com
http://article.gmane.org/gmane.comp.emulators.qemu/123001
 
 Note that this is just a mechanism to facilitate manual static binding using
 numactl on hugetlbfs later, for optimization. This may be especially useful 
 for
 single large multi-node guests use-cases (and, of course, has to be used with
 care).
 
 I don't know if it is a good idea to use the memory range names as a publicly-
 visible interface. Another option may be to use a single file instead, and 
 mmap
 different regions inside the same file for each memory region. I an open to
 comments and suggestions.
 
 Example (untested) usage to bind manually each half of the RAM of a guest to a
 different NUMA node:
 
  $ qemu-system-x86_64 [...] -m 2048 -smp 4 \
-numa node,cpus=0-1,mem=1024 -numa node,cpus=2-3,mem=1024 \
-mem-prealloc -keep-mem-path-files -mem-path /mnt/hugetlbfs/FOO
  $ numactl --offset=1G --length=1G --membind=1 --file 
 /mnt/hugetlbfs/FOO/pc.ram
  $ numactl --offset=0  --length=1G --membind=2 --file 
 /mnt/hugetlbfs/FOO/pc.ram

I'd suggest that instead of making the memory file name into a
public ABI QEMU needs to maintain, QEMU could expose the info
via a monitor command. eg

   $ qemu-system-x86_64 [...] -m 2048 -smp 4 \
 -numa node,cpus=0-1,mem=1024 -numa node,cpus=2-3,mem=1024 \
 -mem-prealloc -mem-path /mnt/hugetlbfs/FOO \
 -monitor stdio
   (qemu) info mem-nodes
node0: file=/proc/self/fd/3, offset=0G, length=1G
node1: file=/proc/self/fd/3, offset=1G, length=1G

This example takes advantage of the fact that with Linux, you can
still access a deleted file via /proc/self/fd/NNN, which AFAICT,
would avoid the need for a --keep-mem-path-files.

By returning info via a monitor command you also avoid hardcoding
the use of 1 single file for all of memory. You also avoid hardcoding
the fact that QEMU stores the nodes in contiguous order inside the
node. eg QEMU could easily return data like this


   $ qemu-system-x86_64 [...] -m 2048 -smp 4 \
 -numa node,cpus=0-1,mem=1024 -numa node,cpus=2-3,mem=1024 \
 -mem-prealloc -mem-path /mnt/hugetlbfs/FOO \
 -monitor stdio
   (qemu) info mem-nodes
node0: file=/proc/self/fd/3, offset=0G, length=1G
node1: file=/proc/self/fd/4, offset=0G, length=1G

or more ingeneous options

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/6] option to not remove files inside -mem-path dir (v2)

2012-07-02 Thread Eduardo Habkost
On Mon, Jul 02, 2012 at 07:56:58PM +0100, Daniel P. Berrange wrote:
 On Mon, Jul 02, 2012 at 03:06:32PM -0300, Eduardo Habkost wrote:
  Resending series, after fixing some coding style issues. Does anybody has 
  any
  feedback about this proposal?
  
  Changes v1 - v2:
   - Coding style fixes
  
  Original cover letter:
  
  I was investigating if there are any mechanisms that allow manually pinning 
  of
  guest RAM to specific host NUMA nodes, in the case of multi-node KVM 
  guests, and
  noticed that -mem-path could be used for that, except that it currently 
  removes
  any files it creates (using mkstemp()) immediately, not allowing numactl to 
  be
  used on the backing files, as a result. This patches add a 
  -keep-mem-path-files
  option to make QEMU create the files inside -mem-path with more predictable
  names, and not remove them after creation.
  
  Some previous discussions about the subject, for reference:
   - Message-ID: 1281534738-8310-1-git-send-email-andre.przyw...@amd.com
 http://article.gmane.org/gmane.comp.emulators.kvm.devel/57684
   - Message-ID: 4c7d7c2a.7000...@codemonkey.ws
 http://article.gmane.org/gmane.comp.emulators.kvm.devel/58835
  
  A more recent thread can be found at:
   - Message-ID: 20111029184502.gh11...@in.ibm.com
 http://article.gmane.org/gmane.comp.emulators.qemu/123001
  
  Note that this is just a mechanism to facilitate manual static binding using
  numactl on hugetlbfs later, for optimization. This may be especially useful 
  for
  single large multi-node guests use-cases (and, of course, has to be used 
  with
  care).
  
  I don't know if it is a good idea to use the memory range names as a 
  publicly-
  visible interface. Another option may be to use a single file instead, and 
  mmap
  different regions inside the same file for each memory region. I an open to
  comments and suggestions.
  
  Example (untested) usage to bind manually each half of the RAM of a guest 
  to a
  different NUMA node:
  
   $ qemu-system-x86_64 [...] -m 2048 -smp 4 \
 -numa node,cpus=0-1,mem=1024 -numa node,cpus=2-3,mem=1024 \
 -mem-prealloc -keep-mem-path-files -mem-path /mnt/hugetlbfs/FOO
   $ numactl --offset=1G --length=1G --membind=1 --file 
  /mnt/hugetlbfs/FOO/pc.ram
   $ numactl --offset=0  --length=1G --membind=2 --file 
  /mnt/hugetlbfs/FOO/pc.ram
 
 I'd suggest that instead of making the memory file name into a
 public ABI QEMU needs to maintain, QEMU could expose the info
 via a monitor command. eg
 
$ qemu-system-x86_64 [...] -m 2048 -smp 4 \
  -numa node,cpus=0-1,mem=1024 -numa node,cpus=2-3,mem=1024 \
  -mem-prealloc -mem-path /mnt/hugetlbfs/FOO \
  -monitor stdio
(qemu) info mem-nodes
 node0: file=/proc/self/fd/3, offset=0G, length=1G
 node1: file=/proc/self/fd/3, offset=1G, length=1G
 
 This example takes advantage of the fact that with Linux, you can
 still access a deleted file via /proc/self/fd/NNN, which AFAICT,
 would avoid the need for a --keep-mem-path-files.

I like the suggestion.

But other processes still need to be able to open those files if we want
to do anything useful with them. In this case, I guess it's better to
let QEMU itself build a /proc/getpid()/fd/fd string instead of
using /proc/self and forcing the client to find out what's the right
PID?

Anyway, even if we want to avoid file-descriptor and /proc tricks, we
can still use the interface you suggest. Then we wouldn't need to have
any filename assumptions: the filenames could be completly random, as
they would be reported using the new monitor command.

 
 By returning info via a monitor command you also avoid hardcoding
 the use of 1 single file for all of memory. You also avoid hardcoding
 the fact that QEMU stores the nodes in contiguous order inside the
 node. eg QEMU could easily return data like this
 
 
$ qemu-system-x86_64 [...] -m 2048 -smp 4 \
  -numa node,cpus=0-1,mem=1024 -numa node,cpus=2-3,mem=1024 \
  -mem-prealloc -mem-path /mnt/hugetlbfs/FOO \
  -monitor stdio
(qemu) info mem-nodes
 node0: file=/proc/self/fd/3, offset=0G, length=1G
 node1: file=/proc/self/fd/4, offset=0G, length=1G
 
 or more ingeneous options

Sounds good.

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


First shot at adding IPMI to qemu

2012-07-02 Thread minyard
I had asked about getting an IPMI device into qemu and received some
interest, and it's useful to me, so I've done some work to add it.
The following patch set has a set of patches to add an IPMI KCS
device, and IPMI BT device, a built-in BMC (IPMI management controller),
and a way to attach an external BMC through a chardev.

There was some discussion on whether to make the BMC internal or
external, but I went ahead and added both.  The internal one is
fairly basic and not extensible, at least without adding code.
I've modified the OpenIPMI library simulator to work with the
external interface to allow it to receive connections from the
qemu external simulator with a fairly basic protocol.

I've also added the ability for the OpenIPMI library to manage
a VM to power it on, power it off, reset it, and handle an IPMI
watchdog timer.  So it looks quite like a real system.  Instructions
for using it are in the OpenIPMI release candidate I uploaded to
https://sourceforge.net/projects/openipmi

Since IPMI can advertise it's presence via SMBIOS, I added a
way for a driver to add an SMBIOS entry.  I also added a way
to query a free interrupt from the ISA bus, since the interrupt
is in the SMBIOS entry and nobody really cares which one is used.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/9] isa: Add a way to query for a free interrupt

2012-07-02 Thread minyard
From: Corey Minyard cminy...@mvista.com

This lets devices that don't care about their interrupt number, like
IPMI, just grab any unused interrupt.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 hw/isa-bus.c |   13 +
 hw/isa.h |2 ++
 2 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/hw/isa-bus.c b/hw/isa-bus.c
index 5a43f03..f561f21 100644
--- a/hw/isa-bus.c
+++ b/hw/isa-bus.c
@@ -71,6 +71,7 @@ qemu_irq isa_get_irq(ISADevice *dev, int isairq)
 if (isairq  0 || isairq  15) {
 hw_error(isa irq %d invalid, isairq);
 }
+isabus-irq_inuse[isairq] = 1;
 return isabus-irqs[isairq];
 }
 
@@ -82,6 +83,18 @@ void isa_init_irq(ISADevice *dev, qemu_irq *p, int isairq)
 dev-nirqs++;
 }
 
+int isa_find_free_irq(ISABus *bus)
+{
+unsigned int i;
+
+/* 0 and 1 are called for, 2 is the chain interrupt */
+for (i = 3; i  ISA_NUM_IRQS; i++) {
+   if (!bus-irq_inuse[i])
+   return i;
+}
+return 0;
+}
+
 static inline void isa_init_ioport(ISADevice *dev, uint16_t ioport)
 {
 if (dev  (dev-ioport_id == 0 || ioport  dev-ioport_id)) {
diff --git a/hw/isa.h b/hw/isa.h
index f7bc4b5..9447296 100644
--- a/hw/isa.h
+++ b/hw/isa.h
@@ -28,6 +28,7 @@ struct ISABus {
 BusState qbus;
 MemoryRegion *address_space_io;
 qemu_irq *irqs;
+int irq_inuse[ISA_NUM_IRQS];
 };
 
 struct ISADevice {
@@ -41,6 +42,7 @@ ISABus *isa_bus_new(DeviceState *dev, MemoryRegion 
*address_space_io);
 void isa_bus_irqs(ISABus *bus, qemu_irq *irqs);
 qemu_irq isa_get_irq(ISADevice *dev, int isairq);
 void isa_init_irq(ISADevice *dev, qemu_irq *p, int isairq);
+int isa_find_free_irq(ISABus *bus);
 MemoryRegion *isa_address_space(ISADevice *dev);
 ISADevice *isa_create(ISABus *bus, const char *name);
 ISADevice *isa_try_create(ISABus *bus, const char *name);
-- 
1.7.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/9] IPMI: Add a PC ISA type structure

2012-07-02 Thread minyard
From: Corey Minyard cminy...@mvista.com

This provides the base infrastructure to tie IPMI low-level
interfaces into a PC ISA bus.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/isa_ipmi.c  |  138 
 hw/pc.c|   12 +++
 hw/pc.h|   18 +
 hw/smbios.h|   12 +++
 7 files changed, 183 insertions(+), 0 deletions(-)
 create mode 100644 hw/isa_ipmi.c

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index eb17afc..c0aff0d 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -8,6 +8,7 @@ CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
+CONFIG_ISA_IPMI=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index e4e3e4f..615e4f2 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -8,6 +8,7 @@ CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
+CONFIG_ISA_IPMI=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 0d55997..8f27ffe 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -21,6 +21,7 @@ hw-obj-$(CONFIG_ESCC) += escc.o
 hw-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
 
 hw-obj-$(CONFIG_IPMI) += ipmi.o
+hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/isa_ipmi.c b/hw/isa_ipmi.c
new file mode 100644
index 000..cad78b0
--- /dev/null
+++ b/hw/isa_ipmi.c
@@ -0,0 +1,138 @@
+/*
+ * QEMU ISA IPMI KCS emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include hw.h
+#include isa.h
+#include pc.h
+#include qemu-timer.h
+#include sysemu.h
+#include smbios.h
+#include ipmi.h
+
+
+typedef struct ISAIPMIState {
+ISADevice dev;
+uint32_t type;
+uint32_t iobase;
+uint32_t isairq;
+uint8_t slave_addr;
+IPMIState state;
+} ISAIPMIState;
+
+static int ipmi_isa_initfn(ISADevice *dev)
+{
+ISAIPMIState *isa = DO_UPCAST(ISAIPMIState, dev, dev);
+struct smbios_type_38 smb38;
+
+if (isa-iobase == -1) {
+   /* If no I/O base is specified, set the defaults */
+   switch (isa-type) {
+   case IPMI_KCS:
+   isa-iobase = 0xca2;
+   break;
+   case IPMI_BT:
+   isa-iobase = 0xe4;
+   break;
+   case IPMI_SMIC:
+   isa-iobase = 0xca9;
+   break;
+   default:
+   fprintf(stderr, Unknown IPMI type: %d\n, isa-type);
+   abort();
+   }
+}
+
+isa-state.slave_addr = isa-slave_addr;
+
+qdev_set_legacy_instance_id(dev-qdev, isa-iobase, 3);
+
+ipmi_init(isa-type, isa-state);
+
+if (isa-isairq  0) {
+   isa_init_irq(dev, isa-state.irq, isa-isairq);
+   isa-state.use_irq = 1;
+}
+
+isa_register_ioport(dev, isa-state.io, isa-iobase);
+
+smb38.header.type = 38;
+smb38.header.length = sizeof(smb38);
+smb38.header.handle = 0x3000;
+smb38.interface_type = isa-state.smbios_type;
+smb38.ipmi_version = 0x20;
+smb38.i2c_slave_addr = isa-state.slave_addr;
+smb38.nv_storage_dev_addr = 0;
+
+/* or 1 to set it to I/O space */
+smb38.base_addr = isa-iobase | 1;
+
+ /* 1-byte boundaries, addr bit0=0, level triggered irq */
+smb38.base_addr_mod_and_irq_info = 1;
+smb38.interrupt_number = isa-isairq;
+smbios_table_entry_add((struct smbios_structure_header *) smb38);
+
+return 0;
+}
+
+static Property ipmi_isa_properties[] = {
+DEFINE_PROP_HEX32(type, ISAIPMIState, type,  IPMI_KCS),
+

[PATCH 9/9] IPMI: Add an external connection simulation interface

2012-07-02 Thread minyard
From: Corey Minyard cminy...@mvista.com

This adds an interface for IPMI that connects to a remote
BMC over a chardev (generally a TCP socket).  The OpenIPMI
lanserv simulator describes this interface, see that for
interface details.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/ipmi_extern.c   |  421 
 hw/ipmi_extern.h   |   75 +++
 5 files changed, 499 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi_extern.c
 create mode 100644 hw/ipmi_extern.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 8c99d5d..325f92e 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -12,6 +12,7 @@ CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
 CONFIG_IPMI_BT=y
 CONFIG_IPMI_LOCAL=y
+CONFIG_IPMI_EXTERN=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 4d01883..2ac9177 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -12,6 +12,7 @@ CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
 CONFIG_IPMI_BT=y
 CONFIG_IPMI_LOCAL=y
+CONFIG_IPMI_EXTERN=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 193227d..06757b0 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -25,6 +25,7 @@ hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
 hw-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs.o
 hw-obj-$(CONFIG_IPMI_BT) += ipmi_bt.o
 hw-obj-$(CONFIG_IPMI_LOCAL) += ipmi_sim.o
+hw-obj-$(CONFIG_IPMI_EXTERN) += ipmi_extern.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/ipmi_extern.c b/hw/ipmi_extern.c
new file mode 100644
index 000..bbb8469
--- /dev/null
+++ b/hw/ipmi_extern.c
@@ -0,0 +1,421 @@
+/*
+ * IPMI BMC external connection
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * This is designed to connect with OpenIPMI's lanserv serial interface
+ * using the VM connection type.  See that for details.
+ */
+
+#include ipmi_extern.h
+
+static int can_receive(void *opaque);
+static void receive(void *opaque, const uint8_t *buf, int size);
+static void chr_event(void *opaque, int event);
+
+static unsigned char
+ipmb_checksum(const unsigned char *data, int size, unsigned char start)
+{
+   unsigned char csum = start;
+
+   for (; size  0; size--, data++)
+   csum += *data;
+
+   return csum;
+}
+
+static void continue_send(IPMIState *s, IPMIExternState *es)
+{
+if (es-outlen == 0)
+   goto check_reset;
+
+ send:
+es-outpos += qemu_chr_fe_write(es-chr, es-outbuf + es-outpos,
+   es-outlen - es-outpos);
+if (es-outpos  es-outlen) {
+   /* Not fully transmitted, try again in a 10ms */
+   qemu_mod_timer(es-extern_timer,
+  qemu_get_clock_ns(vm_clock) + 1000);
+} else {
+   /* Sent */
+   es-outlen = 0;
+   es-outpos = 0;
+   if (!es-sending_cmd)
+   es-waiting_rsp = 1;
+   else
+   es-sending_cmd = 0;
+
+check_reset:
+   if (es-send_reset) {
+   /* Send the reset */
+   es-outbuf[0] = VM_CMD_RESET;
+   es-outbuf[1] = VM_CMD_CHAR;
+   es-outlen = 2;
+   es-outpos = 0;
+   es-send_reset = 0;
+   es-sending_cmd = 1;
+   goto send;
+   }
+
+   if (es-waiting_rsp)
+   /* Make sure we get a response within 4 seconds. */
+   qemu_mod_timer(es-extern_timer,
+  qemu_get_clock_ns(vm_clock) + 40ULL);
+}
+return;
+}
+
+static void extern_timeout(void *opaque)
+{
+IPMIState *s = opaque;

[PATCH 1/9] smbios: Add a function to directly add an entry

2012-07-02 Thread minyard
From: Corey Minyard cminy...@mvista.com

There was no way to directly add a table entry to the SMBIOS table,
even though the BIOS supports this.  So add a function to do this.
This is in preparation for the IPMI handler adding it's SMBIOS table
entry.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 hw/smbios.c |   27 +++
 hw/smbios.h |   15 ---
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/hw/smbios.c b/hw/smbios.c
index c57237d..98c7f99 100644
--- a/hw/smbios.c
+++ b/hw/smbios.c
@@ -178,6 +178,33 @@ static void smbios_build_type_1_fields(const char *t)
  strlen(buf) + 1, buf);
 }
 
+int smbios_table_entry_add(struct smbios_structure_header *entry)
+{
+struct smbios_table *table;
+struct smbios_structure_header *header;
+unsigned int size = entry-length;
+
+if (!smbios_entries) {
+smbios_entries_len = sizeof(uint16_t);
+smbios_entries = g_malloc0(smbios_entries_len);
+}
+smbios_entries = g_realloc(smbios_entries, smbios_entries_len +
+  sizeof(*table) + size);
+table = (struct smbios_table *)(smbios_entries + smbios_entries_len);
+table-header.type = SMBIOS_TABLE_ENTRY;
+table-header.length = cpu_to_le16(sizeof(*table) + size);
+
+header = (struct smbios_structure_header *)(table-data);
+memcpy(header, entry, size);
+
+smbios_check_collision(header-type, SMBIOS_TABLE_ENTRY);
+
+smbios_entries_len += sizeof(*table) + size;
+(*(uint16_t *)smbios_entries) =
+   cpu_to_le16(le16_to_cpu(*(uint16_t *)smbios_entries) + 1);
+return 0;
+}
+
 int smbios_entry_add(const char *t)
 {
 char buf[1024];
diff --git a/hw/smbios.h b/hw/smbios.h
index 94e3641..6431a15 100644
--- a/hw/smbios.h
+++ b/hw/smbios.h
@@ -13,21 +13,22 @@
  *
  */
 
+/* This goes at the beginning of every SMBIOS structure. */
+struct smbios_structure_header {
+uint8_t type;
+uint8_t length;
+uint16_t handle;
+} QEMU_PACKED;
+
 int smbios_entry_add(const char *t);
 void smbios_add_field(int type, int offset, int len, void *data);
 uint8_t *smbios_get_table(size_t *length);
+int smbios_table_entry_add(struct smbios_structure_header *entry);
 
 /*
  * SMBIOS spec defined tables
  */
 
-/* This goes at the beginning of every SMBIOS structure. */
-struct smbios_structure_header {
-uint8_t type;
-uint8_t length;
-uint16_t handle;
-} QEMU_PACKED;
-
 /* SMBIOS type 0 - BIOS Information */
 struct smbios_type_0 {
 struct smbios_structure_header header;
-- 
1.7.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/9] IPMI: Add a local BMC simulation

2012-07-02 Thread minyard
From: Corey Minyard cminy...@mvista.com

This provides a minimal local BMC, basically enough to comply with the
spec and provide a complete watchdog timer (including a sensor, SDR,
and event).

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/ipmi_sim.c  | 1273 
 hw/ipmi_sim.h  |  270 
 5 files changed, 1546 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi_sim.c
 create mode 100644 hw/ipmi_sim.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index f8f8e6d..8c99d5d 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -11,6 +11,7 @@ CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
 CONFIG_IPMI_BT=y
+CONFIG_IPMI_LOCAL=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 8c1177d..4d01883 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -11,6 +11,7 @@ CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
 CONFIG_IPMI_BT=y
+CONFIG_IPMI_LOCAL=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index e1d30cc..193227d 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -24,6 +24,7 @@ hw-obj-$(CONFIG_IPMI) += ipmi.o
 hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
 hw-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs.o
 hw-obj-$(CONFIG_IPMI_BT) += ipmi_bt.o
+hw-obj-$(CONFIG_IPMI_LOCAL) += ipmi_sim.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/ipmi_sim.c b/hw/ipmi_sim.c
new file mode 100644
index 000..6813a86
--- /dev/null
+++ b/hw/ipmi_sim.c
@@ -0,0 +1,1273 @@
+/*
+ * IPMI BMC emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include stdio.h
+#include string.h
+#include ipmi_sim.h
+
+static void ipmi_sim_handle_timeout(IPMIState *s);
+
+static void ipmi_gettime(struct ipmi_time *time)
+{
+int64_t stime;
+
+stime = get_clock_realtime();
+time-tv_sec = stime / 10LL;
+time-tv_nsec = stime % 10LL;
+}
+
+static int64_t ipmi_getmonotime(void)
+{
+return qemu_get_clock_ns(vm_clock);
+}
+
+static void ipmi_timeout(void *opaque)
+{
+IPMIState *s = opaque;
+
+ipmi_sim_handle_timeout(s);
+}
+
+static void set_timestamp(IPMISimState *ss, uint8_t *ts)
+{
+unsigned int val;
+struct ipmi_time now;
+
+ipmi_gettime(now);
+val = now.tv_sec + ss-sel.time_offset;
+ts[0] = val  0xff;
+ts[1] = (val  8)  0xff;
+ts[2] = (val  16)  0xff;
+ts[3] = (val  24)  0xff;
+}
+
+static void sdr_inc_reservation(IPMISdr *sdr)
+{
+sdr-reservation++;
+if (sdr-reservation == 0)
+   sdr-reservation = 1;
+}
+
+static int sdr_add_entry(IPMISimState *ss, const uint8_t *entry,
+unsigned int len, uint16_t *recid)
+{
+if ((len  5) || (len  255))
+   return 1;
+
+if (entry[ss-sdr.next_free + 4] != len)
+   return 1;
+
+if (ss-sdr.next_free + len  MAX_SDR_SIZE) {
+   ss-sdr.overflow = 1;
+   return 1;
+}
+
+memcpy(ss-sdr.sdr + ss-sdr.next_free, entry, len);
+ss-sdr.sdr[ss-sdr.next_free] = ss-sdr.next_rec_id  0xff;
+ss-sdr.sdr[ss-sdr.next_free+1] = (ss-sdr.next_rec_id  8)  0xff;
+ss-sdr.sdr[ss-sdr.next_free+2] = 0x51; /* Conform to IPMI 1.5 spec */
+
+if (recid)
+   *recid = ss-sdr.next_rec_id;
+ss-sdr.next_rec_id++;
+set_timestamp(ss, ss-sdr.last_addition);
+ss-sdr.next_free += len;
+sdr_inc_reservation(ss-sdr);
+return 0;
+}
+
+static int sdr_find_entry(IPMISdr *sdr, uint16_t recid,
+ unsigned int *retpos, uint16_t *nextrec)
+{

[PATCH 4/9] Add a base IPMI interface

2012-07-02 Thread minyard
From: Corey Minyard cminy...@mvista.com

Add the basic IPMI types and infrastructure to QEMU.  Low-level
interfaces and simulation interfaces will register with this; it's
kind of the go-between to tie them together.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |2 +
 hw/ipmi.c  |  147 +++
 hw/ipmi.h  |  192 
 qemu-doc.texi  |2 +
 qemu-options.hx|   35 +++
 sysemu.h   |8 ++
 vl.c   |   46 +
 9 files changed, 434 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi.c
 create mode 100644 hw/ipmi.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 2c78175..eb17afc 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -7,6 +7,7 @@ CONFIG_VGA_ISA=y
 CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
+CONFIG_IPMI=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 233a856..e4e3e4f 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -7,6 +7,7 @@ CONFIG_VGA_ISA=y
 CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
+CONFIG_IPMI=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 30c0b78..0d55997 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -20,6 +20,8 @@ hw-obj-$(CONFIG_M48T59) += m48t59.o
 hw-obj-$(CONFIG_ESCC) += escc.o
 hw-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
 
+hw-obj-$(CONFIG_IPMI) += ipmi.o
+
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
 hw-obj-$(CONFIG_I8254) += i8254_common.o i8254.o
diff --git a/hw/ipmi.c b/hw/ipmi.c
new file mode 100644
index 000..86a097b
--- /dev/null
+++ b/hw/ipmi.c
@@ -0,0 +1,147 @@
+/*
+ * QEMU IPMI emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include hw.h
+#include ipmi.h
+#include sysemu.h
+#include qmp-commands.h
+
+static void (*ipmi_init_handlers[4])(IPMIState *s);
+
+void register_ipmi_type(unsigned int type, void (*init)(IPMIState *s))
+{
+ipmi_init_handlers[type] = init;
+}
+
+static void (*ipmi_sim_init_handlers[2])(IPMIState *s);
+
+void register_ipmi_sim(unsigned int iftype, void (*init)(IPMIState *s))
+{
+ipmi_sim_init_handlers[iftype] = init;
+}
+
+
+#ifdef DO_IPMI_THREAD
+static void *ipmi_thread(void *opaque)
+{
+IPMIState *s = opaque;
+int64_t wait_until;
+
+ipmi_lock();
+for (;;) {
+   qemu_cond_wait(s-waker, s-lock);
+   wait_until = 0;
+   while (s-do_wake) {
+   s-do_wake = 0;
+   s-handle_if_event(s);
+   }
+}
+ipmi_unlock();
+return NULL;
+}
+#endif
+
+static int ipmi_do_hw_op(IPMIState *s, enum ipmi_op op, int checkonly)
+{
+switch(op) {
+case IPMI_RESET_CHASSIS:
+   if (checkonly)
+   return 0;
+   qemu_system_reset_request();
+   return 0;
+
+case IPMI_POWEROFF_CHASSIS:
+   if (checkonly)
+   return 0;
+   qemu_system_powerdown_request();
+   return 0;
+
+case IPMI_SEND_NMI:
+   if (checkonly)
+   return 0;
+   qemu_mutex_lock_iothread();
+   qmp_inject_nmi(NULL);
+   qemu_mutex_unlock_iothread();
+   return 0;
+
+case IPMI_POWERCYCLE_CHASSIS:
+case IPMI_PULSE_DIAG_IRQ:
+case IPMI_SHUTDOWN_VIA_ACPI_OVERTEMP:
+case IPMI_POWERON_CHASSIS:
+default:
+   return IPMI_CC_COMMAND_NOT_SUPPORTED;
+}
+}
+
+static void ipmi_set_irq_enable(IPMIState *s, int val)
+{
+s-irqs_enabled = val;
+}
+
+static 

[PATCH 7/9] IPMI: Add a BT low-level interface

2012-07-02 Thread minyard
From: Corey Minyard cminy...@mvista.com

This provides the simulation of the BT hardware interface for
IPMI.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/ipmi_bt.c   |  265 
 hw/ipmi_bt.h   |   99 +
 5 files changed, 367 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi_bt.c
 create mode 100644 hw/ipmi_bt.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index b549389..f8f8e6d 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -10,6 +10,7 @@ CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
+CONFIG_IPMI_BT=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index af7d2a9..8c1177d 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -10,6 +10,7 @@ CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
+CONFIG_IPMI_BT=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 99e5d1e..e1d30cc 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -23,6 +23,7 @@ hw-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
 hw-obj-$(CONFIG_IPMI) += ipmi.o
 hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
 hw-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs.o
+hw-obj-$(CONFIG_IPMI_BT) += ipmi_bt.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/ipmi_bt.c b/hw/ipmi_bt.c
new file mode 100644
index 000..39f099e
--- /dev/null
+++ b/hw/ipmi_bt.c
@@ -0,0 +1,265 @@
+/*
+ * QEMU IPMI BT emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include hw.h
+
+#include ipmi.h
+#include ipmi_bt.h
+
+#define IPMI_CMD_GET_BT_INTF_CAP   0x36
+
+static void ipmi_bt_handle_event(IPMIState *s)
+{
+IPMIBtState *bt = s-typeinfo;
+
+ipmi_lock();
+if (s-inlen  4)
+   goto out;
+/* Note that overruns are handled by handle_command */
+if (s-inmsg[0] != (s-inlen - 1)) {
+   /* Length mismatch, just ignore. */
+   IPMI_BT_SET_BBUSY(bt-control_reg, 1);
+   s-inlen = 0;
+   goto out;
+}
+if ((s-inmsg[1] == (IPMI_NETFN_APP  2)) 
+   (s-inmsg[3] == IPMI_CMD_GET_BT_INTF_CAP)) {
+   /* We handle this one ourselves. */
+   s-outmsg[0] = 9;
+   s-outmsg[1] = s-inmsg[1] | 0x04;
+   s-outmsg[2] = s-inmsg[2];
+   s-outmsg[3] = s-inmsg[3];
+   s-outmsg[4] = 0;
+   s-outmsg[5] = 1; /* Only support 1 outstanding request. */
+   if (sizeof(s-inmsg)  0xff) /* Input buffer size */
+   s-outmsg[6] = 0xff;
+   else
+   s-outmsg[6] = (unsigned char ) sizeof(s-inmsg);
+   if (sizeof(s-outmsg)  0xff) /* Output buffer size */
+   s-outmsg[7] = 0xff;
+   else
+   s-outmsg[7] = (unsigned char) sizeof(s-outmsg);
+   s-outmsg[8] = 10; /* Max request to response time */
+   s-outmsg[9] = 0; /* Don't recommend retries */
+   s-outlen = 10;
+   IPMI_BT_SET_BBUSY(bt-control_reg, 0);
+   IPMI_BT_SET_B2H_ATN(bt-control_reg, 1);
+   if (s-use_irq  s-irqs_enabled 
+   !IPMI_BT_GET_B2H_IRQ(bt-mask_reg) 
+   IPMI_BT_GET_B2H_IRQ_EN(bt-mask_reg)) {
+   IPMI_BT_SET_B2H_IRQ(bt-mask_reg, 1);
+   qemu_irq_raise(s-irq);
+   }
+   goto out;
+}
+bt-waiting_seq = s-inmsg[2];
+s-inmsg[2] = s-inmsg[1];
+s-handle_command(s, s-inmsg + 2, s-inlen - 2, sizeof(s-inmsg),
+ bt-waiting_rsp);
+ out:
+ipmi_unlock();
+}
+
+static void ipmi_bt_handle_rsp(IPMIState *s, uint8_t msg_id,
+  

[PATCH 2/9] pc: move SMBIOS setup to after device init

2012-07-02 Thread minyard
From: Corey Minyard cminy...@mvista.com

Setting up the firmware interface for the SMBIOS table needs to
be done later in the process, after device initialization, so that
devices can add entries to the table.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 hw/pc.c  |   22 +-
 hw/pc.h  |9 +
 hw/pc_piix.c |   12 
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/hw/pc.c b/hw/pc.c
index fb04c8b..c0acb6a 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -971,20 +971,12 @@ void pc_cpus_init(const char *cpu_model)
 }
 
 void pc_memory_init(MemoryRegion *system_memory,
-const char *kernel_filename,
-const char *kernel_cmdline,
-const char *initrd_filename,
 ram_addr_t below_4g_mem_size,
 ram_addr_t above_4g_mem_size,
-MemoryRegion *rom_memory,
 MemoryRegion **ram_memory)
 {
-int linux_boot, i;
-MemoryRegion *ram, *option_rom_mr;
+MemoryRegion *ram;
 MemoryRegion *ram_below_4g, *ram_above_4g;
-void *fw_cfg;
-
-linux_boot = (kernel_filename != NULL);
 
 /* Allocate RAM.  We allocate it as a single memory region and use
  * aliases to address portions of it, mostly for backwards compatibility
@@ -1006,7 +998,17 @@ void pc_memory_init(MemoryRegion *system_memory,
 memory_region_add_subregion(system_memory, 0x1ULL,
 ram_above_4g);
 }
+}
 
+void pc_bios_init(const char *kernel_filename,
+ const char *kernel_cmdline,
+ const char *initrd_filename,
+ ram_addr_t below_4g_mem_size,
+ MemoryRegion *rom_memory)
+{
+MemoryRegion *option_rom_mr;
+void *fw_cfg;
+int linux_boot, i;
 
 /* Initialize PC system firmware */
 pc_system_firmware_init(rom_memory);
@@ -1019,6 +1021,8 @@ void pc_memory_init(MemoryRegion *system_memory,
 option_rom_mr,
 1);
 
+linux_boot = (kernel_filename != NULL);
+
 fw_cfg = bochs_bios_init();
 rom_set_fw(fw_cfg);
 
diff --git a/hw/pc.h b/hw/pc.h
index 8ccf202..33ab689 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -107,13 +107,14 @@ void pc_acpi_smi_interrupt(void *opaque, int irq, int 
level);
 
 void pc_cpus_init(const char *cpu_model);
 void pc_memory_init(MemoryRegion *system_memory,
-const char *kernel_filename,
-const char *kernel_cmdline,
-const char *initrd_filename,
 ram_addr_t below_4g_mem_size,
 ram_addr_t above_4g_mem_size,
-MemoryRegion *rom_memory,
 MemoryRegion **ram_memory);
+void pc_bios_init(const char *kernel_filename,
+ const char *kernel_cmdline,
+ const char *initrd_filename,
+ ram_addr_t below_4g_mem_size,
+ MemoryRegion *rom_memory);
 qemu_irq *pc_allocate_cpu_irq(void);
 DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
 void pc_basic_device_init(ISABus *isa_bus, qemu_irq *gsi,
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index fb86f27..21b4f59 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -176,10 +176,8 @@ static void pc_init1(MemoryRegion *system_memory,
 
 /* allocate ram and load rom/bios */
 if (!xen_enabled()) {
-pc_memory_init(system_memory,
-   kernel_filename, kernel_cmdline, initrd_filename,
-   below_4g_mem_size, above_4g_mem_size,
-   pci_enabled ? rom_memory : system_memory, ram_memory);
+pc_memory_init(system_memory, below_4g_mem_size, above_4g_mem_size,
+  ram_memory);
 }
 
 gsi_state = g_malloc0(sizeof(*gsi_state));
@@ -287,6 +285,12 @@ static void pc_init1(MemoryRegion *system_memory,
 if (pci_enabled) {
 pc_pci_device_init(pci_bus);
 }
+
+if (!xen_enabled()) {
+   pc_bios_init(kernel_filename, kernel_cmdline, initrd_filename,
+below_4g_mem_size,
+pci_enabled ? rom_memory : system_memory);
+}
 }
 
 static void pc_init_pci(ram_addr_t ram_size,
-- 
1.7.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/9] IPMI: Add a KCS low-level interface

2012-07-02 Thread minyard
From: Corey Minyard cminy...@mvista.com

This provides the simulation of the KCS hardware interface.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/ipmi_kcs.c  |  259 
 hw/ipmi_kcs.h  |   82 +++
 5 files changed, 344 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi_kcs.c
 create mode 100644 hw/ipmi_kcs.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index c0aff0d..b549389 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -9,6 +9,7 @@ CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
+CONFIG_IPMI_KCS=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 615e4f2..af7d2a9 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -9,6 +9,7 @@ CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
+CONFIG_IPMI_KCS=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 8f27ffe..99e5d1e 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -22,6 +22,7 @@ hw-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
 
 hw-obj-$(CONFIG_IPMI) += ipmi.o
 hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
+hw-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/ipmi_kcs.c b/hw/ipmi_kcs.c
new file mode 100644
index 000..61188c9
--- /dev/null
+++ b/hw/ipmi_kcs.c
@@ -0,0 +1,259 @@
+/*
+ * QEMU IPMI KCS emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include hw.h
+
+#include ipmi.h
+#include ipmi_kcs.h
+
+#define SET_OBF() \
+do {   \
+   IPMI_KCS_SET_OBF(kcs-status_reg, 1);   \
+   if (s-use_irq  s-irqs_enabled  !s-obf_irq_set) { \
+   s-obf_irq_set = 1; \
+   if (!s-atn_irq_set)\
+   qemu_irq_raise(s-irq); \
+   }   \
+} while (0)
+
+static void ipmi_kcs_handle_event(IPMIState *s)
+{
+IPMIKcsState *kcs = s-typeinfo;
+if (kcs-cmd_reg == IPMI_KCS_ABORT_STATUS_CMD) {
+   if (IPMI_KCS_GET_STATE(kcs-status_reg) != IPMI_KCS_ERROR_STATE) {
+   kcs-waiting_rsp++; /* Invalidate the message */
+   s-outmsg[0] = IPMI_KCS_STATUS_ABORTED_ERR;
+   s-outlen = 1;
+   s-outpos = 0;
+   IPMI_KCS_SET_STATE(kcs-status_reg, IPMI_KCS_ERROR_STATE);
+   SET_OBF();
+   }
+   goto out;
+}
+
+switch (IPMI_KCS_GET_STATE(kcs-status_reg)) {
+case IPMI_KCS_IDLE_STATE:
+   if (kcs-cmd_reg == IPMI_KCS_WRITE_START_CMD) {
+   IPMI_KCS_SET_STATE(kcs-status_reg, IPMI_KCS_WRITE_STATE);
+   kcs-cmd_reg = -1;
+   s-write_end = 0;
+   s-inlen = 0;
+   SET_OBF();
+   }
+   break;
+
+case IPMI_KCS_READ_STATE:
+handle_read:
+   if (s-outpos = s-outlen) {
+   IPMI_KCS_SET_STATE(kcs-status_reg, IPMI_KCS_IDLE_STATE);
+   SET_OBF();
+   } else if (kcs-data_in_reg == IPMI_KCS_READ_CMD) {
+   kcs-data_out_reg = s-outmsg[s-outpos];
+   s-outpos++;
+   SET_OBF();
+   } else {
+   s-outmsg[0] = IPMI_KCS_STATUS_BAD_CC_ERR;
+   s-outlen = 1;
+   s-outpos = 0;
+   IPMI_KCS_SET_STATE(kcs-status_reg, IPMI_KCS_ERROR_STATE);
+   SET_OBF();
+   goto out;

Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk

2012-07-02 Thread Rusty Russell
On Mon, 02 Jul 2012 10:45:05 +0800, Asias He as...@redhat.com wrote:
 On 07/02/2012 07:54 AM, Rusty Russell wrote:
  Confused.  So, without merging we get 6k exits (per second?)  How many
  do we get when we use the request-based IO path?
 
 Sorry for the confusion. The numbers were collected from request-based 
 IO path where we can have the guest block layer merge the requests.
 
 With the same workload in guest, the guest fires 200K requests to host 
 with merges enabled in guest (echo 0  /sys/block/vdb/queue/nomerges), 
 while the guest fires 4K requests to host with merges disabled in 
 guest (echo 2  /sys/block/vdb/queue/nomerges). This show that the merge 
 in block layer reduces the total number of requests fire to host a lot 
 (4K / 200K = 20).
 
 The guest fires 200K requests to host with merges enabled in guest (echo 
 0  /sys/block/vdb/queue/nomerges), the host fires 6K interrupts in 
 total for the 200K requests. This show that the ratio of interrupts 
 coalesced (200K / 6K = 33).

OK, got it!  Guest merging cuts requests by a factor of 20.  EVENT_IDX
cuts interrupts by a factor of 33.

  If your device is slow, then you won't be able to make many requests per
  second: why worry about exit costs?
 
 If a device is slow, the merge would merge more requests and reduce the 
 total number of requests to host. This saves exit costs, no?

Sure, our guest merging might save us 100x as many exits as no merging.
But since we're not doing many requests, does it matter?

Ideally we'd merge requests only if the device queue is full.  But that
sounds hard.

  If your device is fast (eg. ram),
  you've already shown that your patch is a win, right?
 
 Yes. Both on ramdisk and fast SSD device (e.g. FusionIO).

Thanks,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] kvm/s390: sigp related changes for 3.6

2012-07-02 Thread Marcelo Tosatti
On Tue, Jun 26, 2012 at 04:06:35PM +0200, Cornelia Huck wrote:
 Avi, Marcelo,
 
 here are some more s390 patches for the next release.
 
 Patches 1 and 2 are included for dependency reasons; they will also
 be sent through Martin's s390 tree.
 
 The other patches fix several problems in our sigp handling code
 and make it nicer to read.
 
 Cornelia Huck (1):
   KVM: s390: Fix sigp sense handling.
 
 Heiko Carstens (5):
   s390/smp: remove redundant check
   s390/smp/kvm: unifiy sigp definitions
   KVM: s390: fix sigp sense running condition code handling
   KVM: s390: fix sigp set prefix status stored cases
   KVM: s390: use sigp condition code defines
 
  arch/s390/include/asm/sigp.h |   32 
  arch/s390/kernel/smp.c   |   76 ++-
  arch/s390/kvm/sigp.c |  117 
 +-
  3 files changed, 106 insertions(+), 119 deletions(-)
  create mode 100644 arch/s390/include/asm/sigp.h

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[ANNOUNCE] qemu-kvm-1.1.0

2012-07-02 Thread Marcelo Tosatti

qemu-kvm-1.1.0 is now available. This release is based on the upstream
qemu 1.1.0, plus kvm-specific enhancements. Please see the
original QEMU 1.1.0 release announcement [1] for details.

This release can be used with the kvm kernel modules provided by your
distribution kernel, or by the modules in the kvm-kmod package, such
as kvm-kmod-3.1.

http://www.linux-kvm.org

[1] http://lists.gnu.org/archive/html/qemu-devel/2012-06/msg00072.html


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk

2012-07-02 Thread Asias He

On 07/02/2012 02:41 PM, Rusty Russell wrote:

On Mon, 02 Jul 2012 10:45:05 +0800, Asias He as...@redhat.com wrote:

On 07/02/2012 07:54 AM, Rusty Russell wrote:

Confused.  So, without merging we get 6k exits (per second?)  How many
do we get when we use the request-based IO path?


Sorry for the confusion. The numbers were collected from request-based
IO path where we can have the guest block layer merge the requests.

With the same workload in guest, the guest fires 200K requests to host
with merges enabled in guest (echo 0  /sys/block/vdb/queue/nomerges),
while the guest fires 4K requests to host with merges disabled in
guest (echo 2  /sys/block/vdb/queue/nomerges). This show that the merge
in block layer reduces the total number of requests fire to host a lot
(4K / 200K = 20).

The guest fires 200K requests to host with merges enabled in guest (echo
0  /sys/block/vdb/queue/nomerges), the host fires 6K interrupts in
total for the 200K requests. This show that the ratio of interrupts
coalesced (200K / 6K = 33).


OK, got it!  Guest merging cuts requests by a factor of 20.  EVENT_IDX
cuts interrupts by a factor of 33.


Yes.




If your device is slow, then you won't be able to make many requests per
second: why worry about exit costs?


If a device is slow, the merge would merge more requests and reduce the
total number of requests to host. This saves exit costs, no?


Sure, our guest merging might save us 100x as many exits as no merging.
But since we're not doing many requests, does it matter?


We can still have many requests with slow devices. The number of 
requests depends on the workload in guest. E.g. 512 IO threads in guest 
keeping doing IO.



Ideally we'd merge requests only if the device queue is full.  But that
sounds hard.


If your device is fast (eg. ram),
you've already shown that your patch is a win, right?


Yes. Both on ramdisk and fast SSD device (e.g. FusionIO).


Thanks,
Rusty.




--
Asias


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 0/3] kvm: KVM_IRQFD cleanup, docs, sanitize flags

2012-07-02 Thread Marcelo Tosatti
On Mon, Jul 02, 2012 at 06:52:46PM +0300, Avi Kivity wrote:
 On 07/02/2012 06:51 PM, Alex Williamson wrote:
  On Fri, 2012-06-29 at 09:56 -0600, Alex Williamson wrote:
  Before we start fiddling with what we can and can't add to KVM_IRQFD
  we need to figure out if anyone has been sloppy in their use of the
  ioctl flags.  This series has a minor cleanup to pass the struct
  kvm_irqfd to seup functions rather than individual parameters, making
  it more consistent with ioeventfd, adds API documentation for this
  ioctl, and sanitizes the flags.  If anyone screams, we may have to
  revert this last patch.  Thanks,
  
  Avi, what do you think about trying to get this in for 3.5 to test
  whether we break anyone?  Then we can aim for 3.6 for level irqfd 
  eoifd support.
 
 Makes sense.  Marcelo?

Applied to master, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: handle last_boosted_vcpu = 0 case

2012-07-02 Thread Raghavendra K T

On 07/02/2012 08:19 PM, Rik van Riel wrote:

On 06/28/2012 06:55 PM, Vinod, Chegu wrote:

Hello,

I am just catching up on this email thread...

Perhaps one of you may be able to help answer this query.. preferably
along with some data. [BTW, I do understand the basic intent behind
PLE in a typical [sweet spot] use case where there is over
subscription etc. and the need to optimize the PLE handler in the host
etc. ]

In a use case where the host has fewer but much larger guests (say
40VCPUs and higher) and there is no over subscription (i.e. # of vcpus
across guests= physical cpus in the host and perhaps each guest has
their vcpu's pinned to specific physical cpus for other reasons), I
would like to understand if/how the PLE really helps ? For these use
cases would it be ok to turn PLE off (ple_gap=0) since is no real need
to take an exit and find some other VCPU to yield to ?


Yes, that should be ok.


I think this should be true when we have ple_window tuned to correct
value for guest. (same what you raised)

But otherwise, IMO, it is a very tricky question to answer. PLE is
currently benefiting even flush_tlb_ipi etc apart from spinlock. Having
a properly tuned value for all types of workload, (+load) is really
complicated.
Coming back to ple_handler, IMHO, if we have slight increase in
run_queue length, having directed yield may worsen the scenario.

(In the case Vinod explained, even-though we will succeed in setting
other vcpu task as next_buddy, caller itself gets scheduled out, so
ganging effect reduces. on top of this we always have a question, have 
we chosen right guy OR a really bad guy for yielding.)




On a related note, I wonder if we should increase the ple_gap
significantly.


Did you mean ple_window?



After all, 4096 cycles of spinning is not that much, when you
consider how much time is spent doing the subsequent vmexit,
scanning the other VCPU's status (200 cycles per cache miss),
deciding what to do, maybe poking another CPU, and eventually
a vmenter.

A factor 4 increase in ple_gap might be what it takes to
get the amount of time spent spinning equal to the amount of
time spent on the host side doing KVM stuff...



I agree, I am experimenting with all these things left and right, along
with several optimization ideas I have. Hope to comeback on the
experiments soon.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Tuesday, July 3rd

2012-07-02 Thread 王永博
does KVM  have the function like vmsafe to develop security software
for Virtualization 。

2012/7/2 Juan Quintela quint...@redhat.com:

 Hi

 Please send in any agenda items you are interested in covering.

 Later, Juan.
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RFD: virtio balloon API use (was Re: [PATCH 5 of 5] virtio: expose added descriptors immediately)

2012-07-02 Thread Rusty Russell
On Mon, 2 Jul 2012 13:08:19 -0300, Rafael Aquini aqu...@redhat.com wrote:
 As 'locking in balloon', may I assume the approach I took for the compaction 
 case
 is OK and aligned to address these concerns of yours? If not, do not hesitate 
 in
 giving me your thoughts, please. I'm respinning a V3 series to address a 
 couple
 of extra nitpicks from the compaction standpoint, and I'd love to be able to
 address any extra concern you might have on the balloon side of that work.

It's orthogonal, though looks like they clash textually :(

I'll re-spin MST's patch on top of yours, and include both in my tree,
otherwise linux-next will have to do the merge.  But I'll await your
push before pushing to Linus next merge window.

Thanks,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] msi/msix: added API to set MSI message address and data

2012-07-02 Thread Jan Kiszka
On 2012-07-02 06:28, Alexey Kardashevskiy wrote:
 Ping?
 
 
 On 22/06/12 11:15, Alexey Kardashevskiy wrote:
 Added (msi|msix)_set_message() function for whoever might
 want to use them.

 Currently msi_notify()/msix_notify() write to these vectors to
 signal the guest about an interrupt so the correct values have to
 written there by the guest or QEMU.

 For example, POWER guest never initializes MSI/MSIX vectors, instead
 it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
 POWER we have to initialize MSI/MSIX message from QEMU.

 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
  hw/msi.c  |   13 +
  hw/msi.h  |1 +
  hw/msix.c |9 +
  hw/msix.h |2 ++
  4 files changed, 25 insertions(+)

 diff --git a/hw/msi.c b/hw/msi.c
 index 5233204..cc6102f 100644
 --- a/hw/msi.c
 +++ b/hw/msi.c
 @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* 
 dev, bool msi64bit)
  return dev-msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : 
 PCI_MSI_PENDING_32);
  }
  
 +void msi_set_message(PCIDevice *dev, MSIMessage msg)
 +{
 +uint16_t flags = pci_get_word(dev-config + msi_flags_off(dev));
 +bool msi64bit = flags  PCI_MSI_FLAGS_64BIT;
 +
 +if (msi64bit) {
 +pci_set_quad(dev-config + msi_address_lo_off(dev), msg.address);
 +} else {
 +pci_set_long(dev-config + msi_address_lo_off(dev), msg.address);
 +}
 +pci_set_word(dev-config + msi_data_off(dev, msi64bit), msg.data);
 +}
 +
  bool msi_enabled(const PCIDevice *dev)
  {
  return msi_present(dev) 
 diff --git a/hw/msi.h b/hw/msi.h
 index 75747ab..6ec1f99 100644
 --- a/hw/msi.h
 +++ b/hw/msi.h
 @@ -31,6 +31,7 @@ struct MSIMessage {
  
  extern bool msi_supported;
  
 +void msi_set_message(PCIDevice *dev, MSIMessage msg);
  bool msi_enabled(const PCIDevice *dev);
  int msi_init(struct PCIDevice *dev, uint8_t offset,
   unsigned int nr_vectors, bool msi64bit, bool 
 msi_per_vector_mask);
 diff --git a/hw/msix.c b/hw/msix.c
 index ded3c55..5f7d6d3 100644
 --- a/hw/msix.c
 +++ b/hw/msix.c
 @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, 
 unsigned vector)
  return msg;
  }
  
 +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
 +{
 +uint8_t *table_entry = dev-msix_table_page + vector * 
 PCI_MSIX_ENTRY_SIZE;
 +
 +pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
 +pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
 +table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] = ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
 +}
 +
  /* Add MSI-X capability to the config space for the device. */
  /* Given a bar and its size, add MSI-X table on top of it
   * and fill MSI-X capability in the config space.
 diff --git a/hw/msix.h b/hw/msix.h
 index 50aee82..26a437e 100644
 --- a/hw/msix.h
 +++ b/hw/msix.h
 @@ -4,6 +4,8 @@
  #include qemu-common.h
  #include pci.h
  
 +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
 +
  int msix_init(PCIDevice *pdev, unsigned short nentries,
MemoryRegion *bar,
unsigned bar_nr, unsigned bar_size);
 
 

Acked-by: Jan Kiszka jan.kis...@siemens.com

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux


--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/8] KVM: Optimize MMU notifier's THP page invalidation -v4

2012-07-02 Thread Takuya Yoshikawa
v3-v4: Resolved trace_kvm_age_page() issue -- patch 6,7
v2-v3: Fixed intersection calculations. -- patch 3, 8

Takuya

Takuya Yoshikawa (8):
  KVM: MMU: Use __gfn_to_rmap() to clean up kvm_handle_hva()
  KVM: Introduce hva_to_gfn_memslot() for kvm_handle_hva()
  KVM: MMU: Make kvm_handle_hva() handle range of addresses
  KVM: Introduce kvm_unmap_hva_range() for 
kvm_mmu_notifier_invalidate_range_start()
  KVM: Separate rmap_pde from kvm_lpage_info-write_count
  KVM: MMU: Add memslot parameter to hva handlers
  KVM: MMU: Push trace_kvm_age_page() into kvm_age_rmapp()
  KVM: MMU: Avoid handling same rmap_pde in kvm_handle_hva_range()

 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   47 ---
 arch/x86/include/asm/kvm_host.h |3 +-
 arch/x86/kvm/mmu.c  |  107 +++
 arch/x86/kvm/x86.c  |   11 
 include/linux/kvm_host.h|8 +++
 virt/kvm/kvm_main.c |3 +-
 7 files changed, 131 insertions(+), 50 deletions(-)


From v2:

The new test result was impressively good, see below, and THP page
invalidation was more than 5 times faster on my x86 machine.

Before:
  ...
  19.852 us  |  __mmu_notifier_invalidate_range_start();
  28.033 us  |  __mmu_notifier_invalidate_range_start();
  19.066 us  |  __mmu_notifier_invalidate_range_start();
  44.715 us  |  __mmu_notifier_invalidate_range_start();
  31.613 us  |  __mmu_notifier_invalidate_range_start();
  20.659 us  |  __mmu_notifier_invalidate_range_start();
  19.979 us  |  __mmu_notifier_invalidate_range_start();
  20.416 us  |  __mmu_notifier_invalidate_range_start();
  20.632 us  |  __mmu_notifier_invalidate_range_start();
  22.316 us  |  __mmu_notifier_invalidate_range_start();
  ...

After:
  ...
  4.089 us   |  __mmu_notifier_invalidate_range_start();
  4.096 us   |  __mmu_notifier_invalidate_range_start();
  3.560 us   |  __mmu_notifier_invalidate_range_start();
  3.376 us   |  __mmu_notifier_invalidate_range_start();
  3.772 us   |  __mmu_notifier_invalidate_range_start();
  3.353 us   |  __mmu_notifier_invalidate_range_start();
  3.332 us   |  __mmu_notifier_invalidate_range_start();
  3.332 us   |  __mmu_notifier_invalidate_range_start();
  3.332 us   |  __mmu_notifier_invalidate_range_start();
  3.337 us   |  __mmu_notifier_invalidate_range_start();
  ...
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] KVM: MMU: Use __gfn_to_rmap() to clean up kvm_handle_hva()

2012-07-02 Thread Takuya Yoshikawa
We can treat every level uniformly.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
---
 arch/x86/kvm/mmu.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3b53d9e..d3e7e6a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1205,14 +1205,14 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned 
long hva,
gfn_t gfn_offset = (hva - start)  PAGE_SHIFT;
gfn_t gfn = memslot-base_gfn + gfn_offset;
 
-   ret = handler(kvm, memslot-rmap[gfn_offset], data);
+   ret = 0;
 
-   for (j = 0; j  KVM_NR_PAGE_SIZES - 1; ++j) {
-   struct kvm_lpage_info *linfo;
+   for (j = PT_PAGE_TABLE_LEVEL;
+j  PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+   unsigned long *rmapp;
 
-   linfo = lpage_info_slot(gfn, memslot,
-   PT_DIRECTORY_LEVEL + j);
-   ret |= handler(kvm, linfo-rmap_pde, data);
+   rmapp = __gfn_to_rmap(gfn, j, memslot);
+   ret |= handler(kvm, rmapp, data);
}
trace_kvm_age_page(hva, memslot, ret);
retval |= ret;
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/8] KVM: Introduce hva_to_gfn_memslot() for kvm_handle_hva()

2012-07-02 Thread Takuya Yoshikawa
This restricts hva handling in mmu code and makes it easier to extend
kvm_handle_hva() so that it can treat a range of addresses later in this
patch series.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
Cc: Alexander Graf ag...@suse.de
Cc: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c |6 +++---
 arch/x86/kvm/mmu.c  |3 +--
 include/linux/kvm_host.h|8 
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d03eb6f..3703755 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -772,10 +772,10 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
 
end = start + (memslot-npages  PAGE_SHIFT);
if (hva = start  hva  end) {
-   gfn_t gfn_offset = (hva - start)  PAGE_SHIFT;
+   gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
+   gfn_t gfn_offset = gfn - memslot-base_gfn;
 
-   ret = handler(kvm, memslot-rmap[gfn_offset],
- memslot-base_gfn + gfn_offset);
+   ret = handler(kvm, memslot-rmap[gfn_offset], gfn);
retval |= ret;
}
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d3e7e6a..b898bec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1202,8 +1202,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
 
end = start + (memslot-npages  PAGE_SHIFT);
if (hva = start  hva  end) {
-   gfn_t gfn_offset = (hva - start)  PAGE_SHIFT;
-   gfn_t gfn = memslot-base_gfn + gfn_offset;
+   gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
 
ret = 0;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c7f7787..c0dd2e9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -740,6 +740,14 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t 
base_gfn, int level)
(base_gfn  KVM_HPAGE_GFN_SHIFT(level));
 }
 
+static inline gfn_t
+hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
+{
+   gfn_t gfn_offset = (hva - slot-userspace_addr)  PAGE_SHIFT;
+
+   return slot-base_gfn + gfn_offset;
+}
+
 static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
   gfn_t gfn)
 {
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/8] KVM: MMU: Make kvm_handle_hva() handle range of addresses

2012-07-02 Thread Takuya Yoshikawa
When guest's memory is backed by THP pages, MMU notifier needs to call
kvm_unmap_hva(), which in turn leads to kvm_handle_hva(), in a loop to
invalidate a range of pages which constitute one huge page:

  for each page
for each memslot
  if page is in memslot
unmap using rmap

This means although every page in that range is expected to be found in
the same memslot, we are forced to check unrelated memslots many times.
If the guest has more memslots, the situation will become worse.

Furthermore, if the range does not include any pages in the guest's
memory, the loop over the pages will just consume extra time.

This patch, together with the following patches, solves this problem by
introducing kvm_handle_hva_range() which makes the loop look like this:

  for each memslot
for each page in memslot
  unmap using rmap

In this new processing, the actual work is converted to a loop over rmap
which is much more cache friendly than before.

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
Cc: Alexander Graf ag...@suse.de
Cc: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   36 ++--
 arch/x86/kvm/mmu.c  |   44 ++
 2 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 3703755..1a470bc 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
goto out_put;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-unsigned long gfn))
+static int kvm_handle_hva_range(struct kvm *kvm,
+   unsigned long start,
+   unsigned long end,
+   int (*handler)(struct kvm *kvm,
+  unsigned long *rmapp,
+  unsigned long gfn))
 {
int ret;
int retval = 0;
@@ -767,12 +770,22 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
 
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
-   unsigned long start = memslot-userspace_addr;
-   unsigned long end;
+   unsigned long hva_start, hva_end;
+   gfn_t gfn, gfn_end;
+
+   hva_start = max(start, memslot-userspace_addr);
+   hva_end = min(end, memslot-userspace_addr +
+   (memslot-npages  PAGE_SHIFT));
+   if (hva_start = hva_end)
+   continue;
+   /*
+* {gfn(page) | page intersects with [hva_start, hva_end)} =
+* {gfn, gfn+1, ..., gfn_end-1}.
+*/
+   gfn = hva_to_gfn_memslot(hva_start, memslot);
+   gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-   end = start + (memslot-npages  PAGE_SHIFT);
-   if (hva = start  hva  end) {
-   gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
+   for (; gfn  gfn_end; ++gfn) {
gfn_t gfn_offset = gfn - memslot-base_gfn;
 
ret = handler(kvm, memslot-rmap[gfn_offset], gfn);
@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
return retval;
 }
 
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+unsigned long gfn))
+{
+   return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
+}
+
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
   unsigned long gfn)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b898bec..6903568 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1183,10 +1183,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned 
long *rmapp,
return 0;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-unsigned long data))
+static int kvm_handle_hva_range(struct kvm *kvm,
+   unsigned long start,
+   unsigned long end,
+   unsigned long data,
+   int (*handler)(struct kvm *kvm,
+  unsigned long *rmapp,
+  unsigned long data))
 {
int j;
int ret;
@@ 

[PATCH 4/8] KVM: Introduce kvm_unmap_hva_range() for kvm_mmu_notifier_invalidate_range_start()

2012-07-02 Thread Takuya Yoshikawa
When we tested KVM under memory pressure, with THP enabled on the host,
we noticed that MMU notifier took a long time to invalidate huge pages.

Since the invalidation was done with mmu_lock held, it not only wasted
the CPU but also made the host harder to respond.

This patch mitigates this by using kvm_handle_hva_range().

Signed-off-by: Takuya Yoshikawa yoshikawa.tak...@oss.ntt.co.jp
Cc: Alexander Graf ag...@suse.de
Cc: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_host.h |2 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c |7 +++
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/mmu.c  |5 +
 virt/kvm/kvm_main.c |3 +--
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..572ad01 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -52,6 +52,8 @@
 
 struct kvm;
 extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_unmap_hva_range(struct kvm *kvm,
+  unsigned long start, unsigned long end);
 extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 1a470bc..3c635c0 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -870,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return 0;
 }
 
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long 
end)
+{
+   if (kvm-arch.using_mmu_notifiers)
+   kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+   return 0;
+}
+
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 unsigned long gfn)
 {
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 24b7647..5aab8d4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -942,6 +942,7 @@ extern bool kvm_rebooting;
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long 
end);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6903568..a5f0200 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1248,6 +1248,11 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
 }
 
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long 
end)
+{
+   return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
kvm_handle_hva(kvm, hva, (unsigned long)pte, kvm_set_pte_rmapp);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 636bd08..c597d00 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -332,8 +332,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
 * count is also read inside the mmu_lock critical section.
 */
kvm-mmu_notifier_count++;
-   for (; start  end; start += PAGE_SIZE)
-   need_tlb_flush |= kvm_unmap_hva(kvm, start);
+   need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm-tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >