Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-14 Thread Avi Kivity

On 09/13/2011 10:21 PM, Don Zickus wrote:

Or are you saying an NMI in an idle system will have the same %rip thus
falsely detecting a back-to-back NMI?




That's easy to avoid - insert an instruction zeroing the last nmi_rip 
somewhere before or after hlt.  It's always okay to execute such an 
instruction (outside the nmi handler itself), since nmi_rip is meant to 
detect a no instructions executed condition.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] kvm tools: Use kernel dhcp for network autoconfiguration

2011-09-14 Thread Sasha Levin
This patch removes the manual/usermode dhcp client configuration and instead
uses the DHCP client built within the kernel.

Since this client is tightly integrated with NFS (if NFS config is set), we
will add a specific NFS root addr in our DHCP offer to point it to a non
existent address so that we won't hang trying to poke it for our root.

Signed-off-by: Sasha Levin levinsasha...@gmail.com
---
 tools/kvm/builtin-run.c |2 +-
 tools/kvm/guest/init.c  |4 
 tools/kvm/guest/setnet.sh   |   22 --
 tools/kvm/include/kvm/uip.h |2 ++
 tools/kvm/net/uip/dhcp.c|8 
 5 files changed, 11 insertions(+), 27 deletions(-)
 delete mode 100755 tools/kvm/guest/setnet.sh

diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
index 591fd77..465bbe7 100644
--- a/tools/kvm/builtin-run.c
+++ b/tools/kvm/builtin-run.c
@@ -764,7 +764,7 @@ int kvm_cmd_run(int argc, const char **argv, const char 
*prefix)
if (using_rootfs) {
strcat(real_cmdline,  root=/dev/root rw 
rootflags=rw,trans=virtio,version=9p2000.L rootfstype=9p);
if (custom_rootfs)
-   strcat(real_cmdline,  init=/virt/init);
+   strcat(real_cmdline,  init=/virt/init ip=dhcp);
} else if (!strstr(real_cmdline, root=)) {
strlcat(real_cmdline,  root=/dev/vda rw , 
sizeof(real_cmdline));
}
diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c
index 7733026..837acfb 100644
--- a/tools/kvm/guest/init.c
+++ b/tools/kvm/guest/init.c
@@ -30,10 +30,6 @@ int main(int argc, char *argv[])
 
do_mounts();
 
-   puts(Setting up network...);
-
-   system(/bin/sh virt/setnet.sh);
-
puts(Starting '/bin/sh'...);
 
run_process(/bin/sh);
diff --git a/tools/kvm/guest/setnet.sh b/tools/kvm/guest/setnet.sh
deleted file mode 100755
index 3da9c22..000
--- a/tools/kvm/guest/setnet.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-for f in /sys/class/net/*; do
-   type=`cat $f/type`
-   if [ $type -eq 1 ]; then
-   f=${f#/sys/class/net/}
-
-   eval dhcpcd -A $f 2 /dev/null
-   if [ $? -eq 0 ]; then
-   exit
-   fi
-
-   eval dhclient $f 2 /dev/null
-   if [ $? -eq 0 ]; then
-   exit
-   fi
-
-   ifconfig $f 192.168.33.15
-   route add default 192.168.33.1
-   echo nameserver 8.8.8.8  /etc/resolv.conf
-
-   exit
-   fi
-done
diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h
index 344ec09..3501d36 100644
--- a/tools/kvm/include/kvm/uip.h
+++ b/tools/kvm/include/kvm/uip.h
@@ -58,6 +58,8 @@
 #define UIP_DHCP_TAG_SUBMASK_LEN   4
 #define UIP_DHCP_TAG_ROUTER3
 #define UIP_DHCP_TAG_ROUTER_LEN4
+#define UIP_DHCP_TAG_ROOT  17
+#define UIP_DHCP_TAG_ROOT_LEN  4
 #define UIP_DHCP_TAG_DNS_SERVER6
 #define UIP_DHCP_TAG_DNS_SERVER_LEN4
 #define UIP_DHCP_TAG_DOMAIN_NAME   15
diff --git a/tools/kvm/net/uip/dhcp.c b/tools/kvm/net/uip/dhcp.c
index bd3c53b..e91a7c7 100644
--- a/tools/kvm/net/uip/dhcp.c
+++ b/tools/kvm/net/uip/dhcp.c
@@ -2,6 +2,8 @@
 
 #include arpa/inet.h
 
+#define EMPTY_ADDR 0.0.0.0
+
 static inline bool uip_dhcp_is_discovery(struct uip_dhcp *dhcp)
 {
return (dhcp-option[2] == UIP_DHCP_DISCOVER 
@@ -127,6 +129,12 @@ static int uip_dhcp_fill_option(struct uip_info *info, 
struct uip_dhcp *dhcp, in
*addr   = htonl(info-host_ip);
i   += UIP_DHCP_TAG_ROUTER_LEN;
 
+   opt[i++]= UIP_DHCP_TAG_ROOT;
+   opt[i++]= strlen(EMPTY_ADDR);
+   addr= (u32 *)opt[i];
+   strncpy((void *) addr, EMPTY_ADDR, strlen(EMPTY_ADDR));
+   i   += strlen(EMPTY_ADDR);
+
i   = uip_dhcp_fill_option_name_and_server(info, opt, i);
 
opt[i++]= UIP_DHCP_TAG_END;
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] kvm tools: Don't use i8042 AUX port

2011-09-14 Thread Sasha Levin
We currently don't have sufficient support for mouse, this patch disables
it by default to prevent the delay when booting.

It should be removed once sufficient mouse support is added.

Signed-off-by: Sasha Levin levinsasha...@gmail.com
---
 tools/kvm/builtin-run.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
index 465bbe7..2795115 100644
--- a/tools/kvm/builtin-run.c
+++ b/tools/kvm/builtin-run.c
@@ -736,7 +736,8 @@ int kvm_cmd_run(int argc, const char **argv, const char 
*prefix)
vidmode = 0;
 
memset(real_cmdline, 0, sizeof(real_cmdline));
-   strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 
i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1);
+   strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 
i8042.direct=1 
+   i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1);
if (vnc || sdl) {
strcat(real_cmdline,  video=vesafb console=tty0);
} else
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm tools: Don't use i8042 AUX port

2011-09-14 Thread Pekka Enberg
On Wed, Sep 14, 2011 at 10:11 AM, Sasha Levin levinsasha...@gmail.com wrote:
 We currently don't have sufficient support for mouse, this patch disables
 it by default to prevent the delay when booting.

 It should be removed once sufficient mouse support is added.

 Signed-off-by: Sasha Levin levinsasha...@gmail.com
 ---
  tools/kvm/builtin-run.c |    3 ++-
  1 files changed, 2 insertions(+), 1 deletions(-)

 diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
 index 465bbe7..2795115 100644
 --- a/tools/kvm/builtin-run.c
 +++ b/tools/kvm/builtin-run.c
 @@ -736,7 +736,8 @@ int kvm_cmd_run(int argc, const char **argv, const char 
 *prefix)
                vidmode = 0;

        memset(real_cmdline, 0, sizeof(real_cmdline));
 -       strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 
 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1);
 +       strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 
 i8042.direct=1 
 +                               i8042.dumbkbd=1 i8042.nopnp=1 
 i8042.noaux=1);
        if (vnc || sdl) {
                strcat(real_cmdline,  video=vesafb console=tty0);
        } else

What's the problem? IIRC mouse works just fine in VNC mode?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm tools: Don't use i8042 AUX port

2011-09-14 Thread Sasha Levin
On Wed, 2011-09-14 at 10:14 +0300, Pekka Enberg wrote:
 On Wed, Sep 14, 2011 at 10:11 AM, Sasha Levin levinsasha...@gmail.com wrote:
  We currently don't have sufficient support for mouse, this patch disables
  it by default to prevent the delay when booting.
 
  It should be removed once sufficient mouse support is added.
 
  Signed-off-by: Sasha Levin levinsasha...@gmail.com
  ---
   tools/kvm/builtin-run.c |3 ++-
   1 files changed, 2 insertions(+), 1 deletions(-)
 
  diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
  index 465bbe7..2795115 100644
  --- a/tools/kvm/builtin-run.c
  +++ b/tools/kvm/builtin-run.c
  @@ -736,7 +736,8 @@ int kvm_cmd_run(int argc, const char **argv, const char 
  *prefix)
 vidmode = 0;
 
 memset(real_cmdline, 0, sizeof(real_cmdline));
  -   strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k 
  panic=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1);
  +   strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k 
  panic=1 i8042.direct=1 
  +   i8042.dumbkbd=1 i8042.nopnp=1 
  i8042.noaux=1);
 if (vnc || sdl) {
 strcat(real_cmdline,  video=vesafb console=tty0);
 } else
 
 What's the problem? IIRC mouse works just fine in VNC mode?

The problem is that it causes a pretty long delay (~5-6 sec) during
boot.

VNC mouse could use some more love before I'd say it's good, one example
that the host mouse and guest mouse aren't synced.

-- 

Sasha.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm tools: Don't use i8042 AUX port

2011-09-14 Thread Sasha Levin
On Wed, 2011-09-14 at 10:21 +0300, Pekka Enberg wrote:
 On 9/14/11 10:16 AM, Sasha Levin wrote:
  The problem is that it causes a pretty long delay (~5-6 sec) during
  boot.
 
  VNC mouse could use some more love before I'd say it's good, one example
  that the host mouse and guest mouse aren't synced.
 
 Maybe but it's likely good enough for the people who are actually using 
 it. We can't just disable it silently!
 
 So I think we should enable AUX if user asks for VNC, no?

Yup, let's make it disabled only when !vnc and !sdl.

-- 

Sasha.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm tools: Don't use i8042 AUX port

2011-09-14 Thread Pekka Enberg

On 9/14/11 10:16 AM, Sasha Levin wrote:

The problem is that it causes a pretty long delay (~5-6 sec) during
boot.

VNC mouse could use some more love before I'd say it's good, one example
that the host mouse and guest mouse aren't synced.


Maybe but it's likely good enough for the people who are actually using 
it. We can't just disable it silently!


So I think we should enable AUX if user asks for VNC, no?

Pekka
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/2] kvm tools: Don't use i8042 AUX port

2011-09-14 Thread Sasha Levin
We are enabling i8042 even without VNC or SDL so that we could use it's
reset method to reboot the guest.

AUX port might cause delays during boot. Disable it if the user didn't ask
for VNC or SDL.

Signed-off-by: Sasha Levin levinsasha...@gmail.com
---
 tools/kvm/builtin-run.c |5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
index 465bbe7..b2f17ea 100644
--- a/tools/kvm/builtin-run.c
+++ b/tools/kvm/builtin-run.c
@@ -736,11 +736,12 @@ int kvm_cmd_run(int argc, const char **argv, const char 
*prefix)
vidmode = 0;
 
memset(real_cmdline, 0, sizeof(real_cmdline));
-   strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 
i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1);
+   strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 
i8042.direct=1 
+   i8042.dumbkbd=1 i8042.nopnp=1);
if (vnc || sdl) {
strcat(real_cmdline,  video=vesafb console=tty0);
} else
-   strcat(real_cmdline,  console=ttyS0 earlyprintk=serial);
+   strcat(real_cmdline,  console=ttyS0 earlyprintk=serial 
i8042.noaux=1);
strcat(real_cmdline,  );
if (kernel_cmdline)
strlcat(real_cmdline, kernel_cmdline, sizeof(real_cmdline));
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: x86: Simplify kvm timer handler

2011-09-14 Thread Jan Kiszka
The vcpu reference of a kvm_timer can't become NULL while the timer is
valid, so drop this redundant test. This also makes it pointless to
carry a separate __kvm_timer_fn, fold it into kvm_timer_fn.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 arch/x86/kvm/timer.c |   26 --
 1 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index ae432ea..6b85cc6 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -18,9 +18,10 @@
 #include linux/atomic.h
 #include kvm_timer.h
 
-static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
 {
-   int restart_timer = 0;
+   struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+   struct kvm_vcpu *vcpu = ktimer-vcpu;
wait_queue_head_t *q = vcpu-wq;
 
/*
@@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct 
kvm_timer *ktimer)
 
if (ktimer-t_ops-is_periodic(ktimer)) {
hrtimer_add_expires_ns(ktimer-timer, ktimer-period);
-   restart_timer = 1;
-   }
-
-   return restart_timer;
-}
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
-   int restart_timer;
-   struct kvm_vcpu *vcpu;
-   struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-
-   vcpu = ktimer-vcpu;
-   if (!vcpu)
-   return HRTIMER_NORESTART;
-
-   restart_timer = __kvm_timer_fn(vcpu, ktimer);
-   if (restart_timer)
return HRTIMER_RESTART;
-   else
+   } else
return HRTIMER_NORESTART;
 }
-
-- 
1.7.3.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] pci: clean all funcs when hot-removing multifunc device

2011-09-14 Thread Amos Kong
- Original Message -
 (2011/09/14 13:55), Amos Kong wrote:
  'slot-funcs' is initialized in acpiphp_glue.c:register_slot()
  before
  hotpluging device, and only one entry(func 0) is added to it,
  no new entry will be added to the list when hotpluging devices to
  the slot.
 
 I guess your hotplug slot has only one device object (for func#0)
 in ACPI Namespace (DSDT), and guess this is why there is only one
 entry in the 'slot-funcs'. If so, what about adding device objects
 for function 1-7 to ACPI Namespace? I think most of bare-metal
 environments have such definition in ACPI Namespace. For example:

Hi Kaneshige,

I did some test, fix acpi tables can resolve this problem,
then register_slot() will be executed for all funcs, 
and each func has a entry in slot-funcs.
I will send a patch to seabios.

Thanks a lot!
Amos

 Device (P2P) { // PCI to PCI bridge
 Name (_ADR, ...) // PCI address
 Name (_HPP, ...) // Hot Plug parameter
 ...
 Device (S0F0) { // For function 0
 Name (_ADR, ...)
 Name (_SUN, ...)
 Method (_EJ0, ...)
 }
 Device (S0F1) { // For function 1
 ...
 }
 ...
 Device (S0F7) { // For function 7
 ...
 }
 }
 
 Regards,
 Kenji Kaneshige
 
 
  When we release the whole device, there is only one entry in the
  list,
  this causes func1~7 could not be released.
  I try to add entries for all hotpluged device in enable_device(),
  but
  it doesn't work, because 'slot-funcs' is used in many place which
  we only
  need to process func 0. This patch just try to clean all funcs in
  disable_device().
 
  drivers/pci/hotplug/acpiphp_glue.c:
  static int disable_device(struct acpiphp_slot *slot) {
  list_for_each_entry(func,slot-funcs, sibling) {
  pdev = pci_get_slot(slot-bridge-pci_bus,
 PCI_DEVFN(slot-device, func-function));
  ..clean code.. // those code can only be executed one time(func 
  0)
   pci_remove_bus_device(pdev);
  ---
  pci_bus_add_device() is called for each func device in
  acpiphp_glue.c:enable_device().
  pci_remove_bus_device(pdev) is only called for func 0 in
  acpiphp_glue.c:disable_device().
 
  Boot up a KVM guest, hotplug a multifunc device(8 funcs), we can
  find it in the guest.
  @ ls /dev/vd*
  vda vdb vdc vde vdf vdg vdh
  @ lspci
  00:06.0 SCSI storage controller: Red Hat, Inc Virtio block
  device
  ...
  00:06.7 SCSI storage controller: Red Hat, Inc Virtio block
  device
 
  But func 1~7 still exist in guest after hot-removing the multifunc
  device through qemu monitor.
  @ lspci (00:06.0 disappeared)
  00:06.1 SCSI storage controller: Red Hat, Inc Virtio block
  device (rev ff)
  ...
  00:06.7 SCSI storage controller: Red Hat, Inc Virtio block
  device (rev ff)

  
  @ ls /dev/vd*
  vdb vdc vde vdf vdg vdh
  @ mkfs /dev/vdb
  INFO: task mkfs.ext2:1784 blocked for more than 120 seconds.
  (task hung)
 
  Hotpluging multifunc of WinXp is fine.
 
  Signed-off-by: Amos Kongak...@redhat.com
  ---
drivers/pci/hotplug/acpiphp_glue.c | 27
++-
1 files changed, 18 insertions(+), 9 deletions(-)
 
  diff --git a/drivers/pci/hotplug/acpiphp_glue.c
  b/drivers/pci/hotplug/acpiphp_glue.c
  index a70fa89..3b86d1a 100644
  --- a/drivers/pci/hotplug/acpiphp_glue.c
  +++ b/drivers/pci/hotplug/acpiphp_glue.c
  @@ -880,6 +880,8 @@ static int disable_device(struct acpiphp_slot
  *slot)
{
  struct acpiphp_func *func;
  struct pci_dev *pdev;
  + struct pci_bus *bus = slot-bridge-pci_bus;
  + int i, num = 1;
 
  /* is this slot already disabled? */
  if (!(slot-flags SLOT_ENABLED))
  @@ -893,16 +895,23 @@ static int disable_device(struct acpiphp_slot
  *slot)
  func-bridge = NULL;
  }
 
  - pdev = pci_get_slot(slot-bridge-pci_bus,
  - PCI_DEVFN(slot-device, func-function));
  - if (pdev) {
  - pci_stop_bus_device(pdev);
  - if (pdev-subordinate) {
  - disable_bridges(pdev-subordinate);
  - pci_disable_device(pdev);
  + pdev = pci_scan_single_device(bus,
  + PCI_DEVFN(slot-device, 0));
  + if (!pdev)
  + goto err_exit;
  + if (pdev-multifunction == 1)
  + num = 8;
  + for (i=0; inum; i++) {
  + pdev = pci_get_slot(bus, PCI_DEVFN(slot-device, i));
  + if (pdev) {
  + pci_stop_bus_device(pdev);
  + if (pdev-subordinate) {
  + disable_bridges(pdev-subordinate);
  + pci_disable_device(pdev);
  + }
  + pci_remove_bus_device(pdev);
  + pci_dev_put(pdev);
  }
  - pci_remove_bus_device(pdev);
  - pci_dev_put(pdev);
  }
  }
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm tools: Fix 32bit build errors

2011-09-14 Thread Sasha Levin
Signed-off-by: Sasha Levin levinsasha...@gmail.com
---
 tools/kvm/builtin-run.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
index 591fd77..6234b65 100644
--- a/tools/kvm/builtin-run.c
+++ b/tools/kvm/builtin-run.c
@@ -174,12 +174,12 @@ static int virtio_9p_rootdir_parser(const struct option 
*opt, const char *arg, i
 
 static int shmem_parser(const struct option *opt, const char *arg, int unset)
 {
-   const uint64_t default_size = SHMEM_DEFAULT_SIZE;
-   const uint64_t default_phys_addr = SHMEM_DEFAULT_ADDR;
+   const u64 default_size = SHMEM_DEFAULT_SIZE;
+   const u64 default_phys_addr = SHMEM_DEFAULT_ADDR;
const char *default_handle = SHMEM_DEFAULT_HANDLE;
struct shmem_info *si = malloc(sizeof(struct shmem_info));
-   uint64_t phys_addr;
-   uint64_t size;
+   u64 phys_addr;
+   u64 size;
char *handle = NULL;
int create = 0;
const char *p = arg;
@@ -282,8 +282,8 @@ static int shmem_parser(const struct option *opt, const 
char *arg, int unset)
strcpy(handle, default_handle);
}
if (verbose) {
-   pr_info(shmem: phys_addr = %lx, phys_addr);
-   pr_info(shmem: size  = %lx, size);
+   pr_info(shmem: phys_addr = %llx, phys_addr);
+   pr_info(shmem: size  = %llx, size);
pr_info(shmem: handle= %s, handle);
pr_info(shmem: create= %d, create);
}
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/11] KVM: x86: retry non-page-table writing instruction

2011-09-14 Thread Avi Kivity

On 09/13/2011 09:24 PM, Xiao Guangrong wrote:


  +static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
  +  unsigned long cr2,  int emulation_type)
  +{
  +if (!vcpu-arch.mmu.direct_map   !mmu_is_nested(vcpu))
  +gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);

  If mmu_is_nested() cr2 is an ngpa, we have to translate it to a gpa, no?


Yeah, will fix it.

And this bug also exists in the current code: it always uses L2 gpa to emulate
write operation.


Can you please send this fix separately, so it can be backported if needed?


I guess the reason that it is not triggered is: the gpa of L2's shadow page can
not be touched by L2, it means no page table is write-protected by L2.


Yes.  All real guest hypervisors will do that.  But it is technically 
possible for a hypervisor to allow its guest access to the real page tables.



  btw, I don't see mmu.direct_map initialized for nested npt?


nested_svm_vmrun() -  nested_svm_init_mmu_context():
static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
int r;

r = kvm_init_shadow_mmu(vcpu,vcpu-arch.mmu);

vcpu-arch.mmu.set_cr3   = nested_svm_set_tdp_cr3;
vcpu-arch.mmu.get_cr3   = nested_svm_get_tdp_cr3;
vcpu-arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
vcpu-arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
vcpu-arch.mmu.shadow_root_level = get_npt_level();
vcpu-arch.walk_mmu  =vcpu-arch.nested_mmu;

return r;
}

It is initialized in kvm_init_shadow_mmu :-)


Yes, need new eyeglasses.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 05/11] KVM: MMU: do not mark accessed bit on pte write path

2011-09-14 Thread Avi Kivity

On 09/13/2011 09:29 PM, Xiao Guangrong wrote:

On 09/13/2011 06:53 PM, Avi Kivity wrote:
  On 08/30/2011 05:35 AM, Xiao Guangrong wrote:
  In current code, the accessed bit is always set when page fault occurred,
  do not need to set it on pte write path

  What about speculative sptes that are then only accessed via emulation?


The gfn is read and written only via emulation? I think this case is very
very rare?


Probably...

Marcelo? Can you think of another case where spte.accessed is needed?

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 06/11] KVM: MMU: cleanup FNAME(invlpg)

2011-09-14 Thread Avi Kivity

On 09/13/2011 09:31 PM, Xiao Guangrong wrote:

  @@ -675,36 +684,20 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, 
gva_t gva)

sp = page_header(__pa(sptep));
if (is_last_spte(*sptep, level)) {
  -int offset, shift;
  -
if (!sp-unsync)
break;

  -shift = PAGE_SHIFT -
  -  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
  -offset = sp-role.quadrant   shift;
  -
  -pte_gpa = (sp-gfn   PAGE_SHIFT) + offset;
  +pte_gpa = FNAME(get_first_pte_gpa)(sp);


  Here is can be used for L2 - I think we can use 2MB host pages to back 4MB 
guest mappings.


Only unsync shadow page is fetched here, and its level is always 1.



Right.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 11/11] KVM: MMU: improve write flooding detected

2011-09-14 Thread Avi Kivity

On 09/13/2011 10:19 PM, Xiao Guangrong wrote:


  The spte may not be accessed, but other sptes in the same page can be 
accessed.  An example is the fixmap area for kmap_atomic(), there will be a lot of 
pte writes but other sptes will be accessed without going through soft-mmu at all.

I think this kind of shadow pae is mostly the last page table(level=1), maybe
we can skip the write-flooding for the last shadow page, because the last shadow
page can become unsync and it can not let page table write-protected.


Yes.



  I think you have to read the parent_ptes-spte.accessed bits to be sure.


I guess the overload of this way is little high:
- it needs to walk parent ptes for every shadow pages
- we need to clear the parent_ptes-spte.accessed bit when the page is written, 
and
   the tlb flush is needed.
no?



Right.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/11] KVM: x86: retry non-page-table writing instruction

2011-09-14 Thread Xiao Guangrong
On 09/14/2011 05:53 PM, Avi Kivity wrote:
 On 09/13/2011 09:24 PM, Xiao Guangrong wrote:
 
   +static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
   +  unsigned long cr2,  int emulation_type)
   +{
   +if (!vcpu-arch.mmu.direct_map   !mmu_is_nested(vcpu))
   +gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
 
   If mmu_is_nested() cr2 is an ngpa, we have to translate it to a gpa, no?
 

 Yeah, will fix it.

 And this bug also exists in the current code: it always uses L2 gpa to 
 emulate
 write operation.
 
 Can you please send this fix separately, so it can be backported if needed?
 

Sure, i will do it as soon as possible. :-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 03/14] KVM: x86 emulator: move memop, memopp into emulation context

2011-09-14 Thread Takuya Yoshikawa
On Tue, 13 Sep 2011 10:45:40 +0300
Avi Kivity a...@redhat.com wrote:

 Simplifies further generalization of decode.
 
 Signed-off-by: Avi Kivity a...@redhat.com
 ---
  arch/x86/include/asm/kvm_emulate.h |2 ++
  arch/x86/kvm/emulate.c |   34 +-
  2 files changed, 19 insertions(+), 17 deletions(-)
 
 diff --git a/arch/x86/include/asm/kvm_emulate.h 
 b/arch/x86/include/asm/kvm_emulate.h
 index 6040d11..56bac3e 100644
 --- a/arch/x86/include/asm/kvm_emulate.h
 +++ b/arch/x86/include/asm/kvm_emulate.h
 @@ -275,6 +275,8 @@ struct x86_emulate_ctxt {
   unsigned long _eip;
   /* Fields above regs are cleared together. */
   unsigned long regs[NR_VCPU_REGS];
 + struct operand memop;
 + struct operand *memopp;
   struct fetch_cache fetch;
   struct read_cache io_read;
   struct read_cache mem_read;

Once the emulator context gets stablized, some comments will be nice
to know which ones are supposed to be accessed from outside of the
emulator, and which ones are only for the emulator internal usage.

Practically, knowing each member's lifetime, decode stage only or
emulation stage only or throughout the emulation, will make it easy
to avoid extra ctxt/regs initialization and ... maybe more.

Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/14] Emulator decode generalization

2011-09-14 Thread Marcelo Tosatti
On Tue, Sep 13, 2011 at 10:45:37AM +0300, Avi Kivity wrote:
 - merge dst/src/src2 decode
 - generalize %seg embedded in opcode decode
 
 plus a fix.
 
 Avi Kivity (14):
   KVM: x86 emulator: fix Src2CL decode
   KVM: x86 emulator: convert group 3 instructions to direct decode
   KVM: x86 emulator: move memop, memopp into emulation context
   KVM: x86 emulator: split dst decode to a generic decode_operand()
   KVM: x86 emulator: expand decode flags to 64 bits
   KVM: x86 emulator: switch src2 to generic decode_operand()
   KVM: x86 emulator: free up some flag bits near src, dst
   KVM: x86 emulator: switch OpImmUByte decode to decode_imm()
   KVM: x86 emulator: qualify OpReg inhibit_byte_regs hack
   KVM: x86 emulator: switch src decode to decode_operand()
   KVM: x86 emulator: simplify OpMem64 decode
   KVM: x86 emulator: streamline decode of segment registers
   KVM: x86 emulator: switch lds/les/lss/lfs/lgs to direct decode
   KVM: x86 emulator: convert push %sreg/pop %sreg to direct decode
 
  arch/x86/include/asm/kvm_emulate.h |4 +-
  arch/x86/kvm/emulate.c |  563 
 ++--
  2 files changed, 286 insertions(+), 281 deletions(-)

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: Split up MSI-X assigned device IRQ handler

2011-09-14 Thread Marcelo Tosatti
On Mon, Sep 12, 2011 at 06:57:56PM +0200, Jan Kiszka wrote:
 The threaded IRQ handler for MSI-X has almost nothing in common with the
 INTx/MSI handler. Move its code into a dedicated handler.
 
 Signed-off-by: Jan Kiszka jan.kis...@siemens.com
 ---
  virt/kvm/assigned-dev.c |   32 +++-
  1 files changed, 19 insertions(+), 13 deletions(-)

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: Add module parameter for lapic periodic timer limit

2011-09-14 Thread Marcelo Tosatti
On Mon, Sep 12, 2011 at 02:10:22PM +0200, Jan Kiszka wrote:
 Certain guests, specifically RTOSes, request faster periodic timers than
 what we allow by default. Add a module parameter to adjust the limit for
 non-standard setups. Also add a rate-limited warning in case the guest
 requested more.
 
 Signed-off-by: Jan Kiszka jan.kis...@siemens.com
 ---
  arch/x86/kvm/lapic.c |   15 +--
  1 files changed, 13 insertions(+), 2 deletions(-)

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 3/4] block: add block timer and throttling algorithm

2011-09-14 Thread Marcelo Tosatti
On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote:
 On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti mtosa...@redhat.com wrote:
  On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
  Note:
       1.) When bps/iops limits are specified to a small value such as 511 
  bytes/s, this VM will hang up. We are considering how to handle this 
  senario.
 
  You can increase the length of the slice, if the request is larger than
  slice_time * bps_limit.
 Yeah, but it is a challenge for how to increase it. Do you have some nice 
 idea?

If the queue is empty, and the request being processed does not fit the
queue, increase the slice so that the request fits.

That is, make BLOCK_IO_SLICE_TIME dynamic and adjust it as described
above (if the bps or io limits change, reset it to the default
BLOCK_IO_SLICE_TIME).

       2.) When dd command is issued in guest, if its option bs is set to 
  a large value such as bs=1024K, the result speed will slightly bigger 
  than the limits.
 
  Why?
 This issue has not existed. I will remove it.
 When drive bps=100, i did some testings on guest VM.
 1.) bs=1024K
 18+0 records in
 18+0 records out
 18874368 bytes (19 MB) copied, 26.6268 s, 709 kB/s
 2.) bs=2048K
 18+0 records in
 18+0 records out
 37748736 bytes (38 MB) copied, 46.5336 s, 811 kB/s
 
 
  There is lots of debugging leftovers in the patch.
 sorry, i forgot to remove them.
 
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] qemu-kvm: Fix build without VNC support

2011-09-14 Thread Marcelo Tosatti
On Tue, Sep 13, 2011 at 05:13:41PM +0400, Boris Dolgov wrote:
 Hello!
 
 Qemu-kvm 0.15.0 doesn't build with vnc support disabled. The following
 patch fixes the problem:
 
 Signed-off-by: Boris Dolgov bo...@dolgov.name
 
 - monitor.c~  2011-08-09 12:40:29.0 +
 +++ monitor.c   2011-09-13 13:02:40.0 +
 @@ -1221,10 +1221,12 @@ static int add_graphics_client(Monitor *
  }
 qerror_report(QERR_ADD_CLIENT_FAILED);
 return -1;
 +#ifdef CONFIG_VNC
  } else if (strcmp(protocol, vnc) == 0) {
 int fd = monitor_get_fd(mon, fdname);
 vnc_display_add_client(NULL, fd, skipauth);
 return 0;
 +#endif
  } else if ((s = qemu_chr_find(protocol)) != NULL) {
 int fd = monitor_get_fd(mon, fdname);
 if (qemu_chr_add_client(s, fd)  0) {
 
 -- 
 Boris Dolgov.

Boris,

Does QEMU upstream suffer from the same problem? If so, it should be
fixed there (patch sent to qemu-de...@nongnu.org).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 03/14] KVM: x86 emulator: move memop, memopp into emulation context

2011-09-14 Thread Avi Kivity

On 09/14/2011 01:41 PM, Takuya Yoshikawa wrote:

On Tue, 13 Sep 2011 10:45:40 +0300
Avi Kivitya...@redhat.com  wrote:

  Simplifies further generalization of decode.

  Signed-off-by: Avi Kivitya...@redhat.com
  ---
   arch/x86/include/asm/kvm_emulate.h |2 ++
   arch/x86/kvm/emulate.c |   34 +-
   2 files changed, 19 insertions(+), 17 deletions(-)

  diff --git a/arch/x86/include/asm/kvm_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
  index 6040d11..56bac3e 100644
  --- a/arch/x86/include/asm/kvm_emulate.h
  +++ b/arch/x86/include/asm/kvm_emulate.h
  @@ -275,6 +275,8 @@ struct x86_emulate_ctxt {
unsigned long _eip;
/* Fields above regs are cleared together. */
unsigned long regs[NR_VCPU_REGS];
  + struct operand memop;
  + struct operand *memopp;
struct fetch_cache fetch;
struct read_cache io_read;
struct read_cache mem_read;

Once the emulator context gets stablized, some comments will be nice
to know which ones are supposed to be accessed from outside of the
emulator, and which ones are only for the emulator internal usage.

Practically, knowing each member's lifetime, decode stage only or
emulation stage only or throughout the emulation, will make it easy
to avoid extra ctxt/regs initialization and ... maybe more.



Nothing should be accessed from outside the emulator, except via 
accessors.  We should move initialization to the emulator as well (or 
just initialize from x86_decode_insn() - any reason not to?)


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] pci: clean all funcs when hot-removing multifunc device

2011-09-14 Thread Amos Kong
- Original Message -
 - Original Message -
  (2011/09/14 13:55), Amos Kong wrote:
   'slot-funcs' is initialized in acpiphp_glue.c:register_slot()
   before
   hotpluging device, and only one entry(func 0) is added to it,
   no new entry will be added to the list when hotpluging devices to
   the slot.
 
  I guess your hotplug slot has only one device object (for func#0)
  in ACPI Namespace (DSDT), and guess this is why there is only one
  entry in the 'slot-funcs'. If so, what about adding device objects
  for function 1-7 to ACPI Namespace? I think most of bare-metal
  environments have such definition in ACPI Namespace. For example:
 
 Hi Kaneshige,
 
 I did some test, fix acpi tables can resolve this problem,
 then register_slot() will be executed for all funcs,
 and each func has a entry in slot-funcs.
 I will send a patch to seabios.


The size of bios.bin compiled from seabios
original: 128K
only apply patch1:  256K
only apply patch2:  128K

patch1: add 6 slot(only slot6 has 8 funcs) to the table
can hotplug/hot-remove a multifunc device to slot 6 successfully

patch2: add 31 slot(with 8 funcs) to the table
could not boot up guest.
I found there is a special process for large bios.bin in qemu,
problem maybe exist here, I'm driving into it...

qemu/hw/pc.c:
void pc_memory_init(...

/* map the last 128KB of the BIOS in ISA space */
isa_bios_size = bios_size;
if (isa_bios_size  (128 * 1024))
isa_bios_size = 128 * 1024;


 
  Device (P2P) { // PCI to PCI bridge
  Name (_ADR, ...) // PCI address
  Name (_HPP, ...) // Hot Plug parameter
  ...
  Device (S0F0) { // For function 0
  Name (_ADR, ...)
  Name (_SUN, ...)
  Method (_EJ0, ...)
  }
  Device (S0F1) { // For function 1
  ...
  }
  ...
  Device (S0F7) { // For function 7
  ...
  }
  }
 
  Regards,
  Kenji Kaneshige
 
 
   When we release the whole device, there is only one entry in the
   list,
   this causes func1~7 could not be released.
   I try to add entries for all hotpluged device in enable_device(),
   but
   it doesn't work, because 'slot-funcs' is used in many place which
   we only
   need to process func 0. This patch just try to clean all funcs in
   disable_device().
  
   drivers/pci/hotplug/acpiphp_glue.c:
   static int disable_device(struct acpiphp_slot *slot) {
 list_for_each_entry(func,slot-funcs, sibling) {
 pdev = pci_get_slot(slot-bridge-pci_bus,
PCI_DEVFN(slot-device, func-function));
 ..clean code.. // those code can only be executed one time(func
 0)
pci_remove_bus_device(pdev);
   ---
   pci_bus_add_device() is called for each func device in
   acpiphp_glue.c:enable_device().
   pci_remove_bus_device(pdev) is only called for func 0 in
   acpiphp_glue.c:disable_device().
  
   Boot up a KVM guest, hotplug a multifunc device(8 funcs), we can
   find it in the guest.
   @ ls /dev/vd*
   vda vdb vdc vde vdf vdg vdh
   @ lspci
   00:06.0 SCSI storage controller: Red Hat, Inc Virtio block
   device
   ...
   00:06.7 SCSI storage controller: Red Hat, Inc Virtio block
   device
  
   But func 1~7 still exist in guest after hot-removing the multifunc
   device through qemu monitor.
   @ lspci (00:06.0 disappeared)
   00:06.1 SCSI storage controller: Red Hat, Inc Virtio block
   device (rev ff)
   ...
   00:06.7 SCSI storage controller: Red Hat, Inc Virtio block
   device (rev ff)
 
   
   @ ls /dev/vd*
   vdb vdc vde vdf vdg vdh
   @ mkfs /dev/vdb
   INFO: task mkfs.ext2:1784 blocked for more than 120 seconds.
   (task hung)
  
   Hotpluging multifunc of WinXp is fine.
  
   Signed-off-by: Amos Kongak...@redhat.com
   ---
 drivers/pci/hotplug/acpiphp_glue.c | 27
 ++-
 1 files changed, 18 insertions(+), 9 deletions(-)
  
   diff --git a/drivers/pci/hotplug/acpiphp_glue.c
   b/drivers/pci/hotplug/acpiphp_glue.c
   index a70fa89..3b86d1a 100644
   --- a/drivers/pci/hotplug/acpiphp_glue.c
   +++ b/drivers/pci/hotplug/acpiphp_glue.c
   @@ -880,6 +880,8 @@ static int disable_device(struct acpiphp_slot
   *slot)
 {
 struct acpiphp_func *func;
 struct pci_dev *pdev;
   + struct pci_bus *bus = slot-bridge-pci_bus;
   + int i, num = 1;
  
 /* is this slot already disabled? */
 if (!(slot-flags SLOT_ENABLED))
   @@ -893,16 +895,23 @@ static int disable_device(struct
   acpiphp_slot
   *slot)
 func-bridge = NULL;
 }
  
   - pdev = pci_get_slot(slot-bridge-pci_bus,
   - PCI_DEVFN(slot-device, func-function));
   - if (pdev) {
   - pci_stop_bus_device(pdev);
   - if (pdev-subordinate) {
   - disable_bridges(pdev-subordinate);
   - pci_disable_device(pdev);
   + pdev = pci_scan_single_device(bus,
   + PCI_DEVFN(slot-device, 0));
   + if (!pdev)
   + goto 

Re: [PATCH 1/2] KVM: emulate lapic tsc deadline timer for guest

2011-09-14 Thread Marcelo Tosatti
On Tue, Sep 13, 2011 at 10:36:51PM +0800, Liu, Jinsong wrote:
 From 7b12021e1d1b79797b49e41cc0a7be05a6180d9a Mon Sep 17 00:00:00 2001
 From: Liu, Jinsong jinsong@intel.com
 Date: Tue, 13 Sep 2011 21:52:54 +0800
 Subject: [PATCH] KVM: emulate lapic tsc deadline timer for guest
 
 This patch emulate lapic tsc deadline timer for guest:
 Enumerate tsc deadline timer capability by CPUID;
 Enable tsc deadline timer mode by lapic MMIO;
 Start tsc deadline timer by WRMSR;
 
 Signed-off-by: Liu, Jinsong jinsong@intel.com
 ---
  arch/x86/include/asm/apicdef.h|2 +
  arch/x86/include/asm/cpufeature.h |3 +
  arch/x86/include/asm/kvm_host.h   |2 +
  arch/x86/include/asm/msr-index.h  |2 +
  arch/x86/kvm/kvm_timer.h  |2 +
  arch/x86/kvm/lapic.c  |  122 
 ++---
  arch/x86/kvm/lapic.h  |3 +
  arch/x86/kvm/x86.c|   20 ++-
  8 files changed, 132 insertions(+), 24 deletions(-)
 
 diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
 index 34595d5..3925d80 100644
 --- a/arch/x86/include/asm/apicdef.h
 +++ b/arch/x86/include/asm/apicdef.h
 @@ -100,7 +100,9 @@
  #define  APIC_TIMER_BASE_CLKIN   0x0
  #define  APIC_TIMER_BASE_TMBASE  0x1
  #define  APIC_TIMER_BASE_DIV 0x2
 +#define  APIC_LVT_TIMER_ONESHOT  (0  17)
  #define  APIC_LVT_TIMER_PERIODIC (1  17)
 +#define  APIC_LVT_TIMER_TSCDEADLINE  (2  17)
  #define  APIC_LVT_MASKED (1  16)
  #define  APIC_LVT_LEVEL_TRIGGER  (1  15)
  #define  APIC_LVT_REMOTE_IRR (1  14)

Please have a separate, introductory patch for definitions that are not 
KVM specific.

 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -671,6 +671,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t 
 gfn);
  
  extern bool tdp_enabled;
  
 +extern u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
 +

No need for extern.

 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 2b2255b..925d4b9 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -135,9 +135,23 @@ static inline int apic_lvt_vector(struct kvm_lapic 
 *apic, int lvt_type)
   return apic_get_reg(apic, lvt_type)  APIC_VECTOR_MASK;
  }
  
 +static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
 +{
 + return ((apic_get_reg(apic, APIC_LVTT)  
 + apic-lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
 +}
 +
  static inline int apic_lvtt_period(struct kvm_lapic *apic)
  {
 - return apic_get_reg(apic, APIC_LVTT)  APIC_LVT_TIMER_PERIODIC;
 + return ((apic_get_reg(apic, APIC_LVTT)  
 + apic-lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
 +}
 +
 +static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
 +{
 + return ((apic_get_reg(apic, APIC_LVTT)  
 + apic-lapic_timer.timer_mode_mask) == 
 + APIC_LVT_TIMER_TSCDEADLINE);
  }
  
  static inline int apic_lvt_nmi_mode(u32 lvt_val)
 @@ -166,7 +180,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
  }
  
  static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
 - LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
 + LVT_MASK ,  /* part LVTT mask, timer mode mask added at runtime */
   LVT_MASK | APIC_MODE_MASK,  /* LVTTHMR */
   LVT_MASK | APIC_MODE_MASK,  /* LVTPC */
   LINT_MASK, LINT_MASK,   /* LVT0-1 */
 @@ -570,6 +584,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned 
 int offset)
   break;
  
   case APIC_TMCCT:/* Timer CCR */
 + if (apic_lvtt_tscdeadline(apic))
 + return 0;
 +
   val = apic_get_tmcct(apic);
   break;
  
 @@ -664,29 +681,32 @@ static void update_divide_count(struct kvm_lapic *apic)
  
  static void start_apic_timer(struct kvm_lapic *apic)
  {
 - ktime_t now = apic-lapic_timer.timer.base-get_time();
 -
 - apic-lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
 - APIC_BUS_CYCLE_NS * apic-divide_count;
 + ktime_t now;
   atomic_set(apic-lapic_timer.pending, 0);
  
 - if (!apic-lapic_timer.period)
 - return;
 - /*
 -  * Do not allow the guest to program periodic timers with small
 -  * interval, since the hrtimers are not throttled by the host
 -  * scheduler.
 -  */
 - if (apic_lvtt_period(apic)) {
 - if (apic-lapic_timer.period  NSEC_PER_MSEC/2)
 - apic-lapic_timer.period = NSEC_PER_MSEC/2;
 - }
 + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
 + /* lapic timer in oneshot or peroidic mode */
 + now = apic-lapic_timer.timer.base-get_time();
 + apic-lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
 + * 

Re: [PATCH 2/2] Qemu co-operation with kvm tsc deadline timer

2011-09-14 Thread Marcelo Tosatti
On Wed, Sep 14, 2011 at 10:51:41AM +0800, Liu, Jinsong wrote:
 Jan Kiszka wrote:
  On 2011-09-13 16:38, Liu, Jinsong wrote:
  From c1b502d6548fcc41592cd90acc82109ee949df75 Mon Sep 17 00:00:00
  2001 
  From: Liu, Jinsong jinsong@intel.com
  Date: Tue, 13 Sep 2011 22:05:30 +0800
  Subject: [PATCH] Qemu co-operation with kvm tsc deadline timer
  
  KVM add emulation of lapic tsc deadline timer for guest.
  This patch is co-operation work at qemu side.
  
  Signed-off-by: Liu, Jinsong jinsong@intel.com ---
   target-i386/cpu.h |2 ++
   target-i386/kvm.c |   14 ++
   2 files changed, 16 insertions(+), 0 deletions(-)
  
  diff --git a/target-i386/cpu.h b/target-i386/cpu.h
  index 935d08a..62ff73c 100644
  --- a/target-i386/cpu.h
  +++ b/target-i386/cpu.h
  @@ -283,6 +283,7 @@
   #define MSR_IA32_APICBASE_BSP   (18)
   #define MSR_IA32_APICBASE_ENABLE(111)
   #define MSR_IA32_APICBASE_BASE  (0xf12)
  +#define MSR_IA32_TSCDEADLINE0x6e0
  
   #define MSR_MTRRcap   0xfe
   #define MSR_MTRRcap_VCNT  8
  @@ -687,6 +688,7 @@ typedef struct CPUX86State {
   uint64_t async_pf_en_msr;
  
   uint64_t tsc;
  +uint64_t tsc_deadline;
  
  This field has to be saved/restored for snapshots/migrations.
  
  Frankly, I've no clue right now if substates are in vogue again (they
  had problems in their binary format) or if you can simply add a
  versioned top-level field and bump the CPUState version number.
  
 
 Yes, it would be saved/restored. After migration, tsc_deadline would be set 
 to MSR_IA32_TSCDEADLINE to trigger tsc timer interrupt.

Jan means you should explicitly add this to machine.c.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] kvm tools: Use kernel dhcp for network autoconfiguration

2011-09-14 Thread Asias He
On 09/14/2011 03:11 PM, Sasha Levin wrote:
 This patch removes the manual/usermode dhcp client configuration and instead
 uses the DHCP client built within the kernel.
 
 Since this client is tightly integrated with NFS (if NFS config is set), we
 will add a specific NFS root addr in our DHCP offer to point it to a non
 existent address so that we won't hang trying to poke it for our root.
 
 Signed-off-by: Sasha Levin levinsasha...@gmail.com

Acked-by: Asias He asias.he...@gmail.com

-- 
Asias He
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/10] Driver core: Add iommu_ops to bus_type

2011-09-14 Thread Roedel, Joerg
On Tue, Sep 13, 2011 at 12:21:26PM -0400, Greg KH wrote:
 On Tue, Sep 13, 2011 at 05:38:11PM +0200, Roedel, Joerg wrote:
  On Tue, Sep 13, 2011 at 10:58:55AM -0400, Greg KH wrote:
   On Tue, Sep 13, 2011 at 04:54:02PM +0200, Roedel, Joerg wrote:
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -22,6 +22,7 @@
 #include linux/types.h
 #include linux/module.h
 #include linux/pm.h
+#include linux/iommu.h
   
   Ick, please don't add new #includes to device.h, it makes the whole
   build slower.  Just pre-declare the structure and all should be fine.
  
  Hmm, since linux/iommu.h provides 'struct iommu_ops', and this patch
  adds a 'struct iommu_ops' to 'struct bus_type', wouldn't a simple
  forward declaration make the bus_type incomplete in most other places?
 
 No, just like it doesn't make iommu.h incomplete as you used a struct
 bus_type there.

Ah right, because bus-iommu_ops is just a pointer the full type
definition for iommu_ops is only needed when this pointer is actually
dereferenced. I updated the patch. Please find it below.


From 6e0e1c3b997e06539f7bda80f46ffe9fb04aab4e Mon Sep 17 00:00:00 2001
From: Joerg Roedel joerg.roe...@amd.com
Date: Fri, 26 Aug 2011 16:48:26 +0200
Subject: [PATCH 02/10] Driver core: Add iommu_ops to bus_type

This is the starting point to make the iommu_ops used for
the iommu-api a per-bus-type structure. It is required to
easily implement bus-specific setup in the iommu-layer.
The first user will be the iommu-group attribute in sysfs.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 drivers/base/bus.c |   29 +
 drivers/iommu/iommu.c  |4 
 include/linux/device.h |   10 ++
 include/linux/iommu.h  |2 ++
 4 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 000e7b2..b3014fe 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -1028,6 +1028,35 @@ void bus_sort_breadthfirst(struct bus_type *bus,
 }
 EXPORT_SYMBOL_GPL(bus_sort_breadthfirst);
 
+#ifdef CONFIG_IOMMU_API
+/**
+ * bus_set_iommu - set iommu-callbacks for the bus
+ * @bus: bus.
+ * @ops: the callbacks provided by the iommu-driver
+ *
+ * This function is called by an iommu driver to set the iommu methods
+ * used for a particular bus. Drivers for devices on that bus can use
+ * the iommu-api after these ops are registered.
+ * This special function is needed because IOMMUs are usually devices on
+ * the bus itself, so the iommu drivers are not initialized when the bus
+ * is set up. With this function the iommu-driver can set the iommu-ops
+ * afterwards.
+ */
+int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops)
+{
+   if (bus-iommu_ops != NULL)
+   return -EBUSY;
+
+   bus-iommu_ops = ops;
+
+   /* Do IOMMU specific setup for this bus-type */
+   iommu_bus_init(bus, ops);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(bus_set_iommu);
+#endif
+
 int __init buses_init(void)
 {
bus_kset = kset_create_and_add(bus, bus_uevent_ops, NULL);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 30b0644..3b24a5b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -34,6 +34,10 @@ void register_iommu(struct iommu_ops *ops)
iommu_ops = ops;
 }
 
+void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops)
+{
+}
+
 bool iommu_found(void)
 {
return iommu_ops != NULL;
diff --git a/include/linux/device.h b/include/linux/device.h
index c20dfbf..490382b 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -33,6 +33,7 @@ struct class;
 struct subsys_private;
 struct bus_type;
 struct device_node;
+struct iommu_ops;
 
 struct bus_attribute {
struct attributeattr;
@@ -46,6 +47,7 @@ struct bus_attribute bus_attr_##_name = __ATTR(_name, _mode, 
_show, _store)
 extern int __must_check bus_create_file(struct bus_type *,
struct bus_attribute *);
 extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
+extern void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops);
 
 /**
  * struct bus_type - The bus type of the device
@@ -67,6 +69,9 @@ extern void bus_remove_file(struct bus_type *, struct 
bus_attribute *);
  * @resume:Called to bring a device on this bus out of sleep mode.
  * @pm:Power management operations of this bus, callback the 
specific
  * device driver's pm-ops.
+ * @iommu_ops   IOMMU specific operations for this bus, used to attach IOMMU
+ *  driver implementations to a bus and allow the driver to do
+ *  bus-specific setup
  * @p: The private data of the driver core, only the driver core can
  * touch this.
  *
@@ -96,6 +101,8 @@ struct bus_type {
 
const struct dev_pm_ops *pm;
 
+   struct iommu_ops *iommu_ops;
+
struct subsys_private *p;
 };
 
@@ -148,6 +155,9 @@ extern int 

Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-14 Thread Don Zickus
On Wed, Sep 14, 2011 at 10:00:07AM +0300, Avi Kivity wrote:
 On 09/13/2011 10:21 PM, Don Zickus wrote:
 Or are you saying an NMI in an idle system will have the same %rip thus
 falsely detecting a back-to-back NMI?
 
 
 
 That's easy to avoid - insert an instruction zeroing the last
 nmi_rip somewhere before or after hlt.  It's always okay to execute
 such an instruction (outside the nmi handler itself), since nmi_rip
 is meant to detect a no instructions executed condition.

Ah. Like a touch_nmi_watchdog() type of thing.  Interesting.  I'll poke
around the idle code.  Need to instrument a reproducer first.

Thanks,
Don
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] qemu-kvm: Fix build without VNC support

2011-09-14 Thread Boris Dolgov
On Wed, Sep 14, 2011 at 15:16, Marcelo Tosatti mtosa...@redhat.com wrote:
 Does QEMU upstream suffer from the same problem? If so, it should be
 fixed there (patch sent to qemu-de...@nongnu.org).
Yes, it is. I have sent the patch to the correct maillist, thanks for your help.


-- 
Boris Dolgov.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-14 Thread Andi Kleen
On Wed, Sep 14, 2011 at 10:00:07AM +0300, Avi Kivity wrote:
 On 09/13/2011 10:21 PM, Don Zickus wrote:
 Or are you saying an NMI in an idle system will have the same %rip thus
 falsely detecting a back-to-back NMI?
 
 
 
 That's easy to avoid - insert an instruction zeroing the last nmi_rip 
 somewhere before or after hlt.  It's always okay to execute such an 
 instruction (outside the nmi handler itself), since nmi_rip is meant to 
 detect a no instructions executed condition.

At least for classic hlt there is no simple after hlt because it's all
interrupt handlers and exceptions and everything else that can interrupt
combined.

It may work with newer MWAIT.

-Andi

-- 
a...@linux.intel.com -- Speaking for myself only.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-14 Thread Avi Kivity

On 09/14/2011 05:49 PM, Andi Kleen wrote:

On Wed, Sep 14, 2011 at 10:00:07AM +0300, Avi Kivity wrote:
  On 09/13/2011 10:21 PM, Don Zickus wrote:
  Or are you saying an NMI in an idle system will have the same %rip thus
  falsely detecting a back-to-back NMI?
  
  

  That's easy to avoid - insert an instruction zeroing the last nmi_rip
  somewhere before or after hlt.  It's always okay to execute such an
  instruction (outside the nmi handler itself), since nmi_rip is meant to
  detect a no instructions executed condition.

At least for classic hlt there is no simple after hlt because it's all
interrupt handlers and exceptions and everything else that can interrupt
combined.


If an NMI hits in an interrupt handler, or in the after hlt section 
before the write-to-last-nmi-rip, then we'll see that %rip has changed.  
If it hits after the write-to-last-nmi-rip instruction (or in the hlt 
itself), then we'll also see that %rip has changed, due to the effect of 
that instruction.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Memory API code review

2011-09-14 Thread Avi Kivity
I would like to carry out an online code review of the memory API so 
that more people are familiar with the internals, and perhaps even to 
catch some bugs or deficiency.  I'd like to use the next kvm conference 
call slot for this (Tuesday 1400 UTC) since many people already have it 
reserved in the schedule.


It would be great if people from the wider qemu community be present, 
rather than the usual x86 is everything crowd (+Jan) that usually 
participates in the kvm weekly call.


Juan, Chris, can we dedicate next week's call to this?

We'll also need a way to disseminate a few slides and an editor session 
for showing the code.  We have an elluminate account that can be used 
for this, but usually this has a 50% failure rate on Linux.  Anthony, 
perhaps we can set up a view-only vnc reflector on qemu.org?


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm 0.15 usb problem

2011-09-14 Thread Lutz Vieweg

On 08/10/2011 09:25 AM, Gerd Hoffmann wrote:

qemu-system-x86_64: /tmp/qemu-kvm-0.15.0/hw/usb.c:336:
usb_packet_complete: Assertion `p-owner != ((void *)0)' failed.



What kind of device is this?


I can say that in my case, a dive computer (a primitive serial device,
I think) caused the assertion.

Regards,

Lutz Vieweg



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm 0.15 usb problem

2011-09-14 Thread Lutz Vieweg

On 08/09/2011 09:30 PM, Michael wrote:

After installed 0.15 ( and 0.15rc1) guest VM failed at some point with
USB attached with error:
qemu-system-x86_64: /tmp/qemu-kvm-0.15.0/hw/usb.c:336:
usb_packet_complete: Assertion `p-owner != ((void *)0)' failed.


I experienced the exact same problem, a formerly working USB usage
by the virtual machine suddenly caused this assertion to abort
qemu-kvm 0.15.

Looking only briefly at this assertion and the surrounding code,
I found that I did not understand what it's good for, so I removed
the assertion, re-compiled, and - voila! - qemu-kvm 0.15 now works
with the USB device as good as before.

Regards,

Lutz Vieweg


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 03/14] KVM: x86 emulator: move memop, memopp into emulation context

2011-09-14 Thread Avi Kivity

On 09/14/2011 06:21 PM, Takuya Yoshikawa wrote:


  Nothing should be accessed from outside the emulator, except via
  accessors.  We should move initialization to the emulator as well (or
  just initialize from x86_decode_insn() - any reason not to?)

Not big reason but kvm_inject_realmode_interrupt() and kvm_task_switch()
call emulate_int_real() and emulator_task_switch() respectively without
doing generic decoding.

So at least, we need some special initialization for them if we move
init_emulate_ctxt() into x86_decode_insn().



Best if x86_decode_insn(), emulate_int_real(), and 
emulator_task_switch() all call an internal initialization function.  
This way the external caller doesn't have to worry about the details.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 03/14] KVM: x86 emulator: move memop, memopp into emulation context

2011-09-14 Thread Takuya Yoshikawa
On Wed, 14 Sep 2011 14:37:21 +0300
Avi Kivity a...@redhat.com wrote:

  Once the emulator context gets stablized, some comments will be nice
  to know which ones are supposed to be accessed from outside of the
  emulator, and which ones are only for the emulator internal usage.
 
  Practically, knowing each member's lifetime, decode stage only or
  emulation stage only or throughout the emulation, will make it easy
  to avoid extra ctxt/regs initialization and ... maybe more.
 
 
 Nothing should be accessed from outside the emulator, except via 
 accessors.  We should move initialization to the emulator as well (or 
 just initialize from x86_decode_insn() - any reason not to?)

Not big reason but kvm_inject_realmode_interrupt() and kvm_task_switch()
call emulate_int_real() and emulator_task_switch() respectively without
doing generic decoding.

So at least, we need some special initialization for them if we move
init_emulate_ctxt() into x86_decode_insn().

Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Memory API code review

2011-09-14 Thread Anthony Liguori

On 09/14/2011 10:07 AM, Avi Kivity wrote:

I would like to carry out an online code review of the memory API so
that more people are familiar with the internals, and perhaps even to
catch some bugs or deficiency. I'd like to use the next kvm conference
call slot for this (Tuesday 1400 UTC) since many people already have it
reserved in the schedule.

It would be great if people from the wider qemu community be present,
rather than the usual x86 is everything crowd (+Jan) that usually
participates in the kvm weekly call.

Juan, Chris, can we dedicate next week's call to this?

We'll also need a way to disseminate a few slides and an editor session
for showing the code. We have an elluminate account that can be used for
this, but usually this has a 50% failure rate on Linux. Anthony, perhaps
we can set up a view-only vnc reflector on qemu.org?


Absolutely.  I'll set something up and then get with you for the details 
of access.


Regards,

Anthony Liguori





--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] kvm tools: Don't copy network autoconfiguration script

2011-09-14 Thread Sasha Levin
Network autoconfiguration was moved to the kernel, but the setup code still
tried to copy the script over to the rootfs. This prevented from /virt/ to
be properly created.

Signed-off-by: Sasha Levin levinsasha...@gmail.com
---
 tools/kvm/builtin-setup.c |   13 -
 1 files changed, 0 insertions(+), 13 deletions(-)

diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c
index c93eec3..6b8eb5b 100644
--- a/tools/kvm/builtin-setup.c
+++ b/tools/kvm/builtin-setup.c
@@ -129,15 +129,6 @@ static int copy_init(const char *guestfs_name)
return copy_file(guest/init, path);
 }
 
-static int copy_net(const char *guestfs_name)
-{
-   char path[PATH_MAX];
-
-   snprintf(path, PATH_MAX, %s%s%s/virt/setnet.sh, HOME_DIR, 
KVM_PID_FILE_PATH, guestfs_name);
-
-   return copy_file(guest/setnet.sh, path);
-}
-
 static int make_guestfs_symlink(const char *guestfs_name, const char *path)
 {
char target[PATH_MAX];
@@ -195,10 +186,6 @@ static int do_setup(const char *guestfs_name)
make_guestfs_symlink(guestfs_name, guestfs_symlinks[i]);
}
 
-   ret = copy_net(guestfs_name);
-   if (ret  0)
-   return ret;
-
return copy_init(guestfs_name);
 }
 
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] kvm tools: Use host's resolv.conf within the guest

2011-09-14 Thread Sasha Levin
Since kernel IP autoconfiguration doesn't set up /etc/resolv.conf, we'll
use the one located within the host, since this was anyway what we simulated
within the DHCP offer packets.

Signed-off-by: Sasha Levin levinsasha...@gmail.com
---
 tools/kvm/builtin-run.c   |2 ++
 tools/kvm/builtin-setup.c |9 +
 tools/kvm/include/kvm/builtin-setup.h |1 +
 3 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
index 5dafb15..9d61088 100644
--- a/tools/kvm/builtin-run.c
+++ b/tools/kvm/builtin-run.c
@@ -129,6 +129,7 @@ static int img_name_parser(const struct option *opt, const 
char *arg, int unset)
die(Unable to initialize virtio 9p);
if (virtio_9p__register(kvm, /, hostfs)  0)
die(Unable to initialize virtio 9p);
+   kvm_setup_resolv(arg);
using_rootfs = custom_rootfs = 1;
return 0;
}
@@ -750,6 +751,7 @@ int kvm_cmd_run(int argc, const char **argv, const char 
*prefix)
char tmp[PATH_MAX];
 
kvm_setup_create_new(default);
+   kvm_setup_resolv(default);
 
snprintf(tmp, PATH_MAX, %s%s%s, HOME_DIR, KVM_PID_FILE_PATH, 
default);
if (virtio_9p__register(kvm, tmp, /dev/root)  0)
diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c
index 6b8eb5b..3e569e7 100644
--- a/tools/kvm/builtin-setup.c
+++ b/tools/kvm/builtin-setup.c
@@ -168,6 +168,15 @@ static void make_guestfs_dir(const char *guestfs_name, 
const char *dir)
make_dir(name);
 }
 
+void kvm_setup_resolv(const char *guestfs_name)
+{
+   char path[PATH_MAX];
+
+   snprintf(path, PATH_MAX, %s%s%s/etc/resolv.conf, HOME_DIR, 
KVM_PID_FILE_PATH, guestfs_name);
+
+   copy_file(/etc/resolv.conf, path);
+}
+
 static int do_setup(const char *guestfs_name)
 {
unsigned int i;
diff --git a/tools/kvm/include/kvm/builtin-setup.h 
b/tools/kvm/include/kvm/builtin-setup.h
index 6e183a1..f70ae78 100644
--- a/tools/kvm/include/kvm/builtin-setup.h
+++ b/tools/kvm/include/kvm/builtin-setup.h
@@ -4,5 +4,6 @@
 int kvm_cmd_setup(int argc, const char **argv, const char *prefix);
 void kvm_setup_help(void);
 int kvm_setup_create_new(const char *guestfs_name);
+void kvm_setup_resolv(const char *guestfs_name);
 
 #endif
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-14 Thread Andi Kleen
 If an NMI hits in an interrupt handler, or in the after hlt section 
 before the write-to-last-nmi-rip, then we'll see that %rip has changed.  
 If it hits after the write-to-last-nmi-rip instruction (or in the hlt 
 itself), then we'll also see that %rip has changed, due to the effect of 
 that instruction.

It won't handle multiple NMIs in halt. I assume that's reasonable common.

-Andi
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Memory API code review

2011-09-14 Thread Chris Wright
* Avi Kivity (a...@redhat.com) wrote:
 I would like to carry out an online code review of the memory API so that
 more people are familiar with the internals, and perhaps even to catch some
 bugs or deficiency.  I'd like to use the next kvm conference call slot for
 this (Tuesday 1400 UTC) since many people already have it reserved in the
 schedule.
 
 It would be great if people from the wider qemu community be present, rather
 than the usual x86 is everything crowd (+Jan) that usually participates in
 the kvm weekly call.
 
 Juan, Chris, can we dedicate next week's call to this?

Yup, sounds like a good idea.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-14 Thread Avi Kivity

On 09/14/2011 08:28 PM, Andi Kleen wrote:

  If an NMI hits in an interrupt handler, or in the after hlt section
  before the write-to-last-nmi-rip, then we'll see that %rip has changed.
  If it hits after the write-to-last-nmi-rip instruction (or in the hlt
  itself), then we'll also see that %rip has changed, due to the effect of
  that instruction.

It won't handle multiple NMIs in halt. I assume that's reasonable common.



Why not?

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-14 Thread Andi Kleen
On Wed, Sep 14, 2011 at 10:26:21PM +0300, Avi Kivity wrote:
 On 09/14/2011 08:28 PM, Andi Kleen wrote:
   If an NMI hits in an interrupt handler, or in the after hlt section
   before the write-to-last-nmi-rip, then we'll see that %rip has changed.
   If it hits after the write-to-last-nmi-rip instruction (or in the hlt
   itself), then we'll also see that %rip has changed, due to the effect of
   that instruction.
 
 It won't handle multiple NMIs in halt. I assume that's reasonable common.
 
 
 Why not?

They all have the same original RIPs and there is no way to distingush
them.

-Andi

-- 
a...@linux.intel.com -- Speaking for myself only.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-14 Thread Avi Kivity

On 09/14/2011 10:34 PM, Andi Kleen wrote:

On Wed, Sep 14, 2011 at 10:26:21PM +0300, Avi Kivity wrote:
  On 09/14/2011 08:28 PM, Andi Kleen wrote:
 If an NMI hits in an interrupt handler, or in the after hlt section
 before the write-to-last-nmi-rip, then we'll see that %rip has changed.
 If it hits after the write-to-last-nmi-rip instruction (or in the hlt
 itself), then we'll also see that %rip has changed, due to the effect of
 that instruction.
  
  It won't handle multiple NMIs in halt. I assume that's reasonable common.
  

  Why not?

They all have the same original RIPs and there is no way to distingush
them.



That's how we detect multiple NMIs.

1. First NMI is posted
2. NMI handler starts
3. 2nd NMI posted, queued
4. First NMI source handled
5. IRET
6. Queued NMI hits the core
7. back-to-back NMI detected (same rip)
8. Second (and third...) NMI source handled
9. Execution continues.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] KVM: PPC: HIOR and sregs fixup

2011-09-14 Thread Alexander Graf
While working on the HIOR setting which already made it into Avi's tree, I
was too uncautious and ended up extending the sregs structure, breaking ABI
compatibility with all QEMU versions.

So the approach I was taking there was obvious wrong. Instead, what I thought
might be a better alternative is to get rid of the static we have a struct
full of registers and shove it left and right and instead just poke registers
directly between kernel and user space. That sounds slow for starters, but once
we have the infrastructure in place, we can build a batched version of the same
interface and be fast again but maintain flexibility.

This interface can also for example be used to easily fetch the next great
extension of SSE registers or some MSRs that we haven't thought of or lots
of PPC registers I haven't even heard of so far :). There always seem to be
new ones to learn of out there.

Please take a look at the interface and comment on whether you like it this
way or not. It's currently only implemented for the PPC target, but is held
generically, so everyone can use it.

Oh and - it obviously implements HIOR again which we have to drop from sregs
due to the ABI breakage.

Alex

Alexander Graf (3):
  Revert KVM: PPC: Add support for explicit HIOR setting
  KVM: PPC: Add generic single register ioctls
  KVM: PPC: Add support for explicit HIOR setting

 Documentation/virtual/kvm/api.txt |   48 
 arch/powerpc/include/asm/kvm.h|   10 +
 arch/powerpc/include/asm/kvm_book3s.h |2 +-
 arch/powerpc/kvm/book3s_pr.c  |   12 +-
 arch/powerpc/kvm/powerpc.c|   64 +
 include/linux/kvm.h   |   32 
 6 files changed, 149 insertions(+), 19 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] KVM: PPC: Add generic single register ioctls

2011-09-14 Thread Alexander Graf
Right now we transfer a static struct every time we want to get or set
registers. Unfortunately, over time we realize that there are more of
these than we thought of before and the extensibility and flexibility of
transferring a full struct every time is limited.

So this is a new approach to the problem. With these new ioctls, we can
get and set a single register that is identified by an ID. This allows for
very precise and limited transmittal of data. When we later realize that
it's a better idea to shove over multiple registers at once, we can reuse
most of the infrastructure and simply implement a GET_MANY_REGS / SET_MANY_REGS
interface.

The only downpoint I see to this one is that it needs to pad to 1024 bits
(hardware is already on 512 bit registers, so I wanted to leave some room)
which is slightly too much for transmitting only 64 bits. But if that's all
the tradeoff we have to do for getting an extensible interface, I'd say go
for it nevertheless.

Signed-off-by: Alexander Graf ag...@suse.de
---
 Documentation/virtual/kvm/api.txt |   47 ++
 arch/powerpc/kvm/powerpc.c|   51 +
 include/linux/kvm.h   |   32 +++
 3 files changed, 130 insertions(+), 0 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index b547d7e..5a8f305 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1482,6 +1482,53 @@ is supported; 2 if the processor requires all virtual 
machines to have
 an RMA, or 1 if the processor can use an RMA but doesn't require it,
 because it supports the Virtual RMA (VRMA) facility.
 
+4.64 KVM_SET_ONE_REG
+
+Capability: KVM_CAP_ONE_REG
+Architectures: all
+Type: vcpu ioctl
+Parameters: struct kvm_one_reg (in)
+Returns: 0 on success, negative value on failure
+
+struct kvm_one_reg {
+   __u64 id;
+   union {
+   __u8 reg8;
+   __u16 reg16;
+   __u32 reg32;
+   __u64 reg64;
+   __u8 reg128[16];
+   __u8 reg256[32];
+   __u8 reg512[64];
+   __u8 reg1024[128];
+   } u;
+};
+
+Using this ioctl, a single vcpu register can be set to a specific value
+defined by user space with the passed in struct kvm_one_reg. There can
+be architecture agnostic and architecture specific registers. Each have
+their own range of operation and their own constants and width. To keep
+track of the implemented registers, find a list below:
+
+  Arch  |   Register| Width (bits)
+|   |
+
+4.65 KVM_GET_ONE_REG
+
+Capability: KVM_CAP_ONE_REG
+Architectures: all
+Type: vcpu ioctl
+Parameters: struct kvm_one_reg (in and out)
+Returns: 0 on success, negative value on failure
+
+This ioctl allows to receive the value of a single register implemented
+in a vcpu. The register to read is indicated by the id field of the
+kvm_one_reg struct passed in. On success, the register value can be found
+in the respective width field of the struct after this call.
+
+The list of registers accessible using this interface is identical to the
+list in 4.64.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index e75c5ac..39cdb3f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -214,6 +214,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PPC_UNSET_IRQ:
case KVM_CAP_PPC_IRQ_LEVEL:
case KVM_CAP_ENABLE_CAP:
+   case KVM_CAP_ONE_REG:
r = 1;
break;
 #ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -627,6 +628,32 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return r;
 }
 
+static int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu,
+ struct kvm_one_reg *reg)
+{
+   int r = -EINVAL;
+
+   switch (reg-id) {
+   default:
+   break;
+   }
+
+   return r;
+}
+
+static int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
+ struct kvm_one_reg *reg)
+{
+   int r = -EINVAL;
+
+   switch (reg-id) {
+   default:
+   break;
+   }
+
+   return r;
+}
+
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 struct kvm_mp_state *mp_state)
 {
@@ -666,6 +693,30 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
 
+   case KVM_GET_ONE_REG:
+   {
+   struct kvm_one_reg reg;
+   r = -EFAULT;
+   if (copy_from_user(reg, argp, sizeof(reg)))
+   goto out;
+   r = kvm_vcpu_ioctl_get_one_reg(vcpu, reg);
+   if (copy_to_user(argp, reg, sizeof(reg))) {
+   r = -EFAULT;
+   goto out;
+ 

[PATCH 1/3] Revert KVM: PPC: Add support for explicit HIOR setting

2011-09-14 Thread Alexander Graf
This reverts commit 11d7596e18a712dc3bc29d45662ec111fd65946b. It exceeded
the padding on the SREGS struct, rendering the ABI backwards-incompatible.

Signed-off-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/include/asm/kvm.h|8 
 arch/powerpc/include/asm/kvm_book3s.h |2 --
 arch/powerpc/kvm/book3s_pr.c  |   14 ++
 arch/powerpc/kvm/powerpc.c|1 -
 include/linux/kvm.h   |1 -
 5 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 71684b9..a635e22 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -149,12 +149,6 @@ struct kvm_regs {
 #define KVM_SREGS_E_UPDATE_DBSR(1  3)
 
 /*
- * Book3S special bits to indicate contents in the struct by maintaining
- * backwards compatibility with older structs. If adding a new field,
- * please make sure to add a flag for that new field */
-#define KVM_SREGS_S_HIOR   (1  0)
-
-/*
  * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a
  * previous KVM_GET_REGS.
  *
@@ -179,8 +173,6 @@ struct kvm_sregs {
__u64 ibat[8]; 
__u64 dbat[8]; 
} ppc32;
-   __u64 flags; /* KVM_SREGS_S_ */
-   __u64 hior;
} s;
struct {
union {
diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index a384ffd..d4df013 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -90,8 +90,6 @@ struct kvmppc_vcpu_book3s {
 #endif
int context_id[SID_CONTEXTS];
 
-   bool hior_sregs;/* HIOR is set by SREGS, not PVR */
-
struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d417511..84505a2 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -150,16 +150,14 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 #ifdef CONFIG_PPC_BOOK3S_64
if ((pvr = 0x33)  (pvr  0x7033)) {
kvmppc_mmu_book3s_64_init(vcpu);
-   if (!to_book3s(vcpu)-hior_sregs)
-   to_book3s(vcpu)-hior = 0xfff0;
+   to_book3s(vcpu)-hior = 0xfff0;
to_book3s(vcpu)-msr_mask = 0xULL;
vcpu-arch.cpu_type = KVM_CPU_3S_64;
} else
 #endif
{
kvmppc_mmu_book3s_32_init(vcpu);
-   if (!to_book3s(vcpu)-hior_sregs)
-   to_book3s(vcpu)-hior = 0;
+   to_book3s(vcpu)-hior = 0;
to_book3s(vcpu)-msr_mask = 0xULL;
vcpu-arch.cpu_type = KVM_CPU_3S_32;
}
@@ -796,9 +794,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
}
}
 
-   if (sregs-u.s.flags  KVM_SREGS_S_HIOR)
-   sregs-u.s.hior = to_book3s(vcpu)-hior;
-
return 0;
 }
 
@@ -835,11 +830,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
/* Flush the MMU after messing with the segments */
kvmppc_mmu_pte_flush(vcpu, 0, 0);
 
-   if (sregs-u.s.flags  KVM_SREGS_S_HIOR) {
-   to_book3s(vcpu)-hior_sregs = true;
-   to_book3s(vcpu)-hior = sregs-u.s.hior;
-   }
-
return 0;
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 55b4233..e75c5ac 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -209,7 +209,6 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PPC_BOOKE_SREGS:
 #else
case KVM_CAP_PPC_SEGSTATE:
-   case KVM_CAP_PPC_HIOR:
case KVM_CAP_PPC_PAPR:
 #endif
case KVM_CAP_PPC_UNSET_IRQ:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 06ef37d..fe57d2b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -554,7 +554,6 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA65
 #define KVM_CAP_MAX_VCPUS 66   /* returns max vcpus per vm */
-#define KVM_CAP_PPC_HIOR 67
 #define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_SW_TLB 69
 
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] KVM: PPC: Add support for explicit HIOR setting

2011-09-14 Thread Alexander Graf
Until now, we always set HIOR based on the PVR, but this is just wrong.
Instead, we should be setting HIOR explicitly, so user space can decide
what the initial HIOR value is - just like on real hardware.

We keep the old PVR based way around for backwards compatibility, but
once user space uses the SET_ONE_REG based method, we drop the PVR logic.

Signed-off-by: Alexander Graf ag...@suse.de
---
 Documentation/virtual/kvm/api.txt |1 +
 arch/powerpc/include/asm/kvm.h|2 ++
 arch/powerpc/include/asm/kvm_book3s.h |2 ++
 arch/powerpc/kvm/book3s_pr.c  |6 --
 arch/powerpc/kvm/powerpc.c|   14 ++
 include/linux/kvm.h   |1 +
 6 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 5a8f305..eb03179 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1512,6 +1512,7 @@ track of the implemented registers, find a list below:
 
   Arch  |   Register| Width (bits)
 |   |
+  PPC   | KVM_ONE_REG_PPC_HIOR  | 64
 
 4.65 KVM_GET_ONE_REG
 
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index a635e22..53b8759 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -327,4 +327,6 @@ struct kvm_book3e_206_tlb_params {
__u32 reserved[8];
 };
 
+#define KVM_ONE_REG_PPC_HIOR   KVM_ONE_REG_PPC | 0x100
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index d4df013..0ba8ba9 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s {
 #endif
int context_id[SID_CONTEXTS];
 
+   bool hior_explicit; /* HIOR is set by ioctl, not PVR */
+
struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 84505a2..565af5a 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -150,14 +150,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 #ifdef CONFIG_PPC_BOOK3S_64
if ((pvr = 0x33)  (pvr  0x7033)) {
kvmppc_mmu_book3s_64_init(vcpu);
-   to_book3s(vcpu)-hior = 0xfff0;
+   if (!to_book3s(vcpu)-hior_explicit)
+   to_book3s(vcpu)-hior = 0xfff0;
to_book3s(vcpu)-msr_mask = 0xULL;
vcpu-arch.cpu_type = KVM_CPU_3S_64;
} else
 #endif
{
kvmppc_mmu_book3s_32_init(vcpu);
-   to_book3s(vcpu)-hior = 0;
+   if (!to_book3s(vcpu)-hior_explicit)
+   to_book3s(vcpu)-hior = 0;
to_book3s(vcpu)-msr_mask = 0xULL;
vcpu-arch.cpu_type = KVM_CPU_3S_32;
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 39cdb3f..c33f6a7 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -209,6 +209,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PPC_BOOKE_SREGS:
 #else
case KVM_CAP_PPC_SEGSTATE:
+   case KVM_CAP_PPC_HIOR:
case KVM_CAP_PPC_PAPR:
 #endif
case KVM_CAP_PPC_UNSET_IRQ:
@@ -634,6 +635,12 @@ static int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu 
*vcpu,
int r = -EINVAL;
 
switch (reg-id) {
+#ifdef CONFIG_PPC_BOOK3S
+   case KVM_ONE_REG_PPC_HIOR:
+   reg-u.reg64 = to_book3s(vcpu)-hior;
+   r = 0;
+   break;
+#endif
default:
break;
}
@@ -647,6 +654,13 @@ static int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu 
*vcpu,
int r = -EINVAL;
 
switch (reg-id) {
+#ifdef CONFIG_PPC_BOOK3S
+   case KVM_ONE_REG_PPC_HIOR:
+   to_book3s(vcpu)-hior = reg-u.reg64;
+   to_book3s(vcpu)-hior_explicit = true;
+   r = 0;
+   break;
+#endif
default:
break;
}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 762959a..cc6c2fb 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA65
 #define KVM_CAP_MAX_VCPUS 66   /* returns max vcpus per vm */
+#define KVM_CAP_PPC_HIOR 67
 #define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_SW_TLB 69
 #define KVM_CAP_ONE_REG 70
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


kgdb hooks and kvm-tool

2011-09-14 Thread David Evensky

Hi. Is it possible to use kvm-tool with a kernel compiled with kgdb?
I've tried adding 'kgdbwait kgdboc=ttyS0' to -p, but that doesn't seem
to work.

Thanks,
\dae
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/10] x86/ticketlock: collapse a layer of functions

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

Now that the paravirtualization layer doesn't exist at the spinlock
level any more, we can collapse the __ticket_ functions into the arch_
functions.

Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
---
 arch/x86/include/asm/spinlock.h |   35 +--
 1 files changed, 5 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 860fc4b..98fe202 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -76,7 +76,7 @@ static __always_inline void __ticket_unlock_kick(struct 
arch_spinlock *lock, __t
  * in the high part, because a wide xadd increment of the low part would carry
  * up and contaminate the high part.
  */
-static __always_inline void __ticket_spin_lock(struct arch_spinlock *lock)
+static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
 {
register struct __raw_tickets inc = { .tail = 1 };
 
@@ -96,7 +96,7 @@ static __always_inline void __ticket_spin_lock(struct 
arch_spinlock *lock)
 out:   barrier();  /* make sure nothing creeps before the lock is 
taken */
 }
 
-static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
+static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
arch_spinlock_t old, new;
 
@@ -128,7 +128,7 @@ static __always_inline void 
__ticket_unlock_release(arch_spinlock_t *lock)
 }
 #endif
 
-static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
__ticket_t next = lock-tickets.head + 1;
 
@@ -136,46 +136,21 @@ static __always_inline void 
__ticket_spin_unlock(arch_spinlock_t *lock)
__ticket_unlock_kick(lock, next);
 }
 
-static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
struct __raw_tickets tmp = ACCESS_ONCE(lock-tickets);
 
return !!(tmp.tail ^ tmp.head);
 }
 
-static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
struct __raw_tickets tmp = ACCESS_ONCE(lock-tickets);
 
return ((tmp.tail - tmp.head)  TICKET_MASK)  1;
 }
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
-   return __ticket_spin_is_locked(lock);
-}
-
-static inline int arch_spin_is_contended(arch_spinlock_t *lock)
-{
-   return __ticket_spin_is_contended(lock);
-}
 #define arch_spin_is_contended arch_spin_is_contended
 
-static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-   __ticket_spin_lock(lock);
-}
-
-static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-   return __ticket_spin_trylock(lock);
-}
-
-static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-   __ticket_spin_unlock(lock);
-}
-
 static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
  unsigned long flags)
 {
-- 
1.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/10] x86/ticketlocks: remove obsolete comment

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

The note about partial registers is not really relevent now that we
rely on gcc to generate all the assembler.

Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
---
 arch/x86/include/asm/spinlock.h |4 
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index f5695ee..972c260 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -49,10 +49,6 @@
  * issues and should be optimal for the uncontended case. Note the tail must be
  * in the high part, because a wide xadd increment of the low part would carry
  * up and contaminate the high part.
- *
- * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
- * save some instructions and make the code more elegant. There really isn't
- * much between them in performance though, especially as locks are out of 
line.
  */
 static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
 {
-- 
1.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/10] x86/spinlocks: replace pv spinlocks with pv ticketlocks

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

Rather than outright replacing the entire spinlock implementation in
order to paravirtualize it, keep the ticket lock implementation but add
a couple of pvops hooks on the slow patch (long spin on lock, unlocking
a contended lock).

Ticket locks have a number of nice properties, but they also have some
surprising behaviours in virtual environments.  They enforce a strict
FIFO ordering on cpus trying to take a lock; however, if the hypervisor
scheduler does not schedule the cpus in the correct order, the system can
waste a huge amount of time spinning until the next cpu can take the lock.

(See Thomas Friebel's talk Prevent Guests from Spinning Around
http://www.xen.org/files/xensummitboston08/LHP.pdf for more details.)

To address this, we add two hooks:
 - __ticket_spin_lock which is called after the cpu has been
   spinning on the lock for a significant number of iterations but has
   failed to take the lock (presumably because the cpu holding the lock
   has been descheduled).  The lock_spinning pvop is expected to block
   the cpu until it has been kicked by the current lock holder.
 - __ticket_spin_unlock, which on releasing a contended lock
   (there are more cpus with tail tickets), it looks to see if the next
   cpu is blocked and wakes it if so.

When compiled with CONFIG_PARAVIRT_SPINLOCKS disabled, a set of stub
functions causes all the extra code to go away.

Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
---
 arch/x86/include/asm/paravirt.h   |   30 ++--
 arch/x86/include/asm/paravirt_types.h |   10 ++---
 arch/x86/include/asm/spinlock.h   |   59 ++---
 arch/x86/include/asm/spinlock_types.h |4 --
 arch/x86/kernel/paravirt-spinlocks.c  |   15 +---
 arch/x86/xen/spinlock.c   |7 +++-
 6 files changed, 63 insertions(+), 62 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a7d2db9..76cae7a 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -750,36 +750,14 @@ static inline void __set_fixmap(unsigned /* enum 
fixed_addresses */ idx,
 
 #if defined(CONFIG_SMP)  defined(CONFIG_PARAVIRT_SPINLOCKS)
 
-static inline int arch_spin_is_locked(struct arch_spinlock *lock)
+static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, 
__ticket_t ticket)
 {
-   return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
+   PVOP_VCALL2(pv_lock_ops.lock_spinning, lock, ticket);
 }
 
-static inline int arch_spin_is_contended(struct arch_spinlock *lock)
+static __always_inline void ticket_unlock_kick(struct arch_spinlock *lock, 
__ticket_t ticket)
 {
-   return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
-}
-#define arch_spin_is_contended arch_spin_is_contended
-
-static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
-{
-   PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
-}
-
-static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock,
- unsigned long flags)
-{
-   PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
-}
-
-static __always_inline int arch_spin_trylock(struct arch_spinlock *lock)
-{
-   return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
-}
-
-static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
-{
-   PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
+   PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
 
 #endif
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 8e8b9a4..005e24d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -327,13 +327,11 @@ struct pv_mmu_ops {
 };
 
 struct arch_spinlock;
+#include asm/spinlock_types.h
+
 struct pv_lock_ops {
-   int (*spin_is_locked)(struct arch_spinlock *lock);
-   int (*spin_is_contended)(struct arch_spinlock *lock);
-   void (*spin_lock)(struct arch_spinlock *lock);
-   void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long 
flags);
-   int (*spin_trylock)(struct arch_spinlock *lock);
-   void (*spin_unlock)(struct arch_spinlock *lock);
+   void (*lock_spinning)(struct arch_spinlock *lock, __ticket_t ticket);
+   void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 972c260..860fc4b 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -37,6 +37,32 @@
 # define UNLOCK_LOCK_PREFIX
 #endif
 
+/* How long a lock should spin before we consider blocking */
+#define SPIN_THRESHOLD (1  11)
+
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+
+static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, 
__ticket_t ticket)
+{
+}
+

[PATCH 05/10] xen/pvticketlock: Xen implementation for PV ticket locks

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

Replace the old Xen implementation of PV spinlocks with and implementation
of xen_lock_spinning and xen_unlock_kick.

xen_lock_spinning simply registers the cpu in its entry in lock_waiting,
adds itself to the waiting_cpus set, and blocks on an event channel
until the channel becomes pending.

xen_unlock_kick searches the cpus in waiting_cpus looking for the one
which next wants this lock with the next ticket, if any.  If found,
it kicks it by making its event channel pending, which wakes it up.

We need to make sure interrupts are disabled while we're relying on the
contents of the per-cpu lock_waiting values, otherwise an interrupt
handler could come in, try to take some other lock, block, and overwrite
our values.

Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
---
 arch/x86/xen/spinlock.c |  287 +++
 1 files changed, 43 insertions(+), 244 deletions(-)

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 23af06a..f6133c5 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -19,32 +19,21 @@
 #ifdef CONFIG_XEN_DEBUG_FS
 static struct xen_spinlock_stats
 {
-   u64 taken;
u32 taken_slow;
-   u32 taken_slow_nested;
u32 taken_slow_pickup;
u32 taken_slow_spurious;
-   u32 taken_slow_irqenable;
 
-   u64 released;
u32 released_slow;
u32 released_slow_kicked;
 
 #define HISTO_BUCKETS  30
-   u32 histo_spin_total[HISTO_BUCKETS+1];
-   u32 histo_spin_spinning[HISTO_BUCKETS+1];
u32 histo_spin_blocked[HISTO_BUCKETS+1];
 
-   u64 time_total;
-   u64 time_spinning;
u64 time_blocked;
 } spinlock_stats;
 
 static u8 zero_stats;
 
-static unsigned lock_timeout = 1  10;
-#define TIMEOUT lock_timeout
-
 static inline void check_zero(void)
 {
if (unlikely(zero_stats)) {
@@ -73,22 +62,6 @@ static void __spin_time_accum(u64 delta, u32 *array)
array[HISTO_BUCKETS]++;
 }
 
-static inline void spin_time_accum_spinning(u64 start)
-{
-   u32 delta = xen_clocksource_read() - start;
-
-   __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
-   spinlock_stats.time_spinning += delta;
-}
-
-static inline void spin_time_accum_total(u64 start)
-{
-   u32 delta = xen_clocksource_read() - start;
-
-   __spin_time_accum(delta, spinlock_stats.histo_spin_total);
-   spinlock_stats.time_total += delta;
-}
-
 static inline void spin_time_accum_blocked(u64 start)
 {
u32 delta = xen_clocksource_read() - start;
@@ -105,214 +78,84 @@ static inline u64 spin_time_start(void)
return 0;
 }
 
-static inline void spin_time_accum_total(u64 start)
-{
-}
-static inline void spin_time_accum_spinning(u64 start)
-{
-}
 static inline void spin_time_accum_blocked(u64 start)
 {
 }
 #endif  /* CONFIG_XEN_DEBUG_FS */
 
-struct xen_spinlock {
-   unsigned char lock; /* 0 - free; 1 - locked */
-   unsigned short spinners;/* count of waiting cpus */
+struct xen_lock_waiting {
+   struct arch_spinlock *lock;
+   __ticket_t want;
 };
 
 static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
+static cpumask_t waiting_cpus;
 
-#if 0
-static int xen_spin_is_locked(struct arch_spinlock *lock)
-{
-   struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
-   return xl-lock != 0;
-}
-
-static int xen_spin_is_contended(struct arch_spinlock *lock)
+static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
-   struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
-   /* Not strictly true; this is only the count of contended
-  lock-takers entering the slow path. */
-   return xl-spinners != 0;
-}
-
-static int xen_spin_trylock(struct arch_spinlock *lock)
-{
-   struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-   u8 old = 1;
-
-   asm(xchgb %b0,%1
-   : +q (old), +m (xl-lock) : : memory);
-
-   return old == 0;
-}
-
-static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
-
-/*
- * Mark a cpu as interested in a lock.  Returns the CPU's previous
- * lock of interest, in case we got preempted by an interrupt.
- */
-static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
-{
-   struct xen_spinlock *prev;
-
-   prev = __this_cpu_read(lock_spinners);
-   __this_cpu_write(lock_spinners, xl);
-
-   wmb();  /* set lock of interest before count */
-
-   asm(LOCK_PREFIX  incw %0
-   : +m (xl-spinners) : : memory);
-
-   return prev;
-}
-
-/*
- * Mark a cpu as no longer interested in a lock.  Restores previous
- * lock of interest (NULL for none).
- */
-static inline void unspinning_lock(struct xen_spinlock *xl, struct 
xen_spinlock *prev)
-{
-   asm(LOCK_PREFIX  decw %0
-   : +m (xl-spinners) : : memory);
-   wmb();   

[PATCH 08/10] x86/ticketlock: add slowpath logic

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

Maintain a flag in the LSB of the ticket lock tail which indicates
whether anyone is in the lock slowpath and may need kicking when
the current holder unlocks.  The flags are set when the first locker
enters the slowpath, and cleared when unlocking to an empty queue (ie,
no contention).

In the specific implementation of lock_spinning(), make sure to set
the slowpath flags on the lock just before blocking.  We must do
this before the last-chance pickup test to prevent a deadlock
with the unlocker:

UnlockerLocker
test for lock pickup
- fail
unlock
test slowpath
- false
set slowpath flags
block

Whereas this works in any ordering:

UnlockerLocker
set slowpath flags
test for lock pickup
- fail
block
unlock
test slowpath
- true, kick

If the unlocker finds that the lock has the slowpath flag set but it is
actually uncontended (ie, head == tail, so nobody is waiting), then it
clear the slowpath flag.

Note on memory access ordering:
When unlocking a ticketlock with PV callbacks enabled, unlock
first adds to the lock head, then checks to see if the slowpath
flag is set in the lock tail.

However, because reads are not ordered with respect to writes in
different memory locations, the CPU could perform the read before
updating head to release the lock.

This would deadlock with another CPU in the lock slowpath, as it will
set the slowpath flag before checking to see if the lock has been
released in the interim.

A heavyweight fix would be to stick a full mfence between the two.
However, a lighterweight fix is to simply make sure the flag tests
loads both head and tail of the lock in a single operation, thereby
making sure that it overlaps with the memory written by the unlock,
forcing the CPU to maintain ordering.

Note: this code relies on gcc making sure that unlikely() code is out of
line of the fastpath, which only happens when OPTIMIZE_SIZE=n.  If it
doesn't the generated code isn't too bad, but its definitely suboptimal.

(Thanks to Srivatsa Vaddagiri for providing a bugfix to the original
version of this change, which has been folded in.)

Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
Signed-off-by: Srivatsa Vaddagiri va...@linux.vnet.ibm.com
---
 arch/x86/include/asm/paravirt.h   |2 +-
 arch/x86/include/asm/spinlock.h   |   92 ++--
 arch/x86/include/asm/spinlock_types.h |2 +
 arch/x86/kernel/paravirt-spinlocks.c  |1 +
 arch/x86/xen/spinlock.c   |4 ++
 5 files changed, 82 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 50281c7..13b3d8b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -755,7 +755,7 @@ static __always_inline void __ticket_lock_spinning(struct 
arch_spinlock *lock, _
PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket);
 }
 
-static __always_inline void ticket_unlock_kick(struct arch_spinlock *lock, 
__ticket_t ticket)
+static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, 
__ticket_t ticket)
 {
PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 40c90aa..c1f6981 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -40,29 +40,56 @@
 /* How long a lock should spin before we consider blocking */
 #define SPIN_THRESHOLD (1  11)
 
-#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
 
-static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, 
__ticket_t ticket)
+/*
+ * Return true if someone is in the slowpath on this lock.  This
+ * should only be used by the current lock-holder.
+ */
+static inline bool __ticket_in_slowpath(arch_spinlock_t *lock)
 {
+   /*
+* This deliberately reads both head and tail as a single
+* memory operation, and then tests the flag in tail.  This is
+* to guarantee that this read is ordered after the add to
+* head which does the unlock.  If we were to only read tail
+* to test the flag, then the CPU would be free to reorder the
+* read to before the write to head (since it is a different
+* memory location), which could cause a deadlock with someone
+* setting the flag before re-checking the lock availability.
+*/
+   return ACCESS_ONCE(lock-head_tail)  (TICKET_SLOWPATH_FLAG  
TICKET_SHIFT);
 }
 
-static __always_inline void ticket_unlock_kick(struct arch_spinlock *lock, 
__ticket_t ticket)
+static inline void 

[PATCH 00/10] [PATCH RFC V2] Paravirtualized ticketlocks

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

[ Changes since last posting:
  - fix bugs exposed by the cold light of testing
- make the slow flag read in unlock cover the whole lock
  to force ordering WRT the unlock write
- when kicking on unlock, only look for the CPU *we* released
  (ie, head value the unlock resulted in), rather than re-reading
  the new head and kicking on that basis
  - enable PV ticketlocks in Xen HVM guests
]

NOTE: this series is available in:
  git://github.com/jsgf/linux-xen.git upstream/pvticketlock-slowflag
and is based on the previously posted ticketlock cleanup series in
  git://github.com/jsgf/linux-xen.git upstream/ticketlock-cleanup

This series replaces the existing paravirtualized spinlock mechanism
with a paravirtualized ticketlock mechanism.

Ticket locks have an inherent problem in a virtualized case, because
the vCPUs are scheduled rather than running concurrently (ignoring
gang scheduled vCPUs).  This can result in catastrophic performance
collapses when the vCPU scheduler doesn't schedule the correct next
vCPU, and ends up scheduling a vCPU which burns its entire timeslice
spinning.  (Note that this is not the same problem as lock-holder
preemption, which this series also addresses; that's also a problem,
but not catastrophic).

(See Thomas Friebel's talk Prevent Guests from Spinning Around
http://www.xen.org/files/xensummitboston08/LHP.pdf for more details.)

Currently we deal with this by having PV spinlocks, which adds a layer
of indirection in front of all the spinlock functions, and defining a
completely new implementation for Xen (and for other pvops users, but
there are none at present).

PV ticketlocks keeps the existing ticketlock implemenentation
(fastpath) as-is, but adds a couple of pvops for the slow paths:

- If a CPU has been waiting for a spinlock for SPIN_THRESHOLD
  iterations, then call out to the __ticket_lock_spinning() pvop,
  which allows a backend to block the vCPU rather than spinning.  This
  pvop can set the lock into slowpath state.

- When releasing a lock, if it is in slowpath state, the call
  __ticket_unlock_kick() to kick the next vCPU in line awake.  If the
  lock is no longer in contention, it also clears the slowpath flag.

The slowpath state is stored in the LSB of the within the lock
ticket.  This has the effect of reducing the max number of CPUs by
half (so, a small ticket can deal with 128 CPUs, and large ticket
32768).

This series provides a Xen implementation, but it should be
straightforward to add a KVM implementation as well.

Overall, it results in a large reduction in code, it makes the native
and virtualized cases closer, and it removes a layer of indirection
around all the spinlock functions.

The fast path (taking an uncontended lock which isn't in slowpath
state) is optimal, identical to the non-paravirtualized case.

The inner part of ticket lock code becomes:
inc = xadd(lock-tickets, inc);
inc.tail = ~TICKET_SLOWPATH_FLAG;

if (likely(inc.head == inc.tail))
goto out;

for (;;) {
unsigned count = SPIN_THRESHOLD;

do {
if (ACCESS_ONCE(lock-tickets.head) == inc.tail)
goto out;
cpu_relax();
} while (--count);
__ticket_lock_spinning(lock, inc.tail);
}
out:barrier();

which results in:
push   %rbp
mov%rsp,%rbp

mov$0x200,%eax
lock xadd %ax,(%rdi)
movzbl %ah,%edx
cmp%al,%dl
jne1f

pop%rbp
retq   

### SLOWPATH START
1:  and$-2,%edx
movzbl %dl,%esi

2:  mov$0x800,%eax
jmp4f

3:  pause  
sub$0x1,%eax
je 5f

4:  movzbl (%rdi),%ecx
cmp%cl,%dl
jne3b

pop%rbp
retq   

5:  callq  *__ticket_lock_spinning
jmp2b
### SLOWPATH END

with CONFIG_PARAVIRT_SPINLOCKS=n, the code has changed slightly, where
the fastpath case is straight through (taking the lock without
contention), and the spin loop is out of line:

push   %rbp
mov%rsp,%rbp

mov$0x100,%eax
lock xadd %ax,(%rdi)
movzbl %ah,%edx
cmp%al,%dl
jne1f

pop%rbp
retq   

### SLOWPATH START
1:  pause  
movzbl (%rdi),%eax
cmp%dl,%al
jne1b

pop%rbp
retq   
### SLOWPATH END

The unlock code is very straightforward:
prev = *lock;
__ticket_unlock_release(lock);
if (unlikely(__ticket_in_slowpath(lock)))
__ticket_unlock_slowpath(lock, prev);

which generates:
push   %rbp
mov%rsp,%rbp

movzwl (%rdi),%esi
addb   $0x2,(%rdi)
movzwl (%rdi),%eax
testb  $0x1,%ah

[PATCH 10/10] xen: enable PV ticketlocks on HVM Xen

2011-09-14 Thread Jeremy Fitzhardinge
From: Stefano Stabellini stefano.stabell...@eu.citrix.com

Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
---
 arch/x86/xen/smp.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index e79dbb9..bf958ce 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -552,4 +552,5 @@ void __init xen_hvm_smp_init(void)
smp_ops.cpu_die = xen_hvm_cpu_die;
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = 
xen_smp_send_call_function_single_ipi;
+   xen_init_spinlocks();
 }
-- 
1.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/10] xen/pvticketlock: allow interrupts to be enabled while blocking

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

If interrupts were enabled when taking the spinlock, we can leave them
enabled while blocking to get the lock.

If we can enable interrupts while waiting for the lock to become
available, and we take an interrupt before entering the poll,
and the handler takes a spinlock which ends up going into
the slow state (invalidating the per-cpu lock and want values),
then when the interrupt handler returns the event channel will
remain pending so the poll will return immediately, causing it to
return out to the main spinlock loop.
Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
---
 arch/x86/xen/spinlock.c |   48 --
 1 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index c939723..7366b39 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -106,11 +106,28 @@ static void xen_lock_spinning(struct arch_spinlock *lock, 
__ticket_t want)
 
start = spin_time_start();
 
-   /* Make sure interrupts are disabled to ensure that these
-  per-cpu values are not overwritten. */
+   /*
+* Make sure an interrupt handler can't upset things in a
+* partially setup state.
+*/
local_irq_save(flags);
 
+   /*
+* We don't really care if we're overwriting some other
+* (lock,want) pair, as that would mean that we're currently
+* in an interrupt context, and the outer context had
+* interrupts enabled.  That has already kicked the VCPU out
+* of xen_poll_irq(), so it will just return spuriously and
+* retry with newly setup (lock,want).
+*
+* The ordering protocol on this is that the lock pointer
+* may only be set non-NULL if the want ticket is correct.
+* If we're updating want, we must first clear lock.
+*/
+   w-lock = NULL;
+   smp_wmb();
w-want = want;
+   smp_wmb();
w-lock = lock;
 
/* This uses set_bit, which atomic and therefore a barrier */
@@ -124,21 +141,36 @@ static void xen_lock_spinning(struct arch_spinlock *lock, 
__ticket_t want)
/* Only check lock once pending cleared */
barrier();
 
-   /* Mark entry to slowpath before doing the pickup test to make
-  sure we don't deadlock with an unlocker. */
+   /*
+* Mark entry to slowpath before doing the pickup test to make
+* sure we don't deadlock with an unlocker.
+*/
__ticket_enter_slowpath(lock);
 
-   /* check again make sure it didn't become free while
-  we weren't looking  */
+   /*
+* check again make sure it didn't become free while
+* we weren't looking 
+*/
if (ACCESS_ONCE(lock-tickets.head) == want) {
ADD_STATS(taken_slow_pickup, 1);
goto out;
}
 
+   /* Allow interrupts while blocked */
+   local_irq_restore(flags);
+
+   /*
+* If an interrupt happens here, it will leave the wakeup irq
+* pending, which will cause xen_poll_irq() to return
+* immediately.
+*/
+
/* Block until irq becomes pending (or perhaps a spurious wakeup) */
xen_poll_irq(irq);
ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
 
+   local_irq_save(flags);
+
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
 
 out:
@@ -160,7 +192,9 @@ static void xen_unlock_kick(struct arch_spinlock *lock, 
__ticket_t next)
for_each_cpu(cpu, waiting_cpus) {
const struct xen_lock_waiting *w = per_cpu(lock_waiting, cpu);
 
-   if (w-lock == lock  w-want == next) {
+   /* Make sure we read lock before want */
+   if (ACCESS_ONCE(w-lock) == lock 
+   ACCESS_ONCE(w-want) == next) {
ADD_STATS(released_slow_kicked, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
break;
-- 
1.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/10] x86/ticketlock: don't inline _spin_unlock when using paravirt spinlocks

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

The code size expands somewhat, and its probably better to just call
a function rather than inline it.

Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
---
 arch/x86/Kconfig |3 +++
 kernel/Kconfig.locks |2 +-
 2 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6a47bb2..1f03f82 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -585,6 +585,9 @@ config PARAVIRT_SPINLOCKS
 
  If you are unsure how to answer this question, answer N.
 
+config ARCH_NOINLINE_SPIN_UNLOCK
+   def_bool PARAVIRT_SPINLOCKS
+
 config PARAVIRT_CLOCK
bool
 
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 5068e2a..584637b 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -125,7 +125,7 @@ config INLINE_SPIN_LOCK_IRQSAVE
 ARCH_INLINE_SPIN_LOCK_IRQSAVE
 
 config INLINE_SPIN_UNLOCK
-   def_bool !DEBUG_SPINLOCK  (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+   def_bool !DEBUG_SPINLOCK  (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)  
!ARCH_NOINLINE_SPIN_UNLOCK
 
 config INLINE_SPIN_UNLOCK_BH
def_bool !DEBUG_SPINLOCK  ARCH_INLINE_SPIN_UNLOCK_BH
-- 
1.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/10] x86/pvticketlock: use callee-save for lock_spinning

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

Although the lock_spinning calls in the spinlock code are on the
uncommon path, their presence can cause the compiler to generate many
more register save/restores in the function pre/postamble, which is in
the fast path.  To avoid this, convert it to using the pvops callee-save
calling convention, which defers all the save/restores until the actual
function is called, keeping the fastpath clean.

Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
---
 arch/x86/include/asm/paravirt.h   |2 +-
 arch/x86/include/asm/paravirt_types.h |2 +-
 arch/x86/kernel/paravirt-spinlocks.c  |2 +-
 arch/x86/xen/spinlock.c   |3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 76cae7a..50281c7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -752,7 +752,7 @@ static inline void __set_fixmap(unsigned /* enum 
fixed_addresses */ idx,
 
 static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, 
__ticket_t ticket)
 {
-   PVOP_VCALL2(pv_lock_ops.lock_spinning, lock, ticket);
+   PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket);
 }
 
 static __always_inline void ticket_unlock_kick(struct arch_spinlock *lock, 
__ticket_t ticket)
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 005e24d..5e0c138 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -330,7 +330,7 @@ struct arch_spinlock;
 #include asm/spinlock_types.h
 
 struct pv_lock_ops {
-   void (*lock_spinning)(struct arch_spinlock *lock, __ticket_t ticket);
+   struct paravirt_callee_save lock_spinning;
void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
 };
 
diff --git a/arch/x86/kernel/paravirt-spinlocks.c 
b/arch/x86/kernel/paravirt-spinlocks.c
index c2e010e..4251c1d 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -9,7 +9,7 @@
 
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
-   .lock_spinning = paravirt_nop,
+   .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
.unlock_kick = paravirt_nop,
 #endif
 };
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index f6133c5..7a04950 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -145,6 +145,7 @@ out:
 
spin_time_accum_blocked(start);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
 
 static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 {
@@ -197,7 +198,7 @@ void xen_uninit_lock_cpu(int cpu)
 
 void __init xen_init_spinlocks(void)
 {
-   pv_lock_ops.lock_spinning = xen_lock_spinning;
+   pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
pv_lock_ops.unlock_kick = xen_unlock_kick;
 }
 
-- 
1.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/10] x86/ticketlocks: when paravirtualizing ticket locks, increment by 2

2011-09-14 Thread Jeremy Fitzhardinge
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com

Increment ticket head/tails by 2 rather than 1 to leave the LSB free
to store a is in slowpath state bit.  This halves the number
of possible CPUs for a given ticket size, but this shouldn't matter
in practice - kernels built for 32k+ CPU systems are probably
specially built for the hardware rather than a generic distro
kernel.

Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
---
 arch/x86/include/asm/spinlock.h   |   16 
 arch/x86/include/asm/spinlock_types.h |   10 +-
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 98fe202..40c90aa 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -78,7 +78,7 @@ static __always_inline void __ticket_unlock_kick(struct 
arch_spinlock *lock, __t
  */
 static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
 {
-   register struct __raw_tickets inc = { .tail = 1 };
+   register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC };
 
inc = xadd(lock-tickets, inc);
 
@@ -104,7 +104,7 @@ static __always_inline int 
arch_spin_trylock(arch_spinlock_t *lock)
if (old.tickets.head != old.tickets.tail)
return 0;
 
-   new.head_tail = old.head_tail + (1  TICKET_SHIFT);
+   new.head_tail = old.head_tail + (TICKET_LOCK_INC  TICKET_SHIFT);
 
/* cmpxchg is a full barrier, so nothing can move before it */
return cmpxchg(lock-head_tail, old.head_tail, new.head_tail) == 
old.head_tail;
@@ -113,24 +113,24 @@ static __always_inline int 
arch_spin_trylock(arch_spinlock_t *lock)
 #if (NR_CPUS  256)
 static __always_inline void __ticket_unlock_release(arch_spinlock_t *lock)
 {
-   asm volatile(UNLOCK_LOCK_PREFIX incb %0
+   asm volatile(UNLOCK_LOCK_PREFIX addb %1, %0
 : +m (lock-head_tail)
-:
+: i (TICKET_LOCK_INC)
 : memory, cc);
 }
 #else
 static __always_inline void __ticket_unlock_release(arch_spinlock_t *lock)
 {
-   asm volatile(UNLOCK_LOCK_PREFIX incw %0
+   asm volatile(UNLOCK_LOCK_PREFIX addw %1, %0
 : +m (lock-head_tail)
-:
+: i (TICKET_LOCK_INC)
 : memory, cc);
 }
 #endif
 
 static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
-   __ticket_t next = lock-tickets.head + 1;
+   __ticket_t next = lock-tickets.head + TICKET_LOCK_INC;
 
__ticket_unlock_release(lock);
__ticket_unlock_kick(lock, next);
@@ -147,7 +147,7 @@ static inline int arch_spin_is_contended(arch_spinlock_t 
*lock)
 {
struct __raw_tickets tmp = ACCESS_ONCE(lock-tickets);
 
-   return ((tmp.tail - tmp.head)  TICKET_MASK)  1;
+   return ((tmp.tail - tmp.head)  TICKET_MASK)  TICKET_LOCK_INC;
 }
 #define arch_spin_is_contended arch_spin_is_contended
 
diff --git a/arch/x86/include/asm/spinlock_types.h 
b/arch/x86/include/asm/spinlock_types.h
index dbe223d..aa9a205 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -3,7 +3,13 @@
 
 #include linux/types.h
 
-#if (CONFIG_NR_CPUS  256)
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define __TICKET_LOCK_INC  2
+#else
+#define __TICKET_LOCK_INC  1
+#endif
+
+#if (CONFIG_NR_CPUS  (256 / __TICKET_LOCK_INC))
 typedef u8  __ticket_t;
 typedef u16 __ticketpair_t;
 #else
@@ -11,6 +17,8 @@ typedef u16 __ticket_t;
 typedef u32 __ticketpair_t;
 #endif
 
+#define TICKET_LOCK_INC((__ticket_t)__TICKET_LOCK_INC)
+
 #define TICKET_SHIFT   (sizeof(__ticket_t) * 8)
 #define TICKET_MASK((__ticket_t)((1  TICKET_SHIFT) - 1))
 
-- 
1.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] pci: clean all funcs when hot-removing multifunc device

2011-09-14 Thread Kevin O'Connor
On Wed, Sep 14, 2011 at 07:45:59AM -0400, Amos Kong wrote:
 The size of bios.bin compiled from seabios
 original: 128K
 only apply patch1:  256K
 only apply patch2:  128K
 
 patch1: add 6 slot(only slot6 has 8 funcs) to the table
 can hotplug/hot-remove a multifunc device to slot 6 successfully
 
 patch2: add 31 slot(with 8 funcs) to the table
 could not boot up guest.
 I found there is a special process for large bios.bin in qemu,
 problem maybe exist here, I'm driving into it...
 
 qemu/hw/pc.c:
 void pc_memory_init(...
 
 /* map the last 128KB of the BIOS in ISA space */
 isa_bios_size = bios_size;
 if (isa_bios_size  (128 * 1024))
 isa_bios_size = 128 * 1024;

This is probably a regression since seabios commit 87b533bf.  Prior to
that commit, seabios did not mark the early 32bit initialization code
as init code.  However, a side effect of marking that code
(handle_post) as init code is that it is more likely the linker could
place the code at an address less than 0xe.

I'm guesing the patch below (just a hack) would cover up the issue.

-Kevin


--- a/src/post.c
+++ b/src/post.c
@@ -336,7 +336,7 @@ reloc_init(void)
 // Start of Power On Self Test (POST) - the BIOS initilization phase.
 // This function does the setup needed for code relocation, and then
 // invokes the relocation and main setup code.
-void VISIBLE32INIT
+void VISIBLE32FLAT
 handle_post(void)
 {
 debug_serial_setup();
@@ -356,6 +356,14 @@ handle_post(void)
 
 // Allow writes to modify bios area (0xf)
 make_bios_writable();
+
+void handle_post2(void);
+handle_post2();
+}
+
+void VISIBLE32INIT
+handle_post2(void)
+{
 HaveRunPost = 1;
 
 // Detect ram and setup internal malloc.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 03/11] KVM: x86: retry non-page-table writing instruction

2011-09-14 Thread Xiao Guangrong
On 09/14/2011 06:19 PM, Xiao Guangrong wrote:
 On 09/14/2011 05:53 PM, Avi Kivity wrote:
 On 09/13/2011 09:24 PM, Xiao Guangrong wrote:

  +static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
  +  unsigned long cr2,  int emulation_type)
  +{
  +if (!vcpu-arch.mmu.direct_map   !mmu_is_nested(vcpu))
  +gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);

  If mmu_is_nested() cr2 is an ngpa, we have to translate it to a gpa, no?


 Yeah, will fix it.

 And this bug also exists in the current code: it always uses L2 gpa to 
 emulate
 write operation.

 Can you please send this fix separately, so it can be backported if needed?

 
 Sure, i will do it as soon as possible. :-)

I am so sorry, the current code is good, it has already translated L2 gpa to
L1 gpa:

vcpu-arch.nested_mmu.translate_gpa = translate_nested_gpa;

Please ignore it.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm tools: Use host's resolv.conf within the guest

2011-09-14 Thread Pekka Enberg
On Wed, Sep 14, 2011 at 7:28 PM, Sasha Levin levinsasha...@gmail.com wrote:
 Since kernel IP autoconfiguration doesn't set up /etc/resolv.conf, we'll
 use the one located within the host, since this was anyway what we simulated
 within the DHCP offer packets.

 Signed-off-by: Sasha Levin levinsasha...@gmail.com

Wouldn't a symlink to /host/etc/resolv.conf be more appropriate?
Remember, we're supposed to only need to setup the shared rootfs once.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kgdb hooks and kvm-tool

2011-09-14 Thread Pekka Enberg
On Thu, Sep 15, 2011 at 2:17 AM, David Evensky
even...@dancer.ca.sandia.gov wrote:
 Hi. Is it possible to use kvm-tool with a kernel compiled with kgdb?
 I've tried adding 'kgdbwait kgdboc=ttyS0' to -p, but that doesn't seem
 to work.

I've never tried kgdb myself but I'm rather surprised it doesn't just
work. Sasha, Cyrill, Asias, have you guys ever tried kvmtool with
kgdb?

Pekka
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm tools: Use host's resolv.conf within the guest

2011-09-14 Thread Sasha Levin
On Thu, 2011-09-15 at 08:29 +0300, Pekka Enberg wrote:
 On Wed, Sep 14, 2011 at 7:28 PM, Sasha Levin levinsasha...@gmail.com wrote:
  Since kernel IP autoconfiguration doesn't set up /etc/resolv.conf, we'll
  use the one located within the host, since this was anyway what we simulated
  within the DHCP offer packets.
 
  Signed-off-by: Sasha Levin levinsasha...@gmail.com
 
 Wouldn't a symlink to /host/etc/resolv.conf be more appropriate?
 Remember, we're supposed to only need to setup the shared rootfs once.

It would mean the guest can screw up with the host's networking.

-- 

Sasha.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kgdb hooks and kvm-tool

2011-09-14 Thread Sasha Levin
On Thu, 2011-09-15 at 08:32 +0300, Pekka Enberg wrote:
 On Thu, Sep 15, 2011 at 2:17 AM, David Evensky
 even...@dancer.ca.sandia.gov wrote:
  Hi. Is it possible to use kvm-tool with a kernel compiled with kgdb?
  I've tried adding 'kgdbwait kgdboc=ttyS0' to -p, but that doesn't seem
  to work.
 
 I've never tried kgdb myself but I'm rather surprised it doesn't just
 work. Sasha, Cyrill, Asias, have you guys ever tried kvmtool with
 kgdb?

You can either use 'kgdboc=kbd' to use it over the keyboard. I also have
a patch which uses forktty() to spawn serial consoles and redirect guest
tty's into them, but it's somewhat ugly.

Give me a day or two to make it nicer and I'll send it over.

-- 

Sasha.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm tools: Use host's resolv.conf within the guest

2011-09-14 Thread Pekka Enberg

On 9/15/11 8:36 AM, Sasha Levin wrote:

On Thu, 2011-09-15 at 08:29 +0300, Pekka Enberg wrote:

On Wed, Sep 14, 2011 at 7:28 PM, Sasha Levinlevinsasha...@gmail.com  wrote:

Since kernel IP autoconfiguration doesn't set up /etc/resolv.conf, we'll
use the one located within the host, since this was anyway what we simulated
within the DHCP offer packets.

Signed-off-by: Sasha Levinlevinsasha...@gmail.com


Wouldn't a symlink to /host/etc/resolv.conf be more appropriate?
Remember, we're supposed to only need to setup the shared rootfs once.


It would mean the guest can screw up with the host's networking.


How? You're not supposed to run the tool.

Pekka
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] KVM: PPC: HIOR and sregs fixup

2011-09-14 Thread Alexander Graf
While working on the HIOR setting which already made it into Avi's tree, I
was too uncautious and ended up extending the sregs structure, breaking ABI
compatibility with all QEMU versions.

So the approach I was taking there was obvious wrong. Instead, what I thought
might be a better alternative is to get rid of the static we have a struct
full of registers and shove it left and right and instead just poke registers
directly between kernel and user space. That sounds slow for starters, but once
we have the infrastructure in place, we can build a batched version of the same
interface and be fast again but maintain flexibility.

This interface can also for example be used to easily fetch the next great
extension of SSE registers or some MSRs that we haven't thought of or lots
of PPC registers I haven't even heard of so far :). There always seem to be
new ones to learn of out there.

Please take a look at the interface and comment on whether you like it this
way or not. It's currently only implemented for the PPC target, but is held
generically, so everyone can use it.

Oh and - it obviously implements HIOR again which we have to drop from sregs
due to the ABI breakage.

Alex

Alexander Graf (3):
  Revert KVM: PPC: Add support for explicit HIOR setting
  KVM: PPC: Add generic single register ioctls
  KVM: PPC: Add support for explicit HIOR setting

 Documentation/virtual/kvm/api.txt |   48 
 arch/powerpc/include/asm/kvm.h|   10 +
 arch/powerpc/include/asm/kvm_book3s.h |2 +-
 arch/powerpc/kvm/book3s_pr.c  |   12 +-
 arch/powerpc/kvm/powerpc.c|   64 +
 include/linux/kvm.h   |   32 
 6 files changed, 149 insertions(+), 19 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] Revert KVM: PPC: Add support for explicit HIOR setting

2011-09-14 Thread Alexander Graf
This reverts commit 11d7596e18a712dc3bc29d45662ec111fd65946b. It exceeded
the padding on the SREGS struct, rendering the ABI backwards-incompatible.

Signed-off-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/include/asm/kvm.h|8 
 arch/powerpc/include/asm/kvm_book3s.h |2 --
 arch/powerpc/kvm/book3s_pr.c  |   14 ++
 arch/powerpc/kvm/powerpc.c|1 -
 include/linux/kvm.h   |1 -
 5 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 71684b9..a635e22 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -149,12 +149,6 @@ struct kvm_regs {
 #define KVM_SREGS_E_UPDATE_DBSR(1  3)
 
 /*
- * Book3S special bits to indicate contents in the struct by maintaining
- * backwards compatibility with older structs. If adding a new field,
- * please make sure to add a flag for that new field */
-#define KVM_SREGS_S_HIOR   (1  0)
-
-/*
  * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a
  * previous KVM_GET_REGS.
  *
@@ -179,8 +173,6 @@ struct kvm_sregs {
__u64 ibat[8]; 
__u64 dbat[8]; 
} ppc32;
-   __u64 flags; /* KVM_SREGS_S_ */
-   __u64 hior;
} s;
struct {
union {
diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index a384ffd..d4df013 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -90,8 +90,6 @@ struct kvmppc_vcpu_book3s {
 #endif
int context_id[SID_CONTEXTS];
 
-   bool hior_sregs;/* HIOR is set by SREGS, not PVR */
-
struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d417511..84505a2 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -150,16 +150,14 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 #ifdef CONFIG_PPC_BOOK3S_64
if ((pvr = 0x33)  (pvr  0x7033)) {
kvmppc_mmu_book3s_64_init(vcpu);
-   if (!to_book3s(vcpu)-hior_sregs)
-   to_book3s(vcpu)-hior = 0xfff0;
+   to_book3s(vcpu)-hior = 0xfff0;
to_book3s(vcpu)-msr_mask = 0xULL;
vcpu-arch.cpu_type = KVM_CPU_3S_64;
} else
 #endif
{
kvmppc_mmu_book3s_32_init(vcpu);
-   if (!to_book3s(vcpu)-hior_sregs)
-   to_book3s(vcpu)-hior = 0;
+   to_book3s(vcpu)-hior = 0;
to_book3s(vcpu)-msr_mask = 0xULL;
vcpu-arch.cpu_type = KVM_CPU_3S_32;
}
@@ -796,9 +794,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
}
}
 
-   if (sregs-u.s.flags  KVM_SREGS_S_HIOR)
-   sregs-u.s.hior = to_book3s(vcpu)-hior;
-
return 0;
 }
 
@@ -835,11 +830,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
/* Flush the MMU after messing with the segments */
kvmppc_mmu_pte_flush(vcpu, 0, 0);
 
-   if (sregs-u.s.flags  KVM_SREGS_S_HIOR) {
-   to_book3s(vcpu)-hior_sregs = true;
-   to_book3s(vcpu)-hior = sregs-u.s.hior;
-   }
-
return 0;
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 55b4233..e75c5ac 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -209,7 +209,6 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PPC_BOOKE_SREGS:
 #else
case KVM_CAP_PPC_SEGSTATE:
-   case KVM_CAP_PPC_HIOR:
case KVM_CAP_PPC_PAPR:
 #endif
case KVM_CAP_PPC_UNSET_IRQ:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 06ef37d..fe57d2b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -554,7 +554,6 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA65
 #define KVM_CAP_MAX_VCPUS 66   /* returns max vcpus per vm */
-#define KVM_CAP_PPC_HIOR 67
 #define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_SW_TLB 69
 
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] KVM: PPC: Add support for explicit HIOR setting

2011-09-14 Thread Alexander Graf
Until now, we always set HIOR based on the PVR, but this is just wrong.
Instead, we should be setting HIOR explicitly, so user space can decide
what the initial HIOR value is - just like on real hardware.

We keep the old PVR based way around for backwards compatibility, but
once user space uses the SET_ONE_REG based method, we drop the PVR logic.

Signed-off-by: Alexander Graf ag...@suse.de
---
 Documentation/virtual/kvm/api.txt |1 +
 arch/powerpc/include/asm/kvm.h|2 ++
 arch/powerpc/include/asm/kvm_book3s.h |2 ++
 arch/powerpc/kvm/book3s_pr.c  |6 --
 arch/powerpc/kvm/powerpc.c|   14 ++
 include/linux/kvm.h   |1 +
 6 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 5a8f305..eb03179 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1512,6 +1512,7 @@ track of the implemented registers, find a list below:
 
   Arch  |   Register| Width (bits)
 |   |
+  PPC   | KVM_ONE_REG_PPC_HIOR  | 64
 
 4.65 KVM_GET_ONE_REG
 
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index a635e22..53b8759 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -327,4 +327,6 @@ struct kvm_book3e_206_tlb_params {
__u32 reserved[8];
 };
 
+#define KVM_ONE_REG_PPC_HIOR   KVM_ONE_REG_PPC | 0x100
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index d4df013..0ba8ba9 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s {
 #endif
int context_id[SID_CONTEXTS];
 
+   bool hior_explicit; /* HIOR is set by ioctl, not PVR */
+
struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 84505a2..565af5a 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -150,14 +150,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 #ifdef CONFIG_PPC_BOOK3S_64
if ((pvr = 0x33)  (pvr  0x7033)) {
kvmppc_mmu_book3s_64_init(vcpu);
-   to_book3s(vcpu)-hior = 0xfff0;
+   if (!to_book3s(vcpu)-hior_explicit)
+   to_book3s(vcpu)-hior = 0xfff0;
to_book3s(vcpu)-msr_mask = 0xULL;
vcpu-arch.cpu_type = KVM_CPU_3S_64;
} else
 #endif
{
kvmppc_mmu_book3s_32_init(vcpu);
-   to_book3s(vcpu)-hior = 0;
+   if (!to_book3s(vcpu)-hior_explicit)
+   to_book3s(vcpu)-hior = 0;
to_book3s(vcpu)-msr_mask = 0xULL;
vcpu-arch.cpu_type = KVM_CPU_3S_32;
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 39cdb3f..c33f6a7 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -209,6 +209,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PPC_BOOKE_SREGS:
 #else
case KVM_CAP_PPC_SEGSTATE:
+   case KVM_CAP_PPC_HIOR:
case KVM_CAP_PPC_PAPR:
 #endif
case KVM_CAP_PPC_UNSET_IRQ:
@@ -634,6 +635,12 @@ static int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu 
*vcpu,
int r = -EINVAL;
 
switch (reg-id) {
+#ifdef CONFIG_PPC_BOOK3S
+   case KVM_ONE_REG_PPC_HIOR:
+   reg-u.reg64 = to_book3s(vcpu)-hior;
+   r = 0;
+   break;
+#endif
default:
break;
}
@@ -647,6 +654,13 @@ static int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu 
*vcpu,
int r = -EINVAL;
 
switch (reg-id) {
+#ifdef CONFIG_PPC_BOOK3S
+   case KVM_ONE_REG_PPC_HIOR:
+   to_book3s(vcpu)-hior = reg-u.reg64;
+   to_book3s(vcpu)-hior_explicit = true;
+   r = 0;
+   break;
+#endif
default:
break;
}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 762959a..cc6c2fb 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA65
 #define KVM_CAP_MAX_VCPUS 66   /* returns max vcpus per vm */
+#define KVM_CAP_PPC_HIOR 67
 #define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_SW_TLB 69
 #define KVM_CAP_ONE_REG 70
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html