Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking
On 09/13/2011 10:21 PM, Don Zickus wrote: Or are you saying an NMI in an idle system will have the same %rip thus falsely detecting a back-to-back NMI? That's easy to avoid - insert an instruction zeroing the last nmi_rip somewhere before or after hlt. It's always okay to execute such an instruction (outside the nmi handler itself), since nmi_rip is meant to detect a no instructions executed condition. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] kvm tools: Use kernel dhcp for network autoconfiguration
This patch removes the manual/usermode dhcp client configuration and instead uses the DHCP client built within the kernel. Since this client is tightly integrated with NFS (if NFS config is set), we will add a specific NFS root addr in our DHCP offer to point it to a non existent address so that we won't hang trying to poke it for our root. Signed-off-by: Sasha Levin levinsasha...@gmail.com --- tools/kvm/builtin-run.c |2 +- tools/kvm/guest/init.c |4 tools/kvm/guest/setnet.sh | 22 -- tools/kvm/include/kvm/uip.h |2 ++ tools/kvm/net/uip/dhcp.c|8 5 files changed, 11 insertions(+), 27 deletions(-) delete mode 100755 tools/kvm/guest/setnet.sh diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c index 591fd77..465bbe7 100644 --- a/tools/kvm/builtin-run.c +++ b/tools/kvm/builtin-run.c @@ -764,7 +764,7 @@ int kvm_cmd_run(int argc, const char **argv, const char *prefix) if (using_rootfs) { strcat(real_cmdline, root=/dev/root rw rootflags=rw,trans=virtio,version=9p2000.L rootfstype=9p); if (custom_rootfs) - strcat(real_cmdline, init=/virt/init); + strcat(real_cmdline, init=/virt/init ip=dhcp); } else if (!strstr(real_cmdline, root=)) { strlcat(real_cmdline, root=/dev/vda rw , sizeof(real_cmdline)); } diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c index 7733026..837acfb 100644 --- a/tools/kvm/guest/init.c +++ b/tools/kvm/guest/init.c @@ -30,10 +30,6 @@ int main(int argc, char *argv[]) do_mounts(); - puts(Setting up network...); - - system(/bin/sh virt/setnet.sh); - puts(Starting '/bin/sh'...); run_process(/bin/sh); diff --git a/tools/kvm/guest/setnet.sh b/tools/kvm/guest/setnet.sh deleted file mode 100755 index 3da9c22..000 --- a/tools/kvm/guest/setnet.sh +++ /dev/null @@ -1,22 +0,0 @@ -for f in /sys/class/net/*; do - type=`cat $f/type` - if [ $type -eq 1 ]; then - f=${f#/sys/class/net/} - - eval dhcpcd -A $f 2 /dev/null - if [ $? -eq 0 ]; then - exit - fi - - eval dhclient $f 2 /dev/null - if [ $? -eq 0 ]; then - exit - fi - - ifconfig $f 192.168.33.15 - route add default 192.168.33.1 - echo nameserver 8.8.8.8 /etc/resolv.conf - - exit - fi -done diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h index 344ec09..3501d36 100644 --- a/tools/kvm/include/kvm/uip.h +++ b/tools/kvm/include/kvm/uip.h @@ -58,6 +58,8 @@ #define UIP_DHCP_TAG_SUBMASK_LEN 4 #define UIP_DHCP_TAG_ROUTER3 #define UIP_DHCP_TAG_ROUTER_LEN4 +#define UIP_DHCP_TAG_ROOT 17 +#define UIP_DHCP_TAG_ROOT_LEN 4 #define UIP_DHCP_TAG_DNS_SERVER6 #define UIP_DHCP_TAG_DNS_SERVER_LEN4 #define UIP_DHCP_TAG_DOMAIN_NAME 15 diff --git a/tools/kvm/net/uip/dhcp.c b/tools/kvm/net/uip/dhcp.c index bd3c53b..e91a7c7 100644 --- a/tools/kvm/net/uip/dhcp.c +++ b/tools/kvm/net/uip/dhcp.c @@ -2,6 +2,8 @@ #include arpa/inet.h +#define EMPTY_ADDR 0.0.0.0 + static inline bool uip_dhcp_is_discovery(struct uip_dhcp *dhcp) { return (dhcp-option[2] == UIP_DHCP_DISCOVER @@ -127,6 +129,12 @@ static int uip_dhcp_fill_option(struct uip_info *info, struct uip_dhcp *dhcp, in *addr = htonl(info-host_ip); i += UIP_DHCP_TAG_ROUTER_LEN; + opt[i++]= UIP_DHCP_TAG_ROOT; + opt[i++]= strlen(EMPTY_ADDR); + addr= (u32 *)opt[i]; + strncpy((void *) addr, EMPTY_ADDR, strlen(EMPTY_ADDR)); + i += strlen(EMPTY_ADDR); + i = uip_dhcp_fill_option_name_and_server(info, opt, i); opt[i++]= UIP_DHCP_TAG_END; -- 1.7.6.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] kvm tools: Don't use i8042 AUX port
We currently don't have sufficient support for mouse, this patch disables it by default to prevent the delay when booting. It should be removed once sufficient mouse support is added. Signed-off-by: Sasha Levin levinsasha...@gmail.com --- tools/kvm/builtin-run.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c index 465bbe7..2795115 100644 --- a/tools/kvm/builtin-run.c +++ b/tools/kvm/builtin-run.c @@ -736,7 +736,8 @@ int kvm_cmd_run(int argc, const char **argv, const char *prefix) vidmode = 0; memset(real_cmdline, 0, sizeof(real_cmdline)); - strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1); + strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 + i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1); if (vnc || sdl) { strcat(real_cmdline, video=vesafb console=tty0); } else -- 1.7.6.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm tools: Don't use i8042 AUX port
On Wed, Sep 14, 2011 at 10:11 AM, Sasha Levin levinsasha...@gmail.com wrote: We currently don't have sufficient support for mouse, this patch disables it by default to prevent the delay when booting. It should be removed once sufficient mouse support is added. Signed-off-by: Sasha Levin levinsasha...@gmail.com --- tools/kvm/builtin-run.c | 3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c index 465bbe7..2795115 100644 --- a/tools/kvm/builtin-run.c +++ b/tools/kvm/builtin-run.c @@ -736,7 +736,8 @@ int kvm_cmd_run(int argc, const char **argv, const char *prefix) vidmode = 0; memset(real_cmdline, 0, sizeof(real_cmdline)); - strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1); + strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 + i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1); if (vnc || sdl) { strcat(real_cmdline, video=vesafb console=tty0); } else What's the problem? IIRC mouse works just fine in VNC mode? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm tools: Don't use i8042 AUX port
On Wed, 2011-09-14 at 10:14 +0300, Pekka Enberg wrote: On Wed, Sep 14, 2011 at 10:11 AM, Sasha Levin levinsasha...@gmail.com wrote: We currently don't have sufficient support for mouse, this patch disables it by default to prevent the delay when booting. It should be removed once sufficient mouse support is added. Signed-off-by: Sasha Levin levinsasha...@gmail.com --- tools/kvm/builtin-run.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c index 465bbe7..2795115 100644 --- a/tools/kvm/builtin-run.c +++ b/tools/kvm/builtin-run.c @@ -736,7 +736,8 @@ int kvm_cmd_run(int argc, const char **argv, const char *prefix) vidmode = 0; memset(real_cmdline, 0, sizeof(real_cmdline)); - strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1); + strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 + i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1); if (vnc || sdl) { strcat(real_cmdline, video=vesafb console=tty0); } else What's the problem? IIRC mouse works just fine in VNC mode? The problem is that it causes a pretty long delay (~5-6 sec) during boot. VNC mouse could use some more love before I'd say it's good, one example that the host mouse and guest mouse aren't synced. -- Sasha. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm tools: Don't use i8042 AUX port
On Wed, 2011-09-14 at 10:21 +0300, Pekka Enberg wrote: On 9/14/11 10:16 AM, Sasha Levin wrote: The problem is that it causes a pretty long delay (~5-6 sec) during boot. VNC mouse could use some more love before I'd say it's good, one example that the host mouse and guest mouse aren't synced. Maybe but it's likely good enough for the people who are actually using it. We can't just disable it silently! So I think we should enable AUX if user asks for VNC, no? Yup, let's make it disabled only when !vnc and !sdl. -- Sasha. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm tools: Don't use i8042 AUX port
On 9/14/11 10:16 AM, Sasha Levin wrote: The problem is that it causes a pretty long delay (~5-6 sec) during boot. VNC mouse could use some more love before I'd say it's good, one example that the host mouse and guest mouse aren't synced. Maybe but it's likely good enough for the people who are actually using it. We can't just disable it silently! So I think we should enable AUX if user asks for VNC, no? Pekka -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 2/2] kvm tools: Don't use i8042 AUX port
We are enabling i8042 even without VNC or SDL so that we could use it's reset method to reboot the guest. AUX port might cause delays during boot. Disable it if the user didn't ask for VNC or SDL. Signed-off-by: Sasha Levin levinsasha...@gmail.com --- tools/kvm/builtin-run.c |5 +++-- 1 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c index 465bbe7..b2f17ea 100644 --- a/tools/kvm/builtin-run.c +++ b/tools/kvm/builtin-run.c @@ -736,11 +736,12 @@ int kvm_cmd_run(int argc, const char **argv, const char *prefix) vidmode = 0; memset(real_cmdline, 0, sizeof(real_cmdline)); - strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1); + strcpy(real_cmdline, notsc noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 + i8042.dumbkbd=1 i8042.nopnp=1); if (vnc || sdl) { strcat(real_cmdline, video=vesafb console=tty0); } else - strcat(real_cmdline, console=ttyS0 earlyprintk=serial); + strcat(real_cmdline, console=ttyS0 earlyprintk=serial i8042.noaux=1); strcat(real_cmdline, ); if (kernel_cmdline) strlcat(real_cmdline, kernel_cmdline, sizeof(real_cmdline)); -- 1.7.6.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: x86: Simplify kvm timer handler
The vcpu reference of a kvm_timer can't become NULL while the timer is valid, so drop this redundant test. This also makes it pointless to carry a separate __kvm_timer_fn, fold it into kvm_timer_fn. Signed-off-by: Jan Kiszka jan.kis...@siemens.com --- arch/x86/kvm/timer.c | 26 -- 1 files changed, 4 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index ae432ea..6b85cc6 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -18,9 +18,10 @@ #include linux/atomic.h #include kvm_timer.h -static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) { - int restart_timer = 0; + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_vcpu *vcpu = ktimer-vcpu; wait_queue_head_t *q = vcpu-wq; /* @@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) if (ktimer-t_ops-is_periodic(ktimer)) { hrtimer_add_expires_ns(ktimer-timer, ktimer-period); - restart_timer = 1; - } - - return restart_timer; -} - -enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) -{ - int restart_timer; - struct kvm_vcpu *vcpu; - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - - vcpu = ktimer-vcpu; - if (!vcpu) - return HRTIMER_NORESTART; - - restart_timer = __kvm_timer_fn(vcpu, ktimer); - if (restart_timer) return HRTIMER_RESTART; - else + } else return HRTIMER_NORESTART; } - -- 1.7.3.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] pci: clean all funcs when hot-removing multifunc device
- Original Message - (2011/09/14 13:55), Amos Kong wrote: 'slot-funcs' is initialized in acpiphp_glue.c:register_slot() before hotpluging device, and only one entry(func 0) is added to it, no new entry will be added to the list when hotpluging devices to the slot. I guess your hotplug slot has only one device object (for func#0) in ACPI Namespace (DSDT), and guess this is why there is only one entry in the 'slot-funcs'. If so, what about adding device objects for function 1-7 to ACPI Namespace? I think most of bare-metal environments have such definition in ACPI Namespace. For example: Hi Kaneshige, I did some test, fix acpi tables can resolve this problem, then register_slot() will be executed for all funcs, and each func has a entry in slot-funcs. I will send a patch to seabios. Thanks a lot! Amos Device (P2P) { // PCI to PCI bridge Name (_ADR, ...) // PCI address Name (_HPP, ...) // Hot Plug parameter ... Device (S0F0) { // For function 0 Name (_ADR, ...) Name (_SUN, ...) Method (_EJ0, ...) } Device (S0F1) { // For function 1 ... } ... Device (S0F7) { // For function 7 ... } } Regards, Kenji Kaneshige When we release the whole device, there is only one entry in the list, this causes func1~7 could not be released. I try to add entries for all hotpluged device in enable_device(), but it doesn't work, because 'slot-funcs' is used in many place which we only need to process func 0. This patch just try to clean all funcs in disable_device(). drivers/pci/hotplug/acpiphp_glue.c: static int disable_device(struct acpiphp_slot *slot) { list_for_each_entry(func,slot-funcs, sibling) { pdev = pci_get_slot(slot-bridge-pci_bus, PCI_DEVFN(slot-device, func-function)); ..clean code.. // those code can only be executed one time(func 0) pci_remove_bus_device(pdev); --- pci_bus_add_device() is called for each func device in acpiphp_glue.c:enable_device(). pci_remove_bus_device(pdev) is only called for func 0 in acpiphp_glue.c:disable_device(). Boot up a KVM guest, hotplug a multifunc device(8 funcs), we can find it in the guest. @ ls /dev/vd* vda vdb vdc vde vdf vdg vdh @ lspci 00:06.0 SCSI storage controller: Red Hat, Inc Virtio block device ... 00:06.7 SCSI storage controller: Red Hat, Inc Virtio block device But func 1~7 still exist in guest after hot-removing the multifunc device through qemu monitor. @ lspci (00:06.0 disappeared) 00:06.1 SCSI storage controller: Red Hat, Inc Virtio block device (rev ff) ... 00:06.7 SCSI storage controller: Red Hat, Inc Virtio block device (rev ff) @ ls /dev/vd* vdb vdc vde vdf vdg vdh @ mkfs /dev/vdb INFO: task mkfs.ext2:1784 blocked for more than 120 seconds. (task hung) Hotpluging multifunc of WinXp is fine. Signed-off-by: Amos Kongak...@redhat.com --- drivers/pci/hotplug/acpiphp_glue.c | 27 ++- 1 files changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index a70fa89..3b86d1a 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -880,6 +880,8 @@ static int disable_device(struct acpiphp_slot *slot) { struct acpiphp_func *func; struct pci_dev *pdev; + struct pci_bus *bus = slot-bridge-pci_bus; + int i, num = 1; /* is this slot already disabled? */ if (!(slot-flags SLOT_ENABLED)) @@ -893,16 +895,23 @@ static int disable_device(struct acpiphp_slot *slot) func-bridge = NULL; } - pdev = pci_get_slot(slot-bridge-pci_bus, - PCI_DEVFN(slot-device, func-function)); - if (pdev) { - pci_stop_bus_device(pdev); - if (pdev-subordinate) { - disable_bridges(pdev-subordinate); - pci_disable_device(pdev); + pdev = pci_scan_single_device(bus, + PCI_DEVFN(slot-device, 0)); + if (!pdev) + goto err_exit; + if (pdev-multifunction == 1) + num = 8; + for (i=0; inum; i++) { + pdev = pci_get_slot(bus, PCI_DEVFN(slot-device, i)); + if (pdev) { + pci_stop_bus_device(pdev); + if (pdev-subordinate) { + disable_bridges(pdev-subordinate); + pci_disable_device(pdev); + } + pci_remove_bus_device(pdev); + pci_dev_put(pdev); } - pci_remove_bus_device(pdev); - pci_dev_put(pdev); } } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm tools: Fix 32bit build errors
Signed-off-by: Sasha Levin levinsasha...@gmail.com --- tools/kvm/builtin-run.c | 12 ++-- 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c index 591fd77..6234b65 100644 --- a/tools/kvm/builtin-run.c +++ b/tools/kvm/builtin-run.c @@ -174,12 +174,12 @@ static int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, i static int shmem_parser(const struct option *opt, const char *arg, int unset) { - const uint64_t default_size = SHMEM_DEFAULT_SIZE; - const uint64_t default_phys_addr = SHMEM_DEFAULT_ADDR; + const u64 default_size = SHMEM_DEFAULT_SIZE; + const u64 default_phys_addr = SHMEM_DEFAULT_ADDR; const char *default_handle = SHMEM_DEFAULT_HANDLE; struct shmem_info *si = malloc(sizeof(struct shmem_info)); - uint64_t phys_addr; - uint64_t size; + u64 phys_addr; + u64 size; char *handle = NULL; int create = 0; const char *p = arg; @@ -282,8 +282,8 @@ static int shmem_parser(const struct option *opt, const char *arg, int unset) strcpy(handle, default_handle); } if (verbose) { - pr_info(shmem: phys_addr = %lx, phys_addr); - pr_info(shmem: size = %lx, size); + pr_info(shmem: phys_addr = %llx, phys_addr); + pr_info(shmem: size = %llx, size); pr_info(shmem: handle= %s, handle); pr_info(shmem: create= %d, create); } -- 1.7.6.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 03/11] KVM: x86: retry non-page-table writing instruction
On 09/13/2011 09:24 PM, Xiao Guangrong wrote: +static bool retry_instruction(struct x86_emulate_ctxt *ctxt, + unsigned long cr2, int emulation_type) +{ +if (!vcpu-arch.mmu.direct_map !mmu_is_nested(vcpu)) +gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); If mmu_is_nested() cr2 is an ngpa, we have to translate it to a gpa, no? Yeah, will fix it. And this bug also exists in the current code: it always uses L2 gpa to emulate write operation. Can you please send this fix separately, so it can be backported if needed? I guess the reason that it is not triggered is: the gpa of L2's shadow page can not be touched by L2, it means no page table is write-protected by L2. Yes. All real guest hypervisors will do that. But it is technically possible for a hypervisor to allow its guest access to the real page tables. btw, I don't see mmu.direct_map initialized for nested npt? nested_svm_vmrun() - nested_svm_init_mmu_context(): static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { int r; r = kvm_init_shadow_mmu(vcpu,vcpu-arch.mmu); vcpu-arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu-arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; vcpu-arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; vcpu-arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu-arch.mmu.shadow_root_level = get_npt_level(); vcpu-arch.walk_mmu =vcpu-arch.nested_mmu; return r; } It is initialized in kvm_init_shadow_mmu :-) Yes, need new eyeglasses. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 05/11] KVM: MMU: do not mark accessed bit on pte write path
On 09/13/2011 09:29 PM, Xiao Guangrong wrote: On 09/13/2011 06:53 PM, Avi Kivity wrote: On 08/30/2011 05:35 AM, Xiao Guangrong wrote: In current code, the accessed bit is always set when page fault occurred, do not need to set it on pte write path What about speculative sptes that are then only accessed via emulation? The gfn is read and written only via emulation? I think this case is very very rare? Probably... Marcelo? Can you think of another case where spte.accessed is needed? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 06/11] KVM: MMU: cleanup FNAME(invlpg)
On 09/13/2011 09:31 PM, Xiao Guangrong wrote: @@ -675,36 +684,20 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) sp = page_header(__pa(sptep)); if (is_last_spte(*sptep, level)) { -int offset, shift; - if (!sp-unsync) break; -shift = PAGE_SHIFT - - (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; -offset = sp-role.quadrant shift; - -pte_gpa = (sp-gfn PAGE_SHIFT) + offset; +pte_gpa = FNAME(get_first_pte_gpa)(sp); Here is can be used for L2 - I think we can use 2MB host pages to back 4MB guest mappings. Only unsync shadow page is fetched here, and its level is always 1. Right. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 11/11] KVM: MMU: improve write flooding detected
On 09/13/2011 10:19 PM, Xiao Guangrong wrote: The spte may not be accessed, but other sptes in the same page can be accessed. An example is the fixmap area for kmap_atomic(), there will be a lot of pte writes but other sptes will be accessed without going through soft-mmu at all. I think this kind of shadow pae is mostly the last page table(level=1), maybe we can skip the write-flooding for the last shadow page, because the last shadow page can become unsync and it can not let page table write-protected. Yes. I think you have to read the parent_ptes-spte.accessed bits to be sure. I guess the overload of this way is little high: - it needs to walk parent ptes for every shadow pages - we need to clear the parent_ptes-spte.accessed bit when the page is written, and the tlb flush is needed. no? Right. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 03/11] KVM: x86: retry non-page-table writing instruction
On 09/14/2011 05:53 PM, Avi Kivity wrote: On 09/13/2011 09:24 PM, Xiao Guangrong wrote: +static bool retry_instruction(struct x86_emulate_ctxt *ctxt, + unsigned long cr2, int emulation_type) +{ +if (!vcpu-arch.mmu.direct_map !mmu_is_nested(vcpu)) +gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); If mmu_is_nested() cr2 is an ngpa, we have to translate it to a gpa, no? Yeah, will fix it. And this bug also exists in the current code: it always uses L2 gpa to emulate write operation. Can you please send this fix separately, so it can be backported if needed? Sure, i will do it as soon as possible. :-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 03/14] KVM: x86 emulator: move memop, memopp into emulation context
On Tue, 13 Sep 2011 10:45:40 +0300 Avi Kivity a...@redhat.com wrote: Simplifies further generalization of decode. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/include/asm/kvm_emulate.h |2 ++ arch/x86/kvm/emulate.c | 34 +- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 6040d11..56bac3e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -275,6 +275,8 @@ struct x86_emulate_ctxt { unsigned long _eip; /* Fields above regs are cleared together. */ unsigned long regs[NR_VCPU_REGS]; + struct operand memop; + struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; Once the emulator context gets stablized, some comments will be nice to know which ones are supposed to be accessed from outside of the emulator, and which ones are only for the emulator internal usage. Practically, knowing each member's lifetime, decode stage only or emulation stage only or throughout the emulation, will make it easy to avoid extra ctxt/regs initialization and ... maybe more. Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 00/14] Emulator decode generalization
On Tue, Sep 13, 2011 at 10:45:37AM +0300, Avi Kivity wrote: - merge dst/src/src2 decode - generalize %seg embedded in opcode decode plus a fix. Avi Kivity (14): KVM: x86 emulator: fix Src2CL decode KVM: x86 emulator: convert group 3 instructions to direct decode KVM: x86 emulator: move memop, memopp into emulation context KVM: x86 emulator: split dst decode to a generic decode_operand() KVM: x86 emulator: expand decode flags to 64 bits KVM: x86 emulator: switch src2 to generic decode_operand() KVM: x86 emulator: free up some flag bits near src, dst KVM: x86 emulator: switch OpImmUByte decode to decode_imm() KVM: x86 emulator: qualify OpReg inhibit_byte_regs hack KVM: x86 emulator: switch src decode to decode_operand() KVM: x86 emulator: simplify OpMem64 decode KVM: x86 emulator: streamline decode of segment registers KVM: x86 emulator: switch lds/les/lss/lfs/lgs to direct decode KVM: x86 emulator: convert push %sreg/pop %sreg to direct decode arch/x86/include/asm/kvm_emulate.h |4 +- arch/x86/kvm/emulate.c | 563 ++-- 2 files changed, 286 insertions(+), 281 deletions(-) Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: Split up MSI-X assigned device IRQ handler
On Mon, Sep 12, 2011 at 06:57:56PM +0200, Jan Kiszka wrote: The threaded IRQ handler for MSI-X has almost nothing in common with the INTx/MSI handler. Move its code into a dedicated handler. Signed-off-by: Jan Kiszka jan.kis...@siemens.com --- virt/kvm/assigned-dev.c | 32 +++- 1 files changed, 19 insertions(+), 13 deletions(-) Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: Add module parameter for lapic periodic timer limit
On Mon, Sep 12, 2011 at 02:10:22PM +0200, Jan Kiszka wrote: Certain guests, specifically RTOSes, request faster periodic timers than what we allow by default. Add a module parameter to adjust the limit for non-standard setups. Also add a rate-limited warning in case the guest requested more. Signed-off-by: Jan Kiszka jan.kis...@siemens.com --- arch/x86/kvm/lapic.c | 15 +-- 1 files changed, 13 insertions(+), 2 deletions(-) Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote: On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti mtosa...@redhat.com wrote: On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote: Note: 1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario. You can increase the length of the slice, if the request is larger than slice_time * bps_limit. Yeah, but it is a challenge for how to increase it. Do you have some nice idea? If the queue is empty, and the request being processed does not fit the queue, increase the slice so that the request fits. That is, make BLOCK_IO_SLICE_TIME dynamic and adjust it as described above (if the bps or io limits change, reset it to the default BLOCK_IO_SLICE_TIME). 2.) When dd command is issued in guest, if its option bs is set to a large value such as bs=1024K, the result speed will slightly bigger than the limits. Why? This issue has not existed. I will remove it. When drive bps=100, i did some testings on guest VM. 1.) bs=1024K 18+0 records in 18+0 records out 18874368 bytes (19 MB) copied, 26.6268 s, 709 kB/s 2.) bs=2048K 18+0 records in 18+0 records out 37748736 bytes (38 MB) copied, 46.5336 s, 811 kB/s There is lots of debugging leftovers in the patch. sorry, i forgot to remove them. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] qemu-kvm: Fix build without VNC support
On Tue, Sep 13, 2011 at 05:13:41PM +0400, Boris Dolgov wrote: Hello! Qemu-kvm 0.15.0 doesn't build with vnc support disabled. The following patch fixes the problem: Signed-off-by: Boris Dolgov bo...@dolgov.name - monitor.c~ 2011-08-09 12:40:29.0 + +++ monitor.c 2011-09-13 13:02:40.0 + @@ -1221,10 +1221,12 @@ static int add_graphics_client(Monitor * } qerror_report(QERR_ADD_CLIENT_FAILED); return -1; +#ifdef CONFIG_VNC } else if (strcmp(protocol, vnc) == 0) { int fd = monitor_get_fd(mon, fdname); vnc_display_add_client(NULL, fd, skipauth); return 0; +#endif } else if ((s = qemu_chr_find(protocol)) != NULL) { int fd = monitor_get_fd(mon, fdname); if (qemu_chr_add_client(s, fd) 0) { -- Boris Dolgov. Boris, Does QEMU upstream suffer from the same problem? If so, it should be fixed there (patch sent to qemu-de...@nongnu.org). -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 03/14] KVM: x86 emulator: move memop, memopp into emulation context
On 09/14/2011 01:41 PM, Takuya Yoshikawa wrote: On Tue, 13 Sep 2011 10:45:40 +0300 Avi Kivitya...@redhat.com wrote: Simplifies further generalization of decode. Signed-off-by: Avi Kivitya...@redhat.com --- arch/x86/include/asm/kvm_emulate.h |2 ++ arch/x86/kvm/emulate.c | 34 +- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 6040d11..56bac3e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -275,6 +275,8 @@ struct x86_emulate_ctxt { unsigned long _eip; /* Fields above regs are cleared together. */ unsigned long regs[NR_VCPU_REGS]; + struct operand memop; + struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; Once the emulator context gets stablized, some comments will be nice to know which ones are supposed to be accessed from outside of the emulator, and which ones are only for the emulator internal usage. Practically, knowing each member's lifetime, decode stage only or emulation stage only or throughout the emulation, will make it easy to avoid extra ctxt/regs initialization and ... maybe more. Nothing should be accessed from outside the emulator, except via accessors. We should move initialization to the emulator as well (or just initialize from x86_decode_insn() - any reason not to?) -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] pci: clean all funcs when hot-removing multifunc device
- Original Message - - Original Message - (2011/09/14 13:55), Amos Kong wrote: 'slot-funcs' is initialized in acpiphp_glue.c:register_slot() before hotpluging device, and only one entry(func 0) is added to it, no new entry will be added to the list when hotpluging devices to the slot. I guess your hotplug slot has only one device object (for func#0) in ACPI Namespace (DSDT), and guess this is why there is only one entry in the 'slot-funcs'. If so, what about adding device objects for function 1-7 to ACPI Namespace? I think most of bare-metal environments have such definition in ACPI Namespace. For example: Hi Kaneshige, I did some test, fix acpi tables can resolve this problem, then register_slot() will be executed for all funcs, and each func has a entry in slot-funcs. I will send a patch to seabios. The size of bios.bin compiled from seabios original: 128K only apply patch1: 256K only apply patch2: 128K patch1: add 6 slot(only slot6 has 8 funcs) to the table can hotplug/hot-remove a multifunc device to slot 6 successfully patch2: add 31 slot(with 8 funcs) to the table could not boot up guest. I found there is a special process for large bios.bin in qemu, problem maybe exist here, I'm driving into it... qemu/hw/pc.c: void pc_memory_init(... /* map the last 128KB of the BIOS in ISA space */ isa_bios_size = bios_size; if (isa_bios_size (128 * 1024)) isa_bios_size = 128 * 1024; Device (P2P) { // PCI to PCI bridge Name (_ADR, ...) // PCI address Name (_HPP, ...) // Hot Plug parameter ... Device (S0F0) { // For function 0 Name (_ADR, ...) Name (_SUN, ...) Method (_EJ0, ...) } Device (S0F1) { // For function 1 ... } ... Device (S0F7) { // For function 7 ... } } Regards, Kenji Kaneshige When we release the whole device, there is only one entry in the list, this causes func1~7 could not be released. I try to add entries for all hotpluged device in enable_device(), but it doesn't work, because 'slot-funcs' is used in many place which we only need to process func 0. This patch just try to clean all funcs in disable_device(). drivers/pci/hotplug/acpiphp_glue.c: static int disable_device(struct acpiphp_slot *slot) { list_for_each_entry(func,slot-funcs, sibling) { pdev = pci_get_slot(slot-bridge-pci_bus, PCI_DEVFN(slot-device, func-function)); ..clean code.. // those code can only be executed one time(func 0) pci_remove_bus_device(pdev); --- pci_bus_add_device() is called for each func device in acpiphp_glue.c:enable_device(). pci_remove_bus_device(pdev) is only called for func 0 in acpiphp_glue.c:disable_device(). Boot up a KVM guest, hotplug a multifunc device(8 funcs), we can find it in the guest. @ ls /dev/vd* vda vdb vdc vde vdf vdg vdh @ lspci 00:06.0 SCSI storage controller: Red Hat, Inc Virtio block device ... 00:06.7 SCSI storage controller: Red Hat, Inc Virtio block device But func 1~7 still exist in guest after hot-removing the multifunc device through qemu monitor. @ lspci (00:06.0 disappeared) 00:06.1 SCSI storage controller: Red Hat, Inc Virtio block device (rev ff) ... 00:06.7 SCSI storage controller: Red Hat, Inc Virtio block device (rev ff) @ ls /dev/vd* vdb vdc vde vdf vdg vdh @ mkfs /dev/vdb INFO: task mkfs.ext2:1784 blocked for more than 120 seconds. (task hung) Hotpluging multifunc of WinXp is fine. Signed-off-by: Amos Kongak...@redhat.com --- drivers/pci/hotplug/acpiphp_glue.c | 27 ++- 1 files changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index a70fa89..3b86d1a 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -880,6 +880,8 @@ static int disable_device(struct acpiphp_slot *slot) { struct acpiphp_func *func; struct pci_dev *pdev; + struct pci_bus *bus = slot-bridge-pci_bus; + int i, num = 1; /* is this slot already disabled? */ if (!(slot-flags SLOT_ENABLED)) @@ -893,16 +895,23 @@ static int disable_device(struct acpiphp_slot *slot) func-bridge = NULL; } - pdev = pci_get_slot(slot-bridge-pci_bus, - PCI_DEVFN(slot-device, func-function)); - if (pdev) { - pci_stop_bus_device(pdev); - if (pdev-subordinate) { - disable_bridges(pdev-subordinate); - pci_disable_device(pdev); + pdev = pci_scan_single_device(bus, + PCI_DEVFN(slot-device, 0)); + if (!pdev) + goto
Re: [PATCH 1/2] KVM: emulate lapic tsc deadline timer for guest
On Tue, Sep 13, 2011 at 10:36:51PM +0800, Liu, Jinsong wrote: From 7b12021e1d1b79797b49e41cc0a7be05a6180d9a Mon Sep 17 00:00:00 2001 From: Liu, Jinsong jinsong@intel.com Date: Tue, 13 Sep 2011 21:52:54 +0800 Subject: [PATCH] KVM: emulate lapic tsc deadline timer for guest This patch emulate lapic tsc deadline timer for guest: Enumerate tsc deadline timer capability by CPUID; Enable tsc deadline timer mode by lapic MMIO; Start tsc deadline timer by WRMSR; Signed-off-by: Liu, Jinsong jinsong@intel.com --- arch/x86/include/asm/apicdef.h|2 + arch/x86/include/asm/cpufeature.h |3 + arch/x86/include/asm/kvm_host.h |2 + arch/x86/include/asm/msr-index.h |2 + arch/x86/kvm/kvm_timer.h |2 + arch/x86/kvm/lapic.c | 122 ++--- arch/x86/kvm/lapic.h |3 + arch/x86/kvm/x86.c| 20 ++- 8 files changed, 132 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 34595d5..3925d80 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -100,7 +100,9 @@ #define APIC_TIMER_BASE_CLKIN 0x0 #define APIC_TIMER_BASE_TMBASE 0x1 #define APIC_TIMER_BASE_DIV 0x2 +#define APIC_LVT_TIMER_ONESHOT (0 17) #define APIC_LVT_TIMER_PERIODIC (1 17) +#define APIC_LVT_TIMER_TSCDEADLINE (2 17) #define APIC_LVT_MASKED (1 16) #define APIC_LVT_LEVEL_TRIGGER (1 15) #define APIC_LVT_REMOTE_IRR (1 14) Please have a separate, introductory patch for definitions that are not KVM specific. +++ b/arch/x86/include/asm/kvm_host.h @@ -671,6 +671,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; +extern u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); + No need for extern. diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2b2255b..925d4b9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -135,9 +135,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) return apic_get_reg(apic, lvt_type) APIC_VECTOR_MASK; } +static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) + apic-lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); +} + static inline int apic_lvtt_period(struct kvm_lapic *apic) { - return apic_get_reg(apic, APIC_LVTT) APIC_LVT_TIMER_PERIODIC; + return ((apic_get_reg(apic, APIC_LVTT) + apic-lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); +} + +static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) + apic-lapic_timer.timer_mode_mask) == + APIC_LVT_TIMER_TSCDEADLINE); } static inline int apic_lvt_nmi_mode(u32 lvt_val) @@ -166,7 +180,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) } static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { - LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ + LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ LINT_MASK, LINT_MASK, /* LVT0-1 */ @@ -570,6 +584,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) break; case APIC_TMCCT:/* Timer CCR */ + if (apic_lvtt_tscdeadline(apic)) + return 0; + val = apic_get_tmcct(apic); break; @@ -664,29 +681,32 @@ static void update_divide_count(struct kvm_lapic *apic) static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now = apic-lapic_timer.timer.base-get_time(); - - apic-lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) * - APIC_BUS_CYCLE_NS * apic-divide_count; + ktime_t now; atomic_set(apic-lapic_timer.pending, 0); - if (!apic-lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - if (apic-lapic_timer.period NSEC_PER_MSEC/2) - apic-lapic_timer.period = NSEC_PER_MSEC/2; - } + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { + /* lapic timer in oneshot or peroidic mode */ + now = apic-lapic_timer.timer.base-get_time(); + apic-lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) + *
Re: [PATCH 2/2] Qemu co-operation with kvm tsc deadline timer
On Wed, Sep 14, 2011 at 10:51:41AM +0800, Liu, Jinsong wrote: Jan Kiszka wrote: On 2011-09-13 16:38, Liu, Jinsong wrote: From c1b502d6548fcc41592cd90acc82109ee949df75 Mon Sep 17 00:00:00 2001 From: Liu, Jinsong jinsong@intel.com Date: Tue, 13 Sep 2011 22:05:30 +0800 Subject: [PATCH] Qemu co-operation with kvm tsc deadline timer KVM add emulation of lapic tsc deadline timer for guest. This patch is co-operation work at qemu side. Signed-off-by: Liu, Jinsong jinsong@intel.com --- target-i386/cpu.h |2 ++ target-i386/kvm.c | 14 ++ 2 files changed, 16 insertions(+), 0 deletions(-) diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 935d08a..62ff73c 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -283,6 +283,7 @@ #define MSR_IA32_APICBASE_BSP (18) #define MSR_IA32_APICBASE_ENABLE(111) #define MSR_IA32_APICBASE_BASE (0xf12) +#define MSR_IA32_TSCDEADLINE0x6e0 #define MSR_MTRRcap 0xfe #define MSR_MTRRcap_VCNT 8 @@ -687,6 +688,7 @@ typedef struct CPUX86State { uint64_t async_pf_en_msr; uint64_t tsc; +uint64_t tsc_deadline; This field has to be saved/restored for snapshots/migrations. Frankly, I've no clue right now if substates are in vogue again (they had problems in their binary format) or if you can simply add a versioned top-level field and bump the CPUState version number. Yes, it would be saved/restored. After migration, tsc_deadline would be set to MSR_IA32_TSCDEADLINE to trigger tsc timer interrupt. Jan means you should explicitly add this to machine.c. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] kvm tools: Use kernel dhcp for network autoconfiguration
On 09/14/2011 03:11 PM, Sasha Levin wrote: This patch removes the manual/usermode dhcp client configuration and instead uses the DHCP client built within the kernel. Since this client is tightly integrated with NFS (if NFS config is set), we will add a specific NFS root addr in our DHCP offer to point it to a non existent address so that we won't hang trying to poke it for our root. Signed-off-by: Sasha Levin levinsasha...@gmail.com Acked-by: Asias He asias.he...@gmail.com -- Asias He -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 02/10] Driver core: Add iommu_ops to bus_type
On Tue, Sep 13, 2011 at 12:21:26PM -0400, Greg KH wrote: On Tue, Sep 13, 2011 at 05:38:11PM +0200, Roedel, Joerg wrote: On Tue, Sep 13, 2011 at 10:58:55AM -0400, Greg KH wrote: On Tue, Sep 13, 2011 at 04:54:02PM +0200, Roedel, Joerg wrote: --- a/include/linux/device.h +++ b/include/linux/device.h @@ -22,6 +22,7 @@ #include linux/types.h #include linux/module.h #include linux/pm.h +#include linux/iommu.h Ick, please don't add new #includes to device.h, it makes the whole build slower. Just pre-declare the structure and all should be fine. Hmm, since linux/iommu.h provides 'struct iommu_ops', and this patch adds a 'struct iommu_ops' to 'struct bus_type', wouldn't a simple forward declaration make the bus_type incomplete in most other places? No, just like it doesn't make iommu.h incomplete as you used a struct bus_type there. Ah right, because bus-iommu_ops is just a pointer the full type definition for iommu_ops is only needed when this pointer is actually dereferenced. I updated the patch. Please find it below. From 6e0e1c3b997e06539f7bda80f46ffe9fb04aab4e Mon Sep 17 00:00:00 2001 From: Joerg Roedel joerg.roe...@amd.com Date: Fri, 26 Aug 2011 16:48:26 +0200 Subject: [PATCH 02/10] Driver core: Add iommu_ops to bus_type This is the starting point to make the iommu_ops used for the iommu-api a per-bus-type structure. It is required to easily implement bus-specific setup in the iommu-layer. The first user will be the iommu-group attribute in sysfs. Signed-off-by: Joerg Roedel joerg.roe...@amd.com --- drivers/base/bus.c | 29 + drivers/iommu/iommu.c |4 include/linux/device.h | 10 ++ include/linux/iommu.h |2 ++ 4 files changed, 45 insertions(+), 0 deletions(-) diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 000e7b2..b3014fe 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -1028,6 +1028,35 @@ void bus_sort_breadthfirst(struct bus_type *bus, } EXPORT_SYMBOL_GPL(bus_sort_breadthfirst); +#ifdef CONFIG_IOMMU_API +/** + * bus_set_iommu - set iommu-callbacks for the bus + * @bus: bus. + * @ops: the callbacks provided by the iommu-driver + * + * This function is called by an iommu driver to set the iommu methods + * used for a particular bus. Drivers for devices on that bus can use + * the iommu-api after these ops are registered. + * This special function is needed because IOMMUs are usually devices on + * the bus itself, so the iommu drivers are not initialized when the bus + * is set up. With this function the iommu-driver can set the iommu-ops + * afterwards. + */ +int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops) +{ + if (bus-iommu_ops != NULL) + return -EBUSY; + + bus-iommu_ops = ops; + + /* Do IOMMU specific setup for this bus-type */ + iommu_bus_init(bus, ops); + + return 0; +} +EXPORT_SYMBOL_GPL(bus_set_iommu); +#endif + int __init buses_init(void) { bus_kset = kset_create_and_add(bus, bus_uevent_ops, NULL); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 30b0644..3b24a5b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -34,6 +34,10 @@ void register_iommu(struct iommu_ops *ops) iommu_ops = ops; } +void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops) +{ +} + bool iommu_found(void) { return iommu_ops != NULL; diff --git a/include/linux/device.h b/include/linux/device.h index c20dfbf..490382b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -33,6 +33,7 @@ struct class; struct subsys_private; struct bus_type; struct device_node; +struct iommu_ops; struct bus_attribute { struct attributeattr; @@ -46,6 +47,7 @@ struct bus_attribute bus_attr_##_name = __ATTR(_name, _mode, _show, _store) extern int __must_check bus_create_file(struct bus_type *, struct bus_attribute *); extern void bus_remove_file(struct bus_type *, struct bus_attribute *); +extern void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops); /** * struct bus_type - The bus type of the device @@ -67,6 +69,9 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *); * @resume:Called to bring a device on this bus out of sleep mode. * @pm:Power management operations of this bus, callback the specific * device driver's pm-ops. + * @iommu_ops IOMMU specific operations for this bus, used to attach IOMMU + * driver implementations to a bus and allow the driver to do + * bus-specific setup * @p: The private data of the driver core, only the driver core can * touch this. * @@ -96,6 +101,8 @@ struct bus_type { const struct dev_pm_ops *pm; + struct iommu_ops *iommu_ops; + struct subsys_private *p; }; @@ -148,6 +155,9 @@ extern int
Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking
On Wed, Sep 14, 2011 at 10:00:07AM +0300, Avi Kivity wrote: On 09/13/2011 10:21 PM, Don Zickus wrote: Or are you saying an NMI in an idle system will have the same %rip thus falsely detecting a back-to-back NMI? That's easy to avoid - insert an instruction zeroing the last nmi_rip somewhere before or after hlt. It's always okay to execute such an instruction (outside the nmi handler itself), since nmi_rip is meant to detect a no instructions executed condition. Ah. Like a touch_nmi_watchdog() type of thing. Interesting. I'll poke around the idle code. Need to instrument a reproducer first. Thanks, Don -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] qemu-kvm: Fix build without VNC support
On Wed, Sep 14, 2011 at 15:16, Marcelo Tosatti mtosa...@redhat.com wrote: Does QEMU upstream suffer from the same problem? If so, it should be fixed there (patch sent to qemu-de...@nongnu.org). Yes, it is. I have sent the patch to the correct maillist, thanks for your help. -- Boris Dolgov. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking
On Wed, Sep 14, 2011 at 10:00:07AM +0300, Avi Kivity wrote: On 09/13/2011 10:21 PM, Don Zickus wrote: Or are you saying an NMI in an idle system will have the same %rip thus falsely detecting a back-to-back NMI? That's easy to avoid - insert an instruction zeroing the last nmi_rip somewhere before or after hlt. It's always okay to execute such an instruction (outside the nmi handler itself), since nmi_rip is meant to detect a no instructions executed condition. At least for classic hlt there is no simple after hlt because it's all interrupt handlers and exceptions and everything else that can interrupt combined. It may work with newer MWAIT. -Andi -- a...@linux.intel.com -- Speaking for myself only. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking
On 09/14/2011 05:49 PM, Andi Kleen wrote: On Wed, Sep 14, 2011 at 10:00:07AM +0300, Avi Kivity wrote: On 09/13/2011 10:21 PM, Don Zickus wrote: Or are you saying an NMI in an idle system will have the same %rip thus falsely detecting a back-to-back NMI? That's easy to avoid - insert an instruction zeroing the last nmi_rip somewhere before or after hlt. It's always okay to execute such an instruction (outside the nmi handler itself), since nmi_rip is meant to detect a no instructions executed condition. At least for classic hlt there is no simple after hlt because it's all interrupt handlers and exceptions and everything else that can interrupt combined. If an NMI hits in an interrupt handler, or in the after hlt section before the write-to-last-nmi-rip, then we'll see that %rip has changed. If it hits after the write-to-last-nmi-rip instruction (or in the hlt itself), then we'll also see that %rip has changed, due to the effect of that instruction. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Memory API code review
I would like to carry out an online code review of the memory API so that more people are familiar with the internals, and perhaps even to catch some bugs or deficiency. I'd like to use the next kvm conference call slot for this (Tuesday 1400 UTC) since many people already have it reserved in the schedule. It would be great if people from the wider qemu community be present, rather than the usual x86 is everything crowd (+Jan) that usually participates in the kvm weekly call. Juan, Chris, can we dedicate next week's call to this? We'll also need a way to disseminate a few slides and an editor session for showing the code. We have an elluminate account that can be used for this, but usually this has a 50% failure rate on Linux. Anthony, perhaps we can set up a view-only vnc reflector on qemu.org? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu-kvm 0.15 usb problem
On 08/10/2011 09:25 AM, Gerd Hoffmann wrote: qemu-system-x86_64: /tmp/qemu-kvm-0.15.0/hw/usb.c:336: usb_packet_complete: Assertion `p-owner != ((void *)0)' failed. What kind of device is this? I can say that in my case, a dive computer (a primitive serial device, I think) caused the assertion. Regards, Lutz Vieweg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu-kvm 0.15 usb problem
On 08/09/2011 09:30 PM, Michael wrote: After installed 0.15 ( and 0.15rc1) guest VM failed at some point with USB attached with error: qemu-system-x86_64: /tmp/qemu-kvm-0.15.0/hw/usb.c:336: usb_packet_complete: Assertion `p-owner != ((void *)0)' failed. I experienced the exact same problem, a formerly working USB usage by the virtual machine suddenly caused this assertion to abort qemu-kvm 0.15. Looking only briefly at this assertion and the surrounding code, I found that I did not understand what it's good for, so I removed the assertion, re-compiled, and - voila! - qemu-kvm 0.15 now works with the USB device as good as before. Regards, Lutz Vieweg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 03/14] KVM: x86 emulator: move memop, memopp into emulation context
On 09/14/2011 06:21 PM, Takuya Yoshikawa wrote: Nothing should be accessed from outside the emulator, except via accessors. We should move initialization to the emulator as well (or just initialize from x86_decode_insn() - any reason not to?) Not big reason but kvm_inject_realmode_interrupt() and kvm_task_switch() call emulate_int_real() and emulator_task_switch() respectively without doing generic decoding. So at least, we need some special initialization for them if we move init_emulate_ctxt() into x86_decode_insn(). Best if x86_decode_insn(), emulate_int_real(), and emulator_task_switch() all call an internal initialization function. This way the external caller doesn't have to worry about the details. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 03/14] KVM: x86 emulator: move memop, memopp into emulation context
On Wed, 14 Sep 2011 14:37:21 +0300 Avi Kivity a...@redhat.com wrote: Once the emulator context gets stablized, some comments will be nice to know which ones are supposed to be accessed from outside of the emulator, and which ones are only for the emulator internal usage. Practically, knowing each member's lifetime, decode stage only or emulation stage only or throughout the emulation, will make it easy to avoid extra ctxt/regs initialization and ... maybe more. Nothing should be accessed from outside the emulator, except via accessors. We should move initialization to the emulator as well (or just initialize from x86_decode_insn() - any reason not to?) Not big reason but kvm_inject_realmode_interrupt() and kvm_task_switch() call emulate_int_real() and emulator_task_switch() respectively without doing generic decoding. So at least, we need some special initialization for them if we move init_emulate_ctxt() into x86_decode_insn(). Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Memory API code review
On 09/14/2011 10:07 AM, Avi Kivity wrote: I would like to carry out an online code review of the memory API so that more people are familiar with the internals, and perhaps even to catch some bugs or deficiency. I'd like to use the next kvm conference call slot for this (Tuesday 1400 UTC) since many people already have it reserved in the schedule. It would be great if people from the wider qemu community be present, rather than the usual x86 is everything crowd (+Jan) that usually participates in the kvm weekly call. Juan, Chris, can we dedicate next week's call to this? We'll also need a way to disseminate a few slides and an editor session for showing the code. We have an elluminate account that can be used for this, but usually this has a 50% failure rate on Linux. Anthony, perhaps we can set up a view-only vnc reflector on qemu.org? Absolutely. I'll set something up and then get with you for the details of access. Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] kvm tools: Don't copy network autoconfiguration script
Network autoconfiguration was moved to the kernel, but the setup code still tried to copy the script over to the rootfs. This prevented from /virt/ to be properly created. Signed-off-by: Sasha Levin levinsasha...@gmail.com --- tools/kvm/builtin-setup.c | 13 - 1 files changed, 0 insertions(+), 13 deletions(-) diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c index c93eec3..6b8eb5b 100644 --- a/tools/kvm/builtin-setup.c +++ b/tools/kvm/builtin-setup.c @@ -129,15 +129,6 @@ static int copy_init(const char *guestfs_name) return copy_file(guest/init, path); } -static int copy_net(const char *guestfs_name) -{ - char path[PATH_MAX]; - - snprintf(path, PATH_MAX, %s%s%s/virt/setnet.sh, HOME_DIR, KVM_PID_FILE_PATH, guestfs_name); - - return copy_file(guest/setnet.sh, path); -} - static int make_guestfs_symlink(const char *guestfs_name, const char *path) { char target[PATH_MAX]; @@ -195,10 +186,6 @@ static int do_setup(const char *guestfs_name) make_guestfs_symlink(guestfs_name, guestfs_symlinks[i]); } - ret = copy_net(guestfs_name); - if (ret 0) - return ret; - return copy_init(guestfs_name); } -- 1.7.6.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] kvm tools: Use host's resolv.conf within the guest
Since kernel IP autoconfiguration doesn't set up /etc/resolv.conf, we'll use the one located within the host, since this was anyway what we simulated within the DHCP offer packets. Signed-off-by: Sasha Levin levinsasha...@gmail.com --- tools/kvm/builtin-run.c |2 ++ tools/kvm/builtin-setup.c |9 + tools/kvm/include/kvm/builtin-setup.h |1 + 3 files changed, 12 insertions(+), 0 deletions(-) diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c index 5dafb15..9d61088 100644 --- a/tools/kvm/builtin-run.c +++ b/tools/kvm/builtin-run.c @@ -129,6 +129,7 @@ static int img_name_parser(const struct option *opt, const char *arg, int unset) die(Unable to initialize virtio 9p); if (virtio_9p__register(kvm, /, hostfs) 0) die(Unable to initialize virtio 9p); + kvm_setup_resolv(arg); using_rootfs = custom_rootfs = 1; return 0; } @@ -750,6 +751,7 @@ int kvm_cmd_run(int argc, const char **argv, const char *prefix) char tmp[PATH_MAX]; kvm_setup_create_new(default); + kvm_setup_resolv(default); snprintf(tmp, PATH_MAX, %s%s%s, HOME_DIR, KVM_PID_FILE_PATH, default); if (virtio_9p__register(kvm, tmp, /dev/root) 0) diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c index 6b8eb5b..3e569e7 100644 --- a/tools/kvm/builtin-setup.c +++ b/tools/kvm/builtin-setup.c @@ -168,6 +168,15 @@ static void make_guestfs_dir(const char *guestfs_name, const char *dir) make_dir(name); } +void kvm_setup_resolv(const char *guestfs_name) +{ + char path[PATH_MAX]; + + snprintf(path, PATH_MAX, %s%s%s/etc/resolv.conf, HOME_DIR, KVM_PID_FILE_PATH, guestfs_name); + + copy_file(/etc/resolv.conf, path); +} + static int do_setup(const char *guestfs_name) { unsigned int i; diff --git a/tools/kvm/include/kvm/builtin-setup.h b/tools/kvm/include/kvm/builtin-setup.h index 6e183a1..f70ae78 100644 --- a/tools/kvm/include/kvm/builtin-setup.h +++ b/tools/kvm/include/kvm/builtin-setup.h @@ -4,5 +4,6 @@ int kvm_cmd_setup(int argc, const char **argv, const char *prefix); void kvm_setup_help(void); int kvm_setup_create_new(const char *guestfs_name); +void kvm_setup_resolv(const char *guestfs_name); #endif -- 1.7.6.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking
If an NMI hits in an interrupt handler, or in the after hlt section before the write-to-last-nmi-rip, then we'll see that %rip has changed. If it hits after the write-to-last-nmi-rip instruction (or in the hlt itself), then we'll also see that %rip has changed, due to the effect of that instruction. It won't handle multiple NMIs in halt. I assume that's reasonable common. -Andi -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Memory API code review
* Avi Kivity (a...@redhat.com) wrote: I would like to carry out an online code review of the memory API so that more people are familiar with the internals, and perhaps even to catch some bugs or deficiency. I'd like to use the next kvm conference call slot for this (Tuesday 1400 UTC) since many people already have it reserved in the schedule. It would be great if people from the wider qemu community be present, rather than the usual x86 is everything crowd (+Jan) that usually participates in the kvm weekly call. Juan, Chris, can we dedicate next week's call to this? Yup, sounds like a good idea. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking
On 09/14/2011 08:28 PM, Andi Kleen wrote: If an NMI hits in an interrupt handler, or in the after hlt section before the write-to-last-nmi-rip, then we'll see that %rip has changed. If it hits after the write-to-last-nmi-rip instruction (or in the hlt itself), then we'll also see that %rip has changed, due to the effect of that instruction. It won't handle multiple NMIs in halt. I assume that's reasonable common. Why not? -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking
On Wed, Sep 14, 2011 at 10:26:21PM +0300, Avi Kivity wrote: On 09/14/2011 08:28 PM, Andi Kleen wrote: If an NMI hits in an interrupt handler, or in the after hlt section before the write-to-last-nmi-rip, then we'll see that %rip has changed. If it hits after the write-to-last-nmi-rip instruction (or in the hlt itself), then we'll also see that %rip has changed, due to the effect of that instruction. It won't handle multiple NMIs in halt. I assume that's reasonable common. Why not? They all have the same original RIPs and there is no way to distingush them. -Andi -- a...@linux.intel.com -- Speaking for myself only. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking
On 09/14/2011 10:34 PM, Andi Kleen wrote: On Wed, Sep 14, 2011 at 10:26:21PM +0300, Avi Kivity wrote: On 09/14/2011 08:28 PM, Andi Kleen wrote: If an NMI hits in an interrupt handler, or in the after hlt section before the write-to-last-nmi-rip, then we'll see that %rip has changed. If it hits after the write-to-last-nmi-rip instruction (or in the hlt itself), then we'll also see that %rip has changed, due to the effect of that instruction. It won't handle multiple NMIs in halt. I assume that's reasonable common. Why not? They all have the same original RIPs and there is no way to distingush them. That's how we detect multiple NMIs. 1. First NMI is posted 2. NMI handler starts 3. 2nd NMI posted, queued 4. First NMI source handled 5. IRET 6. Queued NMI hits the core 7. back-to-back NMI detected (same rip) 8. Second (and third...) NMI source handled 9. Execution continues. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/3] KVM: PPC: HIOR and sregs fixup
While working on the HIOR setting which already made it into Avi's tree, I was too uncautious and ended up extending the sregs structure, breaking ABI compatibility with all QEMU versions. So the approach I was taking there was obvious wrong. Instead, what I thought might be a better alternative is to get rid of the static we have a struct full of registers and shove it left and right and instead just poke registers directly between kernel and user space. That sounds slow for starters, but once we have the infrastructure in place, we can build a batched version of the same interface and be fast again but maintain flexibility. This interface can also for example be used to easily fetch the next great extension of SSE registers or some MSRs that we haven't thought of or lots of PPC registers I haven't even heard of so far :). There always seem to be new ones to learn of out there. Please take a look at the interface and comment on whether you like it this way or not. It's currently only implemented for the PPC target, but is held generically, so everyone can use it. Oh and - it obviously implements HIOR again which we have to drop from sregs due to the ABI breakage. Alex Alexander Graf (3): Revert KVM: PPC: Add support for explicit HIOR setting KVM: PPC: Add generic single register ioctls KVM: PPC: Add support for explicit HIOR setting Documentation/virtual/kvm/api.txt | 48 arch/powerpc/include/asm/kvm.h| 10 + arch/powerpc/include/asm/kvm_book3s.h |2 +- arch/powerpc/kvm/book3s_pr.c | 12 +- arch/powerpc/kvm/powerpc.c| 64 + include/linux/kvm.h | 32 6 files changed, 149 insertions(+), 19 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] KVM: PPC: Add generic single register ioctls
Right now we transfer a static struct every time we want to get or set registers. Unfortunately, over time we realize that there are more of these than we thought of before and the extensibility and flexibility of transferring a full struct every time is limited. So this is a new approach to the problem. With these new ioctls, we can get and set a single register that is identified by an ID. This allows for very precise and limited transmittal of data. When we later realize that it's a better idea to shove over multiple registers at once, we can reuse most of the infrastructure and simply implement a GET_MANY_REGS / SET_MANY_REGS interface. The only downpoint I see to this one is that it needs to pad to 1024 bits (hardware is already on 512 bit registers, so I wanted to leave some room) which is slightly too much for transmitting only 64 bits. But if that's all the tradeoff we have to do for getting an extensible interface, I'd say go for it nevertheless. Signed-off-by: Alexander Graf ag...@suse.de --- Documentation/virtual/kvm/api.txt | 47 ++ arch/powerpc/kvm/powerpc.c| 51 + include/linux/kvm.h | 32 +++ 3 files changed, 130 insertions(+), 0 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index b547d7e..5a8f305 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1482,6 +1482,53 @@ is supported; 2 if the processor requires all virtual machines to have an RMA, or 1 if the processor can use an RMA but doesn't require it, because it supports the Virtual RMA (VRMA) facility. +4.64 KVM_SET_ONE_REG + +Capability: KVM_CAP_ONE_REG +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_one_reg (in) +Returns: 0 on success, negative value on failure + +struct kvm_one_reg { + __u64 id; + union { + __u8 reg8; + __u16 reg16; + __u32 reg32; + __u64 reg64; + __u8 reg128[16]; + __u8 reg256[32]; + __u8 reg512[64]; + __u8 reg1024[128]; + } u; +}; + +Using this ioctl, a single vcpu register can be set to a specific value +defined by user space with the passed in struct kvm_one_reg. There can +be architecture agnostic and architecture specific registers. Each have +their own range of operation and their own constants and width. To keep +track of the implemented registers, find a list below: + + Arch | Register| Width (bits) +| | + +4.65 KVM_GET_ONE_REG + +Capability: KVM_CAP_ONE_REG +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_one_reg (in and out) +Returns: 0 on success, negative value on failure + +This ioctl allows to receive the value of a single register implemented +in a vcpu. The register to read is indicated by the id field of the +kvm_one_reg struct passed in. On success, the register value can be found +in the respective width field of the struct after this call. + +The list of registers accessible using this interface is identical to the +list in 4.64. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e75c5ac..39cdb3f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -214,6 +214,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_ONE_REG: r = 1; break; #ifndef CONFIG_KVM_BOOK3S_64_HV @@ -627,6 +628,32 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } +static int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, + struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg-id) { + default: + break; + } + + return r; +} + +static int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, + struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg-id) { + default: + break; + } + + return r; +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { @@ -666,6 +693,30 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } + case KVM_GET_ONE_REG: + { + struct kvm_one_reg reg; + r = -EFAULT; + if (copy_from_user(reg, argp, sizeof(reg))) + goto out; + r = kvm_vcpu_ioctl_get_one_reg(vcpu, reg); + if (copy_to_user(argp, reg, sizeof(reg))) { + r = -EFAULT; + goto out; +
[PATCH 1/3] Revert KVM: PPC: Add support for explicit HIOR setting
This reverts commit 11d7596e18a712dc3bc29d45662ec111fd65946b. It exceeded the padding on the SREGS struct, rendering the ABI backwards-incompatible. Signed-off-by: Alexander Graf ag...@suse.de --- arch/powerpc/include/asm/kvm.h|8 arch/powerpc/include/asm/kvm_book3s.h |2 -- arch/powerpc/kvm/book3s_pr.c | 14 ++ arch/powerpc/kvm/powerpc.c|1 - include/linux/kvm.h |1 - 5 files changed, 2 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 71684b9..a635e22 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -149,12 +149,6 @@ struct kvm_regs { #define KVM_SREGS_E_UPDATE_DBSR(1 3) /* - * Book3S special bits to indicate contents in the struct by maintaining - * backwards compatibility with older structs. If adding a new field, - * please make sure to add a flag for that new field */ -#define KVM_SREGS_S_HIOR (1 0) - -/* * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a * previous KVM_GET_REGS. * @@ -179,8 +173,6 @@ struct kvm_sregs { __u64 ibat[8]; __u64 dbat[8]; } ppc32; - __u64 flags; /* KVM_SREGS_S_ */ - __u64 hior; } s; struct { union { diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index a384ffd..d4df013 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -90,8 +90,6 @@ struct kvmppc_vcpu_book3s { #endif int context_id[SID_CONTEXTS]; - bool hior_sregs;/* HIOR is set by SREGS, not PVR */ - struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d417511..84505a2 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -150,16 +150,14 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr = 0x33) (pvr 0x7033)) { kvmppc_mmu_book3s_64_init(vcpu); - if (!to_book3s(vcpu)-hior_sregs) - to_book3s(vcpu)-hior = 0xfff0; + to_book3s(vcpu)-hior = 0xfff0; to_book3s(vcpu)-msr_mask = 0xULL; vcpu-arch.cpu_type = KVM_CPU_3S_64; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - if (!to_book3s(vcpu)-hior_sregs) - to_book3s(vcpu)-hior = 0; + to_book3s(vcpu)-hior = 0; to_book3s(vcpu)-msr_mask = 0xULL; vcpu-arch.cpu_type = KVM_CPU_3S_32; } @@ -796,9 +794,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, } } - if (sregs-u.s.flags KVM_SREGS_S_HIOR) - sregs-u.s.hior = to_book3s(vcpu)-hior; - return 0; } @@ -835,11 +830,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Flush the MMU after messing with the segments */ kvmppc_mmu_pte_flush(vcpu, 0, 0); - if (sregs-u.s.flags KVM_SREGS_S_HIOR) { - to_book3s(vcpu)-hior_sregs = true; - to_book3s(vcpu)-hior = sregs-u.s.hior; - } - return 0; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 55b4233..e75c5ac 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -209,7 +209,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: - case KVM_CAP_PPC_HIOR: case KVM_CAP_PPC_PAPR: #endif case KVM_CAP_PPC_UNSET_IRQ: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 06ef37d..fe57d2b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -554,7 +554,6 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ -#define KVM_CAP_PPC_HIOR 67 #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_SW_TLB 69 -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] KVM: PPC: Add support for explicit HIOR setting
Until now, we always set HIOR based on the PVR, but this is just wrong. Instead, we should be setting HIOR explicitly, so user space can decide what the initial HIOR value is - just like on real hardware. We keep the old PVR based way around for backwards compatibility, but once user space uses the SET_ONE_REG based method, we drop the PVR logic. Signed-off-by: Alexander Graf ag...@suse.de --- Documentation/virtual/kvm/api.txt |1 + arch/powerpc/include/asm/kvm.h|2 ++ arch/powerpc/include/asm/kvm_book3s.h |2 ++ arch/powerpc/kvm/book3s_pr.c |6 -- arch/powerpc/kvm/powerpc.c| 14 ++ include/linux/kvm.h |1 + 6 files changed, 24 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 5a8f305..eb03179 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1512,6 +1512,7 @@ track of the implemented registers, find a list below: Arch | Register| Width (bits) | | + PPC | KVM_ONE_REG_PPC_HIOR | 64 4.65 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index a635e22..53b8759 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -327,4 +327,6 @@ struct kvm_book3e_206_tlb_params { __u32 reserved[8]; }; +#define KVM_ONE_REG_PPC_HIOR KVM_ONE_REG_PPC | 0x100 + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index d4df013..0ba8ba9 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s { #endif int context_id[SID_CONTEXTS]; + bool hior_explicit; /* HIOR is set by ioctl, not PVR */ + struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 84505a2..565af5a 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -150,14 +150,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr = 0x33) (pvr 0x7033)) { kvmppc_mmu_book3s_64_init(vcpu); - to_book3s(vcpu)-hior = 0xfff0; + if (!to_book3s(vcpu)-hior_explicit) + to_book3s(vcpu)-hior = 0xfff0; to_book3s(vcpu)-msr_mask = 0xULL; vcpu-arch.cpu_type = KVM_CPU_3S_64; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - to_book3s(vcpu)-hior = 0; + if (!to_book3s(vcpu)-hior_explicit) + to_book3s(vcpu)-hior = 0; to_book3s(vcpu)-msr_mask = 0xULL; vcpu-arch.cpu_type = KVM_CPU_3S_32; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 39cdb3f..c33f6a7 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -209,6 +209,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_HIOR: case KVM_CAP_PPC_PAPR: #endif case KVM_CAP_PPC_UNSET_IRQ: @@ -634,6 +635,12 @@ static int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, int r = -EINVAL; switch (reg-id) { +#ifdef CONFIG_PPC_BOOK3S + case KVM_ONE_REG_PPC_HIOR: + reg-u.reg64 = to_book3s(vcpu)-hior; + r = 0; + break; +#endif default: break; } @@ -647,6 +654,13 @@ static int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, int r = -EINVAL; switch (reg-id) { +#ifdef CONFIG_PPC_BOOK3S + case KVM_ONE_REG_PPC_HIOR: + to_book3s(vcpu)-hior = reg-u.reg64; + to_book3s(vcpu)-hior_explicit = true; + r = 0; + break; +#endif default: break; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 762959a..cc6c2fb 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ +#define KVM_CAP_PPC_HIOR 67 #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_SW_TLB 69 #define KVM_CAP_ONE_REG 70 -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
kgdb hooks and kvm-tool
Hi. Is it possible to use kvm-tool with a kernel compiled with kgdb? I've tried adding 'kgdbwait kgdboc=ttyS0' to -p, but that doesn't seem to work. Thanks, \dae -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/10] x86/ticketlock: collapse a layer of functions
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com Now that the paravirtualization layer doesn't exist at the spinlock level any more, we can collapse the __ticket_ functions into the arch_ functions. Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com --- arch/x86/include/asm/spinlock.h | 35 +-- 1 files changed, 5 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 860fc4b..98fe202 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -76,7 +76,7 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, __t * in the high part, because a wide xadd increment of the low part would carry * up and contaminate the high part. */ -static __always_inline void __ticket_spin_lock(struct arch_spinlock *lock) +static __always_inline void arch_spin_lock(struct arch_spinlock *lock) { register struct __raw_tickets inc = { .tail = 1 }; @@ -96,7 +96,7 @@ static __always_inline void __ticket_spin_lock(struct arch_spinlock *lock) out: barrier(); /* make sure nothing creeps before the lock is taken */ } -static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { arch_spinlock_t old, new; @@ -128,7 +128,7 @@ static __always_inline void __ticket_unlock_release(arch_spinlock_t *lock) } #endif -static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { __ticket_t next = lock-tickets.head + 1; @@ -136,46 +136,21 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) __ticket_unlock_kick(lock, next); } -static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) +static inline int arch_spin_is_locked(arch_spinlock_t *lock) { struct __raw_tickets tmp = ACCESS_ONCE(lock-tickets); return !!(tmp.tail ^ tmp.head); } -static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { struct __raw_tickets tmp = ACCESS_ONCE(lock-tickets); return ((tmp.tail - tmp.head) TICKET_MASK) 1; } - -static inline int arch_spin_is_locked(arch_spinlock_t *lock) -{ - return __ticket_spin_is_locked(lock); -} - -static inline int arch_spin_is_contended(arch_spinlock_t *lock) -{ - return __ticket_spin_is_contended(lock); -} #define arch_spin_is_contended arch_spin_is_contended -static __always_inline void arch_spin_lock(arch_spinlock_t *lock) -{ - __ticket_spin_lock(lock); -} - -static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) -{ - return __ticket_spin_trylock(lock); -} - -static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - __ticket_spin_unlock(lock); -} - static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { -- 1.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/10] x86/ticketlocks: remove obsolete comment
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com The note about partial registers is not really relevent now that we rely on gcc to generate all the assembler. Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com --- arch/x86/include/asm/spinlock.h |4 1 files changed, 0 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index f5695ee..972c260 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -49,10 +49,6 @@ * issues and should be optimal for the uncontended case. Note the tail must be * in the high part, because a wide xadd increment of the low part would carry * up and contaminate the high part. - * - * With fewer than 2^8 possible CPUs, we can use x86's partial registers to - * save some instructions and make the code more elegant. There really isn't - * much between them in performance though, especially as locks are out of line. */ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { -- 1.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/10] x86/spinlocks: replace pv spinlocks with pv ticketlocks
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com Rather than outright replacing the entire spinlock implementation in order to paravirtualize it, keep the ticket lock implementation but add a couple of pvops hooks on the slow patch (long spin on lock, unlocking a contended lock). Ticket locks have a number of nice properties, but they also have some surprising behaviours in virtual environments. They enforce a strict FIFO ordering on cpus trying to take a lock; however, if the hypervisor scheduler does not schedule the cpus in the correct order, the system can waste a huge amount of time spinning until the next cpu can take the lock. (See Thomas Friebel's talk Prevent Guests from Spinning Around http://www.xen.org/files/xensummitboston08/LHP.pdf for more details.) To address this, we add two hooks: - __ticket_spin_lock which is called after the cpu has been spinning on the lock for a significant number of iterations but has failed to take the lock (presumably because the cpu holding the lock has been descheduled). The lock_spinning pvop is expected to block the cpu until it has been kicked by the current lock holder. - __ticket_spin_unlock, which on releasing a contended lock (there are more cpus with tail tickets), it looks to see if the next cpu is blocked and wakes it if so. When compiled with CONFIG_PARAVIRT_SPINLOCKS disabled, a set of stub functions causes all the extra code to go away. Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com --- arch/x86/include/asm/paravirt.h | 30 ++-- arch/x86/include/asm/paravirt_types.h | 10 ++--- arch/x86/include/asm/spinlock.h | 59 ++--- arch/x86/include/asm/spinlock_types.h |4 -- arch/x86/kernel/paravirt-spinlocks.c | 15 +--- arch/x86/xen/spinlock.c |7 +++- 6 files changed, 63 insertions(+), 62 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a7d2db9..76cae7a 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -750,36 +750,14 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) defined(CONFIG_PARAVIRT_SPINLOCKS) -static inline int arch_spin_is_locked(struct arch_spinlock *lock) +static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); + PVOP_VCALL2(pv_lock_ops.lock_spinning, lock, ticket); } -static inline int arch_spin_is_contended(struct arch_spinlock *lock) +static __always_inline void ticket_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); -} -#define arch_spin_is_contended arch_spin_is_contended - -static __always_inline void arch_spin_lock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_lock, lock); -} - -static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock, - unsigned long flags) -{ - PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); -} - -static __always_inline int arch_spin_trylock(struct arch_spinlock *lock) -{ - return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); -} - -static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); + PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8e8b9a4..005e24d 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -327,13 +327,11 @@ struct pv_mmu_ops { }; struct arch_spinlock; +#include asm/spinlock_types.h + struct pv_lock_ops { - int (*spin_is_locked)(struct arch_spinlock *lock); - int (*spin_is_contended)(struct arch_spinlock *lock); - void (*spin_lock)(struct arch_spinlock *lock); - void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct arch_spinlock *lock); - void (*spin_unlock)(struct arch_spinlock *lock); + void (*lock_spinning)(struct arch_spinlock *lock, __ticket_t ticket); + void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 972c260..860fc4b 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -37,6 +37,32 @@ # define UNLOCK_LOCK_PREFIX #endif +/* How long a lock should spin before we consider blocking */ +#define SPIN_THRESHOLD (1 11) + +#ifndef CONFIG_PARAVIRT_SPINLOCKS + +static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) +{ +} +
[PATCH 05/10] xen/pvticketlock: Xen implementation for PV ticket locks
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com Replace the old Xen implementation of PV spinlocks with and implementation of xen_lock_spinning and xen_unlock_kick. xen_lock_spinning simply registers the cpu in its entry in lock_waiting, adds itself to the waiting_cpus set, and blocks on an event channel until the channel becomes pending. xen_unlock_kick searches the cpus in waiting_cpus looking for the one which next wants this lock with the next ticket, if any. If found, it kicks it by making its event channel pending, which wakes it up. We need to make sure interrupts are disabled while we're relying on the contents of the per-cpu lock_waiting values, otherwise an interrupt handler could come in, try to take some other lock, block, and overwrite our values. Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com --- arch/x86/xen/spinlock.c | 287 +++ 1 files changed, 43 insertions(+), 244 deletions(-) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 23af06a..f6133c5 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -19,32 +19,21 @@ #ifdef CONFIG_XEN_DEBUG_FS static struct xen_spinlock_stats { - u64 taken; u32 taken_slow; - u32 taken_slow_nested; u32 taken_slow_pickup; u32 taken_slow_spurious; - u32 taken_slow_irqenable; - u64 released; u32 released_slow; u32 released_slow_kicked; #define HISTO_BUCKETS 30 - u32 histo_spin_total[HISTO_BUCKETS+1]; - u32 histo_spin_spinning[HISTO_BUCKETS+1]; u32 histo_spin_blocked[HISTO_BUCKETS+1]; - u64 time_total; - u64 time_spinning; u64 time_blocked; } spinlock_stats; static u8 zero_stats; -static unsigned lock_timeout = 1 10; -#define TIMEOUT lock_timeout - static inline void check_zero(void) { if (unlikely(zero_stats)) { @@ -73,22 +62,6 @@ static void __spin_time_accum(u64 delta, u32 *array) array[HISTO_BUCKETS]++; } -static inline void spin_time_accum_spinning(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); - spinlock_stats.time_spinning += delta; -} - -static inline void spin_time_accum_total(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_total); - spinlock_stats.time_total += delta; -} - static inline void spin_time_accum_blocked(u64 start) { u32 delta = xen_clocksource_read() - start; @@ -105,214 +78,84 @@ static inline u64 spin_time_start(void) return 0; } -static inline void spin_time_accum_total(u64 start) -{ -} -static inline void spin_time_accum_spinning(u64 start) -{ -} static inline void spin_time_accum_blocked(u64 start) { } #endif /* CONFIG_XEN_DEBUG_FS */ -struct xen_spinlock { - unsigned char lock; /* 0 - free; 1 - locked */ - unsigned short spinners;/* count of waiting cpus */ +struct xen_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; }; static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); +static cpumask_t waiting_cpus; -#if 0 -static int xen_spin_is_locked(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl-lock != 0; -} - -static int xen_spin_is_contended(struct arch_spinlock *lock) +static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl-spinners != 0; -} - -static int xen_spin_trylock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm(xchgb %b0,%1 - : +q (old), +m (xl-lock) : : memory); - - return old == 0; -} - -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -/* - * Mark a cpu as interested in a lock. Returns the CPU's previous - * lock of interest, in case we got preempted by an interrupt. - */ -static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) -{ - struct xen_spinlock *prev; - - prev = __this_cpu_read(lock_spinners); - __this_cpu_write(lock_spinners, xl); - - wmb(); /* set lock of interest before count */ - - asm(LOCK_PREFIX incw %0 - : +m (xl-spinners) : : memory); - - return prev; -} - -/* - * Mark a cpu as no longer interested in a lock. Restores previous - * lock of interest (NULL for none). - */ -static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) -{ - asm(LOCK_PREFIX decw %0 - : +m (xl-spinners) : : memory); - wmb();
[PATCH 08/10] x86/ticketlock: add slowpath logic
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com Maintain a flag in the LSB of the ticket lock tail which indicates whether anyone is in the lock slowpath and may need kicking when the current holder unlocks. The flags are set when the first locker enters the slowpath, and cleared when unlocking to an empty queue (ie, no contention). In the specific implementation of lock_spinning(), make sure to set the slowpath flags on the lock just before blocking. We must do this before the last-chance pickup test to prevent a deadlock with the unlocker: UnlockerLocker test for lock pickup - fail unlock test slowpath - false set slowpath flags block Whereas this works in any ordering: UnlockerLocker set slowpath flags test for lock pickup - fail block unlock test slowpath - true, kick If the unlocker finds that the lock has the slowpath flag set but it is actually uncontended (ie, head == tail, so nobody is waiting), then it clear the slowpath flag. Note on memory access ordering: When unlocking a ticketlock with PV callbacks enabled, unlock first adds to the lock head, then checks to see if the slowpath flag is set in the lock tail. However, because reads are not ordered with respect to writes in different memory locations, the CPU could perform the read before updating head to release the lock. This would deadlock with another CPU in the lock slowpath, as it will set the slowpath flag before checking to see if the lock has been released in the interim. A heavyweight fix would be to stick a full mfence between the two. However, a lighterweight fix is to simply make sure the flag tests loads both head and tail of the lock in a single operation, thereby making sure that it overlaps with the memory written by the unlock, forcing the CPU to maintain ordering. Note: this code relies on gcc making sure that unlikely() code is out of line of the fastpath, which only happens when OPTIMIZE_SIZE=n. If it doesn't the generated code isn't too bad, but its definitely suboptimal. (Thanks to Srivatsa Vaddagiri for providing a bugfix to the original version of this change, which has been folded in.) Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com Signed-off-by: Srivatsa Vaddagiri va...@linux.vnet.ibm.com --- arch/x86/include/asm/paravirt.h |2 +- arch/x86/include/asm/spinlock.h | 92 ++-- arch/x86/include/asm/spinlock_types.h |2 + arch/x86/kernel/paravirt-spinlocks.c |1 + arch/x86/xen/spinlock.c |4 ++ 5 files changed, 82 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 50281c7..13b3d8b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -755,7 +755,7 @@ static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, _ PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket); } -static __always_inline void ticket_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) +static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) { PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 40c90aa..c1f6981 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -40,29 +40,56 @@ /* How long a lock should spin before we consider blocking */ #define SPIN_THRESHOLD (1 11) -#ifndef CONFIG_PARAVIRT_SPINLOCKS +#ifdef CONFIG_PARAVIRT_SPINLOCKS -static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) +/* + * Return true if someone is in the slowpath on this lock. This + * should only be used by the current lock-holder. + */ +static inline bool __ticket_in_slowpath(arch_spinlock_t *lock) { + /* +* This deliberately reads both head and tail as a single +* memory operation, and then tests the flag in tail. This is +* to guarantee that this read is ordered after the add to +* head which does the unlock. If we were to only read tail +* to test the flag, then the CPU would be free to reorder the +* read to before the write to head (since it is a different +* memory location), which could cause a deadlock with someone +* setting the flag before re-checking the lock availability. +*/ + return ACCESS_ONCE(lock-head_tail) (TICKET_SLOWPATH_FLAG TICKET_SHIFT); } -static __always_inline void ticket_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) +static inline void
[PATCH 00/10] [PATCH RFC V2] Paravirtualized ticketlocks
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com [ Changes since last posting: - fix bugs exposed by the cold light of testing - make the slow flag read in unlock cover the whole lock to force ordering WRT the unlock write - when kicking on unlock, only look for the CPU *we* released (ie, head value the unlock resulted in), rather than re-reading the new head and kicking on that basis - enable PV ticketlocks in Xen HVM guests ] NOTE: this series is available in: git://github.com/jsgf/linux-xen.git upstream/pvticketlock-slowflag and is based on the previously posted ticketlock cleanup series in git://github.com/jsgf/linux-xen.git upstream/ticketlock-cleanup This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. Ticket locks have an inherent problem in a virtualized case, because the vCPUs are scheduled rather than running concurrently (ignoring gang scheduled vCPUs). This can result in catastrophic performance collapses when the vCPU scheduler doesn't schedule the correct next vCPU, and ends up scheduling a vCPU which burns its entire timeslice spinning. (Note that this is not the same problem as lock-holder preemption, which this series also addresses; that's also a problem, but not catastrophic). (See Thomas Friebel's talk Prevent Guests from Spinning Around http://www.xen.org/files/xensummitboston08/LHP.pdf for more details.) Currently we deal with this by having PV spinlocks, which adds a layer of indirection in front of all the spinlock functions, and defining a completely new implementation for Xen (and for other pvops users, but there are none at present). PV ticketlocks keeps the existing ticketlock implemenentation (fastpath) as-is, but adds a couple of pvops for the slow paths: - If a CPU has been waiting for a spinlock for SPIN_THRESHOLD iterations, then call out to the __ticket_lock_spinning() pvop, which allows a backend to block the vCPU rather than spinning. This pvop can set the lock into slowpath state. - When releasing a lock, if it is in slowpath state, the call __ticket_unlock_kick() to kick the next vCPU in line awake. If the lock is no longer in contention, it also clears the slowpath flag. The slowpath state is stored in the LSB of the within the lock ticket. This has the effect of reducing the max number of CPUs by half (so, a small ticket can deal with 128 CPUs, and large ticket 32768). This series provides a Xen implementation, but it should be straightforward to add a KVM implementation as well. Overall, it results in a large reduction in code, it makes the native and virtualized cases closer, and it removes a layer of indirection around all the spinlock functions. The fast path (taking an uncontended lock which isn't in slowpath state) is optimal, identical to the non-paravirtualized case. The inner part of ticket lock code becomes: inc = xadd(lock-tickets, inc); inc.tail = ~TICKET_SLOWPATH_FLAG; if (likely(inc.head == inc.tail)) goto out; for (;;) { unsigned count = SPIN_THRESHOLD; do { if (ACCESS_ONCE(lock-tickets.head) == inc.tail) goto out; cpu_relax(); } while (--count); __ticket_lock_spinning(lock, inc.tail); } out:barrier(); which results in: push %rbp mov%rsp,%rbp mov$0x200,%eax lock xadd %ax,(%rdi) movzbl %ah,%edx cmp%al,%dl jne1f pop%rbp retq ### SLOWPATH START 1: and$-2,%edx movzbl %dl,%esi 2: mov$0x800,%eax jmp4f 3: pause sub$0x1,%eax je 5f 4: movzbl (%rdi),%ecx cmp%cl,%dl jne3b pop%rbp retq 5: callq *__ticket_lock_spinning jmp2b ### SLOWPATH END with CONFIG_PARAVIRT_SPINLOCKS=n, the code has changed slightly, where the fastpath case is straight through (taking the lock without contention), and the spin loop is out of line: push %rbp mov%rsp,%rbp mov$0x100,%eax lock xadd %ax,(%rdi) movzbl %ah,%edx cmp%al,%dl jne1f pop%rbp retq ### SLOWPATH START 1: pause movzbl (%rdi),%eax cmp%dl,%al jne1b pop%rbp retq ### SLOWPATH END The unlock code is very straightforward: prev = *lock; __ticket_unlock_release(lock); if (unlikely(__ticket_in_slowpath(lock))) __ticket_unlock_slowpath(lock, prev); which generates: push %rbp mov%rsp,%rbp movzwl (%rdi),%esi addb $0x2,(%rdi) movzwl (%rdi),%eax testb $0x1,%ah
[PATCH 10/10] xen: enable PV ticketlocks on HVM Xen
From: Stefano Stabellini stefano.stabell...@eu.citrix.com Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com --- arch/x86/xen/smp.c |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index e79dbb9..bf958ce 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -552,4 +552,5 @@ void __init xen_hvm_smp_init(void) smp_ops.cpu_die = xen_hvm_cpu_die; smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; + xen_init_spinlocks(); } -- 1.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/10] xen/pvticketlock: allow interrupts to be enabled while blocking
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com If interrupts were enabled when taking the spinlock, we can leave them enabled while blocking to get the lock. If we can enable interrupts while waiting for the lock to become available, and we take an interrupt before entering the poll, and the handler takes a spinlock which ends up going into the slow state (invalidating the per-cpu lock and want values), then when the interrupt handler returns the event channel will remain pending so the poll will return immediately, causing it to return out to the main spinlock loop. Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com --- arch/x86/xen/spinlock.c | 48 -- 1 files changed, 41 insertions(+), 7 deletions(-) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index c939723..7366b39 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -106,11 +106,28 @@ static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) start = spin_time_start(); - /* Make sure interrupts are disabled to ensure that these - per-cpu values are not overwritten. */ + /* +* Make sure an interrupt handler can't upset things in a +* partially setup state. +*/ local_irq_save(flags); + /* +* We don't really care if we're overwriting some other +* (lock,want) pair, as that would mean that we're currently +* in an interrupt context, and the outer context had +* interrupts enabled. That has already kicked the VCPU out +* of xen_poll_irq(), so it will just return spuriously and +* retry with newly setup (lock,want). +* +* The ordering protocol on this is that the lock pointer +* may only be set non-NULL if the want ticket is correct. +* If we're updating want, we must first clear lock. +*/ + w-lock = NULL; + smp_wmb(); w-want = want; + smp_wmb(); w-lock = lock; /* This uses set_bit, which atomic and therefore a barrier */ @@ -124,21 +141,36 @@ static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) /* Only check lock once pending cleared */ barrier(); - /* Mark entry to slowpath before doing the pickup test to make - sure we don't deadlock with an unlocker. */ + /* +* Mark entry to slowpath before doing the pickup test to make +* sure we don't deadlock with an unlocker. +*/ __ticket_enter_slowpath(lock); - /* check again make sure it didn't become free while - we weren't looking */ + /* +* check again make sure it didn't become free while +* we weren't looking +*/ if (ACCESS_ONCE(lock-tickets.head) == want) { ADD_STATS(taken_slow_pickup, 1); goto out; } + /* Allow interrupts while blocked */ + local_irq_restore(flags); + + /* +* If an interrupt happens here, it will leave the wakeup irq +* pending, which will cause xen_poll_irq() to return +* immediately. +*/ + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ xen_poll_irq(irq); ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); + local_irq_save(flags); + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); out: @@ -160,7 +192,9 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) for_each_cpu(cpu, waiting_cpus) { const struct xen_lock_waiting *w = per_cpu(lock_waiting, cpu); - if (w-lock == lock w-want == next) { + /* Make sure we read lock before want */ + if (ACCESS_ONCE(w-lock) == lock + ACCESS_ONCE(w-want) == next) { ADD_STATS(released_slow_kicked, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); break; -- 1.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/10] x86/ticketlock: don't inline _spin_unlock when using paravirt spinlocks
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com The code size expands somewhat, and its probably better to just call a function rather than inline it. Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com --- arch/x86/Kconfig |3 +++ kernel/Kconfig.locks |2 +- 2 files changed, 4 insertions(+), 1 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a47bb2..1f03f82 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -585,6 +585,9 @@ config PARAVIRT_SPINLOCKS If you are unsure how to answer this question, answer N. +config ARCH_NOINLINE_SPIN_UNLOCK + def_bool PARAVIRT_SPINLOCKS + config PARAVIRT_CLOCK bool diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 5068e2a..584637b 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -125,7 +125,7 @@ config INLINE_SPIN_LOCK_IRQSAVE ARCH_INLINE_SPIN_LOCK_IRQSAVE config INLINE_SPIN_UNLOCK - def_bool !DEBUG_SPINLOCK (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) + def_bool !DEBUG_SPINLOCK (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) !ARCH_NOINLINE_SPIN_UNLOCK config INLINE_SPIN_UNLOCK_BH def_bool !DEBUG_SPINLOCK ARCH_INLINE_SPIN_UNLOCK_BH -- 1.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/10] x86/pvticketlock: use callee-save for lock_spinning
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com Although the lock_spinning calls in the spinlock code are on the uncommon path, their presence can cause the compiler to generate many more register save/restores in the function pre/postamble, which is in the fast path. To avoid this, convert it to using the pvops callee-save calling convention, which defers all the save/restores until the actual function is called, keeping the fastpath clean. Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com --- arch/x86/include/asm/paravirt.h |2 +- arch/x86/include/asm/paravirt_types.h |2 +- arch/x86/kernel/paravirt-spinlocks.c |2 +- arch/x86/xen/spinlock.c |3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 76cae7a..50281c7 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -752,7 +752,7 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) { - PVOP_VCALL2(pv_lock_ops.lock_spinning, lock, ticket); + PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket); } static __always_inline void ticket_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 005e24d..5e0c138 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -330,7 +330,7 @@ struct arch_spinlock; #include asm/spinlock_types.h struct pv_lock_ops { - void (*lock_spinning)(struct arch_spinlock *lock, __ticket_t ticket); + struct paravirt_callee_save lock_spinning; void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); }; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index c2e010e..4251c1d 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -9,7 +9,7 @@ struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP - .lock_spinning = paravirt_nop, + .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), .unlock_kick = paravirt_nop, #endif }; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index f6133c5..7a04950 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -145,6 +145,7 @@ out: spin_time_accum_blocked(start); } +PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) { @@ -197,7 +198,7 @@ void xen_uninit_lock_cpu(int cpu) void __init xen_init_spinlocks(void) { - pv_lock_ops.lock_spinning = xen_lock_spinning; + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; } -- 1.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/10] x86/ticketlocks: when paravirtualizing ticket locks, increment by 2
From: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com Increment ticket head/tails by 2 rather than 1 to leave the LSB free to store a is in slowpath state bit. This halves the number of possible CPUs for a given ticket size, but this shouldn't matter in practice - kernels built for 32k+ CPU systems are probably specially built for the hardware rather than a generic distro kernel. Signed-off-by: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com --- arch/x86/include/asm/spinlock.h | 16 arch/x86/include/asm/spinlock_types.h | 10 +- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 98fe202..40c90aa 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -78,7 +78,7 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, __t */ static __always_inline void arch_spin_lock(struct arch_spinlock *lock) { - register struct __raw_tickets inc = { .tail = 1 }; + register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; inc = xadd(lock-tickets, inc); @@ -104,7 +104,7 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) if (old.tickets.head != old.tickets.tail) return 0; - new.head_tail = old.head_tail + (1 TICKET_SHIFT); + new.head_tail = old.head_tail + (TICKET_LOCK_INC TICKET_SHIFT); /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(lock-head_tail, old.head_tail, new.head_tail) == old.head_tail; @@ -113,24 +113,24 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) #if (NR_CPUS 256) static __always_inline void __ticket_unlock_release(arch_spinlock_t *lock) { - asm volatile(UNLOCK_LOCK_PREFIX incb %0 + asm volatile(UNLOCK_LOCK_PREFIX addb %1, %0 : +m (lock-head_tail) -: +: i (TICKET_LOCK_INC) : memory, cc); } #else static __always_inline void __ticket_unlock_release(arch_spinlock_t *lock) { - asm volatile(UNLOCK_LOCK_PREFIX incw %0 + asm volatile(UNLOCK_LOCK_PREFIX addw %1, %0 : +m (lock-head_tail) -: +: i (TICKET_LOCK_INC) : memory, cc); } #endif static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { - __ticket_t next = lock-tickets.head + 1; + __ticket_t next = lock-tickets.head + TICKET_LOCK_INC; __ticket_unlock_release(lock); __ticket_unlock_kick(lock, next); @@ -147,7 +147,7 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock) { struct __raw_tickets tmp = ACCESS_ONCE(lock-tickets); - return ((tmp.tail - tmp.head) TICKET_MASK) 1; + return ((tmp.tail - tmp.head) TICKET_MASK) TICKET_LOCK_INC; } #define arch_spin_is_contended arch_spin_is_contended diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index dbe223d..aa9a205 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -3,7 +3,13 @@ #include linux/types.h -#if (CONFIG_NR_CPUS 256) +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define __TICKET_LOCK_INC 2 +#else +#define __TICKET_LOCK_INC 1 +#endif + +#if (CONFIG_NR_CPUS (256 / __TICKET_LOCK_INC)) typedef u8 __ticket_t; typedef u16 __ticketpair_t; #else @@ -11,6 +17,8 @@ typedef u16 __ticket_t; typedef u32 __ticketpair_t; #endif +#define TICKET_LOCK_INC((__ticket_t)__TICKET_LOCK_INC) + #define TICKET_SHIFT (sizeof(__ticket_t) * 8) #define TICKET_MASK((__ticket_t)((1 TICKET_SHIFT) - 1)) -- 1.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] pci: clean all funcs when hot-removing multifunc device
On Wed, Sep 14, 2011 at 07:45:59AM -0400, Amos Kong wrote: The size of bios.bin compiled from seabios original: 128K only apply patch1: 256K only apply patch2: 128K patch1: add 6 slot(only slot6 has 8 funcs) to the table can hotplug/hot-remove a multifunc device to slot 6 successfully patch2: add 31 slot(with 8 funcs) to the table could not boot up guest. I found there is a special process for large bios.bin in qemu, problem maybe exist here, I'm driving into it... qemu/hw/pc.c: void pc_memory_init(... /* map the last 128KB of the BIOS in ISA space */ isa_bios_size = bios_size; if (isa_bios_size (128 * 1024)) isa_bios_size = 128 * 1024; This is probably a regression since seabios commit 87b533bf. Prior to that commit, seabios did not mark the early 32bit initialization code as init code. However, a side effect of marking that code (handle_post) as init code is that it is more likely the linker could place the code at an address less than 0xe. I'm guesing the patch below (just a hack) would cover up the issue. -Kevin --- a/src/post.c +++ b/src/post.c @@ -336,7 +336,7 @@ reloc_init(void) // Start of Power On Self Test (POST) - the BIOS initilization phase. // This function does the setup needed for code relocation, and then // invokes the relocation and main setup code. -void VISIBLE32INIT +void VISIBLE32FLAT handle_post(void) { debug_serial_setup(); @@ -356,6 +356,14 @@ handle_post(void) // Allow writes to modify bios area (0xf) make_bios_writable(); + +void handle_post2(void); +handle_post2(); +} + +void VISIBLE32INIT +handle_post2(void) +{ HaveRunPost = 1; // Detect ram and setup internal malloc. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 03/11] KVM: x86: retry non-page-table writing instruction
On 09/14/2011 06:19 PM, Xiao Guangrong wrote: On 09/14/2011 05:53 PM, Avi Kivity wrote: On 09/13/2011 09:24 PM, Xiao Guangrong wrote: +static bool retry_instruction(struct x86_emulate_ctxt *ctxt, + unsigned long cr2, int emulation_type) +{ +if (!vcpu-arch.mmu.direct_map !mmu_is_nested(vcpu)) +gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); If mmu_is_nested() cr2 is an ngpa, we have to translate it to a gpa, no? Yeah, will fix it. And this bug also exists in the current code: it always uses L2 gpa to emulate write operation. Can you please send this fix separately, so it can be backported if needed? Sure, i will do it as soon as possible. :-) I am so sorry, the current code is good, it has already translated L2 gpa to L1 gpa: vcpu-arch.nested_mmu.translate_gpa = translate_nested_gpa; Please ignore it. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm tools: Use host's resolv.conf within the guest
On Wed, Sep 14, 2011 at 7:28 PM, Sasha Levin levinsasha...@gmail.com wrote: Since kernel IP autoconfiguration doesn't set up /etc/resolv.conf, we'll use the one located within the host, since this was anyway what we simulated within the DHCP offer packets. Signed-off-by: Sasha Levin levinsasha...@gmail.com Wouldn't a symlink to /host/etc/resolv.conf be more appropriate? Remember, we're supposed to only need to setup the shared rootfs once. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kgdb hooks and kvm-tool
On Thu, Sep 15, 2011 at 2:17 AM, David Evensky even...@dancer.ca.sandia.gov wrote: Hi. Is it possible to use kvm-tool with a kernel compiled with kgdb? I've tried adding 'kgdbwait kgdboc=ttyS0' to -p, but that doesn't seem to work. I've never tried kgdb myself but I'm rather surprised it doesn't just work. Sasha, Cyrill, Asias, have you guys ever tried kvmtool with kgdb? Pekka -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm tools: Use host's resolv.conf within the guest
On Thu, 2011-09-15 at 08:29 +0300, Pekka Enberg wrote: On Wed, Sep 14, 2011 at 7:28 PM, Sasha Levin levinsasha...@gmail.com wrote: Since kernel IP autoconfiguration doesn't set up /etc/resolv.conf, we'll use the one located within the host, since this was anyway what we simulated within the DHCP offer packets. Signed-off-by: Sasha Levin levinsasha...@gmail.com Wouldn't a symlink to /host/etc/resolv.conf be more appropriate? Remember, we're supposed to only need to setup the shared rootfs once. It would mean the guest can screw up with the host's networking. -- Sasha. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kgdb hooks and kvm-tool
On Thu, 2011-09-15 at 08:32 +0300, Pekka Enberg wrote: On Thu, Sep 15, 2011 at 2:17 AM, David Evensky even...@dancer.ca.sandia.gov wrote: Hi. Is it possible to use kvm-tool with a kernel compiled with kgdb? I've tried adding 'kgdbwait kgdboc=ttyS0' to -p, but that doesn't seem to work. I've never tried kgdb myself but I'm rather surprised it doesn't just work. Sasha, Cyrill, Asias, have you guys ever tried kvmtool with kgdb? You can either use 'kgdboc=kbd' to use it over the keyboard. I also have a patch which uses forktty() to spawn serial consoles and redirect guest tty's into them, but it's somewhat ugly. Give me a day or two to make it nicer and I'll send it over. -- Sasha. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm tools: Use host's resolv.conf within the guest
On 9/15/11 8:36 AM, Sasha Levin wrote: On Thu, 2011-09-15 at 08:29 +0300, Pekka Enberg wrote: On Wed, Sep 14, 2011 at 7:28 PM, Sasha Levinlevinsasha...@gmail.com wrote: Since kernel IP autoconfiguration doesn't set up /etc/resolv.conf, we'll use the one located within the host, since this was anyway what we simulated within the DHCP offer packets. Signed-off-by: Sasha Levinlevinsasha...@gmail.com Wouldn't a symlink to /host/etc/resolv.conf be more appropriate? Remember, we're supposed to only need to setup the shared rootfs once. It would mean the guest can screw up with the host's networking. How? You're not supposed to run the tool. Pekka -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/3] KVM: PPC: HIOR and sregs fixup
While working on the HIOR setting which already made it into Avi's tree, I was too uncautious and ended up extending the sregs structure, breaking ABI compatibility with all QEMU versions. So the approach I was taking there was obvious wrong. Instead, what I thought might be a better alternative is to get rid of the static we have a struct full of registers and shove it left and right and instead just poke registers directly between kernel and user space. That sounds slow for starters, but once we have the infrastructure in place, we can build a batched version of the same interface and be fast again but maintain flexibility. This interface can also for example be used to easily fetch the next great extension of SSE registers or some MSRs that we haven't thought of or lots of PPC registers I haven't even heard of so far :). There always seem to be new ones to learn of out there. Please take a look at the interface and comment on whether you like it this way or not. It's currently only implemented for the PPC target, but is held generically, so everyone can use it. Oh and - it obviously implements HIOR again which we have to drop from sregs due to the ABI breakage. Alex Alexander Graf (3): Revert KVM: PPC: Add support for explicit HIOR setting KVM: PPC: Add generic single register ioctls KVM: PPC: Add support for explicit HIOR setting Documentation/virtual/kvm/api.txt | 48 arch/powerpc/include/asm/kvm.h| 10 + arch/powerpc/include/asm/kvm_book3s.h |2 +- arch/powerpc/kvm/book3s_pr.c | 12 +- arch/powerpc/kvm/powerpc.c| 64 + include/linux/kvm.h | 32 6 files changed, 149 insertions(+), 19 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] Revert KVM: PPC: Add support for explicit HIOR setting
This reverts commit 11d7596e18a712dc3bc29d45662ec111fd65946b. It exceeded the padding on the SREGS struct, rendering the ABI backwards-incompatible. Signed-off-by: Alexander Graf ag...@suse.de --- arch/powerpc/include/asm/kvm.h|8 arch/powerpc/include/asm/kvm_book3s.h |2 -- arch/powerpc/kvm/book3s_pr.c | 14 ++ arch/powerpc/kvm/powerpc.c|1 - include/linux/kvm.h |1 - 5 files changed, 2 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 71684b9..a635e22 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -149,12 +149,6 @@ struct kvm_regs { #define KVM_SREGS_E_UPDATE_DBSR(1 3) /* - * Book3S special bits to indicate contents in the struct by maintaining - * backwards compatibility with older structs. If adding a new field, - * please make sure to add a flag for that new field */ -#define KVM_SREGS_S_HIOR (1 0) - -/* * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a * previous KVM_GET_REGS. * @@ -179,8 +173,6 @@ struct kvm_sregs { __u64 ibat[8]; __u64 dbat[8]; } ppc32; - __u64 flags; /* KVM_SREGS_S_ */ - __u64 hior; } s; struct { union { diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index a384ffd..d4df013 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -90,8 +90,6 @@ struct kvmppc_vcpu_book3s { #endif int context_id[SID_CONTEXTS]; - bool hior_sregs;/* HIOR is set by SREGS, not PVR */ - struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d417511..84505a2 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -150,16 +150,14 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr = 0x33) (pvr 0x7033)) { kvmppc_mmu_book3s_64_init(vcpu); - if (!to_book3s(vcpu)-hior_sregs) - to_book3s(vcpu)-hior = 0xfff0; + to_book3s(vcpu)-hior = 0xfff0; to_book3s(vcpu)-msr_mask = 0xULL; vcpu-arch.cpu_type = KVM_CPU_3S_64; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - if (!to_book3s(vcpu)-hior_sregs) - to_book3s(vcpu)-hior = 0; + to_book3s(vcpu)-hior = 0; to_book3s(vcpu)-msr_mask = 0xULL; vcpu-arch.cpu_type = KVM_CPU_3S_32; } @@ -796,9 +794,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, } } - if (sregs-u.s.flags KVM_SREGS_S_HIOR) - sregs-u.s.hior = to_book3s(vcpu)-hior; - return 0; } @@ -835,11 +830,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Flush the MMU after messing with the segments */ kvmppc_mmu_pte_flush(vcpu, 0, 0); - if (sregs-u.s.flags KVM_SREGS_S_HIOR) { - to_book3s(vcpu)-hior_sregs = true; - to_book3s(vcpu)-hior = sregs-u.s.hior; - } - return 0; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 55b4233..e75c5ac 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -209,7 +209,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: - case KVM_CAP_PPC_HIOR: case KVM_CAP_PPC_PAPR: #endif case KVM_CAP_PPC_UNSET_IRQ: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 06ef37d..fe57d2b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -554,7 +554,6 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ -#define KVM_CAP_PPC_HIOR 67 #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_SW_TLB 69 -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] KVM: PPC: Add support for explicit HIOR setting
Until now, we always set HIOR based on the PVR, but this is just wrong. Instead, we should be setting HIOR explicitly, so user space can decide what the initial HIOR value is - just like on real hardware. We keep the old PVR based way around for backwards compatibility, but once user space uses the SET_ONE_REG based method, we drop the PVR logic. Signed-off-by: Alexander Graf ag...@suse.de --- Documentation/virtual/kvm/api.txt |1 + arch/powerpc/include/asm/kvm.h|2 ++ arch/powerpc/include/asm/kvm_book3s.h |2 ++ arch/powerpc/kvm/book3s_pr.c |6 -- arch/powerpc/kvm/powerpc.c| 14 ++ include/linux/kvm.h |1 + 6 files changed, 24 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 5a8f305..eb03179 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1512,6 +1512,7 @@ track of the implemented registers, find a list below: Arch | Register| Width (bits) | | + PPC | KVM_ONE_REG_PPC_HIOR | 64 4.65 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index a635e22..53b8759 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -327,4 +327,6 @@ struct kvm_book3e_206_tlb_params { __u32 reserved[8]; }; +#define KVM_ONE_REG_PPC_HIOR KVM_ONE_REG_PPC | 0x100 + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index d4df013..0ba8ba9 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s { #endif int context_id[SID_CONTEXTS]; + bool hior_explicit; /* HIOR is set by ioctl, not PVR */ + struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 84505a2..565af5a 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -150,14 +150,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr = 0x33) (pvr 0x7033)) { kvmppc_mmu_book3s_64_init(vcpu); - to_book3s(vcpu)-hior = 0xfff0; + if (!to_book3s(vcpu)-hior_explicit) + to_book3s(vcpu)-hior = 0xfff0; to_book3s(vcpu)-msr_mask = 0xULL; vcpu-arch.cpu_type = KVM_CPU_3S_64; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - to_book3s(vcpu)-hior = 0; + if (!to_book3s(vcpu)-hior_explicit) + to_book3s(vcpu)-hior = 0; to_book3s(vcpu)-msr_mask = 0xULL; vcpu-arch.cpu_type = KVM_CPU_3S_32; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 39cdb3f..c33f6a7 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -209,6 +209,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_HIOR: case KVM_CAP_PPC_PAPR: #endif case KVM_CAP_PPC_UNSET_IRQ: @@ -634,6 +635,12 @@ static int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, int r = -EINVAL; switch (reg-id) { +#ifdef CONFIG_PPC_BOOK3S + case KVM_ONE_REG_PPC_HIOR: + reg-u.reg64 = to_book3s(vcpu)-hior; + r = 0; + break; +#endif default: break; } @@ -647,6 +654,13 @@ static int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, int r = -EINVAL; switch (reg-id) { +#ifdef CONFIG_PPC_BOOK3S + case KVM_ONE_REG_PPC_HIOR: + to_book3s(vcpu)-hior = reg-u.reg64; + to_book3s(vcpu)-hior_explicit = true; + r = 0; + break; +#endif default: break; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 762959a..cc6c2fb 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ +#define KVM_CAP_PPC_HIOR 67 #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_SW_TLB 69 #define KVM_CAP_ONE_REG 70 -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html