[PATCH][kvm-unit-test] Keep gui off when running test cases
From: Jan Kiszka jan.kis...@siemens.com Signed-off-by: Jan Kiszka jan.kis...@siemens.com --- x86-run |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/x86-run b/x86-run index 14ff331..646c577 100755 --- a/x86-run +++ b/x86-run @@ -33,7 +33,7 @@ else pc_testdev=-device testdev,chardev=testlog -chardev file,id=testlog,path=msr.out fi -command=${qemu} -enable-kvm $pc_testdev -serial stdio $pci_testdev -kernel +command=${qemu} -enable-kvm $pc_testdev -display none -serial stdio $pci_testdev -kernel echo ${command} $@ ${command} $@ ret=$? -- 1.7.3.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: Fix RTC interrupt coalescing tracking
On Wed, Jun 26, 2013 at 07:49:37AM +0200, Jan Kiszka wrote: On 2013-06-24 14:19, Gleb Natapov wrote: This reverts most of the f1ed0450a5fac7067590317cbf027f566b6ccbca. After the commit kvm_apic_set_irq() no longer returns accurate information about interrupt injection status if injection is done into disabled APIC. RTC interrupt coalescing tracking relies on the information to be accurate and cannot recover if it is not. Signed-off-by: Gleb Natapov g...@redhat.com diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9d75193..9f4bea8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -405,17 +405,17 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) return highest_irr; } -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode, - unsigned long *dest_map); +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, +int vector, int level, int trig_mode, +unsigned long *dest_map); I still think __apic_accept_irq should unconditionally inject, and the test for acpi_enabled belongs into kvm_apic_set_irq. Why should __apic_accept_irq accept non-APIC_DM_FIXED messages if the APIC is off? See below for another reason to refactor this part of the interface. 10.4.7.2 Local APIC State After It Has Been Software Disabled The local APIC will respond normally to INIT, NMI, SMI, and SIPI messages. -void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map) +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map) { struct kvm_lapic *apic = vcpu-arch.apic; - __apic_accept_irq(apic, irq-delivery_mode, irq-vector, - irq-level, irq-trig_mode, dest_map); + return __apic_accept_irq(apic, irq-delivery_mode, irq-vector, + irq-level, irq-trig_mode, dest_map); } static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) @@ -608,8 +608,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = -1; if (irq-shorthand == APIC_DEST_SELF) { - kvm_apic_set_irq(src-vcpu, irq, dest_map); - *r = 1; + *r = kvm_apic_set_irq(src-vcpu, irq, dest_map); return true; } @@ -654,8 +653,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, continue; if (*r 0) *r = 0; - kvm_apic_set_irq(dst[i]-vcpu, irq, dest_map); - *r += 1; + *r += kvm_apic_set_irq(dst[i]-vcpu, irq, dest_map); } ret = true; @@ -664,11 +662,15 @@ out: return ret; } -/* Set an IRQ pending in the lapic. */ -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode, - unsigned long *dest_map) +/* + * Add a pending IRQ into lapic. + * Return 1 if successfully added and 0 if discarded. + */ +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, +int vector, int level, int trig_mode, +unsigned long *dest_map) { + int result = 0; struct kvm_vcpu *vcpu = apic-vcpu; switch (delivery_mode) { @@ -682,10 +684,13 @@ static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu-vcpu_id, dest_map); - if (kvm_x86_ops-deliver_posted_interrupt) + if (kvm_x86_ops-deliver_posted_interrupt) { + result = 1; kvm_x86_ops-deliver_posted_interrupt(vcpu, vector); - else { - if (apic_test_and_set_irr(vector, apic)) { + } else { + result = !apic_test_and_set_irr(vector, apic); This part of the revert makes no sense. If deliver_posted_interrupt is on, we don't have this feedback anymore, thus we decided to remove it, no? Agree, but I wanted to do clear revert and fix on top. Jan + + if (!result) { if (trig_mode) apic_debug(level trig mode repeatedly for vector %d, vector); @@ -697,7 +702,7 @@ static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } out: trace_kvm_apic_accept_irq(vcpu-vcpu_id, delivery_mode, - trig_mode, vector, false); + trig_mode, vector, !result); break; case APIC_DM_REMRD: @@ -709,12
Re: [PATCH] KVM: Fix RTC interrupt coalescing tracking
On 2013-06-26 08:15, Gleb Natapov wrote: On Wed, Jun 26, 2013 at 07:49:37AM +0200, Jan Kiszka wrote: On 2013-06-24 14:19, Gleb Natapov wrote: This reverts most of the f1ed0450a5fac7067590317cbf027f566b6ccbca. After the commit kvm_apic_set_irq() no longer returns accurate information about interrupt injection status if injection is done into disabled APIC. RTC interrupt coalescing tracking relies on the information to be accurate and cannot recover if it is not. Signed-off-by: Gleb Natapov g...@redhat.com diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9d75193..9f4bea8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -405,17 +405,17 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) return highest_irr; } -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode, - unsigned long *dest_map); +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, +int vector, int level, int trig_mode, +unsigned long *dest_map); I still think __apic_accept_irq should unconditionally inject, and the test for acpi_enabled belongs into kvm_apic_set_irq. Why should __apic_accept_irq accept non-APIC_DM_FIXED messages if the APIC is off? See below for another reason to refactor this part of the interface. 10.4.7.2 Local APIC State After It Has Been Software Disabled The local APIC will respond normally to INIT, NMI, SMI, and SIPI messages. OK, I see. -void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map) +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map) { struct kvm_lapic *apic = vcpu-arch.apic; - __apic_accept_irq(apic, irq-delivery_mode, irq-vector, - irq-level, irq-trig_mode, dest_map); + return __apic_accept_irq(apic, irq-delivery_mode, irq-vector, + irq-level, irq-trig_mode, dest_map); } static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) @@ -608,8 +608,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = -1; if (irq-shorthand == APIC_DEST_SELF) { - kvm_apic_set_irq(src-vcpu, irq, dest_map); - *r = 1; + *r = kvm_apic_set_irq(src-vcpu, irq, dest_map); return true; } @@ -654,8 +653,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, continue; if (*r 0) *r = 0; - kvm_apic_set_irq(dst[i]-vcpu, irq, dest_map); - *r += 1; + *r += kvm_apic_set_irq(dst[i]-vcpu, irq, dest_map); } ret = true; @@ -664,11 +662,15 @@ out: return ret; } -/* Set an IRQ pending in the lapic. */ -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode, - unsigned long *dest_map) +/* + * Add a pending IRQ into lapic. + * Return 1 if successfully added and 0 if discarded. + */ +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, +int vector, int level, int trig_mode, +unsigned long *dest_map) { + int result = 0; struct kvm_vcpu *vcpu = apic-vcpu; switch (delivery_mode) { @@ -682,10 +684,13 @@ static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu-vcpu_id, dest_map); - if (kvm_x86_ops-deliver_posted_interrupt) + if (kvm_x86_ops-deliver_posted_interrupt) { + result = 1; kvm_x86_ops-deliver_posted_interrupt(vcpu, vector); - else { - if (apic_test_and_set_irr(vector, apic)) { + } else { + result = !apic_test_and_set_irr(vector, apic); This part of the revert makes no sense. If deliver_posted_interrupt is on, we don't have this feedback anymore, thus we decided to remove it, no? Agree, but I wanted to do clear revert and fix on top. Fine with me, let's write a separate fix. Jan signature.asc Description: OpenPGP digital signature
Re: [nVMX w/ Haswell] KVM unit-tests in L1 - eventinj test fails trying to send NMI
On 2013-06-05 11:06, Kashyap Chamarthy wrote: Adding Jan, Jun, to see if they have any inputs here. Thanks for the note, it's very helpful! This test actually fails on older CPUs as well, and I can finally reproduce the issue that Jay also reported. I'm not able to cure it by going back to 3b656cf764^, just alter the error report. Anyway, a start. Now I just need time to debug it... Jan /kashyap On Tue, Jun 4, 2013 at 6:14 PM, Kashyap Chamarthy kashyap...@gmail.com wrote: Heya, So, I invoked this in L1 with: === [test@foo kvm-unit-tests]$ time qemu-system-x86_64 -enable-kvm -device pc-testdev -serial stdio -nographic -no-user-config -nodefaults -device isa-debug-exit,iobase=0xf4,iosize=0x4 -kernel ./x86/eventinj.flat | tee /var/tmp/eventinj-test.txt enabling apic paging enabled cr0 = 80010011 cr3 = 7fff000 cr4 = 20 Try to divide by 0 DE isr running divider is 0 Result is 150 DE exception: PASS Try int 3 BP isr running After int 3 BP exception: PASS Try send vec 33 to itself irq1 running After vec 33 to itself vec 33: PASS Try int $33 irq1 running After int $33 int $33: PASS Try send vec 32 and 33 to itself irq1 running irq0 running After vec 32 and 33 to itself vec 32/33: PASS Try send vec 32 and int $33 irq1 running irq0 running After vec 32 and int $33 vec 32/int $33: PASS Try send vec 33 and 62 and mask one with TPR irq1 running After 33/62 TPR test TPR: PASS irq0 running Try send NMI to itself After NMI to itself NMI: FAIL Try int 33 with shadowed stack irq1 running After int 33 with shadowed stack int 33 with shadowed stack: PASS summary: 9 tests, 1 failures real0m0.647s user0m0.164s sys 0m0.146s [test@foo kvm-unit-tests]$ === Any hints on further debugging this ? Other info: -- - L1's qemu-kvm CLI === # ps -ef | grep -i qemu qemu 5455 1 94 Jun02 ?1-07:14:29 /usr/bin/qemu-system-x86_64 -machine accel=kvm -name regular-guest -S -machine pc-i440fx-1.4,accel=kvm,usb=off -cpu Haswell,+vmx -m 10240 -smp 4,sockets=4,cores=1,threads=1 -uuid 4ed9ac0b-7f72-dfcf-68b3-e6fe2ac588b2 -nographic -no-user-config -nodefaults -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/regular-guest.monitor,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive file=/home/test/vmimages/regular-guest.qcow2,if=none,id=drive-virtio-disk0,format=qcow2,cache=none -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 -netdev tap,fd=23,id=hostnet0,vhost=on,vhostfd=24 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:80:c1:34,bus=pci.0,addr=0x3 -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 root 12255 5419 0 08:41 pts/200:00:00 grep --color=auto -i qemu === - Setup details -- https://github.com/kashyapc/nvmx-haswell/blob/master/SETUP-nVMX.rst /kashyap signature.asc Description: OpenPGP digital signature
Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction
On 06/26/2013 01:42 PM, Bharat Bhushan wrote: ehpriv instruction is used for setting software breakpoints by user space. This patch adds support to exit to user space with run-debug have relevant information. As this is the first point we are using run-debug, also defined the run-debug structure. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/disassemble.h |4 arch/powerpc/include/uapi/asm/kvm.h| 21 + arch/powerpc/kvm/e500_emulate.c| 27 +++ 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 9b198d1..856f8de 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst) return inst 0x; } +static inline unsigned int get_oc(u32 inst) +{ + return (inst 11) 0x7fff; +} #endif /* __ASM_PPC_DISASSEMBLE_H__ */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 0fb1a6e..ded0607 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -269,7 +269,24 @@ struct kvm_fpu { __u64 fpr[32]; }; +/* + * Defines for h/w breakpoint, watchpoint (read, write or both) and + * software breakpoint. + * These are used as type in KVM_SET_GUEST_DEBUG ioctl and status + * for KVM_DEBUG_EXIT. + */ +#define KVMPPC_DEBUG_NONE 0x0 +#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) +#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) +#define KVMPPC_DEBUG_WATCH_READ(1UL 3) struct kvm_debug_exit_arch { + __u64 address; + /* +* exiting to userspace because of h/w breakpoint, watchpoint +* (read, write or both) and software breakpoint. +*/ + __u32 status; + __u32 reserved; }; /* for KVM_SET_GUEST_DEBUG */ @@ -281,10 +298,6 @@ struct kvm_guest_debug_arch { * Type denotes h/w breakpoint, read watchpoint, write * watchpoint or watchpoint (both read and write). */ -#define KVMPPC_DEBUG_NONE 0x0 -#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) -#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) -#define KVMPPC_DEBUG_WATCH_READ(1UL 3) __u32 type; __u32 reserved; } bp[16]; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index b10a012..dab9d07 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -26,6 +26,8 @@ #define XOP_TLBRE 946 #define XOP_TLBWE 978 #define XOP_TLBILX 18 +#define XOP_EHPRIV 270 +#define EHPRIV_OC_DEBUG 0 As I think the case, OC = 0, is a bit specific since IIRC, if the OC operand is omitted, its equal 0 by default. So I think we should start this OC value from 1 or other magic number. And if possible, we'd better add some comments to describe this to make the OC definition readable. Tiejun #ifdef CONFIG_KVM_E500MC static int dbell2prio(ulong param) @@ -82,6 +84,26 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) } #endif +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + + switch (get_oc(inst)) { + case EHPRIV_OC_DEBUG: + run-exit_reason = KVM_EXIT_DEBUG; + run-debug.arch.address = vcpu-arch.pc; + run-debug.arch.status = 0; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + emulated = EMULATE_EXIT_USER; + *advance = 0; + break; + default: + emulated = EMULATE_FAIL; + } + return emulated; +} + int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -130,6 +152,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; + case XOP_EHPRIV: + emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, + advance); + break; + default: emulated = EMULATE_FAIL; } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][kvm-unit-test] Keep gui off when running test cases
Il 26/06/2013 08:06, Jan Kiszka ha scritto: From: Jan Kiszka jan.kis...@siemens.com Signed-off-by: Jan Kiszka jan.kis...@siemens.com --- x86-run |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/x86-run b/x86-run index 14ff331..646c577 100755 --- a/x86-run +++ b/x86-run @@ -33,7 +33,7 @@ else pc_testdev=-device testdev,chardev=testlog -chardev file,id=testlog,path=msr.out fi -command=${qemu} -enable-kvm $pc_testdev -serial stdio $pci_testdev -kernel +command=${qemu} -enable-kvm $pc_testdev -display none -serial stdio $pci_testdev -kernel echo ${command} $@ ${command} $@ ret=$? Reviewed-by: Paolo Bonzini pbonz...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline
Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto: - cpu = get_cpu(); + cpu = get_online_cpus_atomic(); vmx_vcpu_load(vmx-vcpu, cpu); vmx-vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(vmx-vcpu); - put_cpu(); + put_online_cpus_atomic(); The new API has a weird name. Why are you adding new functions instead of just modifying get/put_cpu? Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [nVMX w/ Haswell] KVM unit-tests in L1 - eventinj test fails trying to send NMI
Thanks for the note, it's very helpful! This test actually fails on older CPUs as well, and I can finally reproduce the issue that Jay also reported. I'm not able to cure it by going back to 3b656cf764^, Ok, you tried w/o this commit.. commit 3b656cf764cbc43d3efb9bf5f45c618d4cf0989f Author: Jan Kiszka jan.kis...@siemens.com Date: Sun Apr 14 12:12:45 2013 +0200 KVM: nVMX: Fix injection of PENDING_INTERRUPT and NMI_WINDOW exits to L1 Check if the interrupt or NMI window exit is for L1 by testing if it has the corresponding controls enabled. This is required when we allow direct injection from L0 to L2 just alter the error report. Anyway, a start. Now I just need time to debug it... Great, would you prefer a bug to track this? Or will that be ignored? :) Don't hesitate to let me know if you need any further testing help or want me to try something specific. Thanks. Jan /kashyap On Tue, Jun 4, 2013 at 6:14 PM, Kashyap Chamarthy kashyap...@gmail.com wrote: Heya, So, I invoked this in L1 with: === [test@foo kvm-unit-tests]$ time qemu-system-x86_64 -enable-kvm -device pc-testdev -serial stdio -nographic -no-user-config -nodefaults -device isa-debug-exit,iobase=0xf4,iosize=0x4 -kernel ./x86/eventinj.flat | tee /var/tmp/eventinj-test.txt enabling apic paging enabled cr0 = 80010011 cr3 = 7fff000 cr4 = 20 Try to divide by 0 DE isr running divider is 0 Result is 150 DE exception: PASS Try int 3 BP isr running After int 3 BP exception: PASS Try send vec 33 to itself irq1 running After vec 33 to itself vec 33: PASS Try int $33 irq1 running After int $33 int $33: PASS Try send vec 32 and 33 to itself irq1 running irq0 running After vec 32 and 33 to itself vec 32/33: PASS Try send vec 32 and int $33 irq1 running irq0 running After vec 32 and int $33 vec 32/int $33: PASS Try send vec 33 and 62 and mask one with TPR irq1 running After 33/62 TPR test TPR: PASS irq0 running Try send NMI to itself After NMI to itself NMI: FAIL Try int 33 with shadowed stack irq1 running After int 33 with shadowed stack int 33 with shadowed stack: PASS summary: 9 tests, 1 failures real0m0.647s user0m0.164s sys 0m0.146s [test@foo kvm-unit-tests]$ === Any hints on further debugging this ? Other info: -- - L1's qemu-kvm CLI === # ps -ef | grep -i qemu qemu 5455 1 94 Jun02 ?1-07:14:29 /usr/bin/qemu-system-x86_64 -machine accel=kvm -name regular-guest -S -machine pc-i440fx-1.4,accel=kvm,usb=off -cpu Haswell,+vmx -m 10240 -smp 4,sockets=4,cores=1,threads=1 -uuid 4ed9ac0b-7f72-dfcf-68b3-e6fe2ac588b2 -nographic -no-user-config -nodefaults -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/regular-guest.monitor,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive file=/home/test/vmimages/regular-guest.qcow2,if=none,id=drive-virtio-disk0,format=qcow2,cache=none -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 -netdev tap,fd=23,id=hostnet0,vhost=on,vhostfd=24 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:80:c1:34,bus=pci.0,addr=0x3 -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 root 12255 5419 0 08:41 pts/200:00:00 grep --color=auto -i qemu === - Setup details -- https://github.com/kashyapc/nvmx-haswell/blob/master/SETUP-nVMX.rst /kashyap -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Google Summer of Code 2013 has started
It is a pleasure to welcome the following GSoC 2013 students to the QEMU, KVM, and libvirt communities: Libvirt Wireshark Dissector - Yuto KAWAMURA (kawamuray) http://qemu-project.org/Features/LibvirtWiresharkDissector Libvirt Introduce API to query IP addresses for given domain - Nehal J. Wani (nehaljwani) http://www.google-melange.com/gsoc/project/google/gsoc2013/nehaljwani/51001 Libvirt More Intelligent virsh auto-completion - Tomas Meszaros http://www.google-melange.com/gsoc/project/google/gsoc2013/examon/13001 QEMU Integrated Copy-Paste - Ozan Çağlayan and Pallav Agrawal (pallav) http://qemu-project.org/Features/IntegratedCopyPaste QEMU Continuation Passing C - Charlie Shepherd (cs648) http://qemu-project.org/Features/Continuation-Passing_C QEMU Kconfig - Ákos Kovács http://qemu-project.org/Features/Kconfig QEMU USB Media Transfer Protocol emulation - a|mond http://www.google-melange.com/gsoc/project/google/gsoc2013/almond/1001 KVM Nested Virtualization Testsuite - Arthur Chunqi Li (xelatex) http://www.google-melange.com/gsoc/project/google/gsoc2013/xelatex/19001 Coding started on Monday, 17th of June and ends Monday, 23rd of September. Feel free to follow these projects - feature pages are being created with git repo and blog links. Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline
On 06/26/2013 01:16 PM, Paolo Bonzini wrote: Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto: -cpu = get_cpu(); +cpu = get_online_cpus_atomic(); vmx_vcpu_load(vmx-vcpu, cpu); vmx-vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(vmx-vcpu); -put_cpu(); +put_online_cpus_atomic(); The new API has a weird name. Why are you adding new functions instead of just modifying get/put_cpu? Because the purpose of those two functions are distinctly different from each other. get/put_cpu() is used to disable preemption on the local CPU. (Which also disables offlining the local CPU during that critical section). What this patchset deals with is synchronizing with offline of *any* CPU. Typically, we use get_online_cpus()/put_online_cpus() for that purpose. But they can't be used in atomic context, because they take mutex locks and hence can sleep. So the code that executes in atomic context and which wants to prevent *any* CPU from going offline, used to disable preemption around its critical section. Disabling preemption prevents stop_machine(), and CPU offline (of *any* CPU) was done via stop_machine(). So disabling preemption disabled any CPU from going offline, as a *side-effect*. And this patchset prepares the ground for getting rid of stop_machine() in the CPU offline path. Which means, disabling preemption only prevents the *local* CPU from going offline. So if code in atomic context wants to prevent any CPU from going offline, we need a new set of APIs, like get/put_online_cpus(), but which can be invoked from atomic context. That's why I named it as get/put_online_cpus_atomic(). One of the key points here is that we want to preserve get/put_cpu() as it is, since its purpose is different - disable preemption and offline of the local CPU. There is no reason to change that API, its useful as it is. Regards, Srivatsa S. Bhat -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH-next] kvm: don't try to take mmu_lock while holding the main raw kvm_lock
Il 26/06/2013 00:34, Paul Gortmaker ha scritto: In commit e935b8372cf8 (KVM: Convert kvm_lock to raw_spinlock), the kvm_lock was made a raw lock. However, the kvm mmu_shrink() function tries to grab the (non-raw) mmu_lock within the scope of the raw locked kvm_lock being held. This leads to the following: BUG: sleeping function called from invalid context at kernel/rtmutex.c:659 in_atomic(): 1, irqs_disabled(): 0, pid: 55, name: kswapd0 Preemption disabled at:[a0376eac] mmu_shrink+0x5c/0x1b0 [kvm] Pid: 55, comm: kswapd0 Not tainted 3.4.34_preempt-rt Call Trace: [8106f2ad] __might_sleep+0xfd/0x160 [817d8d64] rt_spin_lock+0x24/0x50 [a0376f3c] mmu_shrink+0xec/0x1b0 [kvm] [8111455d] shrink_slab+0x17d/0x3a0 [81151f00] ? mem_cgroup_iter+0x130/0x260 [8111824a] balance_pgdat+0x54a/0x730 [8111fe47] ? set_pgdat_percpu_threshold+0xa7/0xd0 [811185bf] kswapd+0x18f/0x490 [81070961] ? get_parent_ip+0x11/0x50 [81061970] ? __init_waitqueue_head+0x50/0x50 [81118430] ? balance_pgdat+0x730/0x730 [81060d2b] kthread+0xdb/0xe0 [8106e122] ? finish_task_switch+0x52/0x100 [817e1e94] kernel_thread_helper+0x4/0x10 [81060c50] ? __init_kthread_worker+0x Since we only use the lock for protecting the vm_list, once we've found the instance we want, we can shuffle it to the end of the list and then drop the kvm_lock before taking the mmu_lock. We can do this because after the mmu operations are completed, we break -- i.e. we don't continue list processing, so it doesn't matter if the list changed around us. Signed-off-by: Paul Gortmaker paul.gortma...@windriver.com Since the shrinker code is asynchronous with respect to KVM, I think that the kvm_lock here is also protecting against kvm_destroy_vm running at the same time. So the patch is almost okay; all that is missing is a kvm_get_kvm/kvm_put_kvm pair, where the reference is added just before releasing the kvm_lock. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 28/45] KVM: Use get/put_online_cpus_atomic() to prevent CPU offline
Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto: Once stop_machine() is gone from the CPU offline path, we won't be able to depend on disabling preemption to prevent CPUs from going offline from under us. Use the get/put_online_cpus_atomic() APIs to prevent CPUs from going offline, while invoking from atomic context. Cc: Gleb Natapov g...@redhat.com Cc: Paolo Bonzini pbonz...@redhat.com Cc: kvm@vger.kernel.org Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com --- virt/kvm/kvm_main.c |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 302681c..5bbfa30 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -174,7 +174,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) zalloc_cpumask_var(cpus, GFP_ATOMIC); - me = get_cpu(); + me = get_online_cpus_atomic(); kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(req, vcpu); cpu = vcpu-cpu; @@ -192,7 +192,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - put_cpu(); + put_online_cpus_atomic(); free_cpumask_var(cpus); return called; } @@ -1707,11 +1707,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) ++vcpu-stat.halt_wakeup; } - me = get_cpu(); + me = get_online_cpus_atomic(); if (cpu != me (unsigned)cpu nr_cpu_ids cpu_online(cpu)) if (kvm_arch_vcpu_should_kick(vcpu)) smp_send_reschedule(cpu); - put_cpu(); + put_online_cpus_atomic(); } EXPORT_SYMBOL_GPL(kvm_vcpu_kick); #endif /* !CONFIG_S390 */ -- To unsubscribe from this list: send the line unsubscribe linux-pm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Acked-by: Paolo Bonzini pbonz...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline
Il 26/06/2013 10:06, Srivatsa S. Bhat ha scritto: On 06/26/2013 01:16 PM, Paolo Bonzini wrote: Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto: - cpu = get_cpu(); + cpu = get_online_cpus_atomic(); vmx_vcpu_load(vmx-vcpu, cpu); vmx-vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(vmx-vcpu); - put_cpu(); + put_online_cpus_atomic(); The new API has a weird name. Why are you adding new functions instead of just modifying get/put_cpu? Because the purpose of those two functions are distinctly different from each other. get/put_cpu() is used to disable preemption on the local CPU. (Which also disables offlining the local CPU during that critical section). Ok, then I understood correctly... and I acked the other KVM patch. However, keeping the code on the local CPU is exactly the point of this particular use of get_cpu()/put_cpu(). Why does it need to synchronize with offlining of other CPUs? Paolo What this patchset deals with is synchronizing with offline of *any* CPU. Typically, we use get_online_cpus()/put_online_cpus() for that purpose. But they can't be used in atomic context, because they take mutex locks and hence can sleep. So the code that executes in atomic context and which wants to prevent *any* CPU from going offline, used to disable preemption around its critical section. Disabling preemption prevents stop_machine(), and CPU offline (of *any* CPU) was done via stop_machine(). So disabling preemption disabled any CPU from going offline, as a *side-effect*. And this patchset prepares the ground for getting rid of stop_machine() in the CPU offline path. Which means, disabling preemption only prevents the *local* CPU from going offline. So if code in atomic context wants to prevent any CPU from going offline, we need a new set of APIs, like get/put_online_cpus(), but which can be invoked from atomic context. That's why I named it as get/put_online_cpus_atomic(). One of the key points here is that we want to preserve get/put_cpu() as it is, since its purpose is different - disable preemption and offline of the local CPU. There is no reason to change that API, its useful as it is. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC V10 0/18] Paravirtualized ticket spinlocks
On 06/24/2013 06:47 PM, Andrew Jones wrote: On Mon, Jun 24, 2013 at 06:10:14PM +0530, Raghavendra K T wrote: Results: === base = 3.10-rc2 kernel patched = base + this series The test was on 32 core (model: Intel(R) Xeon(R) CPU X7560) HT disabled with 32 KVM guest vcpu 8GB RAM. Have you ever tried to get results with HT enabled? +---+---+---++---+ ebizzy (records/sec) higher is better +---+---+---++---+ basestdevpatchedstdev%improvement +---+---+---++---+ 1x 5574.9000 237.49975618.94.0366 0.77311 2x 2741.5000 561.30903332. 102.473821.53930 3x 2146.2500 216.77182302.76.3870 7.27237 4x 1663. 141.92351753.750083.5220 5.45701 +---+---+---++---+ This looks good. Are your ebizzy results consistent run to run though? +---+---+---++---+ dbench (Throughput) higher is better +---+---+---++---+ basestdevpatchedstdev%improvement +---+---+---++---+ 1x 14111.5600 754.4525 14645.9900 114.3087 3.78718 2x 2481.627071.26652667.128073.8193 7.47498 3x 1510.248331.86341503.879236.0777-0.42173 4x 1029.487516.91661039.706943.8840 0.99267 +---+---+---++---+ Hmm, I wonder what 2.5x looks like. Also, the 3% improvement with no overcommit is interesting. What's happening there? It makes me wonder what 1x looks like. Hi Andrew, I tried 2.5x case sort where I used 3 guests with 27 vcpu each on 32 core (HT disabled machine) and here is the output. almost no gain there. throughput avgstdev base: 1768.7458 MB/sec 54.044221 patched: 1772.5617 MB/sec 41.227689 gain %0.226 I am yet to try HT enabled cases that would give 0.5x to 2x performance results. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks
On 06/25/2013 08:20 PM, Andrew Theurer wrote: On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote: This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. The series provides implementation for both Xen and KVM. Changes in V9: - Changed spin_threshold to 32k to avoid excess halt exits that are causing undercommit degradation (after PLE handler improvement). - Added kvm_irq_delivery_to_apic (suggested by Gleb) - Optimized halt exit path to use PLE handler V8 of PVspinlock was posted last year. After Avi's suggestions to look at PLE handler's improvements, various optimizations in PLE handling have been tried. Sorry for not posting this sooner. I have tested the v9 pv-ticketlock patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs. I have tested these patches with and without PLE, as PLE is still not scalable with large VMs. Hi Andrew, Thanks for testing. System: x3850X5, 40 cores, 80 threads 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput(MB/s)Notes 3.10-default-ple_on 22945 5% CPU in host kernel, 2% spin_lock in guests 3.10-default-ple_off23184 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_on22895 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_off 23051 5% CPU in host kernel, 2% spin_lock in guests [all 1x results look good here] Yes. The 1x results look too close 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench: --- Total Configuration Throughput Notes 3.10-default-ple_on 6287 55% CPU host kernel, 17% spin_lock in guests 3.10-default-ple_off 1849 2% CPU in host kernel, 95% spin_lock in guests 3.10-pvticket-ple_on 6691 50% CPU in host kernel, 15% spin_lock in guests 3.10-pvticket-ple_off 16464 8% CPU in host kernel, 33% spin_lock in guests I see 6.426% improvement with ple_on and 161.87% improvement with ple_off. I think this is a very good sign for the patches [PLE hinders pv-ticket improvements, but even with PLE off, we still off from ideal throughput (somewhere 2)] Okay, The ideal throughput you are referring is getting around atleast 80% of 1x throughput for over-commit. Yes we are still far away from there. 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on 22736 6% CPU in host kernel, 3% spin_lock in guests 3.10-default-ple_off23377 5% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_on22471 6% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_off 23445 5% CPU in host kernel, 3% spin_lock in guests [1x looking fine here] I see ple_off is little better here. 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on 1965 70% CPU in host kernel, 34% spin_lock in guests 3.10-default-ple_off 226 2% CPU in host kernel, 94% spin_lock in guests 3.10-pvticket-ple_on 1942 70% CPU in host kernel, 35% spin_lock in guests 3.10-pvticket-ple_off8003 11% CPU in host kernel, 70% spin_lock in guests [quite bad all around, but pv-tickets with PLE off the best so far. Still quite a bit off from ideal throughput] This is again a remarkable improvement (307%). This motivates me to add a patch to disable ple when pvspinlock is on. probably we can add a hypercall that disables ple in kvm init patch. but only problem I see is what if the guests are mixed. (i.e one guest has pvspinlock support but other does not. Host supports pv) /me thinks In summary, I would state that the pv-ticket is an overall win, but the current PLE handler tends to get in the way on these larger guests. -Andrew -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a
Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline
On 06/26/2013 01:53 PM, Paolo Bonzini wrote: Il 26/06/2013 10:06, Srivatsa S. Bhat ha scritto: On 06/26/2013 01:16 PM, Paolo Bonzini wrote: Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto: - cpu = get_cpu(); + cpu = get_online_cpus_atomic(); vmx_vcpu_load(vmx-vcpu, cpu); vmx-vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(vmx-vcpu); - put_cpu(); + put_online_cpus_atomic(); The new API has a weird name. Why are you adding new functions instead of just modifying get/put_cpu? Because the purpose of those two functions are distinctly different from each other. get/put_cpu() is used to disable preemption on the local CPU. (Which also disables offlining the local CPU during that critical section). Ok, then I understood correctly... and I acked the other KVM patch. Thank you! However, keeping the code on the local CPU is exactly the point of this particular use of get_cpu()/put_cpu(). Why does it need to synchronize with offlining of other CPUs? Now that I looked at it again, I think you are right, get/put_cpu() is good enough here. But let me explain why I initially thought we needed full synchronization with CPU offline. In short, I wanted to synchronize the calls to __loaded_vmcs_clear(). We have the scenario shown below: CPU offline: CPU_DYING: hardware_disable(); -vmclear_local_loaded_vmcss(); -__loaded_vmcs_clear(v); And vmx_vcpu_load() (among others) can do: vmx_vcpu_load(); - loaded_vmcs_clear(); - __loaded_vmcs_clear(); So I wanted to avoid this race-condition and hence wrapped the code with get/put_online_cpus_atomic(). But the point I missed earlier is that loaded_vmcs_clear() calls __loaded_vmcs_clear() using smp_call_function_single(), which itself synchronizes properly with CPU hotplug. So there is no need to add full hotplug synchronization in the vmx code, as you noted above. So, please ignore this patch, and sorry for the noise! Regards, Srivatsa S. Bhat -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][kvm-unit-test] Keep gui off when running test cases
On Wed, Jun 26, 2013 at 09:08:12AM +0200, Paolo Bonzini wrote: Il 26/06/2013 08:06, Jan Kiszka ha scritto: From: Jan Kiszka jan.kis...@siemens.com Signed-off-by: Jan Kiszka jan.kis...@siemens.com Applied, thanks. --- x86-run |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/x86-run b/x86-run index 14ff331..646c577 100755 --- a/x86-run +++ b/x86-run @@ -33,7 +33,7 @@ else pc_testdev=-device testdev,chardev=testlog -chardev file,id=testlog,path=msr.out fi -command=${qemu} -enable-kvm $pc_testdev -serial stdio $pci_testdev -kernel +command=${qemu} -enable-kvm $pc_testdev -display none -serial stdio $pci_testdev -kernel echo ${command} $@ ${command} $@ ret=$? Reviewed-by: Paolo Bonzini pbonz...@redhat.com -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline
Il 26/06/2013 10:41, Srivatsa S. Bhat ha scritto: On 06/26/2013 01:53 PM, Paolo Bonzini wrote: Il 26/06/2013 10:06, Srivatsa S. Bhat ha scritto: On 06/26/2013 01:16 PM, Paolo Bonzini wrote: Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto: - cpu = get_cpu(); + cpu = get_online_cpus_atomic(); vmx_vcpu_load(vmx-vcpu, cpu); vmx-vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(vmx-vcpu); - put_cpu(); + put_online_cpus_atomic(); The new API has a weird name. Why are you adding new functions instead of just modifying get/put_cpu? Because the purpose of those two functions are distinctly different from each other. get/put_cpu() is used to disable preemption on the local CPU. (Which also disables offlining the local CPU during that critical section). Ok, then I understood correctly... and I acked the other KVM patch. Thank you! However, keeping the code on the local CPU is exactly the point of this particular use of get_cpu()/put_cpu(). Why does it need to synchronize with offlining of other CPUs? Now that I looked at it again, I think you are right, get/put_cpu() is good enough here. But let me explain why I initially thought we needed full synchronization with CPU offline. In short, I wanted to synchronize the calls to __loaded_vmcs_clear(). We have the scenario shown below: CPU offline: CPU_DYING: hardware_disable(); -vmclear_local_loaded_vmcss(); -__loaded_vmcs_clear(v); And vmx_vcpu_load() (among others) can do: vmx_vcpu_load(); - loaded_vmcs_clear(); - __loaded_vmcs_clear(); So I wanted to avoid this race-condition and hence wrapped the code with get/put_online_cpus_atomic(). But the point I missed earlier is that loaded_vmcs_clear() calls __loaded_vmcs_clear() using smp_call_function_single(), which itself synchronizes properly with CPU hotplug. So there is no need to add full hotplug synchronization in the vmx code, as you noted above. Makes sense, and I see now that it's patch 9 in this series. In general, I would rather add an extra get_online_cpus_atomic pair where it it actually needed (i.e. closer to where cpu_online is actually used), and leave get_cpu/put_cpu as is in the caller... which is exactly what happens in this case, since where it is actually needed is in smp_call_function_single(). So, please ignore this patch, and sorry for the noise! No problem, thanks for the heads-up. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction
On 06/26/2013 04:44 PM, Bhushan Bharat-R65777 wrote: -Original Message- From: tiejun.chen [mailto:tiejun.c...@windriver.com] Sent: Wednesday, June 26, 2013 12:25 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood Scott- B07421; b...@kernel.crashing.org; linuxppc-...@lists.ozlabs.org; linux- ker...@vger.kernel.org; mi...@neuling.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction On 06/26/2013 01:42 PM, Bharat Bhushan wrote: ehpriv instruction is used for setting software breakpoints by user space. This patch adds support to exit to user space with run-debug have relevant information. As this is the first point we are using run-debug, also defined the run-debug structure. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/disassemble.h |4 arch/powerpc/include/uapi/asm/kvm.h| 21 + arch/powerpc/kvm/e500_emulate.c| 27 +++ 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 9b198d1..856f8de 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst) return inst 0x; } +static inline unsigned int get_oc(u32 inst) +{ + return (inst 11) 0x7fff; +} #endif /* __ASM_PPC_DISASSEMBLE_H__ */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 0fb1a6e..ded0607 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -269,7 +269,24 @@ struct kvm_fpu { __u64 fpr[32]; }; +/* + * Defines for h/w breakpoint, watchpoint (read, write or both) and + * software breakpoint. + * These are used as type in KVM_SET_GUEST_DEBUG ioctl and status + * for KVM_DEBUG_EXIT. + */ +#define KVMPPC_DEBUG_NONE 0x0 +#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) +#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) +#define KVMPPC_DEBUG_WATCH_READ(1UL 3) struct kvm_debug_exit_arch { + __u64 address; + /* +* exiting to userspace because of h/w breakpoint, watchpoint +* (read, write or both) and software breakpoint. +*/ + __u32 status; + __u32 reserved; }; /* for KVM_SET_GUEST_DEBUG */ @@ -281,10 +298,6 @@ struct kvm_guest_debug_arch { * Type denotes h/w breakpoint, read watchpoint, write * watchpoint or watchpoint (both read and write). */ -#define KVMPPC_DEBUG_NONE 0x0 -#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) -#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) -#define KVMPPC_DEBUG_WATCH_READ(1UL 3) __u32 type; __u32 reserved; } bp[16]; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index b10a012..dab9d07 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -26,6 +26,8 @@ #define XOP_TLBRE 946 #define XOP_TLBWE 978 #define XOP_TLBILX 18 +#define XOP_EHPRIV 270 +#define EHPRIV_OC_DEBUG 0 As I think the case, OC = 0, is a bit specific since IIRC, if the OC operand is omitted, its equal 0 by default. So I think we should start this OC value from 1 or other magic number. ehpriv instruction is defined to be used as: ehpriv OC // where OC can be 0,1, ... n and in extended for it can be used as ehpriv // With no OC, and here it assumes OC = 0 So OC = 0 is not specific but ehpriv is same as ehpriv 0. Yes, this is just what I mean. I do not think of any special reason to reserve ehpriv and ehpriv 0. So I still prefer we can reserve the 'ehpriv' without OC operand as one simple approach to test or develop something for KVM quickly because its really convenient to trap into the hypervisor only with one 'ehpriv' instruction easily. But I have no further objection if you guys are fine to this ;-) Tiejun Thanks -Bharat And if possible, we'd better add some comments to describe this to make the OC definition readable. Tiejun #ifdef CONFIG_KVM_E500MC static int dbell2prio(ulong param) @@ -82,6 +84,26 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) } #endif +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + + switch (get_oc(inst)) { + case EHPRIV_OC_DEBUG: + run-exit_reason = KVM_EXIT_DEBUG; + run-debug.arch.address = vcpu-arch.pc; + run-debug.arch.status = 0; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + emulated =
Re: Bug#707257: linux-image-3.8-1-686-pae: KVM crashes with entry failed, hardware error 0x80000021
On Mon, Jun 24, 2013 at 10:42:57PM +0200, Stefan Pietsch wrote: On 24.06.2013 14:30, Gleb Natapov wrote: On Mon, Jun 24, 2013 at 01:59:34PM +0200, Stefan Pietsch wrote: As soon as I remove kvmvapic.bin the virtual machine boots with qemu-kvm 1.5.0. I just verified this with Linux kernel 3.10.0-rc5. emulate_invalid_guest_state=0 or emulate_invalid_guest_state=1 make no difference. Please send your patches. Here it is, run with it and kvmvapic.bin present. See what is printed in dmesg after the failure. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f4a5b3f..65488a4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3385,6 +3385,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 ar; + unsigned long rip; if (vmx-rmode.vm86_active seg != VCPU_SREG_LDTR) { *var = vmx-rmode.segs[seg]; @@ -3408,6 +3409,9 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var-db = (ar 14) 1; var-g = (ar 15) 1; var-unusable = (ar 16) 1; + rip = kvm_rip_read(vcpu); + if ((rip == 0xc101611c || rip == 0xc101611a) seg == VCPU_SREG_FS) + printk(base=%p limit=%p selector=%x ar=%x\n, var-base, var-limit, var-selector, ar); } static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) Booting kernel Linux 3.10-rc5 with your patch applied produces these messages in dmesg when starting a virtual machine: emulate_invalid_guest_state=0 [ 118.732151] base= limit= (null) selector=ffff ar=0 [ 118.732341] base= limit= (null) selector=ffff ar=0 I've butchered printk format, but it gives me the idea of what is going on anyway. Can you try the patch below with emulate_invalid_guest_state=0|1? diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f4a5b3f..eb062ce 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3395,19 +3395,20 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var-selector = vmx_read_guest_seg_selector(vmx, seg); return; } + var-base = vmx_read_guest_seg_base(vmx, seg); var-limit = vmx_read_guest_seg_limit(vmx, seg); var-selector = vmx_read_guest_seg_selector(vmx, seg); ar = vmx_read_guest_seg_ar(vmx, seg); + var-unusable = (ar 16) 1; var-type = ar 15; var-s = (ar 4) 1; var-dpl = (ar 5) 3; - var-present = (ar 7) 1; + var-present = !var-unusable; var-avl = (ar 12) 1; var-l = (ar 13) 1; var-db = (ar 14) 1; var-g = (ar 15) 1; - var-unusable = (ar 16) 1; } static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks
On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote: On 06/25/2013 08:20 PM, Andrew Theurer wrote: On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote: This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. The series provides implementation for both Xen and KVM. Changes in V9: - Changed spin_threshold to 32k to avoid excess halt exits that are causing undercommit degradation (after PLE handler improvement). - Added kvm_irq_delivery_to_apic (suggested by Gleb) - Optimized halt exit path to use PLE handler V8 of PVspinlock was posted last year. After Avi's suggestions to look at PLE handler's improvements, various optimizations in PLE handling have been tried. Sorry for not posting this sooner. I have tested the v9 pv-ticketlock patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs. I have tested these patches with and without PLE, as PLE is still not scalable with large VMs. Hi Andrew, Thanks for testing. System: x3850X5, 40 cores, 80 threads 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench: -- Total ConfigurationThroughput(MB/s)Notes 3.10-default-ple_on 22945 5% CPU in host kernel, 2% spin_lock in guests 3.10-default-ple_off 23184 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_on 22895 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_off23051 5% CPU in host kernel, 2% spin_lock in guests [all 1x results look good here] Yes. The 1x results look too close 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench: --- Total ConfigurationThroughput Notes 3.10-default-ple_on 6287 55% CPU host kernel, 17% spin_lock in guests 3.10-default-ple_off 1849 2% CPU in host kernel, 95% spin_lock in guests 3.10-pvticket-ple_on 6691 50% CPU in host kernel, 15% spin_lock in guests 3.10-pvticket-ple_off16464 8% CPU in host kernel, 33% spin_lock in guests I see 6.426% improvement with ple_on and 161.87% improvement with ple_off. I think this is a very good sign for the patches [PLE hinders pv-ticket improvements, but even with PLE off, we still off from ideal throughput (somewhere 2)] Okay, The ideal throughput you are referring is getting around atleast 80% of 1x throughput for over-commit. Yes we are still far away from there. 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench: -- Total ConfigurationThroughput Notes 3.10-default-ple_on 22736 6% CPU in host kernel, 3% spin_lock in guests 3.10-default-ple_off 23377 5% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_on 22471 6% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_off23445 5% CPU in host kernel, 3% spin_lock in guests [1x looking fine here] I see ple_off is little better here. 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench: -- Total ConfigurationThroughput Notes 3.10-default-ple_on 1965 70% CPU in host kernel, 34% spin_lock in guests 3.10-default-ple_off 226 2% CPU in host kernel, 94% spin_lock in guests 3.10-pvticket-ple_on 1942 70% CPU in host kernel, 35% spin_lock in guests 3.10-pvticket-ple_off 8003 11% CPU in host kernel, 70% spin_lock in guests [quite bad all around, but pv-tickets with PLE off the best so far. Still quite a bit off from ideal throughput] This is again a remarkable improvement (307%). This motivates me to add a patch to disable ple when pvspinlock is on. probably we can add a hypercall that disables ple in kvm init patch. but only problem I see is what if the guests are mixed. (i.e one guest has pvspinlock support but other does not. Host supports pv) How about reintroducing the idea to create per-kvm ple_gap,ple_window state. We were headed
Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks
On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote: On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote: On 06/25/2013 08:20 PM, Andrew Theurer wrote: On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote: This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. The series provides implementation for both Xen and KVM. Changes in V9: - Changed spin_threshold to 32k to avoid excess halt exits that are causing undercommit degradation (after PLE handler improvement). - Added kvm_irq_delivery_to_apic (suggested by Gleb) - Optimized halt exit path to use PLE handler V8 of PVspinlock was posted last year. After Avi's suggestions to look at PLE handler's improvements, various optimizations in PLE handling have been tried. Sorry for not posting this sooner. I have tested the v9 pv-ticketlock patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs. I have tested these patches with and without PLE, as PLE is still not scalable with large VMs. Hi Andrew, Thanks for testing. System: x3850X5, 40 cores, 80 threads 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput(MB/s)Notes 3.10-default-ple_on22945 5% CPU in host kernel, 2% spin_lock in guests 3.10-default-ple_off 23184 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_on 22895 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_off 23051 5% CPU in host kernel, 2% spin_lock in guests [all 1x results look good here] Yes. The 1x results look too close 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench: --- Total Configuration Throughput Notes 3.10-default-ple_on 6287 55% CPU host kernel, 17% spin_lock in guests 3.10-default-ple_off1849 2% CPU in host kernel, 95% spin_lock in guests 3.10-pvticket-ple_on6691 50% CPU in host kernel, 15% spin_lock in guests 3.10-pvticket-ple_off 16464 8% CPU in host kernel, 33% spin_lock in guests I see 6.426% improvement with ple_on and 161.87% improvement with ple_off. I think this is a very good sign for the patches [PLE hinders pv-ticket improvements, but even with PLE off, we still off from ideal throughput (somewhere 2)] Okay, The ideal throughput you are referring is getting around atleast 80% of 1x throughput for over-commit. Yes we are still far away from there. 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on22736 6% CPU in host kernel, 3% spin_lock in guests 3.10-default-ple_off 23377 5% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_on 22471 6% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_off 23445 5% CPU in host kernel, 3% spin_lock in guests [1x looking fine here] I see ple_off is little better here. 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on 1965 70% CPU in host kernel, 34% spin_lock in guests 3.10-default-ple_off 226 2% CPU in host kernel, 94% spin_lock in guests 3.10-pvticket-ple_on1942 70% CPU in host kernel, 35% spin_lock in guests 3.10-pvticket-ple_off 8003 11% CPU in host kernel, 70% spin_lock in guests [quite bad all around, but pv-tickets with PLE off the best so far. Still quite a bit off from ideal throughput] This is again a remarkable improvement (307%). This motivates me to add a patch to disable ple when pvspinlock is on. probably we can add a hypercall that disables ple in kvm init patch. but
Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks
On 06/26/2013 06:22 PM, Gleb Natapov wrote: On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote: On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote: On 06/25/2013 08:20 PM, Andrew Theurer wrote: On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote: This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. The series provides implementation for both Xen and KVM. Changes in V9: - Changed spin_threshold to 32k to avoid excess halt exits that are causing undercommit degradation (after PLE handler improvement). - Added kvm_irq_delivery_to_apic (suggested by Gleb) - Optimized halt exit path to use PLE handler V8 of PVspinlock was posted last year. After Avi's suggestions to look at PLE handler's improvements, various optimizations in PLE handling have been tried. Sorry for not posting this sooner. I have tested the v9 pv-ticketlock patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs. I have tested these patches with and without PLE, as PLE is still not scalable with large VMs. Hi Andrew, Thanks for testing. System: x3850X5, 40 cores, 80 threads 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput(MB/s)Notes 3.10-default-ple_on 22945 5% CPU in host kernel, 2% spin_lock in guests 3.10-default-ple_off23184 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_on22895 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_off 23051 5% CPU in host kernel, 2% spin_lock in guests [all 1x results look good here] Yes. The 1x results look too close 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench: --- Total Configuration Throughput Notes 3.10-default-ple_on 6287 55% CPU host kernel, 17% spin_lock in guests 3.10-default-ple_off 1849 2% CPU in host kernel, 95% spin_lock in guests 3.10-pvticket-ple_on 6691 50% CPU in host kernel, 15% spin_lock in guests 3.10-pvticket-ple_off 16464 8% CPU in host kernel, 33% spin_lock in guests I see 6.426% improvement with ple_on and 161.87% improvement with ple_off. I think this is a very good sign for the patches [PLE hinders pv-ticket improvements, but even with PLE off, we still off from ideal throughput (somewhere 2)] Okay, The ideal throughput you are referring is getting around atleast 80% of 1x throughput for over-commit. Yes we are still far away from there. 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on 22736 6% CPU in host kernel, 3% spin_lock in guests 3.10-default-ple_off23377 5% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_on22471 6% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_off 23445 5% CPU in host kernel, 3% spin_lock in guests [1x looking fine here] I see ple_off is little better here. 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on 1965 70% CPU in host kernel, 34% spin_lock in guests 3.10-default-ple_off 226 2% CPU in host kernel, 94% spin_lock in guests 3.10-pvticket-ple_on 1942 70% CPU in host kernel, 35% spin_lock in guests 3.10-pvticket-ple_off8003 11% CPU in host kernel, 70% spin_lock in guests [quite bad all around, but pv-tickets with PLE off the best so far. Still quite a bit off from ideal throughput] This is again a remarkable improvement (307%). This motivates me to add a patch to disable ple when pvspinlock is on. probably we can add a hypercall that disables ple in kvm init patch. but only problem I see is what if the guests are mixed. (i.e one guest has pvspinlock support but other does not. Host supports pv) How about reintroducing the idea to create per-kvm ple_gap,ple_window
i/o threads
Hi, I noticed that on my 3 VMs running server, that there are 10-20 threads doing i/o. As the VMs are running on HDDs and not SSDs I think that is counterproductive: won't these threads make the HDDs seek back and forth constantly? Folkert van Heusden -- Always wondered what the latency of your webserver is? Or how much more latency you get when you go through a proxy server/tor? The numbers tell the tale and with HTTPing you know them! http://www.vanheusden.com/httping/ --- Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/4] kvm, emulator: Rename VendorSpecific flag
On Tue, Jun 25, 2013 at 02:10:20PM +0300, Gleb Natapov wrote: - if (!(ctxt-d VendorSpecific) ctxt-only_vendor_specific_insn) + if (!(ctxt-d EmulateOnUD) ctxt-only_vendor_specific_insn) Lets rename only_vendor_specific_insn to something like -ud too. So this thing is set only when either svm or vmx encounter an #UD and go and emulate the instruction. I guess this is for the case where we actually do want to inject the #UD into the guest and not emulate the instruction. Btw, it is only checked in x86_decode_insn so we could just as well hand down the emulation_type from the caller x86_emulate_instruction and kill -only_vendor_specific_insn completely like so: if (!(ctxt-d EmulateOnUD) (emul_type EMULTYPE_TRAP_UD)) -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks
On Wed, Jun 26, 2013 at 03:52:40PM +0300, Gleb Natapov wrote: On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote: On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote: On 06/25/2013 08:20 PM, Andrew Theurer wrote: On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote: This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. The series provides implementation for both Xen and KVM. Changes in V9: - Changed spin_threshold to 32k to avoid excess halt exits that are causing undercommit degradation (after PLE handler improvement). - Added kvm_irq_delivery_to_apic (suggested by Gleb) - Optimized halt exit path to use PLE handler V8 of PVspinlock was posted last year. After Avi's suggestions to look at PLE handler's improvements, various optimizations in PLE handling have been tried. Sorry for not posting this sooner. I have tested the v9 pv-ticketlock patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs. I have tested these patches with and without PLE, as PLE is still not scalable with large VMs. Hi Andrew, Thanks for testing. System: x3850X5, 40 cores, 80 threads 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench: -- Total ConfigurationThroughput(MB/s)Notes 3.10-default-ple_on 22945 5% CPU in host kernel, 2% spin_lock in guests 3.10-default-ple_off 23184 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_on 22895 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_off23051 5% CPU in host kernel, 2% spin_lock in guests [all 1x results look good here] Yes. The 1x results look too close 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench: --- Total ConfigurationThroughput Notes 3.10-default-ple_on 6287 55% CPU host kernel, 17% spin_lock in guests 3.10-default-ple_off 1849 2% CPU in host kernel, 95% spin_lock in guests 3.10-pvticket-ple_on 6691 50% CPU in host kernel, 15% spin_lock in guests 3.10-pvticket-ple_off16464 8% CPU in host kernel, 33% spin_lock in guests I see 6.426% improvement with ple_on and 161.87% improvement with ple_off. I think this is a very good sign for the patches [PLE hinders pv-ticket improvements, but even with PLE off, we still off from ideal throughput (somewhere 2)] Okay, The ideal throughput you are referring is getting around atleast 80% of 1x throughput for over-commit. Yes we are still far away from there. 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench: -- Total ConfigurationThroughput Notes 3.10-default-ple_on 22736 6% CPU in host kernel, 3% spin_lock in guests 3.10-default-ple_off 23377 5% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_on 22471 6% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_off23445 5% CPU in host kernel, 3% spin_lock in guests [1x looking fine here] I see ple_off is little better here. 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench: -- Total ConfigurationThroughput Notes 3.10-default-ple_on 1965 70% CPU in host kernel, 34% spin_lock in guests 3.10-default-ple_off 226 2% CPU in host kernel, 94% spin_lock in guests 3.10-pvticket-ple_on 1942 70% CPU in host kernel, 35% spin_lock in guests 3.10-pvticket-ple_off 8003 11% CPU in host kernel, 70% spin_lock in guests [quite bad all around, but pv-tickets with PLE off the best so far. Still quite a bit off from ideal throughput] This is again a remarkable improvement (307%). This motivates me to
Re: [PATCH 3/4] kvm, emulator: Rename VendorSpecific flag
On Wed, Jun 26, 2013 at 04:11:59PM +0200, Borislav Petkov wrote: On Tue, Jun 25, 2013 at 02:10:20PM +0300, Gleb Natapov wrote: - if (!(ctxt-d VendorSpecific) ctxt-only_vendor_specific_insn) + if (!(ctxt-d EmulateOnUD) ctxt-only_vendor_specific_insn) Lets rename only_vendor_specific_insn to something like -ud too. So this thing is set only when either svm or vmx encounter an #UD and go and emulate the instruction. I guess this is for the case where we actually do want to inject the #UD into the guest and not emulate the instruction. Btw, it is only checked in x86_decode_insn so we could just as well hand down the emulation_type from the caller x86_emulate_instruction and kill -only_vendor_specific_insn completely like so: if (!(ctxt-d EmulateOnUD) (emul_type EMULTYPE_TRAP_UD)) EMULTYPE_ values are external to emulator.c and control how x86.c invokes the emulator. I prefer not to change kvm-emulator interface just to get rid of one ctxt field. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks
On 06/26/2013 08:09 PM, Chegu Vinod wrote: On 6/26/2013 6:40 AM, Raghavendra K T wrote: On 06/26/2013 06:22 PM, Gleb Natapov wrote: On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote: On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote: On 06/25/2013 08:20 PM, Andrew Theurer wrote: On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote: This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. The series provides implementation for both Xen and KVM. Changes in V9: - Changed spin_threshold to 32k to avoid excess halt exits that are causing undercommit degradation (after PLE handler improvement). - Added kvm_irq_delivery_to_apic (suggested by Gleb) - Optimized halt exit path to use PLE handler V8 of PVspinlock was posted last year. After Avi's suggestions to look at PLE handler's improvements, various optimizations in PLE handling have been tried. Sorry for not posting this sooner. I have tested the v9 pv-ticketlock patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs. I have tested these patches with and without PLE, as PLE is still not scalable with large VMs. Hi Andrew, Thanks for testing. System: x3850X5, 40 cores, 80 threads 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench: -- Total ConfigurationThroughput(MB/s)Notes 3.10-default-ple_on229455% CPU in host kernel, 2% spin_lock in guests 3.10-default-ple_off231845% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_on228955% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_off230515% CPU in host kernel, 2% spin_lock in guests [all 1x results look good here] Yes. The 1x results look too close 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench: --- Total ConfigurationThroughputNotes 3.10-default-ple_on 628755% CPU host kernel, 17% spin_lock in guests 3.10-default-ple_off 18492% CPU in host kernel, 95% spin_lock in guests 3.10-pvticket-ple_on 669150% CPU in host kernel, 15% spin_lock in guests 3.10-pvticket-ple_off164648% CPU in host kernel, 33% spin_lock in guests I see 6.426% improvement with ple_on and 161.87% improvement with ple_off. I think this is a very good sign for the patches [PLE hinders pv-ticket improvements, but even with PLE off, we still off from ideal throughput (somewhere 2)] Okay, The ideal throughput you are referring is getting around atleast 80% of 1x throughput for over-commit. Yes we are still far away from there. 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench: -- Total ConfigurationThroughputNotes 3.10-default-ple_on227366% CPU in host kernel, 3% spin_lock in guests 3.10-default-ple_off233775% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_on224716% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_off234455% CPU in host kernel, 3% spin_lock in guests [1x looking fine here] I see ple_off is little better here. 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench: -- Total ConfigurationThroughputNotes 3.10-default-ple_on 196570% CPU in host kernel, 34% spin_lock in guests 3.10-default-ple_off 2262% CPU in host kernel, 94% spin_lock in guests 3.10-pvticket-ple_on 194270% CPU in host kernel, 35% spin_lock in guests 3.10-pvticket-ple_off 800311% CPU in host kernel, 70% spin_lock in guests [quite bad all around, but pv-tickets with PLE off the best so far. Still quite a bit off from ideal throughput] This is again a remarkable improvement (307%). This motivates me to add a patch to disable ple when pvspinlock is on. probably we can add a hypercall that disables ple in kvm init patch. but only problem I see is what if the guests are mixed. (i.e one guest has pvspinlock support but other does not. Host supports pv) How about reintroducing the idea to create per-kvm ple_gap,ple_window state. We were headed down that road when considering a dynamic window at one point. Then you can just set a single guest's ple_gap to zero, which would lead to PLE being disabled for that guest. We could also revisit the dynamic window then. Can be done, but lets understand why ple on is such a big problem. Is it possible that ple gap
Re: [PATCH RFC] pci: ACS quirk for AMD southbridge
Bjorn Helgaas wrote: [fix Joerg's email address] On Tue, Jun 25, 2013 at 10:15 PM, Bjorn Helgaas bhelg...@google.com wrote: On Wed, Jul 11, 2012 at 11:18 PM, Alex Williamson alex.william...@redhat.com wrote: We've confirmed that peer-to-peer between these devices is not possible. We can therefore claim that they support a subset of ACS. Signed-off-by: Alex Williamson alex.william...@redhat.com Cc: Joerg Roedel joerg.roe...@amd.com --- Two things about this patch make me a little nervous. The first is that I'd really like to have a pci_is_pcie() test in pci_mf_no_p2p_acs_enabled(), but these devices don't have a PCIe capability. That means that if there was a topology where these devices sit on a legacy PCI bus, we incorrectly return that we're ACS safe here. That leads to my second problem, pciids seems to suggest that some of these functions have been around for a while. Is it just this package that's peer-to-peer safe, or is it safe to assume that any previous assembly of these functions is also p2p safe. Maybe we need to factor in device revs if that uniquely identifies this package? Looks like another useful device to potentially quirk would be: 00:15.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 PCI to PCI bridge (PCIE port 0) 00:15.1 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 PCI to PCI bridge (PCIE port 1) 00:15.2 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI bridge (PCIE port 2) 00:15.3 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI bridge (PCIE port 3) 00:15.0 0604: 1002:43a0 00:15.1 0604: 1002:43a1 00:15.2 0604: 1002:43a2 00:15.3 0604: 1002:43a3 drivers/pci/quirks.c | 29 + 1 file changed, 29 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 4ebc865..2c84961 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3271,11 +3271,40 @@ struct pci_dev *pci_get_dma_source(struct pci_dev *dev) return pci_dev_get(dev); } +/* + * Multifunction devices that do not support peer-to-peer between + * functions can claim to support a subset of ACS. Such devices + * effectively enable request redirect (RR) and completion redirect (CR) + * since all transactions are redirected to the upstream root complex. + */ +static int pci_mf_no_p2p_acs_enabled(struct pci_dev *dev, u16 acs_flags) +{ + if (!dev-multifunction) + return -ENODEV; + + /* Filter out flags not applicable to multifunction */ + acs_flags = (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC | PCI_ACS_DT); + + return acs_flags ~(PCI_ACS_RR | PCI_ACS_CR) ? 0 : 1; +} + static const struct pci_dev_acs_enabled { u16 vendor; u16 device; int (*acs_enabled)(struct pci_dev *dev, u16 acs_flags); } pci_dev_acs_enabled[] = { + /* +* AMD/ATI multifunction southbridge devices. AMD has confirmed +* that peer-to-peer between these devices is not possible, so +* they do support a subset of ACS even though the capability is +* not exposed in config space. +*/ + { PCI_VENDOR_ID_ATI, 0x4385, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x439c, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4383, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x439d, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4384, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4399, pci_mf_no_p2p_acs_enabled }, { 0 } }; I was looking for something else and found this old email. This patch hasn't been applied and I haven't seen any discussion about it. Is it still of interest? It seems relevant to the current ACS discussion [1]. It is absolutely relevant. I always have to patch my kernel to get it working to put my pci device to VM. Meanwhile I'm doing it for kernel 3.9. I would be very glad to get these patches to the kernel as they don't do anything bad! My multifunction devices are the devices defined in the patch. My current pci device passed through is a intel ethernet device: -[:00]-+-00.0 Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (external gfx0 port B) +-00.2 Advanced Micro Devices [AMD] nee ATI RD990 I/O Memory Management Unit (IOMMU) +-02.0-[01]--+-00.0 Advanced Micro Devices [AMD] nee ATI Turks [Radeon HD 6570] |\-00.1 Advanced Micro Devices [AMD] nee ATI Turks HDMI Audio [Radeon HD 6000 Series] +-04.0-[02]00.0 Etron Technology, Inc. EJ168 USB 3.0 Host Controller +-05.0-[03]00.0 Atheros Communications Inc. AR9300 Wireless LAN adaptor +-09.0-[04]00.0 Realtek Semiconductor Co., Ltd. RTL8111/8168B PCI Express Gigabit Ethernet controller +-0a.0-[05]00.0 Etron Technology, Inc. EJ168 USB 3.0 Host Controller
Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks
On Wed, 2013-06-26 at 15:52 +0300, Gleb Natapov wrote: On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote: On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote: On 06/25/2013 08:20 PM, Andrew Theurer wrote: On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote: This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. The series provides implementation for both Xen and KVM. Changes in V9: - Changed spin_threshold to 32k to avoid excess halt exits that are causing undercommit degradation (after PLE handler improvement). - Added kvm_irq_delivery_to_apic (suggested by Gleb) - Optimized halt exit path to use PLE handler V8 of PVspinlock was posted last year. After Avi's suggestions to look at PLE handler's improvements, various optimizations in PLE handling have been tried. Sorry for not posting this sooner. I have tested the v9 pv-ticketlock patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs. I have tested these patches with and without PLE, as PLE is still not scalable with large VMs. Hi Andrew, Thanks for testing. System: x3850X5, 40 cores, 80 threads 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench: -- Total ConfigurationThroughput(MB/s)Notes 3.10-default-ple_on 22945 5% CPU in host kernel, 2% spin_lock in guests 3.10-default-ple_off 23184 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_on 22895 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_off23051 5% CPU in host kernel, 2% spin_lock in guests [all 1x results look good here] Yes. The 1x results look too close 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench: --- Total ConfigurationThroughput Notes 3.10-default-ple_on 6287 55% CPU host kernel, 17% spin_lock in guests 3.10-default-ple_off 1849 2% CPU in host kernel, 95% spin_lock in guests 3.10-pvticket-ple_on 6691 50% CPU in host kernel, 15% spin_lock in guests 3.10-pvticket-ple_off16464 8% CPU in host kernel, 33% spin_lock in guests I see 6.426% improvement with ple_on and 161.87% improvement with ple_off. I think this is a very good sign for the patches [PLE hinders pv-ticket improvements, but even with PLE off, we still off from ideal throughput (somewhere 2)] Okay, The ideal throughput you are referring is getting around atleast 80% of 1x throughput for over-commit. Yes we are still far away from there. 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench: -- Total ConfigurationThroughput Notes 3.10-default-ple_on 22736 6% CPU in host kernel, 3% spin_lock in guests 3.10-default-ple_off 23377 5% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_on 22471 6% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_off23445 5% CPU in host kernel, 3% spin_lock in guests [1x looking fine here] I see ple_off is little better here. 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench: -- Total ConfigurationThroughput Notes 3.10-default-ple_on 1965 70% CPU in host kernel, 34% spin_lock in guests 3.10-default-ple_off 226 2% CPU in host kernel, 94% spin_lock in guests 3.10-pvticket-ple_on 1942 70% CPU in host kernel, 35% spin_lock in guests 3.10-pvticket-ple_off 8003 11% CPU in host kernel, 70% spin_lock in guests [quite bad all around, but pv-tickets with PLE off the best so far. Still quite a bit off from ideal throughput] This is again a remarkable improvement (307%). This motivates me to add a
Re: [PATCH RFC] pci: ACS quirk for AMD southbridge
On Wed, 2013-06-26 at 17:14 +0200, Andreas Hartmann wrote: Bjorn Helgaas wrote: [fix Joerg's email address] On Tue, Jun 25, 2013 at 10:15 PM, Bjorn Helgaas bhelg...@google.com wrote: On Wed, Jul 11, 2012 at 11:18 PM, Alex Williamson alex.william...@redhat.com wrote: We've confirmed that peer-to-peer between these devices is not possible. We can therefore claim that they support a subset of ACS. Signed-off-by: Alex Williamson alex.william...@redhat.com Cc: Joerg Roedel joerg.roe...@amd.com --- Two things about this patch make me a little nervous. The first is that I'd really like to have a pci_is_pcie() test in pci_mf_no_p2p_acs_enabled(), but these devices don't have a PCIe capability. That means that if there was a topology where these devices sit on a legacy PCI bus, we incorrectly return that we're ACS safe here. That leads to my second problem, pciids seems to suggest that some of these functions have been around for a while. Is it just this package that's peer-to-peer safe, or is it safe to assume that any previous assembly of these functions is also p2p safe. Maybe we need to factor in device revs if that uniquely identifies this package? Looks like another useful device to potentially quirk would be: 00:15.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 PCI to PCI bridge (PCIE port 0) 00:15.1 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 PCI to PCI bridge (PCIE port 1) 00:15.2 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI bridge (PCIE port 2) 00:15.3 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI bridge (PCIE port 3) 00:15.0 0604: 1002:43a0 00:15.1 0604: 1002:43a1 00:15.2 0604: 1002:43a2 00:15.3 0604: 1002:43a3 drivers/pci/quirks.c | 29 + 1 file changed, 29 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 4ebc865..2c84961 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3271,11 +3271,40 @@ struct pci_dev *pci_get_dma_source(struct pci_dev *dev) return pci_dev_get(dev); } +/* + * Multifunction devices that do not support peer-to-peer between + * functions can claim to support a subset of ACS. Such devices + * effectively enable request redirect (RR) and completion redirect (CR) + * since all transactions are redirected to the upstream root complex. + */ +static int pci_mf_no_p2p_acs_enabled(struct pci_dev *dev, u16 acs_flags) +{ + if (!dev-multifunction) + return -ENODEV; + + /* Filter out flags not applicable to multifunction */ + acs_flags = (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC | PCI_ACS_DT); + + return acs_flags ~(PCI_ACS_RR | PCI_ACS_CR) ? 0 : 1; +} + static const struct pci_dev_acs_enabled { u16 vendor; u16 device; int (*acs_enabled)(struct pci_dev *dev, u16 acs_flags); } pci_dev_acs_enabled[] = { + /* +* AMD/ATI multifunction southbridge devices. AMD has confirmed +* that peer-to-peer between these devices is not possible, so +* they do support a subset of ACS even though the capability is +* not exposed in config space. +*/ + { PCI_VENDOR_ID_ATI, 0x4385, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x439c, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4383, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x439d, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4384, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4399, pci_mf_no_p2p_acs_enabled }, { 0 } }; I was looking for something else and found this old email. This patch hasn't been applied and I haven't seen any discussion about it. Is it still of interest? It seems relevant to the current ACS discussion [1]. It is absolutely relevant. I always have to patch my kernel to get it working to put my pci device to VM. Meanwhile I'm doing it for kernel 3.9. I would be very glad to get these patches to the kernel as they don't do anything bad! I'd still like to see this get in too. IIRC, where we left off was that Joerg had confirmed with the hardware folks that there is no peer-to-peer between these devices, but we still had questions about whether that was true for any instance of these vendor/device IDs. These devices are re-used in several packages and I'm not sure if we need to somehow figure out what package (ie. which chipset generation) we're looking at to know if p2p is used. Thanks, Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks
On Wed, Jun 26, 2013 at 07:10:21PM +0530, Raghavendra K T wrote: On 06/26/2013 06:22 PM, Gleb Natapov wrote: On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote: On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote: On 06/25/2013 08:20 PM, Andrew Theurer wrote: On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote: This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. The series provides implementation for both Xen and KVM. Changes in V9: - Changed spin_threshold to 32k to avoid excess halt exits that are causing undercommit degradation (after PLE handler improvement). - Added kvm_irq_delivery_to_apic (suggested by Gleb) - Optimized halt exit path to use PLE handler V8 of PVspinlock was posted last year. After Avi's suggestions to look at PLE handler's improvements, various optimizations in PLE handling have been tried. Sorry for not posting this sooner. I have tested the v9 pv-ticketlock patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs. I have tested these patches with and without PLE, as PLE is still not scalable with large VMs. Hi Andrew, Thanks for testing. System: x3850X5, 40 cores, 80 threads 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput(MB/s)Notes 3.10-default-ple_on 22945 5% CPU in host kernel, 2% spin_lock in guests 3.10-default-ple_off 23184 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_on 22895 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_off 23051 5% CPU in host kernel, 2% spin_lock in guests [all 1x results look good here] Yes. The 1x results look too close 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench: --- Total Configuration Throughput Notes 3.10-default-ple_on6287 55% CPU host kernel, 17% spin_lock in guests 3.10-default-ple_off 1849 2% CPU in host kernel, 95% spin_lock in guests 3.10-pvticket-ple_on 6691 50% CPU in host kernel, 15% spin_lock in guests 3.10-pvticket-ple_off 16464 8% CPU in host kernel, 33% spin_lock in guests I see 6.426% improvement with ple_on and 161.87% improvement with ple_off. I think this is a very good sign for the patches [PLE hinders pv-ticket improvements, but even with PLE off, we still off from ideal throughput (somewhere 2)] Okay, The ideal throughput you are referring is getting around atleast 80% of 1x throughput for over-commit. Yes we are still far away from there. 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on 22736 6% CPU in host kernel, 3% spin_lock in guests 3.10-default-ple_off 23377 5% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_on 22471 6% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_off 23445 5% CPU in host kernel, 3% spin_lock in guests [1x looking fine here] I see ple_off is little better here. 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on1965 70% CPU in host kernel, 34% spin_lock in guests 3.10-default-ple_off226 2% CPU in host kernel, 94% spin_lock in guests 3.10-pvticket-ple_on 1942 70% CPU in host kernel, 35% spin_lock in guests 3.10-pvticket-ple_off 8003 11% CPU in host kernel, 70% spin_lock in guests [quite bad all around, but pv-tickets with PLE off the best so far. Still quite a bit off from ideal throughput] This is again a remarkable improvement (307%). This motivates me to add a patch to disable ple when pvspinlock is on. probably we can add a hypercall that disables ple in kvm init patch. but only problem I see is what
Re: [PATCH RFC] pci: ACS quirk for AMD southbridge
Alex Williamson wrote: On Wed, 2013-06-26 at 17:14 +0200, Andreas Hartmann wrote: Bjorn Helgaas wrote: [fix Joerg's email address] On Tue, Jun 25, 2013 at 10:15 PM, Bjorn Helgaas bhelg...@google.com wrote: On Wed, Jul 11, 2012 at 11:18 PM, Alex Williamson alex.william...@redhat.com wrote: We've confirmed that peer-to-peer between these devices is not possible. We can therefore claim that they support a subset of ACS. Signed-off-by: Alex Williamson alex.william...@redhat.com Cc: Joerg Roedel joerg.roe...@amd.com --- Two things about this patch make me a little nervous. The first is that I'd really like to have a pci_is_pcie() test in pci_mf_no_p2p_acs_enabled(), but these devices don't have a PCIe capability. That means that if there was a topology where these devices sit on a legacy PCI bus, we incorrectly return that we're ACS safe here. That leads to my second problem, pciids seems to suggest that some of these functions have been around for a while. Is it just this package that's peer-to-peer safe, or is it safe to assume that any previous assembly of these functions is also p2p safe. Maybe we need to factor in device revs if that uniquely identifies this package? Looks like another useful device to potentially quirk would be: 00:15.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 PCI to PCI bridge (PCIE port 0) 00:15.1 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 PCI to PCI bridge (PCIE port 1) 00:15.2 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI bridge (PCIE port 2) 00:15.3 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI bridge (PCIE port 3) 00:15.0 0604: 1002:43a0 00:15.1 0604: 1002:43a1 00:15.2 0604: 1002:43a2 00:15.3 0604: 1002:43a3 drivers/pci/quirks.c | 29 + 1 file changed, 29 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 4ebc865..2c84961 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3271,11 +3271,40 @@ struct pci_dev *pci_get_dma_source(struct pci_dev *dev) return pci_dev_get(dev); } +/* + * Multifunction devices that do not support peer-to-peer between + * functions can claim to support a subset of ACS. Such devices + * effectively enable request redirect (RR) and completion redirect (CR) + * since all transactions are redirected to the upstream root complex. + */ +static int pci_mf_no_p2p_acs_enabled(struct pci_dev *dev, u16 acs_flags) +{ + if (!dev-multifunction) + return -ENODEV; + + /* Filter out flags not applicable to multifunction */ + acs_flags = (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC | PCI_ACS_DT); + + return acs_flags ~(PCI_ACS_RR | PCI_ACS_CR) ? 0 : 1; +} + static const struct pci_dev_acs_enabled { u16 vendor; u16 device; int (*acs_enabled)(struct pci_dev *dev, u16 acs_flags); } pci_dev_acs_enabled[] = { + /* +* AMD/ATI multifunction southbridge devices. AMD has confirmed +* that peer-to-peer between these devices is not possible, so +* they do support a subset of ACS even though the capability is +* not exposed in config space. +*/ + { PCI_VENDOR_ID_ATI, 0x4385, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x439c, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4383, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x439d, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4384, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4399, pci_mf_no_p2p_acs_enabled }, { 0 } }; I was looking for something else and found this old email. This patch hasn't been applied and I haven't seen any discussion about it. Is it still of interest? It seems relevant to the current ACS discussion [1]. It is absolutely relevant. I always have to patch my kernel to get it working to put my pci device to VM. Meanwhile I'm doing it for kernel 3.9. I would be very glad to get these patches to the kernel as they don't do anything bad! I'd still like to see this get in too. IIRC, where we left off was that Joerg had confirmed with the hardware folks that there is no peer-to-peer between these devices, but we still had questions about whether that was true for any instance of these vendor/device IDs. These devices are re-used in several packages and I'm not sure if we need to somehow figure out what package (ie. which chipset generation) we're looking at to know if p2p is used. Does this statement cover your question? http://article.gmane.org/gmane.comp.emulators.kvm.devel/99402 Andreas -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] pci: ACS quirk for AMD southbridge
On Wed, 2013-06-26 at 18:24 +0200, Andreas Hartmann wrote: Alex Williamson wrote: On Wed, 2013-06-26 at 17:14 +0200, Andreas Hartmann wrote: Bjorn Helgaas wrote: [fix Joerg's email address] On Tue, Jun 25, 2013 at 10:15 PM, Bjorn Helgaas bhelg...@google.com wrote: On Wed, Jul 11, 2012 at 11:18 PM, Alex Williamson alex.william...@redhat.com wrote: We've confirmed that peer-to-peer between these devices is not possible. We can therefore claim that they support a subset of ACS. Signed-off-by: Alex Williamson alex.william...@redhat.com Cc: Joerg Roedel joerg.roe...@amd.com --- Two things about this patch make me a little nervous. The first is that I'd really like to have a pci_is_pcie() test in pci_mf_no_p2p_acs_enabled(), but these devices don't have a PCIe capability. That means that if there was a topology where these devices sit on a legacy PCI bus, we incorrectly return that we're ACS safe here. That leads to my second problem, pciids seems to suggest that some of these functions have been around for a while. Is it just this package that's peer-to-peer safe, or is it safe to assume that any previous assembly of these functions is also p2p safe. Maybe we need to factor in device revs if that uniquely identifies this package? Looks like another useful device to potentially quirk would be: 00:15.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 PCI to PCI bridge (PCIE port 0) 00:15.1 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 PCI to PCI bridge (PCIE port 1) 00:15.2 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI bridge (PCIE port 2) 00:15.3 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI bridge (PCIE port 3) 00:15.0 0604: 1002:43a0 00:15.1 0604: 1002:43a1 00:15.2 0604: 1002:43a2 00:15.3 0604: 1002:43a3 drivers/pci/quirks.c | 29 + 1 file changed, 29 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 4ebc865..2c84961 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3271,11 +3271,40 @@ struct pci_dev *pci_get_dma_source(struct pci_dev *dev) return pci_dev_get(dev); } +/* + * Multifunction devices that do not support peer-to-peer between + * functions can claim to support a subset of ACS. Such devices + * effectively enable request redirect (RR) and completion redirect (CR) + * since all transactions are redirected to the upstream root complex. + */ +static int pci_mf_no_p2p_acs_enabled(struct pci_dev *dev, u16 acs_flags) +{ + if (!dev-multifunction) + return -ENODEV; + + /* Filter out flags not applicable to multifunction */ + acs_flags = (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC | PCI_ACS_DT); + + return acs_flags ~(PCI_ACS_RR | PCI_ACS_CR) ? 0 : 1; +} + static const struct pci_dev_acs_enabled { u16 vendor; u16 device; int (*acs_enabled)(struct pci_dev *dev, u16 acs_flags); } pci_dev_acs_enabled[] = { + /* +* AMD/ATI multifunction southbridge devices. AMD has confirmed +* that peer-to-peer between these devices is not possible, so +* they do support a subset of ACS even though the capability is +* not exposed in config space. +*/ + { PCI_VENDOR_ID_ATI, 0x4385, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x439c, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4383, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x439d, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4384, pci_mf_no_p2p_acs_enabled }, + { PCI_VENDOR_ID_ATI, 0x4399, pci_mf_no_p2p_acs_enabled }, { 0 } }; I was looking for something else and found this old email. This patch hasn't been applied and I haven't seen any discussion about it. Is it still of interest? It seems relevant to the current ACS discussion [1]. It is absolutely relevant. I always have to patch my kernel to get it working to put my pci device to VM. Meanwhile I'm doing it for kernel 3.9. I would be very glad to get these patches to the kernel as they don't do anything bad! I'd still like to see this get in too. IIRC, where we left off was that Joerg had confirmed with the hardware folks that there is no peer-to-peer between these devices, but we still had questions about whether that was true for any instance of these vendor/device IDs. These devices are re-used in several packages and I'm not sure if we need to somehow figure out what package (ie. which chipset generation) we're looking at to know if p2p is used. Does this statement cover your question? http://article.gmane.org/gmane.comp.emulators.kvm.devel/99402 Yeah, perhaps it does. I initially disregarded it because it's easy to
Re: [nVMX w/ Haswell] KVM unit-tests in L1 - eventinj test fails trying to send NMI
On 2013-06-26 10:03, Kashyap Chamarthy wrote: Thanks for the note, it's very helpful! This test actually fails on older CPUs as well, and I can finally reproduce the issue that Jay also reported. I'm not able to cure it by going back to 3b656cf764^, Ok, you tried w/o this commit.. commit 3b656cf764cbc43d3efb9bf5f45c618d4cf0989f Author: Jan Kiszka jan.kis...@siemens.com Date: Sun Apr 14 12:12:45 2013 +0200 KVM: nVMX: Fix injection of PENDING_INTERRUPT and NMI_WINDOW exits to L1 Check if the interrupt or NMI window exit is for L1 by testing if it has the corresponding controls enabled. This is required when we allow direct injection from L0 to L2 I first tried by reverting to the commit before this one, just like Jay reported for https://bugzilla.kernel.org/show_bug.cgi?id=58941. But this just varied the error (kvm reports an internal error), didn't solve the issue. Now I simply reverted the commit on top of next, but without an effect. Looks like those problems are not directly related. Kashyap, you can do us a favor and try to find out if there was a commit in the recent history (roughly before I started to hack on nVMX this year) where this test cases succeeded. TIA, Jan signature.asc Description: OpenPGP digital signature
[PATCH qom-cpu v3 01/14] kvm: Free current_cpu identifier
Since CPU loops are done as last step in kvm_{insert,remove}_breakpoint() and kvm_remove_all_breakpoints(), we do not need to distinguish between invoking CPU and iterated CPUs and can thereby free the identifier for use as a global variable. Acked-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: Andreas Färber afaer...@suse.de --- include/sysemu/kvm.h | 10 +- kvm-all.c| 39 +-- kvm-stub.c | 6 +++--- 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index fe8bc40..c88aee9 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -159,11 +159,11 @@ void *kvm_arch_ram_alloc(ram_addr_t size); void kvm_setup_guest_memory(void *start, size_t size); void kvm_flush_coalesced_mmio_buffer(void); -int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr, +int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr, target_ulong len, int type); -int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr, +int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, target_ulong len, int type); -void kvm_remove_all_breakpoints(CPUArchState *current_env); +void kvm_remove_all_breakpoints(CPUArchState *env); int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap); #ifndef _WIN32 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset); @@ -241,9 +241,9 @@ struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, int kvm_sw_breakpoints_active(CPUState *cpu); -int kvm_arch_insert_sw_breakpoint(CPUState *current_cpu, +int kvm_arch_insert_sw_breakpoint(CPUState *cpu, struct kvm_sw_breakpoint *bp); -int kvm_arch_remove_sw_breakpoint(CPUState *current_cpu, +int kvm_arch_remove_sw_breakpoint(CPUState *cpu, struct kvm_sw_breakpoint *bp); int kvm_arch_insert_hw_breakpoint(target_ulong addr, target_ulong len, int type); diff --git a/kvm-all.c b/kvm-all.c index 7a1684e..d074597 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1896,16 +1896,15 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap) return data.err; } -int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr, +int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr, target_ulong len, int type) { -CPUState *current_cpu = ENV_GET_CPU(current_env); +CPUState *cpu = ENV_GET_CPU(env); struct kvm_sw_breakpoint *bp; -CPUArchState *env; int err; if (type == GDB_BREAKPOINT_SW) { -bp = kvm_find_sw_breakpoint(current_cpu, addr); +bp = kvm_find_sw_breakpoint(cpu, addr); if (bp) { bp-use_count++; return 0; @@ -1918,14 +1917,13 @@ int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr, bp-pc = addr; bp-use_count = 1; -err = kvm_arch_insert_sw_breakpoint(current_cpu, bp); +err = kvm_arch_insert_sw_breakpoint(cpu, bp); if (err) { g_free(bp); return err; } -QTAILQ_INSERT_HEAD(current_cpu-kvm_state-kvm_sw_breakpoints, - bp, entry); +QTAILQ_INSERT_HEAD(cpu-kvm_state-kvm_sw_breakpoints, bp, entry); } else { err = kvm_arch_insert_hw_breakpoint(addr, len, type); if (err) { @@ -1942,16 +1940,15 @@ int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr, return 0; } -int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr, +int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, target_ulong len, int type) { -CPUState *current_cpu = ENV_GET_CPU(current_env); +CPUState *cpu = ENV_GET_CPU(env); struct kvm_sw_breakpoint *bp; -CPUArchState *env; int err; if (type == GDB_BREAKPOINT_SW) { -bp = kvm_find_sw_breakpoint(current_cpu, addr); +bp = kvm_find_sw_breakpoint(cpu, addr); if (!bp) { return -ENOENT; } @@ -1961,12 +1958,12 @@ int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr, return 0; } -err = kvm_arch_remove_sw_breakpoint(current_cpu, bp); +err = kvm_arch_remove_sw_breakpoint(cpu, bp); if (err) { return err; } -QTAILQ_REMOVE(current_cpu-kvm_state-kvm_sw_breakpoints, bp, entry); +QTAILQ_REMOVE(cpu-kvm_state-kvm_sw_breakpoints, bp, entry); g_free(bp); } else { err = kvm_arch_remove_hw_breakpoint(addr, len, type); @@ -1984,16 +1981,14 @@ int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr, return 0; } -void kvm_remove_all_breakpoints(CPUArchState *current_env) +void kvm_remove_all_breakpoints(CPUArchState
[PATCH qom-cpu v3 12/14] target-s390x: Don't overuse ENV_GET_CPU()
Commit 3474b679486caa8f6448bae974e131370f360c13 (Utilize selective runtime reg sync for hot code paths) introduced two uses of ENV_GET_CPU() inside target-s390x/ KVM code. In one case we can use a direct CPU() cast instead. Cc: Jason J. Herne jjhe...@us.ibm.com Signed-off-by: Andreas Färber afaer...@suse.de --- target-s390x/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index b524c35..4660074 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -469,7 +469,7 @@ static int kvm_handle_css_inst(S390CPU *cpu, struct kvm_run *run, int r = 0; int no_cc = 0; CPUS390XState *env = cpu-env; -CPUState *cs = ENV_GET_CPU(env); +CPUState *cs = CPU(cpu); if (ipa0 != 0xb2) { /* Not handled for now. */ -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH qom-cpu v3 13/14] target-s390x: Change handle_{hypercall,diag}() argument to S390CPU
This allows to get rid of the last remaining ENV_GET_CPU() in target-s390x/ by using CPU() cast directly on the argument. Cc: Jason J. Herne jjhe...@us.ibm.com Signed-off-by: Andreas Färber afaer...@suse.de --- target-s390x/kvm.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index 4660074..33ca7a7 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -607,9 +607,10 @@ static int handle_priv(S390CPU *cpu, struct kvm_run *run, return r; } -static int handle_hypercall(CPUS390XState *env, struct kvm_run *run) +static int handle_hypercall(S390CPU *cpu, struct kvm_run *run) { -CPUState *cs = ENV_GET_CPU(env); +CPUState *cs = CPU(cpu); +CPUS390XState *env = cpu-env; kvm_s390_get_registers_partial(cs); cs-kvm_vcpu_dirty = true; @@ -618,13 +619,13 @@ static int handle_hypercall(CPUS390XState *env, struct kvm_run *run) return 0; } -static int handle_diag(CPUS390XState *env, struct kvm_run *run, int ipb_code) +static int handle_diag(S390CPU *cpu, struct kvm_run *run, int ipb_code) { int r = 0; switch (ipb_code) { case DIAG_KVM_HYPERCALL: -r = handle_hypercall(env, run); +r = handle_hypercall(cpu, run); break; case DIAG_KVM_BREAKPOINT: sleep(10); @@ -735,7 +736,6 @@ out: static int handle_instruction(S390CPU *cpu, struct kvm_run *run) { -CPUS390XState *env = cpu-env; unsigned int ipa0 = (run-s390_sieic.ipa 0xff00); uint8_t ipa1 = run-s390_sieic.ipa 0x00ff; int ipb_code = (run-s390_sieic.ipb 0x0fff) 16; @@ -749,7 +749,7 @@ static int handle_instruction(S390CPU *cpu, struct kvm_run *run) r = handle_priv(cpu, run, ipa0 8, ipa1); break; case IPA0_DIAG: -r = handle_diag(env, run, ipb_code); +r = handle_diag(cpu, run, ipb_code); break; case IPA0_SIGP: r = handle_sigp(cpu, run, ipa1); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH qom-cpu v3 03/14] kvm: Change kvm_remove_all_breakpoints() argument to CPUState
Acked-by: Paolo Bonzini pbonz...@redhat.com Reviewed-by: Richard Henderson r...@twiddle.net Signed-off-by: Andreas Färber afaer...@suse.de --- gdbstub.c| 2 +- include/sysemu/kvm.h | 2 +- kvm-all.c| 6 +++--- kvm-stub.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gdbstub.c b/gdbstub.c index 3101a43..9e7f7a1 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -2019,7 +2019,7 @@ static void gdb_breakpoint_remove_all(void) CPUArchState *env; if (kvm_enabled()) { -kvm_remove_all_breakpoints(gdbserver_state-c_cpu); +kvm_remove_all_breakpoints(ENV_GET_CPU(gdbserver_state-c_cpu)); return; } diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index c88aee9..9460d5a 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -163,7 +163,7 @@ int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr, target_ulong len, int type); int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, target_ulong len, int type); -void kvm_remove_all_breakpoints(CPUArchState *env); +void kvm_remove_all_breakpoints(CPUState *cpu); int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap); #ifndef _WIN32 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset); diff --git a/kvm-all.c b/kvm-all.c index d074597..ee0ee02 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1981,11 +1981,11 @@ int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, return 0; } -void kvm_remove_all_breakpoints(CPUArchState *env) +void kvm_remove_all_breakpoints(CPUState *cpu) { -CPUState *cpu = ENV_GET_CPU(env); struct kvm_sw_breakpoint *bp, *next; KVMState *s = cpu-kvm_state; +CPUArchState *env; QTAILQ_FOREACH_SAFE(bp, s-kvm_sw_breakpoints, entry, next) { if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { @@ -2026,7 +2026,7 @@ int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, return -EINVAL; } -void kvm_remove_all_breakpoints(CPUArchState *env) +void kvm_remove_all_breakpoints(CPUState *cpu) { } #endif /* !KVM_CAP_SET_GUEST_DEBUG */ diff --git a/kvm-stub.c b/kvm-stub.c index 76da61e..a6c2b01 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -95,7 +95,7 @@ int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, return -EINVAL; } -void kvm_remove_all_breakpoints(CPUArchState *env) +void kvm_remove_all_breakpoints(CPUState *cpu) { } -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks
On 06/26/2013 09:41 PM, Gleb Natapov wrote: On Wed, Jun 26, 2013 at 07:10:21PM +0530, Raghavendra K T wrote: On 06/26/2013 06:22 PM, Gleb Natapov wrote: On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote: On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote: On 06/25/2013 08:20 PM, Andrew Theurer wrote: On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote: This series replaces the existing paravirtualized spinlock mechanism with a paravirtualized ticketlock mechanism. The series provides implementation for both Xen and KVM. Changes in V9: - Changed spin_threshold to 32k to avoid excess halt exits that are causing undercommit degradation (after PLE handler improvement). - Added kvm_irq_delivery_to_apic (suggested by Gleb) - Optimized halt exit path to use PLE handler V8 of PVspinlock was posted last year. After Avi's suggestions to look at PLE handler's improvements, various optimizations in PLE handling have been tried. Sorry for not posting this sooner. I have tested the v9 pv-ticketlock patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs. I have tested these patches with and without PLE, as PLE is still not scalable with large VMs. Hi Andrew, Thanks for testing. System: x3850X5, 40 cores, 80 threads 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput(MB/s)Notes 3.10-default-ple_on 22945 5% CPU in host kernel, 2% spin_lock in guests 3.10-default-ple_off23184 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_on22895 5% CPU in host kernel, 2% spin_lock in guests 3.10-pvticket-ple_off 23051 5% CPU in host kernel, 2% spin_lock in guests [all 1x results look good here] Yes. The 1x results look too close 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench: --- Total Configuration Throughput Notes 3.10-default-ple_on 6287 55% CPU host kernel, 17% spin_lock in guests 3.10-default-ple_off 1849 2% CPU in host kernel, 95% spin_lock in guests 3.10-pvticket-ple_on 6691 50% CPU in host kernel, 15% spin_lock in guests 3.10-pvticket-ple_off 16464 8% CPU in host kernel, 33% spin_lock in guests I see 6.426% improvement with ple_on and 161.87% improvement with ple_off. I think this is a very good sign for the patches [PLE hinders pv-ticket improvements, but even with PLE off, we still off from ideal throughput (somewhere 2)] Okay, The ideal throughput you are referring is getting around atleast 80% of 1x throughput for over-commit. Yes we are still far away from there. 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on 22736 6% CPU in host kernel, 3% spin_lock in guests 3.10-default-ple_off23377 5% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_on22471 6% CPU in host kernel, 3% spin_lock in guests 3.10-pvticket-ple_off 23445 5% CPU in host kernel, 3% spin_lock in guests [1x looking fine here] I see ple_off is little better here. 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench: -- Total Configuration Throughput Notes 3.10-default-ple_on 1965 70% CPU in host kernel, 34% spin_lock in guests 3.10-default-ple_off 226 2% CPU in host kernel, 94% spin_lock in guests 3.10-pvticket-ple_on 1942 70% CPU in host kernel, 35% spin_lock in guests 3.10-pvticket-ple_off8003 11% CPU in host kernel, 70% spin_lock in guests [quite bad all around, but pv-tickets with PLE off the best so far. Still quite a bit off from ideal throughput] This is again a remarkable improvement (307%). This motivates me to add a patch to disable ple when pvspinlock is on. probably we can add a hypercall that disables ple in kvm init patch. but only problem I see is what if the guests are mixed. (i.e one guest has pvspinlock support
[PATCH-next v2] kvm: don't try to take mmu_lock while holding the main raw kvm_lock
In commit e935b8372cf8 (KVM: Convert kvm_lock to raw_spinlock), the kvm_lock was made a raw lock. However, the kvm mmu_shrink() function tries to grab the (non-raw) mmu_lock within the scope of the raw locked kvm_lock being held. This leads to the following: BUG: sleeping function called from invalid context at kernel/rtmutex.c:659 in_atomic(): 1, irqs_disabled(): 0, pid: 55, name: kswapd0 Preemption disabled at:[a0376eac] mmu_shrink+0x5c/0x1b0 [kvm] Pid: 55, comm: kswapd0 Not tainted 3.4.34_preempt-rt Call Trace: [8106f2ad] __might_sleep+0xfd/0x160 [817d8d64] rt_spin_lock+0x24/0x50 [a0376f3c] mmu_shrink+0xec/0x1b0 [kvm] [8111455d] shrink_slab+0x17d/0x3a0 [81151f00] ? mem_cgroup_iter+0x130/0x260 [8111824a] balance_pgdat+0x54a/0x730 [8111fe47] ? set_pgdat_percpu_threshold+0xa7/0xd0 [811185bf] kswapd+0x18f/0x490 [81070961] ? get_parent_ip+0x11/0x50 [81061970] ? __init_waitqueue_head+0x50/0x50 [81118430] ? balance_pgdat+0x730/0x730 [81060d2b] kthread+0xdb/0xe0 [8106e122] ? finish_task_switch+0x52/0x100 [817e1e94] kernel_thread_helper+0x4/0x10 [81060c50] ? __init_kthread_worker+0x Note that the above was seen on an earlier 3.4 preempt-rt, for where the lock distinction (raw vs. non-raw) actually matters. Since we only use the lock for protecting the vm_list, once we've found the instance we want, we can shuffle it to the end of the list and then drop the kvm_lock before taking the mmu_lock. We can do this because after the mmu operations are completed, we break -- i.e. we don't continue list processing, so it doesn't matter if the list changed around us. Since the shrinker code runs asynchronously with respect to KVM, we do need to still protect against the users_count going to zero and then kvm_destroy_vm() being called, so we use kvm_get_kvm/kvm_put_kvm, as suggested by Paolo. Cc: Paolo Bonzini pbonz...@redhat.com Cc: Gleb Natapov g...@redhat.com Signed-off-by: Paul Gortmaker paul.gortma...@windriver.com --- [v2: add the kvm_get_kvm, update comments and log appropriately] diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 748e0d8..662b679 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4322,6 +4322,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; int nr_to_scan = sc-nr_to_scan; + int found = 0; unsigned long freed = 0; raw_spin_lock(kvm_lock); @@ -4349,6 +4350,18 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) continue; idx = srcu_read_lock(kvm-srcu); + + list_move_tail(kvm-vm_list, vm_list); + found = 1; + /* +* We are done with the list, so drop kvm_lock, as we can't be +* holding a raw lock and take the non-raw mmu_lock. But we +* don't want to be unprotected from kvm_destroy_vm either, +* so we bump users_count. +*/ + kvm_get_kvm(kvm); + raw_spin_unlock(kvm_lock); + spin_lock(kvm-mmu_lock); if (kvm_has_zapped_obsolete_pages(kvm)) { @@ -4363,6 +4376,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unlock: spin_unlock(kvm-mmu_lock); + kvm_put_kvm(kvm); srcu_read_unlock(kvm-srcu, idx); /* @@ -4370,11 +4384,12 @@ unlock: * per-vm shrinkers cry out * sadness comes quickly */ - list_move_tail(kvm-vm_list, vm_list); break; } - raw_spin_unlock(kvm_lock); + if (!found) + raw_spin_unlock(kvm_lock); + return freed; } -- 1.8.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/3] KVM: small type cleanups
Hi, this small series contains a few type and style cleanups. It has no impact on the generated code but removes a few small nits from the code. Please apply! Thanks, Mathias Krause (3): KVM: VMX: Use proper types to access const arrays KVM: VMX: Use size_t to store sizeof() values KVM: x86: Drop useless cast arch/x86/kvm/vmx.c | 19 +-- arch/x86/kvm/x86.c |2 +- 2 files changed, 10 insertions(+), 11 deletions(-) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] KVM: VMX: Use size_t to store sizeof() values
The type for storing values of the sizeof operator should be size_t. No semantical changes, only type correctness. Signed-off-by: Mathias Krause mini...@googlemail.com --- arch/x86/kvm/vmx.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7393164..cd9090f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3909,7 +3909,7 @@ static void free_vpid(struct vcpu_vmx *vmx) static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type) { - int f = sizeof(unsigned long); + const size_t f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) return; @@ -3944,7 +3944,7 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type) { - int f = sizeof(unsigned long); + const size_t f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) return; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] KVM: x86: Drop useless cast
Void pointers don't need no casting, drop it. Signed-off-by: Mathias Krause mini...@googlemail.com --- arch/x86/kvm/x86.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8ba99c..472350c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5300,7 +5300,7 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + struct kvm_x86_ops *ops = opaque; if (kvm_x86_ops) { printk(KERN_ERR kvm: already loaded the other module\n); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] KVM: VMX: Use proper types to access const arrays
Use a const pointer type instead of casting away the const qualifier from const arrays. Keep the pointer array on the stack, nonetheless. Making it static just increases the object size. Signed-off-by: Mathias Krause mini...@googlemail.com --- arch/x86/kvm/vmx.c | 15 +++ 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..7393164 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5956,8 +5956,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long field; u64 field_value; struct vmcs *shadow_vmcs = vmx-nested.current_shadow_vmcs; - unsigned long *fields = (unsigned long *)shadow_read_write_fields; - int num_fields = max_shadow_read_write_fields; + const unsigned long *fields = shadow_read_write_fields; + const int num_fields = max_shadow_read_write_fields; vmcs_load(shadow_vmcs); @@ -5986,12 +5986,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - unsigned long *fields[] = { - (unsigned long *)shadow_read_write_fields, - (unsigned long *)shadow_read_only_fields + const unsigned long *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields }; - int num_lists = ARRAY_SIZE(fields); - int max_fields[] = { + const int max_fields[] = { max_shadow_read_write_fields, max_shadow_read_only_fields }; @@ -6002,7 +6001,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(shadow_vmcs); - for (q = 0; q num_lists; q++) { + for (q = 0; q ARRAY_SIZE(fields); q++) { for (i = 0; i max_fields[q]; i++) { field = fields[q][i]; vmcs12_read_any(vmx-vcpu, field, field_value); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Rebaseing kvm-arm-next
Hi all, I messed up my workflow earlier on, so I had to rebase kvm-arm-next onto kvm/next. I will do everything in my powers to avoid this in the future. Sorry for any troubles. -Christoffer -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[GIT PULL] KVM/ARM queue for 3.11
Hi Gleb and Paolo, The following changes since commit 87d41fb4da6467622b7a87fd6afe8071abab6dae: KVM: s390: Fixed priority of execution in STSI (2013-06-20 23:33:01 +0200) are available in the git repository at: git://git.linaro.org/people/cdall/linux-kvm-arm.git tags/kvm-arm-3.11 for you to fetch changes up to 8bd4ffd6b3a98f00267051dc095076ea2ff06ea8: ARM: kvm: don't include drivers/virtio/Kconfig (2013-06-26 10:50:06 -0700) Thanks, -Christoffer Anup Patel (1): ARM: KVM: Allow host virt timer irq to be different from guest timer virt irq Arnd Bergmann (1): ARM: kvm: don't include drivers/virtio/Kconfig Christoffer Dall (1): Update MAINTAINERS: KVM/ARM work now funded by Linaro Dave P Martin (1): ARM: KVM: Don't handle PSCI calls via SMC Geoff Levand (1): arm/kvm: Cleanup KVM_ARM_MAX_VCPUS logic Marc Zyngier (7): ARM: KVM: remove dead prototype for __kvm_tlb_flush_vmid ARM: KVM: use phys_addr_t instead of unsigned long long for HYP PGDs ARM: KVM: don't special case PC when doing an MMIO ARM: KVM: get rid of S2_PGD_SIZE ARM: KVM: perform save/restore of PAR ARM: KVM: add missing dsb before invalidating Stage-2 TLBs ARM: KVM: clear exclusive monitor on all exception returns MAINTAINERS| 4 ++-- arch/arm/include/asm/kvm_arm.h | 1 - arch/arm/include/asm/kvm_asm.h | 24 arch/arm/include/asm/kvm_emulate.h | 5 - arch/arm/include/asm/kvm_host.h| 9 +++-- arch/arm/kvm/Kconfig | 8 +++- arch/arm/kvm/arm.c | 8 arch/arm/kvm/coproc.c | 4 arch/arm/kvm/handle_exit.c | 3 --- arch/arm/kvm/interrupts.S | 16 +++- arch/arm/kvm/interrupts_head.S | 10 -- arch/arm/kvm/mmio.c| 6 -- arch/arm/kvm/mmu.c | 3 --- arch/arm/kvm/psci.c| 2 +- arch/arm/kvm/reset.c | 12 include/kvm/arm_arch_timer.h | 4 virt/kvm/arm/arch_timer.c | 29 - 17 files changed, 92 insertions(+), 56 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] pci: Enable overrides for missing ACS capabilities
On Mon, 2013-06-24 at 11:43 -0600, Bjorn Helgaas wrote: On Wed, Jun 19, 2013 at 6:43 AM, Don Dutile ddut...@redhat.com wrote: On 06/18/2013 10:52 PM, Bjorn Helgaas wrote: On Tue, Jun 18, 2013 at 5:03 PM, Don Dutileddut...@redhat.com wrote: On 06/18/2013 06:22 PM, Alex Williamson wrote: On Tue, 2013-06-18 at 15:31 -0600, Bjorn Helgaas wrote: On Tue, Jun 18, 2013 at 12:20 PM, Alex Williamson alex.william...@redhat.com wrote: On Tue, 2013-06-18 at 11:28 -0600, Bjorn Helgaas wrote: On Thu, May 30, 2013 at 12:40:19PM -0600, Alex Williamson wrote: ... Who do you expect to decide whether to use this option? I think it requires intimate knowledge of how the device works. I think the benefit of using the option is that it makes assignment of devices to guests more flexible, which will make it attractive to users. But most users have no way of knowing whether it's actually *safe* to use this. So I worry that you're adding an easy way to pretend isolation exists when there's no good way of being confident that it actually does. ... I wonder if we should taint the kernel if this option is used (but not for specific devices added to pci_dev_acs_enabled[]). It would also be nice if pci_dev_specific_acs_enabled() gave some indication in dmesg for the specific devices you're hoping to add to pci_dev_acs_enabled[]. It's not an enumeration-time quirk right now, so I'm not sure how we'd limit it to one message per device. Right, setup vs use and getting single prints is a lot of extra code. Tainting is troublesome for support, Don had some objections when I suggested the same to him. For RH GSS (Global Support Services), a 'taint' in the kernel printk means RH doesn't support that system. The 'non-support' due to 'taint' being printed out in this case may be incorrect -- RH may support that use, at least until a more sufficient patched kernel is provided. Thus my dissension that 'taint' be output. WARN is ok. 'driver beware', 'unleashed dog afoot' sure... So ... that's really a RH-specific support issue, and easily worked around by RH adding a patch that turns off tainting. sure. what's another patch to the thousands... :-/ It still sounds like a good idea to me for upstream, where use of this option can very possibly lead to corruption or information leakage between devices the user claimed were isolated, but in fact were not. Did I miss something? This patch provides a user-level/chosen override; like all other overrides, (pci=realloc, etc.), it can lead to a failing system. IMO, this patch is no different. If you want to tag this patch with taint, then let's audit all the (PCI) overrides and taint them appropriately. Taint should be reserved to changes to the kernel that were done outside the development of the kernel, or with the explicit intent to circumvent the normal operation of the kernel. This patch provides a way to enable ACS checking to succeed when the devices have not provided sufficiently complete ACS information. i.e., it's a growth path for PCIe-ACS and its need for proper support. We're telling the kernel to assume something (the hardware provides protection) that may not be true. If that assumption turns out to be false, the result is that a VM can be crashed or comprised by another VM. One difference I see is that this override can lead to a crash that looks like random memory corruption and has no apparent connection to the actual cause. Most other overrides won't cause run-time crashes (I think they're more likely to cause boot or device configuration failures), and the dmesg log will probably have good clues as to the reason. But the possibility of compromise is probably even more serious, because there would be no crash at all, and we'd have no indication that VM A read or corrupted data in VM B. I'm very concerned about that, enough so that it's not clear to me that an override belongs in the upstream kernel at all. Yes, that would mean some hardware is not suitable for device assignment. That just sounds like if hardware manufacturers do their homework and support ACS properly, their hardware is more useful for virtualization than other hardware. I don't see the problem with that. That's easy to say for someone that doesn't get caught trying to explain this to users over and over. In many cases devices don't do peer-to-peer and missing ACS is an oversight. I imagine that quite a few vendors also see the ACS capability as a means to allow control of ACS and therefore see it as a much larger investment that just providing an empty ACS structure in config space to indicate the lack of peer-to-peer. Even if we taint the kernel when this is enabled and add extremely verbose warnings in kernel-parameters.txt, I think there's value to providing an on-the-spot workaround to users. In many
Re: [PATCH-next v2] kvm: don't try to take mmu_lock while holding the main raw kvm_lock
Il 26/06/2013 20:11, Paul Gortmaker ha scritto: spin_unlock(kvm-mmu_lock); + kvm_put_kvm(kvm); srcu_read_unlock(kvm-srcu, idx); kvm_put_kvm needs to go last. I can fix when applying, but I'll wait for Gleb to take a look too. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH] uio: uio_pci_generic: Add support for MSI interrupts
Enable support for MSI interrupts if the device supports it. Since MSI interrupts are edge triggered, it is no longer necessary to disable interrupts in the kernel and re-enable them from user-space. Instead, clearing the interrupt condition in the user space application automatically re-enables the interrupt. Signed-off-by: Guenter Roeck li...@roeck-us.net --- An open question is if we can just do this unconditionally or if there should be some flag to enable it. A module parameter, maybe ? Documentation/DocBook/uio-howto.tmpl | 23 --- drivers/uio/uio_pci_generic.c| 15 --- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl index 9561815..69b54e0 100644 --- a/Documentation/DocBook/uio-howto.tmpl +++ b/Documentation/DocBook/uio-howto.tmpl @@ -46,6 +46,12 @@ GPL version 2. revhistory revision + revnumber0.10/revnumber + date2013-06-26/date + authorinitialsgr/authorinitials + revremarkAdded MSI support to uio_pci_generic./revremark + /revision + revision revnumber0.9/revnumber date2009-07-16/date authorinitialsmst/authorinitials @@ -935,15 +941,26 @@ and look in the output for failure reasons sect1 id=uio_pci_generic_internals titleThings to know about uio_pci_generic/title para -Interrupts are handled using the Interrupt Disable bit in the PCI command +Interrupts are handled either as MSI interrupts (if the device supports it) or +as legacy INTx interrupts. + /para + para +uio_pci_generic automatically configures a device to use MSI interrupts +if the device supports it. If an MSI interrupt is received, the user space +driver is notified. Since MSI interrupts are edge sensitive, the user space +driver needs to clear the interrupt condition in the device before blocking +and waiting for more interrupts. + /para + para +Legacy interrupts are handled using the Interrupt Disable bit in the PCI command register and Interrupt Status bit in the PCI status register. All devices compliant to PCI 2.3 (circa 2002) and all compliant PCI Express devices should support these bits. uio_pci_generic detects this support, and won't bind to devices which do not support the Interrupt Disable Bit in the command register. /para para -On each interrupt, uio_pci_generic sets the Interrupt Disable bit. -This prevents the device from generating further interrupts +If legacy interrupts are used, uio_pci_generic sets the Interrupt Disable bit on +each interrupt. This prevents the device from generating further interrupts until the bit is cleared. The userspace driver should clear this bit before blocking and waiting for more interrupts. /para diff --git a/drivers/uio/uio_pci_generic.c b/drivers/uio/uio_pci_generic.c index 14aa10c..3366fdb 100644 --- a/drivers/uio/uio_pci_generic.c +++ b/drivers/uio/uio_pci_generic.c @@ -32,6 +32,7 @@ struct uio_pci_generic_dev { struct uio_info info; struct pci_dev *pdev; + bool have_msi; }; static inline struct uio_pci_generic_dev * @@ -46,7 +47,7 @@ static irqreturn_t irqhandler(int irq, struct uio_info *info) { struct uio_pci_generic_dev *gdev = to_uio_pci_generic_dev(info); - if (!pci_check_and_mask_intx(gdev-pdev)) + if (!gdev-have_msi !pci_check_and_mask_intx(gdev-pdev)) return IRQ_NONE; /* UIO core will signal the user process. */ @@ -58,6 +59,7 @@ static int probe(struct pci_dev *pdev, { struct uio_pci_generic_dev *gdev; int err; + bool have_msi = false; err = pci_enable_device(pdev); if (err) { @@ -73,7 +75,9 @@ static int probe(struct pci_dev *pdev, return -ENODEV; } - if (!pci_intx_mask_supported(pdev)) { + if (!pci_enable_msi(pdev)) { + have_msi = true; + } else if (!pci_intx_mask_supported(pdev)) { err = -ENODEV; goto err_verify; } @@ -84,10 +88,11 @@ static int probe(struct pci_dev *pdev, goto err_alloc; } + gdev-have_msi = have_msi; gdev-info.name = uio_pci_generic; gdev-info.version = DRIVER_VERSION; gdev-info.irq = pdev-irq; - gdev-info.irq_flags = IRQF_SHARED; + gdev-info.irq_flags = have_msi ? 0 : IRQF_SHARED; gdev-info.handler = irqhandler; gdev-pdev = pdev; @@ -99,6 +104,8 @@ static int probe(struct pci_dev *pdev, err_register: kfree(gdev); err_alloc: + if (have_msi) + pci_disable_msi(pdev); err_verify: pci_disable_device(pdev); return err; @@ -109,6 +116,8 @@ static void remove(struct pci_dev *pdev) struct uio_pci_generic_dev *gdev = pci_get_drvdata(pdev); uio_unregister_device(gdev-info); + if (gdev-have_msi) +
Migration route from Parallels on Mac for Windows images?
Sorry for the user query but I'm not finding expertise on the Linux mailing lists I belong to. The web site says one-off user questions are OK. I have a few VM images on Parallels 8 for Mac. I want them to be on KVM/Linux. Some of the images are Linux, but the critical ones are a few types of Windows. I don't want to trash my licenses. I noticed that kvm-img has a parallels format option, and it seems to work while the conversion is going on. I've tried kvm-img to convert to qcow2 and to raw, both cases the image converts but the disk is not bootable. The only file the kvm-img doesn't immediately fail on is the one that contains the data. The best answer to my problem is to find out how to make the disk bootable. The next best answer is to find out if there is a reliable migration path, even if it means going to VMware first. Also, if VMware is a necessary intermediate point, it would help to know which VMware format to use for best results. I'm not a KVM expert, I've made some VMs on LVM and installed Linux on them with bridged networking, that's about the extent of it. For the record that was insanely simple. Thanks. -- Ken Roberts k...@9ci.com ken.roberts163 @ skype 605-222-5758 @ cell -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH-next v2] kvm: don't try to take mmu_lock while holding the main raw kvm_lock
[Re: [PATCH-next v2] kvm: don't try to take mmu_lock while holding the main raw kvm_lock] On 26/06/2013 (Wed 23:59) Paolo Bonzini wrote: Il 26/06/2013 20:11, Paul Gortmaker ha scritto: spin_unlock(kvm-mmu_lock); + kvm_put_kvm(kvm); srcu_read_unlock(kvm-srcu, idx); kvm_put_kvm needs to go last. I can fix when applying, but I'll wait for Gleb to take a look too. I'm curious why you would say that -- since the way I sent it has the lock tear down be symmetrical and opposite to the build up - e.g. idx = srcu_read_lock(kvm-srcu); [...] + kvm_get_kvm(kvm); [...] spin_lock(kvm-mmu_lock); [...] unlock: spin_unlock(kvm-mmu_lock); + kvm_put_kvm(kvm); srcu_read_unlock(kvm-srcu, idx); You'd originally said to put the kvm_get_kvm where it currently is; perhaps instead we want the get/put to encompass the whole srcu_read locked section? P. -- Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/6 v5] powerpc: export debug registers save function for KVM
Hi, On Wed, 26 Jun 2013 11:12:23 +0530 Bharat Bhushan r65...@freescale.com wrote: diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 200d763..50b357f 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -30,6 +30,10 @@ extern void enable_kernel_spe(void); extern void giveup_spe(struct task_struct *); extern void load_up_spe(struct task_struct *); +#ifdef CONFIG_PPC_ADV_DEBUG_REGS +extern void switch_booke_debug_regs(struct thread_struct *new_thread); +#endif We usually don't bother guarding function declarations. -- Cheers, Stephen Rothwells...@canb.auug.org.au pgp_yJYfcoXUd.pgp Description: PGP signature
[PATCH 0/8 v4] KVM: PPC: IOMMU in-kernel handling
The changes are: 1. rebased on v3.10-rc7 2. removed spinlocks from real mode 3. added security checks between KVM and VFIO MOre details in the individual patch comments. Alexey Kardashevskiy (8): KVM: PPC: reserve a capability number for multitce support KVM: PPC: reserve a capability and ioctl numbers for realmode VFIO vfio: add external user support hashtable: add hash_for_each_possible_rcu_notrace() powerpc: Prepare to support kernel handling of IOMMU map/unmap KVM: PPC: Add support for multiple-TCE hcalls KVM: PPC: Add support for IOMMU in-kernel handling KVM: PPC: Add hugepage support for IOMMU in-kernel handling Documentation/virtual/kvm/api.txt| 51 +++ arch/powerpc/include/asm/kvm_host.h | 31 ++ arch/powerpc/include/asm/kvm_ppc.h | 18 +- arch/powerpc/include/asm/pgtable-ppc64.h |4 + arch/powerpc/include/uapi/asm/kvm.h |8 + arch/powerpc/kvm/book3s_64_vio.c | 506 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 439 -- arch/powerpc/kvm/book3s_hv.c | 41 ++- arch/powerpc/kvm/book3s_hv_rmhandlers.S |6 + arch/powerpc/kvm/book3s_pr_papr.c| 37 ++- arch/powerpc/kvm/powerpc.c | 15 + arch/powerpc/mm/init_64.c| 78 - drivers/vfio/vfio.c | 53 include/linux/hashtable.h| 15 + include/linux/page-flags.h |4 +- include/uapi/linux/kvm.h |3 + 16 files changed, 1279 insertions(+), 30 deletions(-) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/8] powerpc: Prepare to support kernel handling of IOMMU map/unmap
The current VFIO-on-POWER implementation supports only user mode driven mapping, i.e. QEMU is sending requests to map/unmap pages. However this approach is really slow, so we want to move that to KVM. Since H_PUT_TCE can be extremely performance sensitive (especially with network adapters where each packet needs to be mapped/unmapped) we chose to implement that as a fast hypercall directly in real mode (processor still in the guest context but MMU off). To be able to do that, we need to provide some facilities to access the struct page count within that real mode environment as things like the sparsemem vmemmap mappings aren't accessible. This adds an API to increment/decrement page counter as get_user_pages API used for user mode mapping does not work in the real mode. CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported. Reviewed-by: Paul Mackerras pau...@samba.org Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: 2013/06/27: * realmode_get_page() fixed to use get_page_unless_zero(). If failed, the call will be passed from real to virtual mode and safely handled. * added comment to PageCompound() in include/linux/page-flags.h. 2013/05/20: * PageTail() is replaced by PageCompound() in order to have the same checks for whether the page is huge in realmode_get_page() and realmode_put_page() --- arch/powerpc/include/asm/pgtable-ppc64.h |4 ++ arch/powerpc/mm/init_64.c| 78 +- include/linux/page-flags.h |4 +- 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index e3d55f6f..7b46e5f 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -376,6 +376,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } #endif /* !CONFIG_HUGETLB_PAGE */ +struct page *realmode_pfn_to_page(unsigned long pfn); +int realmode_get_page(struct page *page); +int realmode_put_page(struct page *page); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a90b9c4..7031be3 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -297,5 +297,81 @@ void vmemmap_free(unsigned long start, unsigned long end) { } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ +/* + * We do not have access to the sparsemem vmemmap, so we fallback to + * walking the list of sparsemem blocks which we already maintain for + * the sake of crashdump. In the long run, we might want to maintain + * a tree if performance of that linear walk becomes a problem. + * + * Any of realmode_ functions can fail due to: + * 1) As real sparsemem blocks do not lay in RAM continously (they + * are in virtual address space which is not available in the real mode), + * the requested page struct can be split between blocks so get_page/put_page + * may fail. + * 2) When huge pages are used, the get_page/put_page API will fail + * in real mode as the linked addresses in the page struct are virtual + * too. + * When 1) or 2) takes place, the API returns an error code to cause + * an exit to kernel virtual mode where the operation will be completed. + */ +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct vmemmap_backing *vmem_back; + struct page *page; + unsigned long page_size = 1 mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long pg_va = (unsigned long) pfn_to_page(pfn); + + for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) { + if (pg_va vmem_back-virt_addr) + continue; + /* Check that page struct is not split between real pages */ + if ((pg_va + sizeof(struct page)) + (vmem_back-virt_addr + page_size)) + return NULL; + + page = (struct page *) (vmem_back-phys + pg_va - + vmem_back-virt_addr); + return page; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#elif defined(CONFIG_FLATMEM) + +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct page *page = pfn_to_page(pfn); + return page; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM) +int realmode_get_page(struct page *page) +{ + if (PageCompound(page)) + return -EAGAIN; + + if (!get_page_unless_zero(page)) + return -EAGAIN; + + return 0; +} +EXPORT_SYMBOL_GPL(realmode_get_page); + +int realmode_put_page(struct page *page) +{ + if (PageCompound(page)) + return -EAGAIN; + +
[PATCH 7/8] KVM: PPC: Add support for IOMMU in-kernel handling
This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT and H_STUFF_TCE requests without passing them to QEMU, which saves time on switching to QEMU and back. Both real and virtual modes are supported. First the kernel tries to handle a TCE request in the real mode, if failed it passes it to the virtual mode to complete the operation. If it a virtual mode handler fails, a request is passed to the user mode. This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to associate a virtual PCI bus ID (LIOBN) with an IOMMU group which enables in-kernel handling of IOMMU map/unmap. The external user API support in VFIO is required. Tests show that this patch increases transmission speed from 220MB/s to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card). Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: 2013/06/27: * tce_list page is referenced now in order to protect it from accident invalidation during H_PUT_TCE_INDIRECT execution * added use of the external user VFIO API 2013/06/05: * changed capability number * changed ioctl number * update the doc article number 2013/05/20: * removed get_user() from real mode handlers * kvm_vcpu_arch::tce_tmp usage extended. Now real mode handler puts there translated TCEs, tries realmode_get_page() on those and if it fails, it passes control over the virtual mode handler which tries to finish the request handling * kvmppc_lookup_pte() now does realmode_get_page() protected by BUSY bit on a page * The only reason to pass the request to user mode now is when the user mode did not register TCE table in the kernel, in all other cases the virtual mode handler is expected to do the job --- Documentation/virtual/kvm/api.txt | 26 arch/powerpc/include/asm/kvm_host.h |4 + arch/powerpc/include/asm/kvm_ppc.h |2 + arch/powerpc/include/uapi/asm/kvm.h |8 + arch/powerpc/kvm/book3s_64_vio.c| 294 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 165 arch/powerpc/kvm/powerpc.c | 12 ++ 7 files changed, 509 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 762c703..01b0dc2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2387,6 +2387,32 @@ slows operations a lot. Unlike other capabilities of this section, this one is always enabled. +4.87 KVM_CREATE_SPAPR_TCE_IOMMU + +Capability: KVM_CAP_SPAPR_TCE_IOMMU +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_iommu (in) +Returns: 0 on success, -1 on error + +struct kvm_create_spapr_tce_iommu { + __u64 liobn; + __u32 iommu_id; + __u32 flags; +}; + +This creates a link between IOMMU group and a hardware TCE (translation +control entry) table. This link lets the host kernel know what IOMMU +group (i.e. TCE table) to use for the LIOBN number passed with +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls. + +In response to a TCE hypercall, the kernel looks for a TCE table descriptor +in the list and handles the hypercall in real or virtual modes if +the descriptor is found. Otherwise the hypercall is passed to the user mode. + +No flag is supported at the moment. + + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3bf407b..716ab18 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -180,6 +180,8 @@ struct kvmppc_spapr_tce_table { struct kvm *kvm; u64 liobn; u32 window_size; + struct iommu_group *grp;/* used for IOMMU groups */ + struct file *vfio_filp; /* used for IOMMU groups */ struct page *pages[0]; }; @@ -611,6 +613,8 @@ struct kvm_vcpu_arch { u64 busy_preempt; unsigned long *tce_tmp;/* TCE cache for TCE_PUT_INDIRECT hcall */ + unsigned long tce_tmp_num; /* Number of handled TCEs in the cache */ + unsigned long tce_reason; /* The reason of switching to the virtmode */ #endif }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e852921b..934e01d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -133,6 +133,8 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm, + struct kvm_create_spapr_tce_iommu *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( struct kvm_vcpu *vcpu, unsigned long liobn); extern long kvmppc_emulated_validate_tce(unsigned long tce); diff --git a/arch/powerpc/include/uapi/asm/kvm.h
[PATCH 8/8] KVM: PPC: Add hugepage support for IOMMU in-kernel handling
This adds special support for huge pages (16MB). The reference counting cannot be easily done for such pages in real mode (when MMU is off) so we added a list of huge pages. It is populated in virtual mode and get_page is called just once per a huge page. Real mode handlers check if the requested page is huge and in the list, then no reference counting is done, otherwise an exit to virtual mode happens. The list is released at KVM exit. At the moment the fastest card available for tests uses up to 9 huge pages so walking through this list is not very expensive. However this can change and we may want to optimize this. Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: 2013/06/27: * list of huge pages replaces with hashtable for better performance * spinlock removed from real mode and only protects insertion of new huge [ages descriptors into the hashtable 2013/06/05: * fixed compile error when CONFIG_IOMMU_API=n 2013/05/20: * the real mode handler now searches for a huge page by gpa (used to be pte) * the virtual mode handler prints warning if it is called twice for the same huge page as the real mode handler is expected to fail just once - when a huge page is not in the list yet. * the huge page is refcounted twice - when added to the hugepage list and when used in the virtual mode hcall handler (can be optimized but it will make the patch less nice). --- arch/powerpc/include/asm/kvm_host.h | 25 + arch/powerpc/kvm/book3s_64_vio.c| 95 +-- arch/powerpc/kvm/book3s_64_vio_hv.c | 24 +++-- 3 files changed, 138 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 716ab18..0ad6189 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -30,6 +30,7 @@ #include linux/kvm_para.h #include linux/list.h #include linux/atomic.h +#include linux/hashtable.h #include asm/kvm_asm.h #include asm/processor.h #include asm/page.h @@ -182,9 +183,33 @@ struct kvmppc_spapr_tce_table { u32 window_size; struct iommu_group *grp;/* used for IOMMU groups */ struct file *vfio_filp; /* used for IOMMU groups */ + DECLARE_HASHTABLE(hash_tab, ilog2(64)); /* used for IOMMU groups */ + spinlock_t hugepages_write_lock;/* used for IOMMU groups */ struct page *pages[0]; }; +/* + * The KVM guest can be backed with 16MB pages. + * In this case, we cannot do page counting from the real mode + * as the compound pages are used - they are linked in a list + * with pointers as virtual addresses which are inaccessible + * in real mode. + * + * The code below keeps a 16MB pages list and uses page struct + * in real mode if it is already locked in RAM and inserted into + * the list or switches to the virtual mode where it can be + * handled in a usual manner. + */ +#define KVMPPC_HUGEPAGE_HASH(gpa) hash_32(gpa 24, 32) + +struct kvmppc_iommu_hugepage { + struct hlist_node hash_node; + unsigned long gpa; /* Guest physical address */ + unsigned long hpa; /* Host physical address */ + struct page *page; /* page struct of the very first subpage */ + unsigned long size; /* Huge page size (always 16MB at the moment) */ +}; + struct kvmppc_linear_info { void*base_virt; unsigned longbase_pfn; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index a5d0195..6cedfe9 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -47,6 +47,78 @@ #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) #define ERROR_ADDR ((void *)~(unsigned long)0x0) +#ifdef CONFIG_IOMMU_API +/* Adds a new huge page descriptor to the hashtable */ +static long kvmppc_iommu_hugepage_try_add( + struct kvmppc_spapr_tce_table *tt, + pte_t pte, unsigned long hva, unsigned long gpa, + unsigned long pg_size) +{ + long ret = 0; + struct kvmppc_iommu_hugepage *hp; + struct page *pg; + unsigned key = KVMPPC_HUGEPAGE_HASH(gpa); + + spin_lock(tt-hugepages_write_lock); + hash_for_each_possible_rcu(tt-hash_tab, hp, hash_node, key) { + if (KVMPPC_HUGEPAGE_HASH(hp-gpa) != key) + continue; + if ((gpa hp-gpa) || (gpa = hp-gpa + hp-size)) + continue; + goto unlock_exit; + } + + hva = hva ~(pg_size - 1); + ret = get_user_pages_fast(hva, 1, true/*write*/, pg); + if ((ret != 1) || !pg) { + ret = -EFAULT; + goto unlock_exit; + } + ret = 0; + + hp = kzalloc(sizeof(*hp), GFP_KERNEL); + if (!hp) { + ret = -ENOMEM; + goto unlock_exit; + } + + hp-page = pg; +
[PATCH 6/8] KVM: PPC: Add support for multiple-TCE hcalls
This adds real mode handlers for the H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO devices or emulated PCI. These calls allow adding multiple entries (up to 512) into the TCE table in one call which saves time on transition to/from real mode. This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs (copied from user and verified) before writing the whole list into the TCE table. This cache will be utilized more in the upcoming VFIO/IOMMU support to continue TCE list processing in the virtual mode in the case if the real mode handler failed for some reason. This adds a guest physical to host real address converter and calls the existing H_PUT_TCE handler. The converting function is going to be fully utilized by upcoming VFIO supporting patches. This also implements the KVM_CAP_PPC_MULTITCE capability, so in order to support the functionality of this patch, QEMU needs to query for this capability and set the hcall-multi-tce hypertas property only if the capability is present, otherwise there will be serious performance degradation. Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changelog: 2013/06/27: * fixed clear of BUSY bit in kvmppc_lookup_pte() * H_PUT_TCE_INDIRECT does realmode_get_page() now * KVM_CAP_SPAPR_MULTITCE now depends on CONFIG_PPC_BOOK3S_64 * updated doc 2013/06/05: * fixed mistype about IBMVIO in the commit message * updated doc and moved it to another section * changed capability number 2013/05/21: * added kvm_vcpu_arch::tce_tmp * removed cleanup if put_indirect failed, instead we do not even start writing to TCE table if we cannot get TCEs from the user and they are invalid * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce and kvmppc_emulated_validate_tce (for the previous item) * fixed bug with failthrough for H_IPI * removed all get_user() from real mode handlers * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public) --- Documentation/virtual/kvm/api.txt | 25 +++ arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h | 16 +- arch/powerpc/kvm/book3s_64_vio.c| 123 ++ arch/powerpc/kvm/book3s_64_vio_hv.c | 270 +++ arch/powerpc/kvm/book3s_hv.c| 41 - arch/powerpc/kvm/book3s_hv_rmhandlers.S |6 + arch/powerpc/kvm/book3s_pr_papr.c | 37 - arch/powerpc/kvm/powerpc.c |3 + 9 files changed, 490 insertions(+), 33 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6365fef..762c703 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2362,6 +2362,31 @@ calls by the guest for that service will be passed to userspace to be handled. +4.86 KVM_CAP_PPC_MULTITCE + +Capability: KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This capability means the kernel is capable of handling hypercalls +H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user +space. This significanly accelerates DMA operations for PPC KVM guests. +The user space should expect that its handlers for these hypercalls +are not going to be called. + +In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, +the user space might have to advertise it for the guest. For example, +IBM pSeries guest starts using them if hcall-multi-tce is present in +the ibm,hypertas-functions device-tree property. + +Without this capability, only H_PUT_TCE is handled by the kernel and +therefore the use of H_PUT_TCE_INDIRECT and H_STUFF_TCE is not recommended +unless the capability is present as passing hypercalls to the userspace +slows operations a lot. + +Unlike other capabilities of this section, this one is always enabled. + + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index af326cd..3bf407b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -609,6 +609,8 @@ struct kvm_vcpu_arch { spinlock_t tbacct_lock; u64 busy_stolen; u64 busy_preempt; + + unsigned long *tce_tmp;/* TCE cache for TCE_PUT_INDIRECT hcall */ #endif }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index a5287fe..e852921b 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -133,8 +133,20 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); -extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, -unsigned long ioba, unsigned long tce); +extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( + struct kvm_vcpu *vcpu,
[PATCH 4/8] hashtable: add hash_for_each_possible_rcu_notrace()
This adds hash_for_each_possible_rcu_notrace() which is basically a notrace clone of hash_for_each_possible_rcu() which cannot be used in real mode due to its tracing/debugging capability. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- include/linux/hashtable.h | 15 +++ 1 file changed, 15 insertions(+) diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h index a9df51f..af8b169 100644 --- a/include/linux/hashtable.h +++ b/include/linux/hashtable.h @@ -174,6 +174,21 @@ static inline void hash_del_rcu(struct hlist_node *node) member) /** + * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing + * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + * + * This is the same as hash_for_each_possible_rcu() except that it does + * not do any RCU debugging or tracing. + */ +#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \ + hlist_for_each_entry_rcu_notrace(obj, name[hash_min(key, HASH_BITS(name))],\ + member) + +/** * hash_for_each_possible_safe - iterate over all possible objects hashing to the * same bucket safe against removals * @name: hashtable to iterate -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/8] vfio: add external user support
VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode in the host kernel to avoid passing map/unmap requests to the user space which would made things pretty slow. The proposed protocol includes: 1. do normal VFIO init stuff such as opening a new container, attaching group(s) to it, setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. pass a fd of the group we want to accelerate to KVM. KVM calls vfio_group_iommu_id_from_file() to verify if the group is initialized and IOMMU is set for it. The current TCE IOMMU driver marks the whole IOMMU table as busy when IOMMU is set for a container what this prevents other DMA users from allocating from it so it is safe to pass the group to the user space. 3. KVM increases the container users counter via vfio_group_add_external_user(). This prevents the VFIO group from being disposed prior to exiting KVM. 4. When KVM is finished and doing cleanup, it releases the group file and decrements the container users counter. Everything gets released. 5. KVM also keeps the group file as otherwise its fd might have been closed at the moment of KVM finish so vfio_group_del_external_user() call will not be possible. The vfio: Limit group opens patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio.c | 53 +++ 1 file changed, 53 insertions(+) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index c488da5..54192b2 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1370,6 +1370,59 @@ static const struct file_operations vfio_device_fops = { }; /** + * External user API, exported by symbols to be linked dynamically. + */ + +/* Allows an external user (for example, KVM) to lock an IOMMU group */ +static int vfio_group_add_external_user(struct file *filep) +{ + struct vfio_group *group = filep-private_data; + + if (filep-f_op != vfio_group_fops) + return -EINVAL; + + if (!atomic_inc_not_zero(group-container_users)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_group_add_external_user); + +/* Allows an external user (for example, KVM) to unlock an IOMMU group */ +static void vfio_group_del_external_user(struct file *filep) +{ + struct vfio_group *group = filep-private_data; + + BUG_ON(filep-f_op != vfio_group_fops); + + vfio_group_try_dissolve_container(group); +} +EXPORT_SYMBOL_GPL(vfio_group_del_external_user); + +/* + * Checks if a group for the specified file can be used by + * an external user and returns the IOMMU ID if external use is possible. + */ +static int vfio_group_iommu_id_from_file(struct file *filep) +{ + int ret; + struct vfio_group *group = filep-private_data; + + if (WARN_ON(filep-f_op != vfio_group_fops)) + return -EINVAL; + + if (0 == atomic_read(group-container_users) || + !group-container-iommu_driver || + !vfio_group_viable(group)) + return -EINVAL; + + ret = iommu_group_id(group-iommu_group); + + return ret; +} +EXPORT_SYMBOL_GPL(vfio_group_iommu_id_from_file); + +/** * Module/class support */ static char *vfio_devnode(struct device *dev, umode_t *mode) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/8] KVM: PPC: reserve a capability number for multitce support
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- include/uapi/linux/kvm.h |1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d88c8ee..970b1f5 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IRQ_MPIC 90 #define KVM_CAP_PPC_RTAS 91 #define KVM_CAP_IRQ_XICS 92 +#define KVM_CAP_SPAPR_MULTITCE 93 #ifdef KVM_CAP_IRQ_ROUTING -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/8] KVM: PPC: reserve a capability and ioctl numbers for realmode VFIO
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- include/uapi/linux/kvm.h |2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 970b1f5..0865c01 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_RTAS 91 #define KVM_CAP_IRQ_XICS 92 #define KVM_CAP_SPAPR_MULTITCE 93 +#define KVM_CAP_SPAPR_TCE_IOMMU 94 #ifdef KVM_CAP_IRQ_ROUTING @@ -923,6 +924,7 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_PPC_ALLOC_HTAB */ #define KVM_PPC_ALLOCATE_HTAB_IOWR(KVMIO, 0xa7, __u32) #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) +#define KVM_CREATE_SPAPR_TCE_IOMMU _IOW(KVMIO, 0xaf, struct kvm_create_spapr_tce_iommu) /* Available with KVM_CAP_RMA */ #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) /* Available with KVM_CAP_PPC_HTAB_FD */ -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction
On 06/26/2013 01:42 PM, Bharat Bhushan wrote: ehpriv instruction is used for setting software breakpoints by user space. This patch adds support to exit to user space with run-debug have relevant information. As this is the first point we are using run-debug, also defined the run-debug structure. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/disassemble.h |4 arch/powerpc/include/uapi/asm/kvm.h| 21 + arch/powerpc/kvm/e500_emulate.c| 27 +++ 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 9b198d1..856f8de 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst) return inst 0x; } +static inline unsigned int get_oc(u32 inst) +{ + return (inst 11) 0x7fff; +} #endif /* __ASM_PPC_DISASSEMBLE_H__ */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 0fb1a6e..ded0607 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -269,7 +269,24 @@ struct kvm_fpu { __u64 fpr[32]; }; +/* + * Defines for h/w breakpoint, watchpoint (read, write or both) and + * software breakpoint. + * These are used as type in KVM_SET_GUEST_DEBUG ioctl and status + * for KVM_DEBUG_EXIT. + */ +#define KVMPPC_DEBUG_NONE 0x0 +#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) +#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) +#define KVMPPC_DEBUG_WATCH_READ(1UL 3) struct kvm_debug_exit_arch { + __u64 address; + /* +* exiting to userspace because of h/w breakpoint, watchpoint +* (read, write or both) and software breakpoint. +*/ + __u32 status; + __u32 reserved; }; /* for KVM_SET_GUEST_DEBUG */ @@ -281,10 +298,6 @@ struct kvm_guest_debug_arch { * Type denotes h/w breakpoint, read watchpoint, write * watchpoint or watchpoint (both read and write). */ -#define KVMPPC_DEBUG_NONE 0x0 -#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) -#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) -#define KVMPPC_DEBUG_WATCH_READ(1UL 3) __u32 type; __u32 reserved; } bp[16]; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index b10a012..dab9d07 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -26,6 +26,8 @@ #define XOP_TLBRE 946 #define XOP_TLBWE 978 #define XOP_TLBILX 18 +#define XOP_EHPRIV 270 +#define EHPRIV_OC_DEBUG 0 As I think the case, OC = 0, is a bit specific since IIRC, if the OC operand is omitted, its equal 0 by default. So I think we should start this OC value from 1 or other magic number. And if possible, we'd better add some comments to describe this to make the OC definition readable. Tiejun #ifdef CONFIG_KVM_E500MC static int dbell2prio(ulong param) @@ -82,6 +84,26 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) } #endif +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + + switch (get_oc(inst)) { + case EHPRIV_OC_DEBUG: + run-exit_reason = KVM_EXIT_DEBUG; + run-debug.arch.address = vcpu-arch.pc; + run-debug.arch.status = 0; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + emulated = EMULATE_EXIT_USER; + *advance = 0; + break; + default: + emulated = EMULATE_FAIL; + } + return emulated; +} + int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -130,6 +152,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; + case XOP_EHPRIV: + emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, + advance); + break; + default: emulated = EMULATE_FAIL; } -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction
On 06/26/2013 04:44 PM, Bhushan Bharat-R65777 wrote: -Original Message- From: tiejun.chen [mailto:tiejun.c...@windriver.com] Sent: Wednesday, June 26, 2013 12:25 PM To: Bhushan Bharat-R65777 Cc: kvm-ppc@vger.kernel.org; k...@vger.kernel.org; ag...@suse.de; Wood Scott- B07421; b...@kernel.crashing.org; linuxppc-...@lists.ozlabs.org; linux- ker...@vger.kernel.org; mi...@neuling.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction On 06/26/2013 01:42 PM, Bharat Bhushan wrote: ehpriv instruction is used for setting software breakpoints by user space. This patch adds support to exit to user space with run-debug have relevant information. As this is the first point we are using run-debug, also defined the run-debug structure. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/disassemble.h |4 arch/powerpc/include/uapi/asm/kvm.h| 21 + arch/powerpc/kvm/e500_emulate.c| 27 +++ 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 9b198d1..856f8de 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst) return inst 0x; } +static inline unsigned int get_oc(u32 inst) +{ + return (inst 11) 0x7fff; +} #endif /* __ASM_PPC_DISASSEMBLE_H__ */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 0fb1a6e..ded0607 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -269,7 +269,24 @@ struct kvm_fpu { __u64 fpr[32]; }; +/* + * Defines for h/w breakpoint, watchpoint (read, write or both) and + * software breakpoint. + * These are used as type in KVM_SET_GUEST_DEBUG ioctl and status + * for KVM_DEBUG_EXIT. + */ +#define KVMPPC_DEBUG_NONE 0x0 +#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) +#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) +#define KVMPPC_DEBUG_WATCH_READ(1UL 3) struct kvm_debug_exit_arch { + __u64 address; + /* +* exiting to userspace because of h/w breakpoint, watchpoint +* (read, write or both) and software breakpoint. +*/ + __u32 status; + __u32 reserved; }; /* for KVM_SET_GUEST_DEBUG */ @@ -281,10 +298,6 @@ struct kvm_guest_debug_arch { * Type denotes h/w breakpoint, read watchpoint, write * watchpoint or watchpoint (both read and write). */ -#define KVMPPC_DEBUG_NONE 0x0 -#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) -#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) -#define KVMPPC_DEBUG_WATCH_READ(1UL 3) __u32 type; __u32 reserved; } bp[16]; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index b10a012..dab9d07 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -26,6 +26,8 @@ #define XOP_TLBRE 946 #define XOP_TLBWE 978 #define XOP_TLBILX 18 +#define XOP_EHPRIV 270 +#define EHPRIV_OC_DEBUG 0 As I think the case, OC = 0, is a bit specific since IIRC, if the OC operand is omitted, its equal 0 by default. So I think we should start this OC value from 1 or other magic number. ehpriv instruction is defined to be used as: ehpriv OC // where OC can be 0,1, ... n and in extended for it can be used as ehpriv // With no OC, and here it assumes OC = 0 So OC = 0 is not specific but ehpriv is same as ehpriv 0. Yes, this is just what I mean. I do not think of any special reason to reserve ehpriv and ehpriv 0. So I still prefer we can reserve the 'ehpriv' without OC operand as one simple approach to test or develop something for KVM quickly because its really convenient to trap into the hypervisor only with one 'ehpriv' instruction easily. But I have no further objection if you guys are fine to this ;-) Tiejun Thanks -Bharat And if possible, we'd better add some comments to describe this to make the OC definition readable. Tiejun #ifdef CONFIG_KVM_E500MC static int dbell2prio(ulong param) @@ -82,6 +84,26 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) } #endif +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + + switch (get_oc(inst)) { + case EHPRIV_OC_DEBUG: + run-exit_reason = KVM_EXIT_DEBUG; + run-debug.arch.address = vcpu-arch.pc; + run-debug.arch.status = 0; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + emulated =
Re: [PATCH 3/6 v5] powerpc: export debug registers save function for KVM
Hi, On Wed, 26 Jun 2013 11:12:23 +0530 Bharat Bhushan r65...@freescale.com wrote: diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 200d763..50b357f 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -30,6 +30,10 @@ extern void enable_kernel_spe(void); extern void giveup_spe(struct task_struct *); extern void load_up_spe(struct task_struct *); +#ifdef CONFIG_PPC_ADV_DEBUG_REGS +extern void switch_booke_debug_regs(struct thread_struct *new_thread); +#endif We usually don't bother guarding function declarations. -- Cheers, Stephen Rothwells...@canb.auug.org.au pgpkE_3z91GtM.pgp Description: PGP signature
[PATCH 0/8 v4] KVM: PPC: IOMMU in-kernel handling
The changes are: 1. rebased on v3.10-rc7 2. removed spinlocks from real mode 3. added security checks between KVM and VFIO MOre details in the individual patch comments. Alexey Kardashevskiy (8): KVM: PPC: reserve a capability number for multitce support KVM: PPC: reserve a capability and ioctl numbers for realmode VFIO vfio: add external user support hashtable: add hash_for_each_possible_rcu_notrace() powerpc: Prepare to support kernel handling of IOMMU map/unmap KVM: PPC: Add support for multiple-TCE hcalls KVM: PPC: Add support for IOMMU in-kernel handling KVM: PPC: Add hugepage support for IOMMU in-kernel handling Documentation/virtual/kvm/api.txt| 51 +++ arch/powerpc/include/asm/kvm_host.h | 31 ++ arch/powerpc/include/asm/kvm_ppc.h | 18 +- arch/powerpc/include/asm/pgtable-ppc64.h |4 + arch/powerpc/include/uapi/asm/kvm.h |8 + arch/powerpc/kvm/book3s_64_vio.c | 506 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 439 -- arch/powerpc/kvm/book3s_hv.c | 41 ++- arch/powerpc/kvm/book3s_hv_rmhandlers.S |6 + arch/powerpc/kvm/book3s_pr_papr.c| 37 ++- arch/powerpc/kvm/powerpc.c | 15 + arch/powerpc/mm/init_64.c| 78 - drivers/vfio/vfio.c | 53 include/linux/hashtable.h| 15 + include/linux/page-flags.h |4 +- include/uapi/linux/kvm.h |3 + 16 files changed, 1279 insertions(+), 30 deletions(-) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/8] KVM: PPC: Add support for IOMMU in-kernel handling
This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT and H_STUFF_TCE requests without passing them to QEMU, which saves time on switching to QEMU and back. Both real and virtual modes are supported. First the kernel tries to handle a TCE request in the real mode, if failed it passes it to the virtual mode to complete the operation. If it a virtual mode handler fails, a request is passed to the user mode. This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to associate a virtual PCI bus ID (LIOBN) with an IOMMU group which enables in-kernel handling of IOMMU map/unmap. The external user API support in VFIO is required. Tests show that this patch increases transmission speed from 220MB/s to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card). Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: 2013/06/27: * tce_list page is referenced now in order to protect it from accident invalidation during H_PUT_TCE_INDIRECT execution * added use of the external user VFIO API 2013/06/05: * changed capability number * changed ioctl number * update the doc article number 2013/05/20: * removed get_user() from real mode handlers * kvm_vcpu_arch::tce_tmp usage extended. Now real mode handler puts there translated TCEs, tries realmode_get_page() on those and if it fails, it passes control over the virtual mode handler which tries to finish the request handling * kvmppc_lookup_pte() now does realmode_get_page() protected by BUSY bit on a page * The only reason to pass the request to user mode now is when the user mode did not register TCE table in the kernel, in all other cases the virtual mode handler is expected to do the job --- Documentation/virtual/kvm/api.txt | 26 arch/powerpc/include/asm/kvm_host.h |4 + arch/powerpc/include/asm/kvm_ppc.h |2 + arch/powerpc/include/uapi/asm/kvm.h |8 + arch/powerpc/kvm/book3s_64_vio.c| 294 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 165 arch/powerpc/kvm/powerpc.c | 12 ++ 7 files changed, 509 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 762c703..01b0dc2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2387,6 +2387,32 @@ slows operations a lot. Unlike other capabilities of this section, this one is always enabled. +4.87 KVM_CREATE_SPAPR_TCE_IOMMU + +Capability: KVM_CAP_SPAPR_TCE_IOMMU +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_iommu (in) +Returns: 0 on success, -1 on error + +struct kvm_create_spapr_tce_iommu { + __u64 liobn; + __u32 iommu_id; + __u32 flags; +}; + +This creates a link between IOMMU group and a hardware TCE (translation +control entry) table. This link lets the host kernel know what IOMMU +group (i.e. TCE table) to use for the LIOBN number passed with +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls. + +In response to a TCE hypercall, the kernel looks for a TCE table descriptor +in the list and handles the hypercall in real or virtual modes if +the descriptor is found. Otherwise the hypercall is passed to the user mode. + +No flag is supported at the moment. + + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3bf407b..716ab18 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -180,6 +180,8 @@ struct kvmppc_spapr_tce_table { struct kvm *kvm; u64 liobn; u32 window_size; + struct iommu_group *grp;/* used for IOMMU groups */ + struct file *vfio_filp; /* used for IOMMU groups */ struct page *pages[0]; }; @@ -611,6 +613,8 @@ struct kvm_vcpu_arch { u64 busy_preempt; unsigned long *tce_tmp;/* TCE cache for TCE_PUT_INDIRECT hcall */ + unsigned long tce_tmp_num; /* Number of handled TCEs in the cache */ + unsigned long tce_reason; /* The reason of switching to the virtmode */ #endif }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e852921b..934e01d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -133,6 +133,8 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm, + struct kvm_create_spapr_tce_iommu *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( struct kvm_vcpu *vcpu, unsigned long liobn); extern long kvmppc_emulated_validate_tce(unsigned long tce); diff --git a/arch/powerpc/include/uapi/asm/kvm.h
[PATCH 8/8] KVM: PPC: Add hugepage support for IOMMU in-kernel handling
This adds special support for huge pages (16MB). The reference counting cannot be easily done for such pages in real mode (when MMU is off) so we added a list of huge pages. It is populated in virtual mode and get_page is called just once per a huge page. Real mode handlers check if the requested page is huge and in the list, then no reference counting is done, otherwise an exit to virtual mode happens. The list is released at KVM exit. At the moment the fastest card available for tests uses up to 9 huge pages so walking through this list is not very expensive. However this can change and we may want to optimize this. Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: 2013/06/27: * list of huge pages replaces with hashtable for better performance * spinlock removed from real mode and only protects insertion of new huge [ages descriptors into the hashtable 2013/06/05: * fixed compile error when CONFIG_IOMMU_API=n 2013/05/20: * the real mode handler now searches for a huge page by gpa (used to be pte) * the virtual mode handler prints warning if it is called twice for the same huge page as the real mode handler is expected to fail just once - when a huge page is not in the list yet. * the huge page is refcounted twice - when added to the hugepage list and when used in the virtual mode hcall handler (can be optimized but it will make the patch less nice). --- arch/powerpc/include/asm/kvm_host.h | 25 + arch/powerpc/kvm/book3s_64_vio.c| 95 +-- arch/powerpc/kvm/book3s_64_vio_hv.c | 24 +++-- 3 files changed, 138 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 716ab18..0ad6189 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -30,6 +30,7 @@ #include linux/kvm_para.h #include linux/list.h #include linux/atomic.h +#include linux/hashtable.h #include asm/kvm_asm.h #include asm/processor.h #include asm/page.h @@ -182,9 +183,33 @@ struct kvmppc_spapr_tce_table { u32 window_size; struct iommu_group *grp;/* used for IOMMU groups */ struct file *vfio_filp; /* used for IOMMU groups */ + DECLARE_HASHTABLE(hash_tab, ilog2(64)); /* used for IOMMU groups */ + spinlock_t hugepages_write_lock;/* used for IOMMU groups */ struct page *pages[0]; }; +/* + * The KVM guest can be backed with 16MB pages. + * In this case, we cannot do page counting from the real mode + * as the compound pages are used - they are linked in a list + * with pointers as virtual addresses which are inaccessible + * in real mode. + * + * The code below keeps a 16MB pages list and uses page struct + * in real mode if it is already locked in RAM and inserted into + * the list or switches to the virtual mode where it can be + * handled in a usual manner. + */ +#define KVMPPC_HUGEPAGE_HASH(gpa) hash_32(gpa 24, 32) + +struct kvmppc_iommu_hugepage { + struct hlist_node hash_node; + unsigned long gpa; /* Guest physical address */ + unsigned long hpa; /* Host physical address */ + struct page *page; /* page struct of the very first subpage */ + unsigned long size; /* Huge page size (always 16MB at the moment) */ +}; + struct kvmppc_linear_info { void*base_virt; unsigned longbase_pfn; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index a5d0195..6cedfe9 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -47,6 +47,78 @@ #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) #define ERROR_ADDR ((void *)~(unsigned long)0x0) +#ifdef CONFIG_IOMMU_API +/* Adds a new huge page descriptor to the hashtable */ +static long kvmppc_iommu_hugepage_try_add( + struct kvmppc_spapr_tce_table *tt, + pte_t pte, unsigned long hva, unsigned long gpa, + unsigned long pg_size) +{ + long ret = 0; + struct kvmppc_iommu_hugepage *hp; + struct page *pg; + unsigned key = KVMPPC_HUGEPAGE_HASH(gpa); + + spin_lock(tt-hugepages_write_lock); + hash_for_each_possible_rcu(tt-hash_tab, hp, hash_node, key) { + if (KVMPPC_HUGEPAGE_HASH(hp-gpa) != key) + continue; + if ((gpa hp-gpa) || (gpa = hp-gpa + hp-size)) + continue; + goto unlock_exit; + } + + hva = hva ~(pg_size - 1); + ret = get_user_pages_fast(hva, 1, true/*write*/, pg); + if ((ret != 1) || !pg) { + ret = -EFAULT; + goto unlock_exit; + } + ret = 0; + + hp = kzalloc(sizeof(*hp), GFP_KERNEL); + if (!hp) { + ret = -ENOMEM; + goto unlock_exit; + } + + hp-page = pg; +
[PATCH 5/8] powerpc: Prepare to support kernel handling of IOMMU map/unmap
The current VFIO-on-POWER implementation supports only user mode driven mapping, i.e. QEMU is sending requests to map/unmap pages. However this approach is really slow, so we want to move that to KVM. Since H_PUT_TCE can be extremely performance sensitive (especially with network adapters where each packet needs to be mapped/unmapped) we chose to implement that as a fast hypercall directly in real mode (processor still in the guest context but MMU off). To be able to do that, we need to provide some facilities to access the struct page count within that real mode environment as things like the sparsemem vmemmap mappings aren't accessible. This adds an API to increment/decrement page counter as get_user_pages API used for user mode mapping does not work in the real mode. CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported. Reviewed-by: Paul Mackerras pau...@samba.org Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: 2013/06/27: * realmode_get_page() fixed to use get_page_unless_zero(). If failed, the call will be passed from real to virtual mode and safely handled. * added comment to PageCompound() in include/linux/page-flags.h. 2013/05/20: * PageTail() is replaced by PageCompound() in order to have the same checks for whether the page is huge in realmode_get_page() and realmode_put_page() --- arch/powerpc/include/asm/pgtable-ppc64.h |4 ++ arch/powerpc/mm/init_64.c| 78 +- include/linux/page-flags.h |4 +- 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index e3d55f6f..7b46e5f 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -376,6 +376,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } #endif /* !CONFIG_HUGETLB_PAGE */ +struct page *realmode_pfn_to_page(unsigned long pfn); +int realmode_get_page(struct page *page); +int realmode_put_page(struct page *page); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a90b9c4..7031be3 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -297,5 +297,81 @@ void vmemmap_free(unsigned long start, unsigned long end) { } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ +/* + * We do not have access to the sparsemem vmemmap, so we fallback to + * walking the list of sparsemem blocks which we already maintain for + * the sake of crashdump. In the long run, we might want to maintain + * a tree if performance of that linear walk becomes a problem. + * + * Any of realmode_ functions can fail due to: + * 1) As real sparsemem blocks do not lay in RAM continously (they + * are in virtual address space which is not available in the real mode), + * the requested page struct can be split between blocks so get_page/put_page + * may fail. + * 2) When huge pages are used, the get_page/put_page API will fail + * in real mode as the linked addresses in the page struct are virtual + * too. + * When 1) or 2) takes place, the API returns an error code to cause + * an exit to kernel virtual mode where the operation will be completed. + */ +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct vmemmap_backing *vmem_back; + struct page *page; + unsigned long page_size = 1 mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long pg_va = (unsigned long) pfn_to_page(pfn); + + for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) { + if (pg_va vmem_back-virt_addr) + continue; + /* Check that page struct is not split between real pages */ + if ((pg_va + sizeof(struct page)) + (vmem_back-virt_addr + page_size)) + return NULL; + + page = (struct page *) (vmem_back-phys + pg_va - + vmem_back-virt_addr); + return page; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#elif defined(CONFIG_FLATMEM) + +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct page *page = pfn_to_page(pfn); + return page; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM) +int realmode_get_page(struct page *page) +{ + if (PageCompound(page)) + return -EAGAIN; + + if (!get_page_unless_zero(page)) + return -EAGAIN; + + return 0; +} +EXPORT_SYMBOL_GPL(realmode_get_page); + +int realmode_put_page(struct page *page) +{ + if (PageCompound(page)) + return -EAGAIN; + +
[PATCH 6/8] KVM: PPC: Add support for multiple-TCE hcalls
This adds real mode handlers for the H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO devices or emulated PCI. These calls allow adding multiple entries (up to 512) into the TCE table in one call which saves time on transition to/from real mode. This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs (copied from user and verified) before writing the whole list into the TCE table. This cache will be utilized more in the upcoming VFIO/IOMMU support to continue TCE list processing in the virtual mode in the case if the real mode handler failed for some reason. This adds a guest physical to host real address converter and calls the existing H_PUT_TCE handler. The converting function is going to be fully utilized by upcoming VFIO supporting patches. This also implements the KVM_CAP_PPC_MULTITCE capability, so in order to support the functionality of this patch, QEMU needs to query for this capability and set the hcall-multi-tce hypertas property only if the capability is present, otherwise there will be serious performance degradation. Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changelog: 2013/06/27: * fixed clear of BUSY bit in kvmppc_lookup_pte() * H_PUT_TCE_INDIRECT does realmode_get_page() now * KVM_CAP_SPAPR_MULTITCE now depends on CONFIG_PPC_BOOK3S_64 * updated doc 2013/06/05: * fixed mistype about IBMVIO in the commit message * updated doc and moved it to another section * changed capability number 2013/05/21: * added kvm_vcpu_arch::tce_tmp * removed cleanup if put_indirect failed, instead we do not even start writing to TCE table if we cannot get TCEs from the user and they are invalid * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce and kvmppc_emulated_validate_tce (for the previous item) * fixed bug with failthrough for H_IPI * removed all get_user() from real mode handlers * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public) --- Documentation/virtual/kvm/api.txt | 25 +++ arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h | 16 +- arch/powerpc/kvm/book3s_64_vio.c| 123 ++ arch/powerpc/kvm/book3s_64_vio_hv.c | 270 +++ arch/powerpc/kvm/book3s_hv.c| 41 - arch/powerpc/kvm/book3s_hv_rmhandlers.S |6 + arch/powerpc/kvm/book3s_pr_papr.c | 37 - arch/powerpc/kvm/powerpc.c |3 + 9 files changed, 490 insertions(+), 33 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6365fef..762c703 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2362,6 +2362,31 @@ calls by the guest for that service will be passed to userspace to be handled. +4.86 KVM_CAP_PPC_MULTITCE + +Capability: KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This capability means the kernel is capable of handling hypercalls +H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user +space. This significanly accelerates DMA operations for PPC KVM guests. +The user space should expect that its handlers for these hypercalls +are not going to be called. + +In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, +the user space might have to advertise it for the guest. For example, +IBM pSeries guest starts using them if hcall-multi-tce is present in +the ibm,hypertas-functions device-tree property. + +Without this capability, only H_PUT_TCE is handled by the kernel and +therefore the use of H_PUT_TCE_INDIRECT and H_STUFF_TCE is not recommended +unless the capability is present as passing hypercalls to the userspace +slows operations a lot. + +Unlike other capabilities of this section, this one is always enabled. + + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index af326cd..3bf407b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -609,6 +609,8 @@ struct kvm_vcpu_arch { spinlock_t tbacct_lock; u64 busy_stolen; u64 busy_preempt; + + unsigned long *tce_tmp;/* TCE cache for TCE_PUT_INDIRECT hcall */ #endif }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index a5287fe..e852921b 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -133,8 +133,20 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); -extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, -unsigned long ioba, unsigned long tce); +extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( + struct kvm_vcpu *vcpu,
[PATCH 4/8] hashtable: add hash_for_each_possible_rcu_notrace()
This adds hash_for_each_possible_rcu_notrace() which is basically a notrace clone of hash_for_each_possible_rcu() which cannot be used in real mode due to its tracing/debugging capability. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- include/linux/hashtable.h | 15 +++ 1 file changed, 15 insertions(+) diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h index a9df51f..af8b169 100644 --- a/include/linux/hashtable.h +++ b/include/linux/hashtable.h @@ -174,6 +174,21 @@ static inline void hash_del_rcu(struct hlist_node *node) member) /** + * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing + * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + * + * This is the same as hash_for_each_possible_rcu() except that it does + * not do any RCU debugging or tracing. + */ +#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \ + hlist_for_each_entry_rcu_notrace(obj, name[hash_min(key, HASH_BITS(name))],\ + member) + +/** * hash_for_each_possible_safe - iterate over all possible objects hashing to the * same bucket safe against removals * @name: hashtable to iterate -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/8] KVM: PPC: reserve a capability number for multitce support
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- include/uapi/linux/kvm.h |1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d88c8ee..970b1f5 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IRQ_MPIC 90 #define KVM_CAP_PPC_RTAS 91 #define KVM_CAP_IRQ_XICS 92 +#define KVM_CAP_SPAPR_MULTITCE 93 #ifdef KVM_CAP_IRQ_ROUTING -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/8] vfio: add external user support
VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode in the host kernel to avoid passing map/unmap requests to the user space which would made things pretty slow. The proposed protocol includes: 1. do normal VFIO init stuff such as opening a new container, attaching group(s) to it, setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. pass a fd of the group we want to accelerate to KVM. KVM calls vfio_group_iommu_id_from_file() to verify if the group is initialized and IOMMU is set for it. The current TCE IOMMU driver marks the whole IOMMU table as busy when IOMMU is set for a container what this prevents other DMA users from allocating from it so it is safe to pass the group to the user space. 3. KVM increases the container users counter via vfio_group_add_external_user(). This prevents the VFIO group from being disposed prior to exiting KVM. 4. When KVM is finished and doing cleanup, it releases the group file and decrements the container users counter. Everything gets released. 5. KVM also keeps the group file as otherwise its fd might have been closed at the moment of KVM finish so vfio_group_del_external_user() call will not be possible. The vfio: Limit group opens patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio.c | 53 +++ 1 file changed, 53 insertions(+) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index c488da5..54192b2 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1370,6 +1370,59 @@ static const struct file_operations vfio_device_fops = { }; /** + * External user API, exported by symbols to be linked dynamically. + */ + +/* Allows an external user (for example, KVM) to lock an IOMMU group */ +static int vfio_group_add_external_user(struct file *filep) +{ + struct vfio_group *group = filep-private_data; + + if (filep-f_op != vfio_group_fops) + return -EINVAL; + + if (!atomic_inc_not_zero(group-container_users)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_group_add_external_user); + +/* Allows an external user (for example, KVM) to unlock an IOMMU group */ +static void vfio_group_del_external_user(struct file *filep) +{ + struct vfio_group *group = filep-private_data; + + BUG_ON(filep-f_op != vfio_group_fops); + + vfio_group_try_dissolve_container(group); +} +EXPORT_SYMBOL_GPL(vfio_group_del_external_user); + +/* + * Checks if a group for the specified file can be used by + * an external user and returns the IOMMU ID if external use is possible. + */ +static int vfio_group_iommu_id_from_file(struct file *filep) +{ + int ret; + struct vfio_group *group = filep-private_data; + + if (WARN_ON(filep-f_op != vfio_group_fops)) + return -EINVAL; + + if (0 == atomic_read(group-container_users) || + !group-container-iommu_driver || + !vfio_group_viable(group)) + return -EINVAL; + + ret = iommu_group_id(group-iommu_group); + + return ret; +} +EXPORT_SYMBOL_GPL(vfio_group_iommu_id_from_file); + +/** * Module/class support */ static char *vfio_devnode(struct device *dev, umode_t *mode) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html