[PATCH] kvm: external module: compatibility for 2.6.27 hosts with backported hrtimer patches
From: Avi Kivity a...@redhat.com Some 2.6.27 kernels (F10) have backported htrimer patches which interfere with the external module hrtimer backports. Rework the backports to allow them to coexist. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/kernel/external-module-compat-comm.h b/kernel/external-module-compat-comm.h index 27fea15..f2343f6 100644 --- a/kernel/external-module-compat-comm.h +++ b/kernel/external-module-compat-comm.h @@ -584,26 +584,33 @@ static inline int get_user_pages_fast(unsigned long start, int nr_pages, #if LINUX_VERSION_CODE KERNEL_VERSION(2,6,28) -static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 delta) +static inline void kvm_hrtimer_add_expires_ns(struct hrtimer *timer, u64 delta) { timer-expires = ktime_add_ns(timer-expires, delta); } -static inline ktime_t hrtimer_get_expires(struct hrtimer *timer) +static inline ktime_t kvm_hrtimer_get_expires(struct hrtimer *timer) { return timer-expires; } -static inline u64 hrtimer_get_expires_ns(struct hrtimer *timer) +static inline u64 kvm_hrtimer_get_expires_ns(struct hrtimer *timer) { return ktime_to_ns(timer-expires); } -static inline void hrtimer_start_expires(struct hrtimer *timer, int mode) +static inline void kvm_hrtimer_start_expires(struct hrtimer *timer, int mode) { hrtimer_start_p(timer, timer-expires, mode); } +#else + +#define kvm_hrtimer_add_expires_ns hrtimer_add_expires_ns +#define kvm_hrtimer_get_expires hrtimer_get_expires +#define kvm_hrtimer_get_expires_ns hrtimer_get_expires_ns +#define kvm_hrtimer_start_expires hrtimer_start_expires + #endif #if LINUX_VERSION_CODE KERNEL_VERSION(2,6,28) diff --git a/kernel/ia64/hack-module.awk b/kernel/ia64/hack-module.awk index 3dd2260..a26d567 100644 --- a/kernel/ia64/hack-module.awk +++ b/kernel/ia64/hack-module.awk @@ -1,4 +1,6 @@ BEGIN { split(INIT_WORK on_each_cpu smp_call_function \ + hrtimer_add_expires_ns hrtimer_get_expires \ + hrtimer_get_expires_ns hrtimer_start_expires \ request_irq, compat_apis); } /MODULE_AUTHOR/ { diff --git a/kernel/x86/hack-module.awk b/kernel/x86/hack-module.awk index f40c972..1c80543 100644 --- a/kernel/x86/hack-module.awk +++ b/kernel/x86/hack-module.awk @@ -1,4 +1,6 @@ BEGIN { split(INIT_WORK tsc_khz desc_struct ldttss_desc64 desc_ptr \ + hrtimer_add_expires_ns hrtimer_get_expires \ + hrtimer_get_expires_ns hrtimer_start_expires \ on_each_cpu relay_open request_irq , compat_apis); } /^int kvm_init\(/ { anon_inodes = 1 } -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: introduce kvm_read_guest_virt, kvm_write_guest_virt
From: Izik Eidus iei...@redhat.com This commit change the name of emulator_read_std into kvm_read_guest_virt, and add new function name kvm_write_guest_virt that allow writing into a guest virtual address. Signed-off-by: Izik Eidus iei...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af00b8c..b1e109b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -609,10 +609,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu); void fx_init(struct kvm_vcpu *vcpu); -int emulator_read_std(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); int emulator_write_emulated(unsigned long addr, const void *val, unsigned int bytes, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 36aa576..3e92230 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1973,10 +1973,8 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, return dev; } -int emulator_read_std(unsigned long addr, -void *val, -unsigned int bytes, -struct kvm_vcpu *vcpu) +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) { void *data = val; int r = X86EMUL_CONTINUE; @@ -1984,27 +1982,57 @@ int emulator_read_std(unsigned long addr, while (bytes) { gpa_t gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, addr); unsigned offset = addr (PAGE_SIZE-1); - unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); + unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; if (gpa == UNMAPPED_GVA) { r = X86EMUL_PROPAGATE_FAULT; goto out; } - ret = kvm_read_guest(vcpu-kvm, gpa, data, tocopy); + ret = kvm_read_guest(vcpu-kvm, gpa, data, toread); if (ret 0) { r = X86EMUL_UNHANDLEABLE; goto out; } - bytes -= tocopy; - data += tocopy; - addr += tocopy; + bytes -= toread; + data += toread; + addr += toread; } out: return r; } -EXPORT_SYMBOL_GPL(emulator_read_std); + +int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, +struct kvm_vcpu *vcpu) +{ + void *data = val; + int r = X86EMUL_CONTINUE; + + while (bytes) { + gpa_t gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, addr); + unsigned offset = addr (PAGE_SIZE-1); + unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } + ret = kvm_write_guest(vcpu-kvm, gpa, data, towrite); + if (ret 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } + + bytes -= towrite; + data += towrite; + addr += towrite; + } +out: + return r; +} + static int emulator_read_emulated(unsigned long addr, void *val, @@ -2026,8 +2054,8 @@ static int emulator_read_emulated(unsigned long addr, if ((gpa PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (emulator_read_std(addr, val, bytes, vcpu) - == X86EMUL_CONTINUE) + if (kvm_read_guest_virt(addr, val, bytes, vcpu) + == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; @@ -2230,7 +2258,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); printk(KERN_ERR emulation failed (%s) rip %lx %02x %02x %02x %02x\n, context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -2238,7 +2266,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std= emulator_read_std, + .read_std= kvm_read_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated
[PATCH] KVM: remove the vmap usage
From: Izik Eidus iei...@redhat.com vmap() on guest pages hides those pages from the Linux mm for an extended (userspace determined) amount of time. Get rid of it. Signed-off-by: Izik Eidus iei...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3e92230..1059ffc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2368,40 +2368,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); -static void free_pio_guest_pages(struct kvm_vcpu *vcpu) -{ - int i; - - for (i = 0; i ARRAY_SIZE(vcpu-arch.pio.guest_pages); ++i) - if (vcpu-arch.pio.guest_pages[i]) { - kvm_release_page_dirty(vcpu-arch.pio.guest_pages[i]); - vcpu-arch.pio.guest_pages[i] = NULL; - } -} - static int pio_copy_data(struct kvm_vcpu *vcpu) { void *p = vcpu-arch.pio_data; - void *q; + gva_t q = vcpu-arch.pio.guest_gva; unsigned bytes; - int nr_pages = vcpu-arch.pio.guest_pages[1] ? 2 : 1; + int ret; - q = vmap(vcpu-arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, -PAGE_KERNEL); - if (!q) { - free_pio_guest_pages(vcpu); - return -ENOMEM; - } - q += vcpu-arch.pio.guest_page_offset; bytes = vcpu-arch.pio.size * vcpu-arch.pio.cur_count; if (vcpu-arch.pio.in) - memcpy(q, p, bytes); + ret = kvm_write_guest_virt(q, p, bytes, vcpu); else - memcpy(p, q, bytes); - q -= vcpu-arch.pio.guest_page_offset; - vunmap(q); - free_pio_guest_pages(vcpu); - return 0; + ret = kvm_read_guest_virt(q, p, bytes, vcpu); + return ret; } int complete_pio(struct kvm_vcpu *vcpu) @@ -2512,7 +2491,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu-arch.pio.in = in; vcpu-arch.pio.string = 0; vcpu-arch.pio.down = 0; - vcpu-arch.pio.guest_page_offset = 0; vcpu-arch.pio.rep = 0; if (vcpu-run-io.direction == KVM_EXIT_IO_IN) @@ -2540,9 +2518,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, gva_t address, int rep, unsigned port) { unsigned now, in_page; - int i, ret = 0; - int nr_pages = 1; - struct page *page; + int ret = 0; struct kvm_io_device *pio_dev; vcpu-run-exit_reason = KVM_EXIT_IO; @@ -2554,7 +2530,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu-arch.pio.in = in; vcpu-arch.pio.string = 1; vcpu-arch.pio.down = down; - vcpu-arch.pio.guest_page_offset = offset_in_page(address); vcpu-arch.pio.rep = rep; if (vcpu-run-io.direction == KVM_EXIT_IO_IN) @@ -2574,15 +2549,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, else in_page = offset_in_page(address) + size; now = min(count, (unsigned long)in_page / size); - if (!now) { - /* -* String I/O straddles page boundary. Pin two guest pages -* so that we satisfy atomicity constraints. Do just one -* transaction to avoid complexity. -*/ - nr_pages = 2; + if (!now) now = 1; - } if (down) { /* * String I/O in reverse. Yuck. Kill the guest, fix later. @@ -2597,15 +2565,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, if (vcpu-arch.pio.cur_count == vcpu-arch.pio.count) kvm_x86_ops-skip_emulated_instruction(vcpu); - for (i = 0; i nr_pages; ++i) { - page = gva_to_page(vcpu, address + i * PAGE_SIZE); - vcpu-arch.pio.guest_pages[i] = page; - if (!page) { - kvm_inject_gp(vcpu, 0); - free_pio_guest_pages(vcpu); - return 1; - } - } + vcpu-arch.pio.guest_gva = address; pio_dev = vcpu_find_pio_dev(vcpu, port, vcpu-arch.pio.cur_count, @@ -2613,7 +2573,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, if (!vcpu-arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret = 0 pio_dev) { + if (ret == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (ret == 0 pio_dev) { pio_string_write(pio_dev, vcpu); complete_pio(vcpu); if (vcpu-arch.pio.count == 0) diff --git a/include/linux/kvm_types.h
Re: [PATCH][v2] kvm-userspace: Load PCI option ROMs
Liu, Kechao wrote: Hi Avi, Thanks for your comments. I've updated the patch according to them. Please review it. Thank you. Load assigned devices' PCI option ROMs to the RAM of guest OS. And pass the corresponding devfns to BIOS. Looks good. + +/* Write ROM data and devfn to phys_addr */ +cpu_physical_memory_write_rom(0xd + offset, rom, size); +cpu_physical_memory_write_rom(0xd + offset + size, devfn, 1); + How is the last bit performed on real hardware? Obviously the ROM can't have devfn embedded. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] remove kvm vmap usage
Izik Eidus wrote: Remove the vmap usage from kvm, this is needed both for ksm and get_user_pages != write. applied, thanks. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ kvm-Bugs-2474501 ] Migration: Linux x64 SMP guests failures
Bugs item #2474501, was opened at 2008-12-29 11:14 Message generated for change (Tracker Item Submitted) made by Item Submitter You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2474501group_id=180599 Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: None Group: None Status: Open Resolution: None Priority: 5 Private: No Submitted By: Technologov (technologov) Assigned to: Nobody/Anonymous (nobody) Summary: Migration: Linux x64 SMP guests failures Initial Comment: Linux x64 SMP guests failed to migrate. Host: Fedora 7 x64, AMD Opteron 2352, KVM-81 Guests: Fedora 9 x64, CentOS 5 x64 Qemu/KVM command: /usr/local/bin/qemu-system-x86_64 -monitor unix:/tmp/centos5-64__smp2_dst.monitor,server,nowait -pidfile /tmp/centos5-64__smp2_dst.pid -name centos5-64__smp2_dst -m 512 -hda /vm/centos5-64.qcow2_centos5-64__smp2 -net nic,vlan=0,macaddr=20:20:20:00:00:02,model=rtl8139 -net tap,vlan=0,ifname=cen_0_5706_02,script=/root/Linstall/git-kvm-autotest-new/client/tests/kvm_runtest/root/build/etc/kvm/qemu-ifup -vnc :27 -smp 2 -incoming tcp:0:5027 /dev/null /tmp/centos5-64__smp2_dst.out 21 -Alexey, 29.12.2008. -- You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2474501group_id=180599 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH][v2] kvm-userspace: Load PCI option ROMs
Hi Avi, -Original Message- From: Avi Kivity [mailto:a...@redhat.com] Sent: 2008年12月29日 16:29 To: Liu, Kechao Cc: kvm@vger.kernel.org; Shan, Haitao Subject: Re: [PATCH][v2] kvm-userspace: Load PCI option ROMs Liu, Kechao wrote: Hi Avi, Thanks for your comments. I've updated the patch according to them. Please review it. Thank you. Load assigned devices' PCI option ROMs to the RAM of guest OS. And pass the corresponding devfns to BIOS. Looks good. + +/* Write ROM data and devfn to phys_addr */ +cpu_physical_memory_write_rom(0xd + offset, rom, size); +cpu_physical_memory_write_rom(0xd + offset + size, devfn, 1); + How is the last bit performed on real hardware? Obviously the ROM can't have devfn embedded. On a real hardware, BIOS scans PCI devices, loads ROMs and it can get devices' devfns. Here, in an easier way, we load option ROMs in QEMU and thus need to store and pass the devfns. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. Best Regards, Liu, Kechao
Re: [PATCH][v2] kvm-userspace: Load PCI option ROMs
Liu, Kechao wrote: How is the last bit performed on real hardware? Obviously the ROM can't have devfn embedded. On a real hardware, BIOS scans PCI devices, loads ROMs and it can get devices' devfns. Here, in an easier way, we load option ROMs in QEMU and thus need to store and pass the devfns. Well, it may make sense to provide the ROMs as virtual PCI BARs, and have the bios do the work. This way, if some driver relies on remapping the BAR (graphic cards?), it can still work. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq
On Monday 29 December 2008 13:42:22 Amit Shah wrote: On Sun, Dec 28, 2008 at 07:24:02PM +0800, Sheng Yang wrote: On Sat, Dec 27, 2008 at 06:06:26PM -0200, Marcelo Tosatti wrote: On Fri, Dec 26, 2008 at 10:30:07AM +0800, Sheng Yang wrote: Thanks to Marcelo's observation, The following code have potential issue: if (cancel_work_sync(assigned_dev-interrupt_work)) kvm_put_kvm(kvm); In fact, cancel_work_sync() would return true either work struct is only scheduled or the callback of work struct is executed. This code only consider the former situation. Why not simply drop the reference inc / dec from irq handler/work function? Sorry, I don't know the answer. After checking the code, I also think it's a little strange to increase refernce count here, and I think we won't suppose work_handler can release the kvm struct. At the time of developing that code, this was my observation: I see from the call chain kvm_put_kvm-...-kvm_arch_destroy_vm, no locks are taken to actually destroy the vm. We can't be in ioctls, sure. But shouldn't the mutex be taken to ensure there's nothing else going on while destroying? At least with the workqueue model, we could be called asynchronously in kernel context and I would have held the mutex and about to inject interrupts while everything is being wiped off underneath. However, the workqueue model tries its best to schedule the work on the same CPU, though we can't use that guarantee to ensure things will be fine. --- So I had to get a ref to the current vm till we had any pending work scheduled. I think I put in comments in the code, but sadly most of my comments we stripped out before the merge. Not quite understand... The free assigned device in the destroy path of VM, so as free irq. And we got cancel_work_sync() in free irq which can sync with the execution of scheduled work. And now before cancel_work_sync(), we disable the interrupt so that no more schedule work happen again. So after cancel_work_sync(), everything(I think it's irq handler and schedule work here) asynchronously should quiet down. Or I miss something? -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: gettimeofday slow in RHEL4 guests
On Monday 29 December 2008 02:38:07 Marcelo Tosatti wrote: On Tue, Nov 25, 2008 at 01:52:59PM +0100, Andi Kleen wrote: But yeah - the remapping of HPET timers to virtual HPET timers sounds pretty tough. I wonder if one could overcome that with a little hardware support though ... For gettimeofday better make TSC work. Even in the best case (no virtualization) it is much faster than HPET because it sits in the CPU, while HPET is far away on the external south bridge. The tsc clock on older Linux 2.6 kernels compensates for lost ticks. The algorithm uses the PIT count (latched) to measure the delay between interrupt generation and handling, and sums that value, on the next interrupt, to the TSC delta. Sheng investigated this problem in the discussions before in-kernel PIT was merged: http://www.mail-archive.com/kvm-de...@lists.sourceforge.net/msg13873.html The algorithm overcompensates for lost ticks and the guest time runs faster than the hosts. There are two issues: 1) A bug in the in-kernel PIT which miscalculates the count value. 2) For the case where more than one interrupt is lost, and later reinjected, the value read from PIT count is meaningless for the purpose of the tsc algorithm. The count is interpreted as the delay until the next interrupt, which is not the case with reinjection. As Sheng mentioned in the thread above, Xen pulls back the TSC value when reinjecting interrupts. VMWare ESX has a notion of virtual TSC, which I believe is similar in this context. For KVM I believe the best immediate solution (for now) is to provide an option to disable reinjection, behaving similarly to real hardware. The advantage is simplicity compared to virtualizing the time sources. The QEMU PIT emulation has a limit on the rate of interrupt reinjection, perhaps something similar should be investigated in the future. The following patch (which contains the bugfix for 1) and disabled reinjection) fixes the severe time drift on RHEL4 with clock=tsc. What I'm proposing is to condition reinjection with an option (-kvm-pit-no-reinject or something). I agree that it should go with a user space option to disable rejection, as it's hard to overcome the problem that we delayed interrupt injection... -- regards Yang, Sheng Comments or better ideas? diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e665d1c..608af7b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -201,13 +201,16 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) if (!atomic_inc_and_test(pt-pending)) set_bit(KVM_REQ_PENDING_TIMER, vcpu0-requests); + if (atomic_read(pt-pending) 1) + atomic_set(pt-pending, 1); + if (vcpu0 waitqueue_active(vcpu0-wq)) wake_up_interruptible(vcpu0-wq); hrtimer_add_expires_ns(pt-timer, pt-period); pt-scheduled = hrtimer_get_expires_ns(pt-timer); if (pt-period) - ps-channels[0].count_load_time = hrtimer_get_expires(pt-timer); + ps-channels[0].count_load_time = ktime_get(); return (pt-period == 0 ? 0 : 1); } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: how increase/decrease ram on running vm ?
Xen hypervisor can increase and decrease cpu and ram. will be able this function on kvm/qemu ? В Сбт, 27/12/2008 в 01:47 +0900, Ryota OZAKI пишет: 2008/12/27 Ryota OZAKI ozaki.ry...@gmail.com: Have you tried decreasing memory? AFAIK, current ballooning cannot increase memory. oops, i mean ballooning cannot increase memory over the amount of memory specified in qemu/kvm arguments. Regards, ozaki-r 2008/12/27 Василец Дмитрий d.vasil...@peterhost.ru: i read this , but i haven't balloon in cli. В Птн, 26/12/2008 в 23:25 +0900, Ryota OZAKI пишет: Hi, http://www.linux-kvm.com/content/memory-ballooning-feature-coming-soon-kvm This page might help you. Regards, ozaki-r 2008/12/26 Василец Дмитрий d.vasil...@peterhost.ru: how increase/decrease ram on running vm ? i found virtio_balloon module , but don't know how it work. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: gettimeofday slow in RHEL4 guests
Marcelo Tosatti wrote: The tsc clock on older Linux 2.6 kernels compensates for lost ticks. The algorithm uses the PIT count (latched) to measure the delay between interrupt generation and handling, and sums that value, on the next interrupt, to the TSC delta. Sheng investigated this problem in the discussions before in-kernel PIT was merged: http://www.mail-archive.com/kvm-de...@lists.sourceforge.net/msg13873.html The algorithm overcompensates for lost ticks and the guest time runs faster than the hosts. There are two issues: 1) A bug in the in-kernel PIT which miscalculates the count value. 2) For the case where more than one interrupt is lost, and later reinjected, the value read from PIT count is meaningless for the purpose of the tsc algorithm. The count is interpreted as the delay until the next interrupt, which is not the case with reinjection. As Sheng mentioned in the thread above, Xen pulls back the TSC value when reinjecting interrupts. VMWare ESX has a notion of virtual TSC, which I believe is similar in this context. For KVM I believe the best immediate solution (for now) is to provide an option to disable reinjection, behaving similarly to real hardware. The advantage is simplicity compared to virtualizing the time sources. The QEMU PIT emulation has a limit on the rate of interrupt reinjection, perhaps something similar should be investigated in the future. The following patch (which contains the bugfix for 1) and disabled reinjection) fixes the severe time drift on RHEL4 with clock=tsc. What I'm proposing is to condition reinjection with an option (-kvm-pit-no-reinject or something). Comments or better ideas? diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e665d1c..608af7b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -201,13 +201,16 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) if (!atomic_inc_and_test(pt-pending)) set_bit(KVM_REQ_PENDING_TIMER, vcpu0-requests); + if (atomic_read(pt-pending) 1) + atomic_set(pt-pending, 1); + Replace the atomic_inc() with atomic_set(, 1) instead? One less test, and more important, the logic is scattered less around the source. if (vcpu0 waitqueue_active(vcpu0-wq)) wake_up_interruptible(vcpu0-wq); hrtimer_add_expires_ns(pt-timer, pt-period); pt-scheduled = hrtimer_get_expires_ns(pt-timer); if (pt-period) - ps-channels[0].count_load_time = hrtimer_get_expires(pt-timer); + ps-channels[0].count_load_time = ktime_get(); return (pt-period == 0 ? 0 : 1); } I don't like the idea of punting to the user but looks like we don't have a choice. Hopefully vendors will port kvmclock to these kernels and release them as updates -- time simply doesn't work will with virtualization, especially Linux guests. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq
Amit Shah wrote: I see from the call chain kvm_put_kvm-...-kvm_arch_destroy_vm, no locks are taken to actually destroy the vm. We can't be in ioctls, sure. But shouldn't the mutex be taken to ensure there's nothing else going on while destroying? Locks are useless to guard against something happening concurrent with destruction, since we're about to destroy the lock. At least with the workqueue model, we could be called asynchronously in kernel context and I would have held the mutex and about to inject interrupts while everything is being wiped off underneath. However, the workqueue model tries its best to schedule the work on the same CPU, though we can't use that guarantee to ensure things will be fine. --- So I had to get a ref to the current vm till we had any pending work scheduled. I think that's the right thing to do. I think I put in comments in the code, but sadly most of my comments we stripped out before the merge. Pity. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq
Sheng Yang wrote: The free assigned device in the destroy path of VM, so as free irq. And we got cancel_work_sync() in free irq which can sync with the execution of scheduled work. And now before cancel_work_sync(), we disable the interrupt so that no more schedule work happen again. So after cancel_work_sync(), everything(I think it's irq handler and schedule work here) asynchronously should quiet down. Or I miss something? Suppose the work_struct gets scheduled, but is delayed somewhere in the scheduler. Some kill -9s the VM, and it starts getting destroyed. cancel_work_sync() can no longer truly cancel the work, so it has to schedule and wait for its completion. So now we have kvm_assigned_dev_interrupt_work_handler() running in a partially destroyed VM. It may work or not, but it's a fragile situation (changing the order of destruction of components will likely break things) and it's easy to avoid by keeping the reference count elevated. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: userspace: Remove duplicated functionality for cpuid processing
Amit Shah wrote: host_cpuid is now available in target-i386/helper.c. Remove the duplicated code now in kvm-specific code. Applied, thanks. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq
On Monday 29 December 2008 21:37:52 Avi Kivity wrote: Sheng Yang wrote: The free assigned device in the destroy path of VM, so as free irq. And we got cancel_work_sync() in free irq which can sync with the execution of scheduled work. And now before cancel_work_sync(), we disable the interrupt so that no more schedule work happen again. So after cancel_work_sync(), everything(I think it's irq handler and schedule work here) asynchronously should quiet down. Or I miss something? Suppose the work_struct gets scheduled, but is delayed somewhere in the scheduler. Some kill -9s the VM, and it starts getting destroyed. cancel_work_sync() can no longer truly cancel the work, so it has to schedule and wait for its completion. So now we have kvm_assigned_dev_interrupt_work_handler() running in a partially destroyed VM. It may work or not, but it's a fragile situation (changing the order of destruction of components will likely break things) and it's easy to avoid by keeping the reference count elevated. OK, got it. Thanks for explaining! -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
kvm-82 messages
hi, in kvm-82 among the host's messages i see such lines (which i don't see before): kvm: 6413: cpu0 unhandled wrmsr: 0xc0010117 data 0 kvm: 6413: cpu0 unhandled rdmsr: 0xc0010117 kvm: 6413: cpu0 unhandled rdmsr: 0xc0010117 kvm: 6413: cpu0 unhandled wrmsr: 0xc0010117 data 0 -- Levente Si vis pacem para bellum! -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
__purge_vmap_area_lazy crash with CONFIG_PREEMPT_RCU=y
On Wed, Dec 24, 2008 at 04:28:44PM +0100, Andrea Arcangeli wrote: On Wed, Dec 24, 2008 at 02:50:57PM +0200, Avi Kivity wrote: Marcelo Tosatti wrote: The destructor for huge pages uses the backing inode for adjusting hugetlbfs accounting. Hugepage mappings are destroyed by exit_mmap, after mmu_notifier_release, so there are no notifications through unmap_hugepage_range at this point. The hugetlbfs inode can be freed with pages backed by it referenced by the shadow. When the shadow releases its reference, the huge page destructor will access a now freed inode. Implement the release operation for kvm mmu notifiers to release page refs before the hugetlbfs inode is gone. I see this isn't it. Andrea, comments? Yeah, the patch looks good, I talked a bit with Marcelo about this by PM. The issue is that it's not as strightforward as it seems, basically when I implemented the -release handlers and had sptes teardown running before the files were closed (instead of waiting the kvm anon inode release handler to fire) I was getting bugchecks from debug options including preempt=y (certain debug checks only becomes functional with preempt enabled unfortunately), so eventually I removed -release because for kvm -release wasn't useful because no guest mode can run any more by the time mmu notifier -release is invoked, and that avoided the issues with the bugchecks. We'll be using the mmu notifiers -release because it's always called just before the filehandle are destroyed, it's not really about the guest mode or secondary mmu but just an ordering issue with hugetlbfs internals. So in short if no bugcheck triggers this is fine (at least until hugetlbfs provides a way to register some callback to invoke at the start of the hugetlbfs-release handler). The only bugcheck I see, which triggers on vanilla kvm upstream with CONFIG_PREEMPT_DEBUG=y and CONFIG_PREEMPT_RCU=y is: general protection fault: [#1] PREEMPT SMP DEBUG_PAGEALLOC4ttyS1: 1 input overrun(s) last sysfs file: /sys/class/net/tap0/address CPU 0 Modules linked in: tun ipt_MASQUERADE iptable_nat nf_nat bridge stp llc nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack xt_tcpudp ipt_REJECT iptable_filter ip_tables x_tables dm_multipath kvm_intel kvm scsi_wait_scan ata_piix libata dm_snapshot dm_zero dm_mirror dm_region_hash dm_log dm_mod shpchp pci_hotplug mptsas mptscsih mptbase scsi_transport_sas uhci_hcd ohci_hcd ehci_hcd Pid: 4768, comm: qemu-system-x86 Not tainted 2.6.28-00165-g4f27e3e-dirty #164 RIP: 0010:[8028a5b6] [8028a5b6] __purge_vmap_area_lazy+0x12c/0x163 RSP: 0018:88021e1f9a38 EFLAGS: 00010202 RAX: 6b6b6b6b6b6b6b6b RBX: 6b6b6b6b6b6b6b2b RCX: 0003 RDX: 80a1dae0 RSI: 880028083980 RDI: 0001 RBP: 88021e1f9a78 R08: 0286 R09: 80a1bf50 R10: 880119c270f8 R11: 88021e1f99b8 R12: 88021e1f9a38 R13: 88021e1f9a90 R14: 88021e1f9a98 R15: 813a FS: () GS:8080d900() knlGS: CS: 0010 DS: 002b ES: 002b CR0: 8005003b CR2: 008d9828 CR3: 00201000 CR4: 26e0 DR0: DR1: DR2: DR3: DR6: 0ff0 DR7: 0400 Process qemu-system-x86 (pid: 4768, threadinfo 88021e1f8000, task 880119c270f8) Stack: 88022bdfd840 880119da11b8 c20011c3 813a 0001 88022ec11c18 88022f061838 88021e1f9aa8 8028ab1d 88021e1f9aa8 c20021976000 Call Trace: [8028ab1d] free_unmap_vmap_area_noflush+0x69/0x70 [8028ab49] remove_vm_area+0x25/0x71 [8028ac54] __vunmap+0x3a/0xca [8028ad35] vfree+0x29/0x2b [a00f98a3] kvm_free_physmem_slot+0x25/0x7c [kvm] [a00f9d75] kvm_free_physmem+0x27/0x36 [kvm] [a00fccb4] kvm_arch_destroy_vm+0xa6/0xda [kvm] [a00f9e11] kvm_put_kvm+0x8d/0xa7 [kvm] [a00fa0e2] kvm_vcpu_release+0x13/0x17 [kvm] [802a1c07] __fput+0xeb/0x1a3 [802a1cd4] fput+0x15/0x17 [8029f26c] filp_close+0x67/0x72 [802378a8] put_files_struct+0x74/0xc8 [80237943] exit_files+0x47/0x4f [80238fe5] do_exit+0x1eb/0x7a7 [80587edf] ? _spin_unlock_irq+0x2b/0x51 [80239614] do_group_exit+0x73/0xa0 [80242b10] get_signal_to_deliver+0x30c/0x32c [8020b4d5] ? sysret_signal+0x19/0x29 [8020a80f] do_notify_resume+0x8c/0x851 [8025b811] ? do_futex+0x90/0x92a [80256bd7] ? trace_hardirqs_on_caller+0xf0/0x114 [80587f51] ? _spin_unlock_irqrestore+0x4c/0x68 [8026be5c] ? __rcu_read_unlock+0x92/0x9e [80256bd7] ? trace_hardirqs_on_caller+0xf0/0x114 [80256c08] ? trace_hardirqs_on+0xd/0xf [8024f300] ? getnstimeofday+0x3a/0x96 [8024c4f0] ?
Re: gdbstub: packet reply is too long
Daniel Jacobowitz wrote: On Sun, Dec 21, 2008 at 12:44:04AM +0100, Jan Kiszka wrote: And that means setting current_gdbarch while keeping target_gdbarch - that's where reality (existing gdb code) bites us. Again, I'm not arguing against fixing this, I'm arguing in keeping qemu's workaround until this is done. I will look into the gdb part, but one after the other. No, it does not mean setting current_gdbarch different from target_gdbarch. With the current gdbarch set to a 64-bit one that accurately describes the target, GDB should be able to debug code running in 32-bit mode. If it can't, there are simply bugs in GDB to fix. Well, in the current gdb design, current_gdbarch is consulted when disassembling the code while target_gdbarch defines the register set that is exchanged with the remote stub. If you'd like to reach some solution to this problem, which I've seen come up on the QEMU list a half-dozen times now, please describe how you're using GDB on the g...@sourceware.org mailing list and let's see if we can't fix the GDB bugs. I'm pretty sure that any solution is going to involve always transferring the x86-64 register set, though. I'm pretty sure that the final solution will involve extended x86 register sets in order to inform the frontend about the full target CPU state so that it can set the right current_gdbarch automatically. That's one reason (the other is current/target_gdbarch decoupling) why I see no quick bug fix on the gdb side to actually solve the issue and suggest the reintroduction of the qemu workaround until gdb is enhanced appropriately. But you I right, it's time to start a discussion on the gdb list, hopefully laying the ground for a better x86 low-level support. And Maybe I actually miss some smart intermediate step towards this. Jan signature.asc Description: OpenPGP digital signature
Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq
On Mon, Dec 29, 2008 at 08:23:28PM +0800, Sheng Yang wrote: On Monday 29 December 2008 13:42:22 Amit Shah wrote: On Sun, Dec 28, 2008 at 07:24:02PM +0800, Sheng Yang wrote: On Sat, Dec 27, 2008 at 06:06:26PM -0200, Marcelo Tosatti wrote: On Fri, Dec 26, 2008 at 10:30:07AM +0800, Sheng Yang wrote: Thanks to Marcelo's observation, The following code have potential issue: if (cancel_work_sync(assigned_dev-interrupt_work)) kvm_put_kvm(kvm); In fact, cancel_work_sync() would return true either work struct is only scheduled or the callback of work struct is executed. This code only consider the former situation. Why not simply drop the reference inc / dec from irq handler/work function? Sorry, I don't know the answer. After checking the code, I also think it's a little strange to increase refernce count here, and I think we won't suppose work_handler can release the kvm struct. At the time of developing that code, this was my observation: I see from the call chain kvm_put_kvm-...-kvm_arch_destroy_vm, no locks are taken to actually destroy the vm. We can't be in ioctls, sure. But shouldn't the mutex be taken to ensure there's nothing else going on while destroying? At least with the workqueue model, we could be called asynchronously in kernel context and I would have held the mutex and about to inject interrupts while everything is being wiped off underneath. However, the workqueue model tries its best to schedule the work on the same CPU, though we can't use that guarantee to ensure things will be fine. --- So I had to get a ref to the current vm till we had any pending work scheduled. I think I put in comments in the code, but sadly most of my comments we stripped out before the merge. Not quite understand... The free assigned device in the destroy path of VM, so as free irq. And we got cancel_work_sync() in free irq which can sync with the execution of scheduled work. And now before cancel_work_sync(), we disable the interrupt so that no more schedule work happen again. So after cancel_work_sync(), everything(I think it's irq handler and schedule work here) asynchronously should quiet down. Or I miss something? Thats right. As long as you disable the irq and cancel pending work before freeing the data structures those paths use. There is one remaining issue: kvm_assigned_dev_interrupt_work_handler can re-enable the interrupt for KVM_ASSIGNED_DEV_GUEST_MSI case. Perhaps you need a new flag to indicate shutdown (so the host IRQ won't be reenabled). -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: gettimeofday slow in RHEL4 guests
Avi Kivity wrote: Marcelo Tosatti wrote: The tsc clock on older Linux 2.6 kernels compensates for lost ticks. The algorithm uses the PIT count (latched) to measure the delay between interrupt generation and handling, and sums that value, on the next interrupt, to the TSC delta. Sheng investigated this problem in the discussions before in-kernel PIT was merged: http://www.mail-archive.com/kvm-de...@lists.sourceforge.net/msg13873.html The algorithm overcompensates for lost ticks and the guest time runs faster than the hosts. There are two issues: 1) A bug in the in-kernel PIT which miscalculates the count value. 2) For the case where more than one interrupt is lost, and later reinjected, the value read from PIT count is meaningless for the purpose of the tsc algorithm. The count is interpreted as the delay until the next interrupt, which is not the case with reinjection. As Sheng mentioned in the thread above, Xen pulls back the TSC value when reinjecting interrupts. VMWare ESX has a notion of virtual TSC, which I believe is similar in this context. For KVM I believe the best immediate solution (for now) is to provide an option to disable reinjection, behaving similarly to real hardware. The advantage is simplicity compared to virtualizing the time sources. The QEMU PIT emulation has a limit on the rate of interrupt reinjection, perhaps something similar should be investigated in the future. The following patch (which contains the bugfix for 1) and disabled reinjection) fixes the severe time drift on RHEL4 with clock=tsc. What I'm proposing is to condition reinjection with an option (-kvm-pit-no-reinject or something). Comments or better ideas? diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e665d1c..608af7b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -201,13 +201,16 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) if (!atomic_inc_and_test(pt-pending)) set_bit(KVM_REQ_PENDING_TIMER, vcpu0-requests); +if (atomic_read(pt-pending) 1) +atomic_set(pt-pending, 1); + Replace the atomic_inc() with atomic_set(, 1) instead? One less test, and more important, the logic is scattered less around the source. But having only a pending bit instead of a counter will cause kvm to drop pit irqs on rare high load situations. The disable reinjection option is better. if (vcpu0 waitqueue_active(vcpu0-wq)) wake_up_interruptible(vcpu0-wq); hrtimer_add_expires_ns(pt-timer, pt-period); pt-scheduled = hrtimer_get_expires_ns(pt-timer); if (pt-period) -ps-channels[0].count_load_time = hrtimer_get_expires(pt-timer); +ps-channels[0].count_load_time = ktime_get(); return (pt-period == 0 ? 0 : 1); } I don't like the idea of punting to the user but looks like we don't have a choice. Hopefully vendors will port kvmclock to these kernels and release them as updates -- time simply doesn't work will with virtualization, especially Linux guests. Except for these 'tsc compensate' guest, what are the occasions where the guest writes his tsc? If this is the only case we can disable reinjection once we trap tsc writes. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: gettimeofday slow in RHEL4 guests
Dor Laor wrote: +if (atomic_read(pt-pending) 1) +atomic_set(pt-pending, 1); + Replace the atomic_inc() with atomic_set(, 1) instead? One less test, and more important, the logic is scattered less around the source. But having only a pending bit instead of a counter will cause kvm to drop pit irqs on rare high load situations. The disable reinjection option is better. Both variants disable reinjection. Forcing a counter to 1 every time it exceeds 1 is equivalent to maintaining a bit. In both variants, there is a missing 'if (disable_reinjection)' (Marcelo mentioned this in the original message). Except for these 'tsc compensate' guest, what are the occasions where the guest writes his tsc? If this is the only case we can disable reinjection once we trap tsc writes. I don't think these guests write to the tsc. Rather, they read the tsc and the pit counters and try to correlate. And fail. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: gettimeofday slow in RHEL4 guests
Dor Laor wrote: +if (atomic_read(pt-pending) 1) +atomic_set(pt-pending, 1); + Replace the atomic_inc() with atomic_set(, 1) instead? One less test, and more important, the logic is scattered less around the source. But having only a pending bit instead of a counter will cause kvm to drop pit irqs on rare high load situations. The disable reinjection option is better. Both variants disable reinjection. Forcing a counter to 1 every time it exceeds 1 is equivalent to maintaining a bit. In both variants, there is a missing 'if (disable_reinjection)' (Marcelo mentioned this in the original message). Except for these 'tsc compensate' guest, what are the occasions where the guest writes his tsc? If this is the only case we can disable reinjection once we trap tsc writes. I don't think these guests write to the tsc. Rather, they read the tsc and the pit counters and try to correlate. And fail. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 1/2] KVM: PIT: fix i8254 pending count read
count_load_time assignment is bogus: its supposed to contain what it means, not the expiration time. Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Index: kvm/arch/x86/kvm/i8254.c === --- kvm.orig/arch/x86/kvm/i8254.c +++ kvm/arch/x86/kvm/i8254.c @@ -207,7 +207,7 @@ static int __pit_timer_fn(struct kvm_kpi hrtimer_add_expires_ns(pt-timer, pt-period); pt-scheduled = hrtimer_get_expires_ns(pt-timer); if (pt-period) - ps-channels[0].count_load_time = hrtimer_get_expires(pt-timer); + ps-channels[0].count_load_time = ktime_get(); return (pt-period == 0 ? 0 : 1); } -- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 0/2] PIT: optionally disable interrupt reinjection
-- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 2/2] KVM: PIT: provide an option to disable interrupt reinjection
Certain clocks (such as TSC) in older 2.6 guests overaccount for lost ticks, causing severe time drift. Interrupt reinjection magnifies the problem. Provide an option to disable it. Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Index: kvm/arch/x86/kvm/i8254.c === --- kvm.orig/arch/x86/kvm/i8254.c +++ kvm/arch/x86/kvm/i8254.c @@ -201,6 +201,9 @@ static int __pit_timer_fn(struct kvm_kpi if (!atomic_inc_and_test(pt-pending)) set_bit(KVM_REQ_PENDING_TIMER, vcpu0-requests); + if (pt-no_reinject) + atomic_set(pt-pending, 1); + if (vcpu0 waitqueue_active(vcpu0-wq)) wake_up_interruptible(vcpu0-wq); Index: kvm/arch/x86/kvm/i8254.h === --- kvm.orig/arch/x86/kvm/i8254.h +++ kvm/arch/x86/kvm/i8254.h @@ -9,6 +9,7 @@ struct kvm_kpit_timer { s64 period; /* unit: ns */ s64 scheduled; atomic_t pending; + bool no_reinject; }; struct kvm_kpit_channel_state { Index: kvm/arch/x86/kvm/x86.c === --- kvm.orig/arch/x86/kvm/x86.c +++ kvm/arch/x86/kvm/x86.c @@ -991,6 +991,7 @@ int kvm_dev_ioctl_check_extension(long e case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: + case KVM_CAP_PIT_NO_REINJECT: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1723,6 +1724,12 @@ static int kvm_vm_ioctl_set_pit(struct k return r; } +static int kvm_vm_ioctl_no_reinject(struct kvm *kvm) +{ + kvm-arch.vpit-pit_state.pit_timer.no_reinject = true; + return 0; +} + /* * Get (and clear) the dirty memory log for a memory slot. */ @@ -1920,6 +1927,16 @@ long kvm_arch_vm_ioctl(struct file *filp r = 0; break; } + case KVM_PIT_NO_REINJECT: { + r = -ENXIO; + if (!kvm-arch.vpit) + goto out; + r = kvm_vm_ioctl_no_reinject(kvm); + if (r) + goto out; + r = 0; + break; + } default: ; } Index: kvm/include/linux/kvm.h === --- kvm.orig/include/linux/kvm.h +++ kvm/include/linux/kvm.h @@ -396,6 +396,9 @@ struct kvm_trace_rec { #if defined(CONFIG_X86) #define KVM_CAP_SET_GUEST_DEBUG 23 #endif +#if defined(CONFIG_X86) +#define KVM_CAP_PIT_NO_REINJECT 24 +#endif /* * ioctls for VM fds @@ -429,6 +432,7 @@ struct kvm_trace_rec { struct kvm_assigned_pci_dev) #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ struct kvm_assigned_irq) +#define KVM_PIT_NO_REINJECT_IO(KVMIO, 0x71) /* * ioctls for vcpu fds -- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 1/2] libkvm: pit not reinject support
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Index: kvm-userspace.pit/libkvm/libkvm-x86.c === --- kvm-userspace.pit.orig/libkvm/libkvm-x86.c +++ kvm-userspace.pit/libkvm/libkvm-x86.c @@ -75,6 +75,20 @@ int kvm_create_pit(kvm_context_t kvm) return 0; } +int kvm_pit_no_reinjection(kvm_context_t kvm) +{ +#ifdef KVM_CAP_PIT_NO_REINJECT +int r; + +r = ioctl(kvm-fd, KVM_CHECK_EXTENSION, KVM_CAP_PIT_NO_REINJECT); +if (r 0) { +r = ioctl(kvm-vm_fd, KVM_PIT_NO_REINJECT); +return r; +} +#endif +return -1; +} + int kvm_arch_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem) { Index: kvm-userspace.pit/libkvm/libkvm.h === --- kvm-userspace.pit.orig/libkvm/libkvm.h +++ kvm-userspace.pit/libkvm/libkvm.h @@ -648,6 +648,8 @@ int kvm_set_pit(kvm_context_t kvm, struc #endif +int kvm_pit_no_reinjection(kvm_context_t kvm); + #ifdef KVM_CAP_VAPIC /*! -- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 0/2] QEMU/KVM: PIT no interrupt reinjection support
Userspace support for KVM_PIT_NO_REINJECT. -- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 2/2] QEMU/KVM: provide an option to disable in-kernel PIT int reinjection
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Index: kvm-userspace.pit/qemu/qemu-kvm.c === --- kvm-userspace.pit.orig/qemu/qemu-kvm.c +++ kvm-userspace.pit/qemu/qemu-kvm.c @@ -11,6 +11,7 @@ int kvm_allowed = 1; int kvm_irqchip = 1; int kvm_pit = 1; +int kvm_pit_no_reinject = 0; int kvm_nested = 0; #include assert.h @@ -795,6 +796,12 @@ int kvm_qemu_create_context(void) r = kvm_arch_qemu_create_context(); if(r 0) kvm_qemu_destroy(); +if (kvm_pit_no_reinject) { +if (kvm_pit_no_reinjection(kvm_context)) { +fprintf(stderr, failure to disable in-kernel PIT reinjection\n); +return -1; +} +} #ifdef TARGET_I386 destroy_region_works = kvm_destroy_memory_region_works(kvm_context); #endif Index: kvm-userspace.pit/qemu/vl.c === --- kvm-userspace.pit.orig/qemu/vl.c +++ kvm-userspace.pit/qemu/vl.c @@ -4071,6 +4071,7 @@ static void help(int exitcode) #endif -no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n -no-kvm-pit disable KVM kernel mode PIT\n + -kvm-pit-no-reinject disable KVM kernel mode PIT interrupt reinjection\n -enable-nesting enable support for running a VM inside the VM (AMD only)\n #if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(TARGET_IA64) || defined(__linux__) -pcidevice host=bus:dev.func[,dma=none][,name=string]\n @@ -4202,6 +4203,7 @@ enum { QEMU_OPTION_no_kvm, QEMU_OPTION_no_kvm_irqchip, QEMU_OPTION_no_kvm_pit, +QEMU_OPTION_kvm_pit_no_reinject, #if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(TARGET_IA64) || defined(__linux__) QEMU_OPTION_pcidevice, #endif @@ -4298,6 +4300,7 @@ static const QEMUOption qemu_options[] = #endif { no-kvm-irqchip, 0, QEMU_OPTION_no_kvm_irqchip }, { no-kvm-pit, 0, QEMU_OPTION_no_kvm_pit }, +{ kvm-pit-no-reinject, 0, QEMU_OPTION_kvm_pit_no_reinject }, { enable-nesting, 0, QEMU_OPTION_enable_nesting }, #if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(TARGET_IA64) || defined(__linux__) { pcidevice, HAS_ARG, QEMU_OPTION_pcidevice }, @@ -5267,6 +5270,11 @@ int main(int argc, char **argv, char **e kvm_pit = 0; break; } +case QEMU_OPTION_kvm_pit_no_reinject: { +extern int kvm_pit_no_reinject; +kvm_pit_no_reinject = 1; +break; +} case QEMU_OPTION_enable_nesting: { kvm_nested = 1; break; -- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Serial ATA Support - will it come?
Let's back up here a bit -- When you say no disk was found at all when using SCSI emulation, do you mean the kernel booted and was unable to find the hard drives, or did it not even get that far? The SCSI emulation uses the sym53c8xx driver, which was first developed against 2.0.36; it's been in the kernel long enough that I'm certain whatever kernel you're using supports it (or it wouldn't run a modern userland either). I'm guessing you're running a monolithic kernel with only the drivers you need compiled in, instead of any vendor kernel (as any and every vendor kernel would have this driver available). Frankly, if you want to allow your users to boot their machines on other than your usual hardware, it's only sensible to have a kernel that supports the secondary hardware as well as what you normally provision them -- whether this secondary hardware is physical or virtual. That said, there's a mechanism by which you can cheat: Boot the guest with an externally-provided kernel using -kernel and -append options to kvm. This isn't ideal -- you're no longer going through the guest's bootloader and so lose any settings included there -- but should be good enough for a rescue environment. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] hook cpu running at a higher level.
This patch removes the kvm_enabled() check from cpu-exec.c. This file is highly tcg-specific, and we'll probably want it out when tcg is not compiled in (coming soon, in a theathe near you) Instead, we hook at the main loop level. The amount of code duplication introduced is at worst, acceptable, and I believe it pays. The tcg mainloop is likely to be different from the hypervisors ones, since tcg runs all cpus in lockstep. KVM (and probably xen), will be able to span threads out for its vcpus. Signed-off-by: Glauber Costa glom...@redhat.com --- cpu-exec.c |5 - kvm-all.c | 50 ++ vl.c | 16 ++-- 3 files changed, 60 insertions(+), 11 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index ed1545b..be8ceac 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -333,11 +333,6 @@ int cpu_exec(CPUState *env1) } #endif -if (kvm_enabled()) { -kvm_cpu_exec(env); -longjmp(env-jmp_env, 1); -} - next_tb = 0; /* force lookup of first TB */ for(;;) { interrupt_request = env-interrupt_request; diff --git a/kvm-all.c b/kvm-all.c index 11034df..a279d6c 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -663,3 +663,53 @@ int kvm_has_sync_mmu(void) return 0; } + +extern CPUState *cur_cpu; +extern CPUState *next_cpu; + +extern int reset_requested; +extern int shutdown_requested; +extern int powerdown_requested; + +int kvm_main_loop(void) +{ + +int ret, timeout; +CPUState *env; + +cur_cpu = first_cpu; +next_cpu = first_cpu; +env = first_cpu; + +for(;;) { +/* get next cpu */ +cpu_single_env = env; +ret = kvm_cpu_exec(env); +cpu_single_env = NULL; + +if (shutdown_requested) +break; +if (reset_requested) { +reset_requested = 0; +qemu_system_reset(); +ret = EXCP_INTERRUPT; +} +if (powerdown_requested) { +powerdown_requested = 0; +qemu_system_powerdown(); +ret = EXCP_INTERRUPT; +} + +if (ret == EXCP_HALTED) { +timeout = 5000; +} else { +timeout = 0; +} + +main_loop_wait(timeout); +} +cpu_disable_ticks(); +return ret; +} + + diff --git a/vl.c b/vl.c index 0a02151..bcaccc3 100644 --- a/vl.c +++ b/vl.c @@ -248,8 +248,8 @@ static struct drive_opt { char opt[1024]; } drives_opt[MAX_DRIVES]; -static CPUState *cur_cpu; -static CPUState *next_cpu; +CPUState *cur_cpu; +CPUState *next_cpu; static int event_pending = 1; /* Conversion factor from emulated instructions to virtual clock ticks. */ static int icount_time_shift; @@ -3452,9 +3452,9 @@ typedef struct QEMUResetEntry { } QEMUResetEntry; static QEMUResetEntry *first_reset_entry; -static int reset_requested; -static int shutdown_requested; -static int powerdown_requested; +int reset_requested; +int shutdown_requested; +int powerdown_requested; int qemu_shutdown_requested(void) { @@ -5535,7 +5535,11 @@ int main(int argc, char **argv, char **envp) close(fd); } -main_loop(); +if (kvm_enabled()) +kvm_main_loop(); +else +main_loop(); + quit_timers(); net_cleanup(); -- 1.5.6.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: how increase/decrease ram on running vm ?
Василец Дмитрий wrote: Xen hypervisor can increase and decrease cpu and ram. will be able this function on kvm/qemu ? Last time I was familiar with Xen's balloon driver, it had the exact same limitation as the kvm one. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Nested KVM
On Wed, Dec 24, 2008 at 4:20 AM, Alexander Graf ag...@suse.de wrote: Ugh. Looks like the emulation part is still broken :-(. Please use the attached patch to disable the emulation optimization for now. Avi, could you please apply that patch for kvm-82 too, so we get something working out? I'll take a closer look at what's broken exactly later on. Alex So I am working with the latest git, from today. The emulation error went away and the nested KVM guest partially works. The errors that I am seeing late in the normal guest boot (which seem non-fatal) are: Dec 29 18:33:31 amdbox kernel: [ 1060.446054] bad partial csum: csum=5888/5694 len=80 Dec 29 18:33:33 amdbox kernel: [ 1061.934164] bad partial csum: csum=5888/5694 len=80 Dec 29 18:33:33 amdbox kernel: [ 1062.170127] bad partial csum: csum=5888/5694 len=60 Dec 29 18:33:34 amdbox kernel: [ 1063.419124] bad partial csum: csum=5888/5694 len=270 Dec 29 18:33:35 amdbox kernel: [ 1063.667817] bad partial csum: csum=5888/5694 len=270 Dec 29 18:33:35 amdbox kernel: [ 1063.927839] bad partial csum: csum=5888/5694 len=270 Dec 29 18:33:35 amdbox kernel: [ 1064.126336] bad partial csum: csum=5888/5694 len=252 Dec 29 18:33:35 amdbox kernel: [ 1064.274429] bad partial csum: csum=5888/5694 len=152 Dec 29 18:33:35 amdbox kernel: [ 1064.522702] bad partial csum: csum=5888/5694 len=152 Dec 29 18:33:36 amdbox kernel: [ 1064.776290] bad partial csum: csum=5888/5694 len=152 Dec 29 18:33:38 amdbox kernel: [ 1067.309123] __ratelimit: 4 callbacks suppressed Dec 29 18:33:38 amdbox kernel: [ 1067.309126] bad partial csum: csum=5888/5694 len=252 Dec 29 18:33:39 amdbox kernel: [ 1068.160737] bad partial csum: csum=5888/5694 len=241 Dec 29 18:33:41 amdbox kernel: [ 1070.170049] bad partial csum: csum=5888/5694 len=60 After that I am able to start the nested guest with: sudo qemu-system-x86_64 -hda ubuntu-server.img -cdrom Desktop/ubuntu-8.10-server-amd64.iso The nested guest also has the latest git checkout The nested guest shows the Ubuntu install CD welcome and selecting a language and starting the boot process starts a very little bit and the screen goes black. The nested guest doesn't crash, but becomes very unresponsive, can't ping it, can't ssh, etc. It seems like it only runs for a short time before it becomes unresponsive (less than 30 seconds). I can attach to the qemu-system-x86_64 (gdb) where #0 0x7fa8cc4a1482 in select () from /lib/libc.so.6 #1 0x00408bcb in main_loop_wait (timeout=0) at /backup/src/kvm-src/kvm-userspace/qemu/vl.c:3617 #2 0x005160fa in kvm_main_loop () at /backup/src/kvm-src/kvm-userspace/qemu/qemu-kvm.c:599 #3 0x0040d106 in main (argc=value optimized out, argv=0x7fffd58e9f48, envp=value optimized out) at /backup/src/kvm-src/kvm-userspace/qemu/vl.c:3779 After some time, the qemu-system-x86_64 process starts to take between 97 and 100% of the CPU. The base system is still running OK, but no new messages are printed in /var/log/syslog I am sure there are more KVM debugging tricks Any suggestions? Thanks, Todd -- Todd Deshane http://todddeshane.net http://runningxen.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH][v2] kvm-userspace: Load PCI option ROMs
Avi Kivity wrote: Liu, Kechao wrote: How is the last bit performed on real hardware? Obviously the ROM can't have devfn embedded. On a real hardware, BIOS scans PCI devices, loads ROMs and it can get devices' devfns. Here, in an easier way, we load option ROMs in QEMU and thus need to store and pass the devfns. Well, it may make sense to provide the ROMs as virtual PCI BARs, and have the bios do the work. This way, if some driver relies on remapping the BAR (graphic cards?), it can still work. I do not quite understand this. Can you elaborate? Best Regards Shan Haitao-- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq
On Monday 29 December 2008 23:20:57 Marcelo Tosatti wrote: On Mon, Dec 29, 2008 at 08:23:28PM +0800, Sheng Yang wrote: On Monday 29 December 2008 13:42:22 Amit Shah wrote: On Sun, Dec 28, 2008 at 07:24:02PM +0800, Sheng Yang wrote: On Sat, Dec 27, 2008 at 06:06:26PM -0200, Marcelo Tosatti wrote: On Fri, Dec 26, 2008 at 10:30:07AM +0800, Sheng Yang wrote: Thanks to Marcelo's observation, The following code have potential issue: if (cancel_work_sync(assigned_dev-interrupt_work)) kvm_put_kvm(kvm); In fact, cancel_work_sync() would return true either work struct is only scheduled or the callback of work struct is executed. This code only consider the former situation. Why not simply drop the reference inc / dec from irq handler/work function? Sorry, I don't know the answer. After checking the code, I also think it's a little strange to increase refernce count here, and I think we won't suppose work_handler can release the kvm struct. At the time of developing that code, this was my observation: I see from the call chain kvm_put_kvm-...-kvm_arch_destroy_vm, no locks are taken to actually destroy the vm. We can't be in ioctls, sure. But shouldn't the mutex be taken to ensure there's nothing else going on while destroying? At least with the workqueue model, we could be called asynchronously in kernel context and I would have held the mutex and about to inject interrupts while everything is being wiped off underneath. However, the workqueue model tries its best to schedule the work on the same CPU, though we can't use that guarantee to ensure things will be fine. --- So I had to get a ref to the current vm till we had any pending work scheduled. I think I put in comments in the code, but sadly most of my comments we stripped out before the merge. Not quite understand... The free assigned device in the destroy path of VM, so as free irq. And we got cancel_work_sync() in free irq which can sync with the execution of scheduled work. And now before cancel_work_sync(), we disable the interrupt so that no more schedule work happen again. So after cancel_work_sync(), everything(I think it's irq handler and schedule work here) asynchronously should quiet down. Or I miss something? Thats right. As long as you disable the irq and cancel pending work before freeing the data structures those paths use. There is one remaining issue: kvm_assigned_dev_interrupt_work_handler can re-enable the interrupt for KVM_ASSIGNED_DEV_GUEST_MSI case. Perhaps you need a new flag to indicate shutdown (so the host IRQ won't be reenabled). Is it already covered by disable_irq_no_sync() before cancel_work_sync()? I've noted this in my comment: the irq may be disabled nested(once for MSI and twice for INTx), but I think it's fine for we're going to free it. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 2/2] QEMU/KVM: provide an option to disable in-kernel PIT int reinjection
On Tuesday 30 December 2008 01:42:35 Marcelo Tosatti wrote: Signed-off-by: Marcelo Tosatti mtosa...@redhat.com -- #if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(TARGET_IA64) || defined(__linux__) { pcidevice, HAS_ARG, QEMU_OPTION_pcidevice }, @@ -5267,6 +5270,11 @@ int main(int argc, char **argv, char **e kvm_pit = 0; break; } +case QEMU_OPTION_kvm_pit_no_reinject: { +extern int kvm_pit_no_reinject; +kvm_pit_no_reinject = 1; +break; +} case QEMU_OPTION_enable_nesting: { kvm_nested = 1; break; Do we need check the conflict of --kvm-pit-no-reinject and --no-kvm-pit/--no- kvm-irqchip? Check it ahead of kvm_qemu_create_context() seems a little better... -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: __purge_vmap_area_lazy crash with CONFIG_PREEMPT_RCU=y
On Tuesday 30 December 2008 01:58:21 Marcelo Tosatti wrote: On Wed, Dec 24, 2008 at 04:28:44PM +0100, Andrea Arcangeli wrote: On Wed, Dec 24, 2008 at 02:50:57PM +0200, Avi Kivity wrote: Marcelo Tosatti wrote: The destructor for huge pages uses the backing inode for adjusting hugetlbfs accounting. Hugepage mappings are destroyed by exit_mmap, after mmu_notifier_release, so there are no notifications through unmap_hugepage_range at this point. The hugetlbfs inode can be freed with pages backed by it referenced by the shadow. When the shadow releases its reference, the huge page destructor will access a now freed inode. Implement the release operation for kvm mmu notifiers to release page refs before the hugetlbfs inode is gone. I see this isn't it. Andrea, comments? Yeah, the patch looks good, I talked a bit with Marcelo about this by PM. The issue is that it's not as strightforward as it seems, basically when I implemented the -release handlers and had sptes teardown running before the files were closed (instead of waiting the kvm anon inode release handler to fire) I was getting bugchecks from debug options including preempt=y (certain debug checks only becomes functional with preempt enabled unfortunately), so eventually I removed -release because for kvm -release wasn't useful because no guest mode can run any more by the time mmu notifier -release is invoked, and that avoided the issues with the bugchecks. We'll be using the mmu notifiers -release because it's always called just before the filehandle are destroyed, it's not really about the guest mode or secondary mmu but just an ordering issue with hugetlbfs internals. So in short if no bugcheck triggers this is fine (at least until hugetlbfs provides a way to register some callback to invoke at the start of the hugetlbfs-release handler). The only bugcheck I see, which triggers on vanilla kvm upstream with CONFIG_PREEMPT_DEBUG=y and CONFIG_PREEMPT_RCU=y is: general protection fault: [#1] PREEMPT SMP DEBUG_PAGEALLOC4ttyS1: 1 input overrun(s) last sysfs file: /sys/class/net/tap0/address CPU 0 Modules linked in: tun ipt_MASQUERADE iptable_nat nf_nat bridge stp llc nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack xt_tcpudp ipt_REJECT iptable_filter ip_tables x_tables dm_multipath kvm_intel kvm scsi_wait_scan ata_piix libata dm_snapshot dm_zero dm_mirror dm_region_hash dm_log dm_mod shpchp pci_hotplug mptsas mptscsih mptbase scsi_transport_sas uhci_hcd ohci_hcd ehci_hcd Pid: 4768, comm: qemu-system-x86 Not tainted 2.6.28-00165-g4f27e3e-dirty #164 RIP: 0010:[8028a5b6] [8028a5b6] __purge_vmap_area_lazy+0x12c/0x163 RSP: 0018:88021e1f9a38 EFLAGS: 00010202 RAX: 6b6b6b6b6b6b6b6b RBX: 6b6b6b6b6b6b6b2b RCX: 0003 RDX: 80a1dae0 RSI: 880028083980 RDI: 0001 RBP: 88021e1f9a78 R08: 0286 R09: 80a1bf50 R10: 880119c270f8 R11: 88021e1f99b8 R12: 88021e1f9a38 R13: 88021e1f9a90 R14: 88021e1f9a98 R15: 813a FS: () GS:8080d900() knlGS: CS: 0010 DS: 002b ES: 002b CR0: 8005003b CR2: 008d9828 CR3: 00201000 CR4: 26e0 DR0: DR1: DR2: DR3: DR6: 0ff0 DR7: 0400 Process qemu-system-x86 (pid: 4768, threadinfo 88021e1f8000, task 880119c270f8) Stack: 88022bdfd840 880119da11b8 c20011c3 813a 0001 88022ec11c18 88022f061838 88021e1f9aa8 8028ab1d 88021e1f9aa8 c20021976000 Call Trace: [8028ab1d] free_unmap_vmap_area_noflush+0x69/0x70 [8028ab49] remove_vm_area+0x25/0x71 [8028ac54] __vunmap+0x3a/0xca [8028ad35] vfree+0x29/0x2b [a00f98a3] kvm_free_physmem_slot+0x25/0x7c [kvm] [a00f9d75] kvm_free_physmem+0x27/0x36 [kvm] [a00fccb4] kvm_arch_destroy_vm+0xa6/0xda [kvm] [a00f9e11] kvm_put_kvm+0x8d/0xa7 [kvm] [a00fa0e2] kvm_vcpu_release+0x13/0x17 [kvm] [802a1c07] __fput+0xeb/0x1a3 [802a1cd4] fput+0x15/0x17 [8029f26c] filp_close+0x67/0x72 [802378a8] put_files_struct+0x74/0xc8 [80237943] exit_files+0x47/0x4f [80238fe5] do_exit+0x1eb/0x7a7 [80587edf] ? _spin_unlock_irq+0x2b/0x51 [80239614] do_group_exit+0x73/0xa0 [80242b10] get_signal_to_deliver+0x30c/0x32c [8020b4d5] ? sysret_signal+0x19/0x29 [8020a80f] do_notify_resume+0x8c/0x851 [8025b811] ? do_futex+0x90/0x92a [80256bd7] ? trace_hardirqs_on_caller+0xf0/0x114 [80587f51] ? _spin_unlock_irqrestore+0x4c/0x68 [8026be5c] ? __rcu_read_unlock+0x92/0x9e [80256bd7] ? trace_hardirqs_on_caller+0xf0/0x114
[0/3][RESEND] Device assignment code clean up and MSI disable support
I splitted the former patchset into 3 smaller one. Here is the first one. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] KVM: Add MSI_ACTION flag for assigned irq
For MSI disable feature later. Notice I changed ABI here, but due to no userspace patch, I think it's OK. Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm.h |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 42f51dc..c24f207 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -546,6 +546,7 @@ struct kvm_assigned_irq { #define KVM_DEV_ASSIGN_ENABLE_IOMMU(1 0) -#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI (1 0) +#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION (1 0) +#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI (1 1) #endif -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] KVM: Add support to disable MSI for assigned device
MSI is always enabled by default for msi2intx=1. But if msi2intx=0, we have to disable MSI if guest require to do so. The patch also discard unnecessary msi2intx judgment if guest want to update MSI state. Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/kvm_main.c | 12 ++-- 1 files changed, 10 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cd84b3e..111738b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -328,6 +328,15 @@ static int assigned_device_update_msi(struct kvm *kvm, adev-irq_requested_type = ~KVM_ASSIGNED_DEV_GUEST_MSI; adev-guest_irq = airq-guest_irq; adev-ack_notifier.gsi = airq-guest_irq; + } else { + /* +* Guest require to disable device MSI, we disable MSI and +* re-enable INTx by default again. Notice it's only for +* non-msi2intx. +*/ + kvm_free_assigned_irq(kvm, adev); + assigned_device_update_intx(kvm, adev, airq); + return 0; } if (adev-irq_requested_type KVM_ASSIGNED_DEV_HOST_MSI) @@ -399,8 +408,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, } } - if ((!msi2intx -(assigned_irq-flags KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) || + if ((assigned_irq-flags KVM_DEV_IRQ_ASSIGN_MSI_ACTION) || (msi2intx match-dev-msi_enabled)) { #ifdef CONFIG_X86 r = assigned_device_update_msi(kvm, match, assigned_irq); -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] KVM: Use kvm_free_assigned_irq() for free irq
Which is more convenient... Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/kvm_main.c | 10 ++ 1 files changed, 2 insertions(+), 8 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ffd261d..cd84b3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -284,11 +284,7 @@ static int assigned_device_update_intx(struct kvm *kvm, return 0; if (irqchip_in_kernel(kvm)) { - if (!msi2intx - adev-irq_requested_type KVM_ASSIGNED_DEV_HOST_MSI) { - free_irq(adev-host_irq, (void *)kvm); - pci_disable_msi(adev-dev); - } + kvm_free_assigned_irq(kvm, adev); if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -339,9 +335,7 @@ static int assigned_device_update_msi(struct kvm *kvm, if (irqchip_in_kernel(kvm)) { if (!msi2intx) { - if (adev-irq_requested_type - KVM_ASSIGNED_DEV_HOST_INTX) - free_irq(adev-host_irq, (void *)adev); + kvm_free_assigned_irq(kvm, adev); r = pci_enable_msi(adev-dev); if (r) -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/10][v3] GSI-MSG route layer for MSI/MSI-X
Update from v2: Add gsi_msg_pending_bitmap, in order to support MSI-X mutiply interrupts. And this one depends on the former Device assignment code clean up and MSI disable support patchset. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/10] KVM: Improve MSI dispatch function
Prepare to merge with kvm_set_irq(). Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/kvm_main.c |8 1 files changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3494861..599257e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -87,7 +87,7 @@ static bool kvm_rebooting; #ifdef KVM_CAP_DEVICE_ASSIGNMENT #ifdef CONFIG_X86 -static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) +static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev, u32 gsi) { int vcpu_id; struct kvm_vcpu *vcpu; @@ -99,7 +99,7 @@ static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) BUG_ON(!ioapic); mutex_lock(dev-kvm-gsi_msg_lock); - gsi_msg = kvm_find_gsi_msg(dev-kvm, dev-guest_irq); + gsi_msg = kvm_find_gsi_msg(dev-kvm, gsi); if (!gsi_msg) { printk(KERN_WARNING kvm: fail to find correlated gsi_msg\n); return; @@ -143,7 +143,7 @@ static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) } } #else -static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) {} +static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev, u32 gsi) {} #endif static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, @@ -178,7 +178,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) assigned_dev-guest_irq, 1); else if (assigned_dev-irq_requested_type KVM_ASSIGNED_DEV_GUEST_MSI) { - assigned_device_msi_dispatch(assigned_dev); + assigned_device_msi_dispatch(assigned_dev, assigned_dev-guest_irq); enable_irq(assigned_dev-host_irq); assigned_dev-host_irq_disabled = false; } -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/10] KVM: Using ioapic_irqchip() macro for kvm_set_irq
Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/irq_comm.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index abfab46..47243ef 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -39,7 +39,7 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. */ - kvm_ioapic_set_irq(kvm-arch.vioapic, irq, !!(*irq_state)); + kvm_ioapic_set_irq(ioapic_irqchip(kvm), irq, !!(*irq_state)); #ifdef CONFIG_X86 kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state)); #endif -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/10] KVM: bit ops for deliver_bitmap
It's also convenient when we extend KVM supported vcpu number in the future. Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/lapic.c |7 --- virt/kvm/ioapic.c| 24 +--- virt/kvm/irq_comm.c | 16 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c1e4935..359e02c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -477,9 +477,10 @@ static void apic_send_ipi(struct kvm_lapic *apic) struct kvm_vcpu *target; struct kvm_vcpu *vcpu; - unsigned long lpr_map = 0; + DECLARE_BITMAP(lpr_map, KVM_MAX_VCPUS); int i; + bitmap_zero(lpr_map, KVM_MAX_VCPUS); apic_debug(icr_high 0x%x, icr_low 0x%x, short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n, @@ -494,7 +495,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) if (vcpu-arch.apic apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { if (delivery_mode == APIC_DM_LOWEST) - set_bit(vcpu-vcpu_id, lpr_map); + set_bit(vcpu-vcpu_id, lpr_map); else __apic_accept_irq(vcpu-arch.apic, delivery_mode, vector, level, trig_mode); @@ -502,7 +503,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) } if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map); + target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map); if (target != NULL) __apic_accept_irq(target-arch.apic, delivery_mode, vector, level, trig_mode); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 164a746..bf83f5e 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -195,7 +195,7 @@ void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) { union kvm_ioapic_redirect_entry entry = ioapic-redirtbl[irq]; - unsigned long deliver_bitmask; + DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS); struct kvm_vcpu *vcpu; int vcpu_id, r = 0; @@ -205,22 +205,24 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) entry.fields.delivery_mode, entry.fields.vector, entry.fields.trig_mode); - kvm_get_intr_delivery_bitmask(ioapic, entry, deliver_bitmask); - if (!deliver_bitmask) { - ioapic_debug(no target on destination\n); - return 0; - } + bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS); /* Always delivery PIT interrupt to vcpu 0 */ #ifdef CONFIG_X86 if (irq == 0) - deliver_bitmask = 1 0; + set_bit(0, deliver_bitmask); + else #endif + kvm_get_intr_delivery_bitmask(ioapic, entry, deliver_bitmask); + + if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) = KVM_MAX_VCPUS) { + ioapic_debug(no target on destination\n); + return 0; + } - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask (1 vcpu_id))) - continue; - deliver_bitmask = ~(1 vcpu_id); + while ((vcpu_id = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS)) +KVM_MAX_VCPUS) { + clear_bit(vcpu_id, deliver_bitmask); vcpu = ioapic-kvm-vcpus[vcpu_id]; if (vcpu) { if (entry.fields.delivery_mode == diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index e74d679..ecda2c1 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -42,7 +42,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, case IOAPIC_LOWEST_PRIORITY: vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, entry-fields.vector, deliver_bitmask); - *deliver_bitmask = 1 vcpu-vcpu_id; + set_bit(vcpu-vcpu_id, deliver_bitmask); break; case IOAPIC_FIXED: case IOAPIC_NMI: @@ -63,11 +63,12 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level) struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); struct kvm_gsi_msg *gsi_msg; union kvm_ioapic_redirect_entry entry; - unsigned long deliver_bitmask; + DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS); BUG_ON(!ioapic); #endif + bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS); if (!(gsi KVM_GSI_MSG_MASK)) { int irq = gsi; @@ -111,16 +112,15 @@ void
[PATCH 08/10] KVM: Change API of kvm_ioapic_get_delivery_bitmask
In order to use with bit ops. Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/ioapic.c | 17 - virt/kvm/ioapic.h |4 ++-- virt/kvm/irq_comm.c |5 +++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index af9f5de..ebd5ba6 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -153,22 +153,22 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } -u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, - u8 dest_mode) +void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, +u8 dest_mode, u32 *mask) { - u32 mask = 0; int i; struct kvm *kvm = ioapic-kvm; struct kvm_vcpu *vcpu; ioapic_debug(dest %d dest_mode %d\n, dest, dest_mode); + *mask = 0; if (dest_mode == 0) { /* Physical mode. */ if (dest == 0xFF) { /* Broadcast. */ for (i = 0; i KVM_MAX_VCPUS; ++i) if (kvm-vcpus[i] kvm-vcpus[i]-arch.apic) - mask |= 1 i; - return mask; + *mask |= 1 i; + return; } for (i = 0; i KVM_MAX_VCPUS; ++i) { vcpu = kvm-vcpus[i]; @@ -176,7 +176,7 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, continue; if (kvm_apic_match_physical_addr(vcpu-arch.apic, dest)) { if (vcpu-arch.apic) - mask = 1 i; + *mask = 1 i; break; } } @@ -187,10 +187,9 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, continue; if (vcpu-arch.apic kvm_apic_match_logical_addr(vcpu-arch.apic, dest)) - mask |= 1 vcpu-vcpu_id; + *mask |= 1 vcpu-vcpu_id; } - ioapic_debug(mask %x\n, mask); - return mask; + ioapic_debug(mask %x\n, *mask); } static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index ee5b0bd..e107dbb 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -70,7 +70,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); -u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, - u8 dest_mode); +void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, +u8 dest_mode, u32 *mask); #endif diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index d89d8b2..1949587 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -35,8 +35,9 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, { struct kvm_vcpu *vcpu; - *deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, - entry-fields.dest_id, entry-fields.dest_mode); + kvm_ioapic_get_delivery_bitmask(ioapic, entry-fields.dest_id, + entry-fields.dest_mode, + deliver_bitmask); switch (entry-fields.delivery_mode) { case IOAPIC_LOWEST_PRIORITY: vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/10] KVM: Update intr delivery func to accept unsigned long* bitmap
Would be used with bit ops, and would be easily extended if KVM_MAX_VCPUS is increased. Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/lapic.c |8 include/linux/kvm_host.h |2 +- virt/kvm/ioapic.c|4 ++-- virt/kvm/ioapic.h|4 ++-- virt/kvm/irq_comm.c |6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index afac68c..c1e4935 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -403,7 +403,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, - unsigned long bitmap) + unsigned long *bitmap) { int last; int next; @@ -415,7 +415,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, do { if (++next == KVM_MAX_VCPUS) next = 0; - if (kvm-vcpus[next] == NULL || !test_bit(next, bitmap)) + if (kvm-vcpus[next] == NULL || !test_bit(next, bitmap)) continue; apic = kvm-vcpus[next]-arch.apic; if (apic apic_enabled(apic)) @@ -431,7 +431,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, } struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap) + unsigned long *bitmap) { struct kvm_lapic *apic; @@ -502,7 +502,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) } if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map); + target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map); if (target != NULL) __apic_accept_irq(target-arch.apic, delivery_mode, vector, level, trig_mode); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4f92317..fbf102c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -332,7 +332,7 @@ struct kvm_gsi_msg { void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, union kvm_ioapic_redirect_entry *entry, - u32 *deliver_bitmask); + unsigned long *deliver_bitmask); void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); void kvm_register_irq_ack_notifier(struct kvm *kvm, diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index ebd5ba6..164a746 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -154,7 +154,7 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu) } void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, -u8 dest_mode, u32 *mask) +u8 dest_mode, unsigned long *mask) { int i; struct kvm *kvm = ioapic-kvm; @@ -195,7 +195,7 @@ void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) { union kvm_ioapic_redirect_entry entry = ioapic-redirtbl[irq]; - u32 deliver_bitmask; + unsigned long deliver_bitmask; struct kvm_vcpu *vcpu; int vcpu_id, r = 0; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index e107dbb..c418a7f 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -65,12 +65,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) } struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap); + unsigned long *bitmap); void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, -u8 dest_mode, u32 *mask); +u8 dest_mode, unsigned long *mask); #endif diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 1949587..e74d679 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -31,7 +31,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, union kvm_ioapic_redirect_entry *entry, - u32 *deliver_bitmask) + unsigned long *deliver_bitmask) { struct kvm_vcpu *vcpu; @@ -41,7 +41,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, switch
[PATCH 01/10] KVM: Add a route layer to convert MSI message to GSI
Avi's purpose, to use single kvm_set_irq() to deal with all interrupt, including MSI. So here is it. struct gsi_msg is a mapping from a special gsi(with KVM_GSI_MSG_MASK) to MSI/MSI-X message address/data. Now we support up to 256 gsi_msg mapping, and gsi_msg is allocated by kernel and provide two ioctls to userspace, which is more flexiable. Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm.h | 14 + include/linux/kvm_host.h | 16 ++ virt/kvm/irq_comm.c | 70 ++ virt/kvm/kvm_main.c | 66 +++ 4 files changed, 166 insertions(+), 0 deletions(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c24f207..a75e01f 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -396,6 +396,9 @@ struct kvm_trace_rec { #if defined(CONFIG_X86) #define KVM_CAP_SET_GUEST_DEBUG 23 #endif +#if defined(CONFIG_X86) +#define KVM_CAP_GSI_MSG 24 +#endif /* * ioctls for VM fds @@ -429,6 +432,8 @@ struct kvm_trace_rec { struct kvm_assigned_pci_dev) #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ struct kvm_assigned_irq) +#define KVM_REQUEST_GSI_MSG _IOWR(KVMIO, 0x71, struct kvm_assigned_gsi_msg) +#define KVM_FREE_GSI_MSG _IOR(KVMIO, 0x72, struct kvm_assigned_gsi_msg) /* * ioctls for vcpu fds @@ -549,4 +554,13 @@ struct kvm_assigned_irq { #define KVM_DEV_IRQ_ASSIGN_MSI_ACTION (1 0) #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI (1 1) +struct kvm_assigned_gsi_msg { + __u32 gsi; + struct { + __u32 addr_lo; + __u32 addr_hi; + __u32 data; + } msg; +}; + #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d63e9a4..0e5741a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -132,6 +132,10 @@ struct kvm { unsigned long mmu_notifier_seq; long mmu_notifier_count; #endif + struct hlist_head gsi_msg_list; + struct mutex gsi_msg_lock; +#define KVM_NR_GSI_MSG 256 + DECLARE_BITMAP(gsi_msg_bitmap, KVM_NR_GSI_MSG); }; /* The guest did something we don't support. */ @@ -319,6 +323,14 @@ struct kvm_assigned_dev_kernel { struct pci_dev *dev; struct kvm *kvm; }; + +#define KVM_GSI_MSG_MASK0x100ull +struct kvm_gsi_msg { + u32 gsi; + struct msi_msg msg; + struct hlist_node link; +}; + void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); void kvm_register_irq_ack_notifier(struct kvm *kvm, @@ -326,6 +338,10 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm, void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian); int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); +int kvm_update_gsi_msg(struct kvm *kvm, struct kvm_gsi_msg *gsi_msg); +struct kvm_gsi_msg *kvm_find_gsi_msg(struct kvm *kvm, u32 gsi); +void kvm_free_gsi_msg(struct kvm *kvm, struct kvm_gsi_msg *gsi_msg); +void kvm_free_gsi_msg_list(struct kvm *kvm); #ifdef CONFIG_DMAR int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index aa5d1e5..abfab46 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -99,3 +99,73 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) clear_bit(irq_source_id, kvm-arch.irq_states[i]); clear_bit(irq_source_id, kvm-arch.irq_sources_bitmap); } + +int kvm_update_gsi_msg(struct kvm *kvm, struct kvm_gsi_msg *gsi_msg) +{ + struct kvm_gsi_msg *found_msg, *new_gsi_msg; + int r, gsi; + + mutex_lock(kvm-gsi_msg_lock); + /* Find whether we need a update or a new entry */ + found_msg = kvm_find_gsi_msg(kvm, gsi_msg-gsi); + if (found_msg) + *found_msg = *gsi_msg; + else { + gsi = find_first_zero_bit(kvm-gsi_msg_bitmap, KVM_NR_GSI_MSG); + if (gsi = KVM_NR_GSI_MSG) { + r = -ENOSPC; + goto out; + } + __set_bit(gsi, kvm-gsi_msg_bitmap); + gsi_msg-gsi = gsi | KVM_GSI_MSG_MASK; + new_gsi_msg = kzalloc(sizeof(*new_gsi_msg), GFP_KERNEL); + if (!new_gsi_msg) { + r = -ENOMEM; + goto out; + } + *new_gsi_msg = *gsi_msg; + hlist_add_head(new_gsi_msg-link, kvm-gsi_msg_list); + } + r = 0; +out: + mutex_unlock(kvm-gsi_msg_lock); + return r; +} + +/* Call with kvm-gsi_msg_lock hold */ +struct kvm_gsi_msg *kvm_find_gsi_msg(struct kvm *kvm, u32 gsi) +{ + struct kvm_gsi_msg *gsi_msg; + struct hlist_node *n; + + if (!(gsi KVM_GSI_MSG_MASK)) + return NULL; +
[PATCH 06/10] KVM: Split IOAPIC structure
Prepared for reuse ioapic_redir_entry for MSI. Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_types.h | 17 + virt/kvm/ioapic.c |6 +++--- virt/kvm/ioapic.h | 17 + 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 5f4a18c..46e3d8d 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -52,4 +52,21 @@ struct kvm_pio_request { int rep; }; +union kvm_ioapic_redirect_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; +}; + #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 23b81cf..ebb2ab5 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -85,7 +85,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) { - union ioapic_redir_entry *pent; + union kvm_ioapic_redirect_entry *pent; pent = ioapic-redirtbl[idx]; @@ -272,7 +272,7 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) { u32 old_irr = ioapic-irr; u32 mask = 1 irq; - union ioapic_redir_entry entry; + union kvm_ioapic_redirect_entry entry; if (irq = 0 irq IOAPIC_NUM_PINS) { entry = ioapic-redirtbl[irq]; @@ -291,7 +291,7 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi, int trigger_mode) { - union ioapic_redir_entry *ent; + union kvm_ioapic_redirect_entry *ent; ent = ioapic-redirtbl[gsi]; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 49c9581..ee5b0bd 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -40,22 +40,7 @@ struct kvm_ioapic { u32 id; u32 irr; u32 pad; - union ioapic_redir_entry { - u64 bits; - struct { - u8 vector; - u8 delivery_mode:3; - u8 dest_mode:1; - u8 delivery_status:1; - u8 polarity:1; - u8 remote_irr:1; - u8 trig_mode:1; - u8 mask:1; - u8 reserve:7; - u8 reserved[4]; - u8 dest_id; - } fields; - } redirtbl[IOAPIC_NUM_PINS]; + union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/10] KVM: Merge MSI handling to kvm_set_irq
Using kvm_set_irq to handle all interrupt injection. Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_host.h |2 +- virt/kvm/irq_comm.c | 98 +++--- virt/kvm/kvm_main.c | 77 +++- 3 files changed, 90 insertions(+), 87 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index aa2606b..5b671b6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -330,7 +330,7 @@ struct kvm_gsi_msg { struct hlist_node link; }; -void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); +void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 47243ef..63cdf01 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -20,28 +20,96 @@ */ #include linux/kvm_host.h + +#ifdef CONFIG_X86 +#include asm/msidef.h +#endif + #include irq.h #include ioapic.h /* This should be called with the kvm-lock mutex held */ -void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) +void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level) { - unsigned long *irq_state = (unsigned long *)kvm-arch.irq_states[irq]; - - /* Logical OR for level trig interrupt */ - if (level) - set_bit(irq_source_id, irq_state); - else - clear_bit(irq_source_id, irq_state); - - /* Not possible to detect if the guest uses the PIC or the -* IOAPIC. So set the bit in both. The guest will ignore -* writes to the unused one. -*/ - kvm_ioapic_set_irq(ioapic_irqchip(kvm), irq, !!(*irq_state)); + unsigned long *irq_state; +#ifdef CONFIG_X86 + int vcpu_id; + struct kvm_vcpu *vcpu; + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + struct kvm_gsi_msg *gsi_msg; + int dest_id, vector, dest_mode, trig_mode, delivery_mode; + u32 deliver_bitmask; + + BUG_ON(!ioapic); +#endif + + if (!(gsi KVM_GSI_MSG_MASK)) { + int irq = gsi; + + irq_state = (unsigned long *)kvm-arch.irq_states[irq]; + + /* Logical OR for level trig interrupt */ + if (level) + set_bit(irq_source_id, irq_state); + else + clear_bit(irq_source_id, irq_state); + + /* Not possible to detect if the guest uses the PIC or the +* IOAPIC. So set the bit in both. The guest will ignore +* writes to the unused one. +*/ + kvm_ioapic_set_irq(ioapic, irq, !!(*irq_state)); #ifdef CONFIG_X86 - kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state)); + kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state)); +#endif + return; + } + +#ifdef CONFIG_X86 + mutex_lock(kvm-gsi_msg_lock); + gsi_msg = kvm_find_gsi_msg(kvm, gsi); + mutex_unlock(kvm-gsi_msg_lock); + if (!gsi_msg) { + printk(KERN_WARNING kvm: fail to find correlated gsi_msg\n); + return; + } + + dest_id = (gsi_msg-msg.address_lo MSI_ADDR_DEST_ID_MASK) +MSI_ADDR_DEST_ID_SHIFT; + vector = (gsi_msg-msg.data MSI_DATA_VECTOR_MASK) +MSI_DATA_VECTOR_SHIFT; + dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, + (unsigned long *)gsi_msg-msg.address_lo); + trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, + (unsigned long *)gsi_msg-msg.data); + delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, + (unsigned long *)gsi_msg-msg.data); + deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, + dest_id, dest_mode); + /* IOAPIC delivery mode value is the same as MSI here */ + switch (delivery_mode) { + case IOAPIC_LOWEST_PRIORITY: + vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, vector, + deliver_bitmask); + if (vcpu != NULL) + kvm_apic_set_irq(vcpu, vector, trig_mode); + else + printk(KERN_INFO kvm: null lowest priority vcpu!\n); + break; + case IOAPIC_FIXED: + for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { + if (!(deliver_bitmask (1 vcpu_id))) + continue; + deliver_bitmask = ~(1 vcpu_id); + vcpu = ioapic-kvm-vcpus[vcpu_id]; + if (vcpu) + kvm_apic_set_irq(vcpu, vector,
[PATCH 02/10] KVM: Using gsi_msg mapping for MSI device assignment
Convert MSI userspace interface to support gsi_msg mapping(and nobody should be the user of the old interface...). Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_host.h |1 - virt/kvm/kvm_main.c | 35 ++- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0e5741a..aa2606b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -313,7 +313,6 @@ struct kvm_assigned_dev_kernel { int host_irq; bool host_irq_disabled; int guest_irq; - struct msi_msg guest_msi; #define KVM_ASSIGNED_DEV_GUEST_INTX(1 0) #define KVM_ASSIGNED_DEV_GUEST_MSI (1 1) #define KVM_ASSIGNED_DEV_HOST_INTX (1 8) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 26bccf9..3494861 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -92,20 +92,30 @@ static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) int vcpu_id; struct kvm_vcpu *vcpu; struct kvm_ioapic *ioapic = ioapic_irqchip(dev-kvm); - int dest_id = (dev-guest_msi.address_lo MSI_ADDR_DEST_ID_MASK) -MSI_ADDR_DEST_ID_SHIFT; - int vector = (dev-guest_msi.data MSI_DATA_VECTOR_MASK) -MSI_DATA_VECTOR_SHIFT; - int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, - (unsigned long *)dev-guest_msi.address_lo); - int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, - (unsigned long *)dev-guest_msi.data); - int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, - (unsigned long *)dev-guest_msi.data); + struct kvm_gsi_msg *gsi_msg; + int dest_id, vector, dest_mode, trig_mode, delivery_mode; u32 deliver_bitmask; BUG_ON(!ioapic); + mutex_lock(dev-kvm-gsi_msg_lock); + gsi_msg = kvm_find_gsi_msg(dev-kvm, dev-guest_irq); + if (!gsi_msg) { + printk(KERN_WARNING kvm: fail to find correlated gsi_msg\n); + return; + } + mutex_unlock(dev-kvm-gsi_msg_lock); + + dest_id = (gsi_msg-msg.address_lo MSI_ADDR_DEST_ID_MASK) +MSI_ADDR_DEST_ID_SHIFT; + vector = (gsi_msg-msg.data MSI_DATA_VECTOR_MASK) +MSI_DATA_VECTOR_SHIFT; + dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, + (unsigned long *)gsi_msg-msg.address_lo); + trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, + (unsigned long *)gsi_msg-msg.data); + delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, + (unsigned long *)gsi_msg-msg.data); deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest_id, dest_mode); /* IOAPIC delivery mode value is the same as MSI here */ @@ -316,17 +326,16 @@ static int assigned_device_update_msi(struct kvm *kvm, { int r; + adev-guest_irq = airq-guest_irq; + if (airq-flags KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) { /* x86 don't care upper address of guest msi message addr */ adev-irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI; adev-irq_requested_type = ~KVM_ASSIGNED_DEV_GUEST_INTX; - adev-guest_msi.address_lo = airq-guest_msi.addr_lo; - adev-guest_msi.data = airq-guest_msi.data; adev-ack_notifier.gsi = -1; } else if (msi2intx) { adev-irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX; adev-irq_requested_type = ~KVM_ASSIGNED_DEV_GUEST_MSI; - adev-guest_irq = airq-guest_irq; adev-ack_notifier.gsi = airq-guest_irq; } else { /* -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/10] KVM: Unified the delivery of IOAPIC and MSI
Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_host.h |3 ++ virt/kvm/ioapic.c| 84 +- virt/kvm/irq_comm.c | 75 3 files changed, 79 insertions(+), 83 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5b671b6..4f92317 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -330,6 +330,9 @@ struct kvm_gsi_msg { struct hlist_node link; }; +void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, + union kvm_ioapic_redirect_entry *entry, + u32 *deliver_bitmask); void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); void kvm_register_irq_ack_notifier(struct kvm *kvm, diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index ebb2ab5..af9f5de 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -195,75 +195,53 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) { - u8 dest = ioapic-redirtbl[irq].fields.dest_id; - u8 dest_mode = ioapic-redirtbl[irq].fields.dest_mode; - u8 delivery_mode = ioapic-redirtbl[irq].fields.delivery_mode; - u8 vector = ioapic-redirtbl[irq].fields.vector; - u8 trig_mode = ioapic-redirtbl[irq].fields.trig_mode; + union kvm_ioapic_redirect_entry entry = ioapic-redirtbl[irq]; u32 deliver_bitmask; struct kvm_vcpu *vcpu; int vcpu_id, r = 0; ioapic_debug(dest=%x dest_mode=%x delivery_mode=%x vector=%x trig_mode=%x\n, -dest, dest_mode, delivery_mode, vector, trig_mode); +entry.fields.dest, entry.fields.dest_mode, +entry.fields.delivery_mode, entry.fields.vector, +entry.fields.trig_mode); - deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest, - dest_mode); + kvm_get_intr_delivery_bitmask(ioapic, entry, deliver_bitmask); if (!deliver_bitmask) { ioapic_debug(no target on destination\n); return 0; } - switch (delivery_mode) { - case IOAPIC_LOWEST_PRIORITY: - vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, vector, - deliver_bitmask); + /* Always delivery PIT interrupt to vcpu 0 */ #ifdef CONFIG_X86 - if (irq == 0) - vcpu = ioapic-kvm-vcpus[0]; + if (irq == 0) + deliver_bitmask = 1 0; #endif - if (vcpu != NULL) - r = ioapic_inj_irq(ioapic, vcpu, vector, - trig_mode, delivery_mode); - else - ioapic_debug(null lowest prio vcpu: -mask=%x vector=%x delivery_mode=%x\n, -deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); - break; - case IOAPIC_FIXED: -#ifdef CONFIG_X86 - if (irq == 0) - deliver_bitmask = 1; -#endif - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask (1 vcpu_id))) - continue; - deliver_bitmask = ~(1 vcpu_id); - vcpu = ioapic-kvm-vcpus[vcpu_id]; - if (vcpu) { - r = ioapic_inj_irq(ioapic, vcpu, vector, - trig_mode, delivery_mode); - } - } - break; - case IOAPIC_NMI: - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask (1 vcpu_id))) - continue; - deliver_bitmask = ~(1 vcpu_id); - vcpu = ioapic-kvm-vcpus[vcpu_id]; - if (vcpu) + + for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { + if (!(deliver_bitmask (1 vcpu_id))) + continue; + deliver_bitmask = ~(1 vcpu_id); + vcpu = ioapic-kvm-vcpus[vcpu_id]; + if (vcpu) { + if (entry.fields.delivery_mode == + IOAPIC_LOWEST_PRIORITY || + entry.fields.delivery_mode == IOAPIC_FIXED) + r = ioapic_inj_irq(ioapic, vcpu, + entry.fields.vector, + entry.fields.trig_mode, +
Re: [PATCH 0/10][v3] GSI-MSG route layer for MSI/MSI-X
On Tuesday 30 December 2008 13:55:52 Sheng Yang wrote: Update from v2: Add gsi_msg_pending_bitmap, in order to support MSI-X mutiply interrupts. And this one depends on the former Device assignment code clean up and MSI disable support patchset. Sorry, a little chaos here. The gsi_msg_pending_bitmap is in MSI-X patchset... So this one should be almost the same as v2. Just resend. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Patch 0/3][v2] Userspace for MSI-X
Change from v2: Move MMIO intercepting to userspace, and add two new ioctls. For kernel space MSI-X depends on lots of related patch I sent before, this time I kept it and wait for others to be checked in. Please help to review, thanks. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] Add MSI-X related macro to pci.c
Signed-off-by: Sheng Yang sh...@linux.intel.com --- qemu/hw/pci.h |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/qemu/hw/pci.h b/qemu/hw/pci.h index f2a622c..22c5de1 100644 --- a/qemu/hw/pci.h +++ b/qemu/hw/pci.h @@ -87,6 +87,7 @@ typedef struct PCIIORegion { #define PCI_CAPABILITY_CONFIG_MAX_LENGTH 0x60 #define PCI_CAPABILITY_CONFIG_DEFAULT_START_ADDR 0x40 #define PCI_CAPABILITY_CONFIG_MSI_LENGTH 0x10 +#define PCI_CAPABILITY_CONFIG_MSIX_LENGTH 0x10 struct PCIDevice { /* PCI config space */ -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] kvm: enable MSI-X capabilty for assigned device
The most important part here, is we emulate a page of MMIO region using a page of memory. That's because MSI-X table was put in the region and we have to intercept it. Signed-off-by: Sheng Yang sh...@linux.intel.com --- qemu/hw/device-assignment.c | 275 ++- qemu/hw/device-assignment.h |6 + 2 files changed, 276 insertions(+), 5 deletions(-) diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c index 2d3e67e..dc2020a 100644 --- a/qemu/hw/device-assignment.c +++ b/qemu/hw/device-assignment.c @@ -146,6 +146,7 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, { AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev); AssignedDevRegion *region = r_dev-v_addrs[region_num]; +PCIRegion *real_region = r_dev-real_device.regions[region_num]; uint32_t old_ephys = region-e_physbase; uint32_t old_esize = region-e_size; int first_map = (region-e_size == 0); @@ -164,10 +165,27 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, TARGET_PAGE_ALIGN(old_esize)); } -if (e_size 0) +if (e_size 0) { +/* deal with MSI-X MMIO page */ +if (real_region-base_addr = r_dev-msix_table_addr +real_region-base_addr + real_region-size = +r_dev-msix_table_addr) { +int offset = r_dev-msix_table_addr - real_region-base_addr; +ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE); +if (ret == 0) +DEBUG(munmap done, virt_base 0x%p\n, +region-u.r_virtbase + offset); +else { +fprintf(stderr, %s: fail munmap msix table!\n, __func__); +exit(1); +} +cpu_register_physical_memory(e_phys + offset, +TARGET_PAGE_SIZE, r_dev-mmio_index); +} ret = kvm_register_phys_mem(kvm_context, e_phys, region-u.r_virtbase, TARGET_PAGE_ALIGN(e_size), 0); +} if (ret != 0) { fprintf(stderr, %s: Error: create new mapping failed\n, __func__); @@ -570,7 +588,9 @@ void assigned_dev_update_irq(PCIDevice *d) } } -#if defined(KVM_CAP_DEVICE_MSI) defined (KVM_CAP_GSI_MSG) +#ifdef KVM_CAP_GSI_MSG + +#ifdef KVM_CAP_DEVICE_MSI static void assigned_dev_update_msi(PCIDevice *pci_dev, unsigned int ctrl_pos) { struct kvm_assigned_irq assigned_irq_data; @@ -610,14 +630,140 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev, unsigned int ctrl_pos) } #endif -void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, uint32_t address, +#ifdef KVM_CAP_DEVICE_MSIX +static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev) +{ +AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev); +u16 entries_nr = 0, entries_max_nr; +int pos = 0, i, r = 0; +u32 msg_addr, msg_upper_addr, msg_data, msg_ctrl; +struct kvm_assigned_msix_nr msix_nr; +struct kvm_assigned_msix_entry msix_entry; +struct kvm_assigned_gsi_msg gsi_msg; +void *va = adev-msix_table_page; + +if (adev-cap.available ASSIGNED_DEVICE_CAP_MSI) +pos = PCI_CAPABILITY_CONFIG_MSI_LENGTH; +entries_max_nr = pci_dev-cap.config[pos + 2]; +entries_max_nr = PCI_MSIX_TABSIZE; + +/* Get the usable entry number for allocating */ +for (i = 0; i entries_max_nr; i++) { +memcpy(msg_ctrl, va + i * 16 + 12, 4); +/* 0x1 is mask bit for per vector */ +if (msg_ctrl 0x1) +continue; +memcpy(msg_data, va + i * 16 + 8, 4); +/* Ignore unused entry even it's unmasked */ +if (msg_data == 0) +continue; +entries_nr ++; +} + +msix_nr.assigned_dev_id = calc_assigned_dev_id(adev-h_busnr, + (uint8_t)adev-h_devfn); +msix_nr.entry_nr = entries_nr; +r = kvm_set_msix_nr(kvm_context, msix_nr); +if (r != 0) { +fprintf(stderr, fail to set MSI-X entry number for MSIX! %s\n, + strerror(-r)); +return r; +} + +msix_entry.assigned_dev_id = msix_nr.assigned_dev_id; +entries_nr = 0; +for (i = 0; i entries_max_nr; i++) { +if (entries_nr = msix_nr.entry_nr) +break; +memcpy(msg_ctrl, va + i * 16 + 12, 4); +if (msg_ctrl 0x1) +continue; +memcpy(msg_data, va + i * 16 + 8, 4); +if (msg_data == 0) +continue; + +memcpy(msg_addr, va + i * 16, 4); +memcpy(msg_upper_addr, va + i * 16 + 4, 4); + +gsi_msg.gsi = 0; +gsi_msg.msg.addr_lo = msg_addr; +gsi_msg.msg.addr_hi = msg_upper_addr; +gsi_msg.msg.data = msg_data; +r = kvm_request_gsi_msg(kvm_context, gsi_msg); +if (r) { +fprintf(stderr, fail to request gsi msg for MSIX! %s\n, +
[PATCH 2/3] kvm: add ioctl KVM_SET_MSIX_ENTRY_NR and KVM_SET_MSIX_ENTRY
Signed-off-by: Sheng Yang sh...@linux.intel.com --- libkvm/libkvm.c | 26 ++ libkvm/libkvm.h |6 ++ 2 files changed, 32 insertions(+), 0 deletions(-) diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c index ddcc929..ad218e1 100644 --- a/libkvm/libkvm.c +++ b/libkvm/libkvm.c @@ -1171,3 +1171,29 @@ int kvm_free_gsi_msg(kvm_context_t kvm, uint32_t gsi) } #endif + +#ifdef KVM_CAP_DEVICE_MSIX +int kvm_set_msix_nr(kvm_context_t kvm, +struct kvm_assigned_msix_nr *msix_nr) +{ +int ret; + +ret = ioctl(kvm-vm_fd, KVM_SET_MSIX_NR, msix_nr); +if (ret 0) +return -errno; + +return ret; +} + +int kvm_set_msix_entry(kvm_context_t kvm, + struct kvm_assigned_msix_entry *entry) +{ +int ret; + +ret = ioctl(kvm-vm_fd, KVM_SET_MSIX_ENTRY, entry); +if (ret 0) +return -errno; + +return ret; +} +#endif diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h index 53e57c5..af704f7 100644 --- a/libkvm/libkvm.h +++ b/libkvm/libkvm.h @@ -725,4 +725,10 @@ int kvm_request_gsi_msg(kvm_context_t kvm, int kvm_free_gsi_msg(kvm_context_t kvm, uint32_t gsi); #endif +#ifdef KVM_CAP_DEVICE_MSIX +int kvm_set_msix_nr(kvm_context_t kvm, +struct kvm_assigned_msix_nr *msix_nr); +int kvm_set_msix_entry(kvm_context_t kvm, + struct kvm_assigned_msix_entry *entry); +#endif #endif -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html