[PATCH] KVM: ia64: Implement a uniform vps interface
From: Xiantao Zhang [EMAIL PROTECTED] An uniform entry kvm_vps_entry is added for vps_sync_write/read, vps_resume_handler/guest, and branches to differnt PAL service according to the offset. Singed-off-by: Anthony Xu [EMAIL PROTECTED] Signed-off-by: Xiantao Zhang [EMAIL PROTECTED] Signed-off-by: Avi Kivity [EMAIL PROTECTED] diff --git a/arch/ia64/kvm/kvm_minstate.h b/arch/ia64/kvm/kvm_minstate.h index 13980d9..2cc41d1 100644 --- a/arch/ia64/kvm/kvm_minstate.h +++ b/arch/ia64/kvm/kvm_minstate.h @@ -50,27 +50,18 @@ #define PAL_VSA_SYNC_READ \ /* begin to call pal vps sync_read */ \ +{.mii; \ add r25 = VMM_VPD_BASE_OFFSET, r21; \ - adds r20 = VMM_VCPU_VSA_BASE_OFFSET, r21; /* entry point */\ + nop 0x0;\ + mov r24=ip; \ ;; \ +} \ +{.mmb \ + add r24=0x20, r24; \ ld8 r25 = [r25]; /* read vpd base */ \ - ld8 r20 = [r20];\ - ;; \ - add r20 = PAL_VPS_SYNC_READ,r20;\ - ;; \ -{ .mii; \ - nop 0x0;\ - mov r24 = ip; \ - mov b0 = r20; \ + br.cond.sptk kvm_vps_sync_read; /*call the service*/\ ;; \ }; \ -{ .mmb; \ - add r24 = 0x20, r24;\ - nop 0x0;\ - br.cond.sptk b0;/* call the service */ \ - ;; \ -}; - #define KVM_MINSTATE_GET_CURRENT(reg) mov reg=r21 diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S index e4f15d6..f0bf0a8 100644 --- a/arch/ia64/kvm/optvfault.S +++ b/arch/ia64/kvm/optvfault.S @@ -20,6 +20,75 @@ #define ACCE_MOV_TO_PSR #define ACCE_THASH +ENTRY(kvm_vps_entry) + adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21 + ;; + ld8 r29 = [r29] + ;; + add r29 = r29, r30 + ;; + mov b0 = r29 + br.sptk.many b0 +END(kvm_vps_entry) + +/* + * Inputs: + * r24 : return address + * r25 : vpd + * r29 : scratch + * + */ +GLOBAL_ENTRY(kvm_vps_sync_read) + movl r30 = PAL_VPS_SYNC_READ + ;; + br.sptk.many kvm_vps_entry +END(kvm_vps_sync_read) + +/* + * Inputs: + * r24 : return address + * r25 : vpd + * r29 : scratch + * + */ +GLOBAL_ENTRY(kvm_vps_sync_write) + movl r30 = PAL_VPS_SYNC_WRITE + ;; + br.sptk.many kvm_vps_entry +END(kvm_vps_sync_write) + +/* + * Inputs: + * r23 : pr + * r24 : guest b0 + * r25 : vpd + * + */ +GLOBAL_ENTRY(kvm_vps_resume_normal) + movl r30 = PAL_VPS_RESUME_NORMAL + ;; + mov pr=r23,-2 + br.sptk.many kvm_vps_entry +END(kvm_vps_resume_normal) + +/* + * Inputs: + * r23 : pr + * r24 : guest b0 + * r25 : vpd + * r17 : isr + */ +GLOBAL_ENTRY(kvm_vps_resume_handler) + movl r30 = PAL_VPS_RESUME_HANDLER + ;; + ld8 r27=[r25] + shr r17=r17,IA64_ISR_IR_BIT + ;; + dep r27=r17,r27,63,1 // bit 63 of r27 indicate whether enable CFLE + mov pr=r23,-2 + br.sptk.many kvm_vps_entry +END(kvm_vps_resume_handler) + //mov r1=ar3 GLOBAL_ENTRY(kvm_asm_mov_from_ar) #ifndef ACCE_MOV_FROM_AR diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c index 5a33f7e..3417783 100644 --- a/arch/ia64/kvm/process.c +++ b/arch/ia64/kvm/process.c @@ -962,9 +962,9 @@ static void kvm_do_resume_op(struct kvm_vcpu *vcpu) void vmm_transition(struct kvm_vcpu *vcpu) { ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu-arch.vpd, - 0, 0, 0, 0, 0, 0); + 1, 0, 0, 0, 0, 0); vmm_trampoline(vcpu-arch.guest, vcpu-arch.host); ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu-arch.vpd, - 0, 0, 0, 0, 0, 0); +
[PATCH] kvm: libkvm: do not use mem_hole anymore.
From: Glauber Costa [EMAIL PROTECTED] memory holes are totally evil. Right now they work for some basic tests, but had never been stressed enough. Using memory holes leaves open questions like: * what happens if a area being registered span two slots? * what happens if there is already data in the slots? also, the code behaves badly if the piece to be removed lies in the boundaries of the current slot. Luckily, we don't really need it. Remove it, and make sure we never hit it. Signed-off-by: Glauber Costa [EMAIL PROTECTED] Signed-off-by: Avi Kivity [EMAIL PROTECTED] diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c index a850caa..c261053 100644 --- a/libkvm/libkvm.c +++ b/libkvm/libkvm.c @@ -439,74 +439,9 @@ int kvm_is_allocated_mem(kvm_context_t kvm, unsigned long phys_start, return 0; } -int kvm_create_mem_hole(kvm_context_t kvm, unsigned long phys_start, - unsigned long len) -{ - int slot; - int r; - struct kvm_userspace_memory_region rmslot; - struct kvm_userspace_memory_region newslot1; - struct kvm_userspace_memory_region newslot2; - - len = (len + PAGE_SIZE - 1) PAGE_MASK; - - slot = get_intersecting_slot(phys_start); - /* no need to create hole, as there is already hole */ - if (slot == -1) - return 0; - - memset(rmslot, 0, sizeof(struct kvm_userspace_memory_region)); - memset(newslot1, 0, sizeof(struct kvm_userspace_memory_region)); - memset(newslot2, 0, sizeof(struct kvm_userspace_memory_region)); - - rmslot.guest_phys_addr = slots[slot].phys_addr; - rmslot.slot = slot; - - newslot1.guest_phys_addr = slots[slot].phys_addr; - newslot1.memory_size = phys_start - slots[slot].phys_addr; - newslot1.slot = slot; - newslot1.userspace_addr = slots[slot].userspace_addr; - newslot1.flags = slots[slot].flags; - - newslot2.guest_phys_addr = newslot1.guest_phys_addr + - newslot1.memory_size + len; - newslot2.memory_size = slots[slot].phys_addr + - slots[slot].len - newslot2.guest_phys_addr; - newslot2.userspace_addr = newslot1.userspace_addr + - newslot1.memory_size; - newslot2.slot = get_free_slot(kvm); - newslot2.flags = newslot1.flags; - - r = ioctl(kvm-vm_fd, KVM_SET_USER_MEMORY_REGION, rmslot); - if (r == -1) { - fprintf(stderr, kvm_create_mem_hole: %s\n, strerror(errno)); - return -1; - } - free_slot(slot); - - r = ioctl(kvm-vm_fd, KVM_SET_USER_MEMORY_REGION, newslot1); - if (r == -1) { - fprintf(stderr, kvm_create_mem_hole: %s\n, strerror(errno)); - return -1; - } - register_slot(newslot1.slot, newslot1.guest_phys_addr, - newslot1.memory_size, newslot1.userspace_addr, - newslot1.flags); - - r = ioctl(kvm-vm_fd, KVM_SET_USER_MEMORY_REGION, newslot2); - if (r == -1) { - fprintf(stderr, kvm_create_mem_hole: %s\n, strerror(errno)); - return -1; - } - register_slot(newslot2.slot, newslot2.guest_phys_addr, - newslot2.memory_size, newslot2.userspace_addr, - newslot2.flags); - return 0; -} - int kvm_register_phys_mem(kvm_context_t kvm, - unsigned long phys_start, void *userspace_addr, - unsigned long len, int log) + unsigned long phys_start, void *userspace_addr, + unsigned long len, int log) { struct kvm_userspace_memory_region memory = { diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h index 79dd769..77fd903 100644 --- a/libkvm/libkvm.h +++ b/libkvm/libkvm.h @@ -457,8 +457,6 @@ void kvm_destroy_phys_mem(kvm_context_t, unsigned long phys_start, int kvm_is_intersecting_mem(kvm_context_t kvm, unsigned long phys_start); int kvm_is_allocated_mem(kvm_context_t kvm, unsigned long phys_start, unsigned long len); -int kvm_create_mem_hole(kvm_context_t kvm, unsigned long phys_start, - unsigned long len); int kvm_register_phys_mem(kvm_context_t kvm, unsigned long phys_start, void *userspace_addr, unsigned long len, int log); diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c index 00840df..3663d38 100644 --- a/qemu/qemu-kvm.c +++ b/qemu/qemu-kvm.c @@ -773,12 +773,13 @@ void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr, r = kvm_is_allocated_mem(kvm_context, start_addr, size); if (r) return; -r = kvm_is_intersecting_mem(kvm_context, start_addr); -if (r) -kvm_create_mem_hole(kvm_context, start_addr, size); -r = kvm_register_phys_mem(kvm_context, start_addr, - phys_ram_base + phys_offset, -
[PATCH] kvm: qemu: register mmio slots
From: Glauber Costa [EMAIL PROTECTED] By analysing phys_offset, we know whether a region is an mmio region or not. If it is, we don't want to have kvm caring about it, so just return. Signed-off-by: Glauber Costa [EMAIL PROTECTED] Signed-off-by: Avi Kivity [EMAIL PROTECTED] diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c index 1da253a..cfdf90f 100644 --- a/qemu/qemu-kvm.c +++ b/qemu/qemu-kvm.c @@ -780,6 +780,10 @@ void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr, r = kvm_is_containing_region(kvm_context, start_addr, size); if (r) return; + +if (area_flags = TLB_MMIO) +return; + r = kvm_register_phys_mem(kvm_context, start_addr, phys_ram_base + phys_offset, size, 0); -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: qemu: unregister memory area depending on their flags
From: Glauber Costa [EMAIL PROTECTED] Signed-off-by: Glauber Costa [EMAIL PROTECTED] Signed-off-by: Avi Kivity [EMAIL PROTECTED] diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c index f7a7fdd..88d3f5d 100644 --- a/libkvm/libkvm.c +++ b/libkvm/libkvm.c @@ -508,6 +508,18 @@ void kvm_destroy_phys_mem(kvm_context_t kvm, unsigned long phys_start, free_slot(memory.slot); } +void kvm_unregister_memory_area(kvm_context_t kvm, uint64_t phys_addr, unsigned long size) +{ + + int slot = get_container_slot(phys_addr, size); + + if (slot != -1) { + DPRINTF(Unregistering memory region %llx (%lx)\n, phys_addr, size); + kvm_destroy_phys_mem(kvm, phys_addr, size); + return; + } +} + static int kvm_get_map(kvm_context_t kvm, int ioctl_num, int slot, void *buf) { int r; diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h index cb77c6c..14ea93b 100644 --- a/libkvm/libkvm.h +++ b/libkvm/libkvm.h @@ -454,6 +454,9 @@ void *kvm_create_phys_mem(kvm_context_t, unsigned long phys_start, unsigned long len, int log, int writable); void kvm_destroy_phys_mem(kvm_context_t, unsigned long phys_start, unsigned long len); +void kvm_unregister_memory_area(kvm_context_t, uint64_t phys_start, +unsigned long len); + int kvm_is_containing_region(kvm_context_t kvm, unsigned long phys_start, unsigned long size); int kvm_register_phys_mem(kvm_context_t kvm, unsigned long phys_start, void *userspace_addr, diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c index 07cffef..1da253a 100644 --- a/qemu/qemu-kvm.c +++ b/qemu/qemu-kvm.c @@ -768,8 +768,15 @@ void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr, unsigned long phys_offset) { int r = 0; +unsigned long area_flags = phys_offset ~TARGET_PAGE_MASK; phys_offset = ~IO_MEM_ROM; + +if (area_flags == IO_MEM_UNASSIGNED) { +kvm_unregister_memory_area(kvm_context, start_addr, size); +return; +} + r = kvm_is_containing_region(kvm_context, start_addr, size); if (r) return; -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Remaining passthrough/VT-d tasks list
Hi all, The initial passthrough/VT-d patches have been in kvm, it's time to enhance it, and push them into 2.6.28. Following is the remaining passthrough/VT-d tasks list: - Multiple devices assignment (WIP) - MSI support (WIP) - MTRR/PAT support of EPT (WIP) - MTRR/PAT support of shadow (WIP) - Basic FLR support (WIP) (Above tasks are working in process, some patches have been sent out, others will be sent out in near future) - architecture independent (such as x86, IPF) - Shared Interrupt support - Add dummy driver to hide/unbind passthrough device from host kernel If I omit some good features or you have some good proposals, please feel free to add them to this list. If you are interest in any tasks, please reply the mail directly and let other guys to know your progress. Appreciate any effort from you! Randy (Weidong) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
On Wednesday 24 September 2008 14:15:15 Han, Weidong wrote: Hi all, The initial passthrough/VT-d patches have been in kvm, it's time to enhance it, and push them into 2.6.28. Some supplements: Following is the remaining passthrough/VT-d tasks list: - Multiple devices assignment (WIP) Weidong is working on this. - MSI support (WIP) - MTRR/PAT support of EPT (WIP) - MTRR/PAT support of shadow (WIP) - Basic FLR support (WIP) Above four are my works. All of them work now. But more job should be done to polish the patches. And the main part of Function Level Reset would be picked by linux-pci. Another thing is we would send out/update above patches before Sept. 28, and hope they can picked by 2.6.28 merge window. Avi, what's your opinion? Of course we would work hard. :) But what's the deadline of merge window? (Above tasks are working in process, some patches have been sent out, others will be sent out in near future) - architecture independent (such as x86, IPF) - Shared Interrupt support I still don't know who would do this. It's very important for VT-d real usable. If nobody interested in it, I would pick it up, but after Oct. 6 (after National Holiday in China). -- regards Yang, Sheng - Add dummy driver to hide/unbind passthrough device from host kernel If I omit some good features or you have some good proposals, please feel free to add them to this list. If you are interest in any tasks, please reply the mail directly and let other guys to know your progress. Appreciate any effort from you! Randy (Weidong) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
* On Wednesday 24 Sep 2008 13:21:25 Han, Weidong wrote: Amit Shah wrote: - Add dummy driver to hide/unbind passthrough device from host kernel This isn't needed; we currently don't assign the device to the guest if we find that a driver is already loaded. I intend to change it to failing guest start altogether in case we find a module already using a device. When a guest exits, we release all the structures and hence even unloading kvm is not needed to reclaim the device on the host side. This task needn't targe 2.6.28. For long term, we need it to make device assignment more user friendly. How is the current scheme not user friendly? Or, how will adding a dummy driver be more user friendly? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
Han, Weidong wrote: Hi all, The initial passthrough/VT-d patches have been in kvm, it's time to enhance it, and push them into 2.6.28. - Shared Interrupt support Shared guest interrupts is a prerequisite for merging into mainline. Without this, device assignment is useless in anything but a benchmark scenario. I won't push device assignment for 2.6.28 without it. Shared host interrupts are a different matter; which one did you mean? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: Remaining passthrough/VT-d tasks list
Amit Shah wrote: * On Wednesday 24 Sep 2008 13:21:25 Han, Weidong wrote: Amit Shah wrote: - Add dummy driver to hide/unbind passthrough device from host kernel This isn't needed; we currently don't assign the device to the guest if we find that a driver is already loaded. I intend to change it to failing guest start altogether in case we find a module already using a device. When a guest exits, we release all the structures and hence even unloading kvm is not needed to reclaim the device on the host side. This task needn't targe 2.6.28. For long term, we need it to make device assignment more user friendly. How is the current scheme not user friendly? Or, how will adding a dummy driver be more user friendly? We had some discussion on this few months ago. Currently, users need to remove device driver before assignment. If there are more than one same type devices, removing driver makes them cannot work at the same time, even though user just want to assign one of them to guest. Note that not all drivers support unbind function. If we can provide a mechanism to hide single device independently, e.g, implement a dummy driver to own devices that user want to assign to guest. I think it's more friendly to end user than remove/unbind driver manually. Randy (Weidong) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
Yang, Sheng wrote: - MSI support (WIP) - MTRR/PAT support of EPT (WIP) - MTRR/PAT support of shadow (WIP) - Basic FLR support (WIP) Above four are my works. All of them work now. But more job should be done to polish the patches. And the main part of Function Level Reset would be picked by linux-pci. Another thing is we would send out/update above patches before Sept. 28, and hope they can picked by 2.6.28 merge window. Avi, what's your opinion? Of course we would work hard. :) But what's the deadline of merge window? No one knows, but it's very unlikely these features will make it for 2.6.28. To be merged, it is not sufficient for the patches to be ready. They have to undergo some testing in the field. - Shared Interrupt support I still don't know who would do this. It's very important for VT-d real usable. If nobody interested in it, I would pick it up, but after Oct. 6 (after National Holiday in China). Shared host interrupts? What's your plan here? The polarity trick? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
Han, Weidong wrote: - Add dummy driver to hide/unbind passthrough device from host kernel Maybe this can be implemented at the modprobe/hotplug level. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
On Wednesday 24 September 2008 16:34:22 Avi Kivity wrote: Han, Weidong wrote: Hi all, The initial passthrough/VT-d patches have been in kvm, it's time to enhance it, and push them into 2.6.28. - Shared Interrupt support Shared guest interrupts is a prerequisite for merging into mainline. Without this, device assignment is useless in anything but a benchmark scenario. I won't push device assignment for 2.6.28 without it. Shared host interrupts are a different matter; which one did you mean? Got confused... I think we are talking about share host interrupts, that is pre-assigned device shared IRQ with other devices. Why share guest interrupts is a prerequisite... -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
On Wednesday 24 September 2008 16:38:35 Avi Kivity wrote: Yang, Sheng wrote: - MSI support (WIP) - MTRR/PAT support of EPT (WIP) - MTRR/PAT support of shadow (WIP) - Basic FLR support (WIP) Above four are my works. All of them work now. But more job should be done to polish the patches. And the main part of Function Level Reset would be picked by linux-pci. Another thing is we would send out/update above patches before Sept. 28, and hope they can picked by 2.6.28 merge window. Avi, what's your opinion? Of course we would work hard. :) But what's the deadline of merge window? No one knows, but it's very unlikely these features will make it for 2.6.28. To be merged, it is not sufficient for the patches to be ready. They have to undergo some testing in the field. .. - Shared Interrupt support I still don't know who would do this. It's very important for VT-d real usable. If nobody interested in it, I would pick it up, but after Oct. 6 (after National Holiday in China). Shared host interrupts? What's your plan here? The polarity trick? Yeah, share host interrupts. But haven't got the very clear idea yet. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
Amit Shah wrote: I'd say we have about 3 weeks to get things in. How do you figure? 2.6.26 was released July 13, we're more than 2.5 months later. Furthermore, I'm not queueing untested patches for 2.6.28 at this time. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: Remaining passthrough/VT-d tasks list
Avi Kivity wrote: Han, Weidong wrote: - Add dummy driver to hide/unbind passthrough device from host kernel Maybe this can be implemented at the modprobe/hotplug level. I think so. Randy (Weidong) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
Han, Weidong wrote: We had some discussion on this few months ago. Currently, users need to remove device driver before assignment. If there are more than one same type devices, removing driver makes them cannot work at the same time, even though user just want to assign one of them to guest. Note that not all drivers support unbind function. If we can provide a mechanism to hide single device independently, e.g, implement a dummy driver to own devices that user want to assign to guest. I think it's more friendly to end user than remove/unbind driver manually. That's a good point -- multiple devices with the same driver. We may need a kernel parameter as well, for built-in drivers. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
Yang, Sheng wrote: Shared guest interrupts is a prerequisite for merging into mainline. Without this, device assignment is useless in anything but a benchmark scenario. I won't push device assignment for 2.6.28 without it. Shared host interrupts are a different matter; which one did you mean? Got confused... I think we are talking about share host interrupts, that is pre-assigned device shared IRQ with other devices. Why share guest interrupts is a prerequisite... We only have three pci interrupts at this point (though this could be easily extended); if you start the guest with a non-trivial number of devices, you will have shared guest interrupts. (of course, when I pointed this out during review, people said it could be done later, then forgot all about it) -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] VT-d: remove useless header inclusion
Han, Weidong wrote: Currently #include linux/intel-iommu.h is not needed in virt/kvm/kvm_main.c. What's more, this inclusion may result in compilation error in other architecture. Applied, but please also fix intel-iommu.h to compile on all archs. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Implement an fd pool to get real AIO with posix-aio
Anthony Liguori wrote: This patch implements a simple fd pool to allow many AIO requests with posix-aio. The result is significantly improved performance (identical to that reported for linux-aio) for both cache=on and cache=off. The fundamental problem with posix-aio is that it limits itself to one thread per-file descriptor. I don't know why this is, but this patch provides a simple mechanism to work around this (duplicating the file descriptor). This isn't a great solution, but it seems like a reasonable intermediate step between posix-aio and a custom thread-pool to replace it. +static int raw_fd_pool_get(BDRVRawState *s) +{ +int i; + +for (i = 0; i RAW_FD_POOL_SIZE; i++) { +/* already in use */ +if (s-fd_pool[i] != -1) +continue; + +/* try to dup file descriptor */ +s-fd_pool[i] = dup(s-fd); +if (s-fd_pool[i] != -1) +return s-fd_pool[i]; +} + +/* we couldn't dup the file descriptor so just use the main one */ +return s-fd; +} + dup()ing the fd on each request is unnecessary work; would be better to cache the duped fd. Of course, if this is just a stepping stone, it doesn't matter very much. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
On Wednesday 24 September 2008 16:53:15 Avi Kivity wrote: Yang, Sheng wrote: Shared guest interrupts is a prerequisite for merging into mainline. Without this, device assignment is useless in anything but a benchmark scenario. I won't push device assignment for 2.6.28 without it. Shared host interrupts are a different matter; which one did you mean? Got confused... I think we are talking about share host interrupts, that is pre-assigned device shared IRQ with other devices. Why share guest interrupts is a prerequisite... We only have three pci interrupts at this point (though this could be easily extended); if you start the guest with a non-trivial number of devices, you will have shared guest interrupts. (of course, when I pointed this out during review, people said it could be done later, then forgot all about it) .. I think it's a performance issue, not break it? How about do it like Xen side? Try best to avoid the share, extended the pci interrupts, improve hash algorithm. Is there anything else we can do? -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] VT-d: remove useless header inclusion
Avi Kivity wrote: Han, Weidong wrote: Currently #include linux/intel-iommu.h is not needed in virt/kvm/kvm_main.c. What's more, this inclusion may result in compilation error in other architecture. Applied, but please also fix intel-iommu.h to compile on all archs. Avi, Current intel-iommu.h should be compiled on all archs. On linux-next, they moved __iommu_clflush_cache() definition to intel-iomm.h, which results in it cannot pass compilation on some archs, such as IA64. Randy (Weidong) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
Han, Weidong wrote: Avi Kivity wrote: Han, Weidong wrote: - Add dummy driver to hide/unbind passthrough device from host kernel Maybe this can be implemented at the modprobe/hotplug level. I think so. I'm not sure now -- after I saw the point about a driver binding to two devices. Perhaps the deeper fix is to separate driver loading from binding to devices (or maybe it's separated already, but not exposed)? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] VT-d: remove useless header inclusion
Han, Weidong wrote: Avi Kivity wrote: Han, Weidong wrote: Currently #include linux/intel-iommu.h is not needed in virt/kvm/kvm_main.c. What's more, this inclusion may result in compilation error in other architecture. Applied, but please also fix intel-iommu.h to compile on all archs. Avi, Current intel-iommu.h should be compiled on all archs. On linux-next, they moved __iommu_clflush_cache() definition to intel-iomm.h, which results in it cannot pass compilation on some archs, such as IA64. Well, it still wants fixing, even if it is in linux-next only. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch] cap code_gen_buffer_size on ia64
Hi, This one limits the code_gen_buffer_size on ia64, phys_mem_size/4 really gets out of hand when you boot say a 64GB guest. Cheers, Jes Cap code_gen_buffer_size on ia64 - it quickly goes out of hand otherwise when booting larger guests. Signed-off-by: Jes Sorensen [EMAIL PROTECTED] --- qemu/exec.c |4 1 file changed, 4 insertions(+) Index: kvm-userspace.git/qemu/exec.c === --- kvm-userspace.git.orig/qemu/exec.c +++ kvm-userspace.git/qemu/exec.c @@ -443,6 +443,10 @@ start = (void *) 0x6000UL; if (code_gen_buffer_size (512 * 1024 * 1024)) code_gen_buffer_size = (512 * 1024 * 1024); +#elif defined(__ia64__) + /* cap the mapping, don't want it totally out of hand */ +if (code_gen_buffer_size (512 * 1024 * 1024)) +code_gen_buffer_size = (512 * 1024 * 1024); #endif code_gen_buffer = mmap(start, code_gen_buffer_size, PROT_WRITE | PROT_READ | PROT_EXEC,
Re: Remaining passthrough/VT-d tasks list
Yang, Sheng wrote: We only have three pci interrupts at this point (though this could be easily extended); if you start the guest with a non-trivial number of devices, you will have shared guest interrupts. (of course, when I pointed this out during review, people said it could be done later, then forgot all about it) . I think it's a performance issue, not break it? How about do it like Xen side? Try best to avoid the share, extended the pci interrupts, improve hash algorithm. Is there anything else we can do? Two separate issues: 1. only three guest pci interrupts That's a performance issue, not correctness. can be fixed by using gsi 16-23 in APIC mode, and by adding another IOAPIC (so we can use gsi 16-47). Anthony Xu posted some patches for this, not sure where this stands, but it was the right approach. 2. shared guest pci interrupts That's a correctness issue. No matter how many interrupts we have, we may have sharing issues. Of course with only three the issue is very pressing since we will get sharing with just a few devices. Currently if two assigned devices share a guest interrupts, or if an emulated device shares an interrupt with an assigned device, things will break. They need to be fixed independently. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] remove kvm_init_ap from qemu code.
Glauber Costa wrote: Call it as a special case for cpu 0 creation. This removes a piece of kvm code from raw qemu. +void kvm_init_new_ap(int cpu, CPUState *env) +{ +if (!cpu) +kvm_init_ap(); + +pthread_create(vcpu_info[cpu].thread, NULL, ap_main_loop, env); + +while (vcpu_info[cpu].created == 0) + qemu_cond_wait(qemu_vcpu_cond); +} + kvm_init_ap() is machine-level initialization. It's hacky to call it from cpu-level initialization. Do we have a machine-level initialization hook? [btw, !x makes sense when x is a boolean, pointer, or count. Then it means there is no x or there are no xs. But when x is an index, !x means x equals 0, so you may as well write that. and don't get me started on !strcmp()] -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
On Wednesday 24 September 2008 17:22:53 Avi Kivity wrote: Yang, Sheng wrote: We only have three pci interrupts at this point (though this could be easily extended); if you start the guest with a non-trivial number of devices, you will have shared guest interrupts. (of course, when I pointed this out during review, people said it could be done later, then forgot all about it) . I think it's a performance issue, not break it? How about do it like Xen side? Try best to avoid the share, extended the pci interrupts, improve hash algorithm. Is there anything else we can do? Two separate issues: 1. only three guest pci interrupts That's a performance issue, not correctness. can be fixed by using gsi 16-23 in APIC mode, and by adding another IOAPIC (so we can use gsi 16-47). Anthony Xu posted some patches for this, not sure where this stands, but it was the right approach. 2. shared guest pci interrupts That's a correctness issue. No matter how many interrupts we have, we may have sharing issues. Of course with only three the issue is very pressing since we will get sharing with just a few devices. Currently if two assigned devices share a guest interrupts, or if an emulated device shares an interrupt with an assigned device, things will break. They need to be fixed independently. About the second issue, I don't understand how it would break... Would you please give more details on this? It's a QEmu bug or IOAPIC bug? -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
* On Wednesday 24 Sep 2008 14:08:14 Han, Weidong wrote: Amit Shah wrote: * On Wednesday 24 Sep 2008 13:21:25 Han, Weidong wrote: Amit Shah wrote: - Add dummy driver to hide/unbind passthrough device from host kernel This isn't needed; we currently don't assign the device to the guest if we find that a driver is already loaded. I intend to change it to failing guest start altogether in case we find a module already using a device. When a guest exits, we release all the structures and hence even unloading kvm is not needed to reclaim the device on the host side. This task needn't targe 2.6.28. For long term, we need it to make device assignment more user friendly. How is the current scheme not user friendly? Or, how will adding a dummy driver be more user friendly? We had some discussion on this few months ago. Currently, users need to remove device driver before assignment. If there are more than one same type devices, removing driver makes them cannot work at the same time, even though user just want to assign one of them to guest. Note that not all drivers support unbind function. If we can provide a mechanism to hide single device independently, e.g, implement a dummy driver to own devices that user want to assign to guest. I think it's more friendly to end user than remove/unbind driver manually. This needs a change in the driver core and it definitely won't be solved by having a dummy device. We have to have a way to signal to modules that a particular device will now be owned by a different module, even if the current module thinks it is the sole owner. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
* On Wednesday 24 Sep 2008 14:16:47 Avi Kivity wrote: Amit Shah wrote: I'd say we have about 3 weeks to get things in. How do you figure? 2.6.26 was released July 13, we're more than 2.5 months later. A week for 2.6.28 to open and two weeks for the rc1 window. Furthermore, I'm not queueing untested patches for 2.6.28 at this time. Of course, I'm not advocating this! If they're tested by Intel, we can push them in. Amit -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/9] kvm-x86: Enable NMI Watchdog via in-kernel PIT source
On Tuesday 23 September 2008 23:04:48 Jan Kiszka wrote: Yang, Sheng wrote: On Friday 19 September 2008 20:03:02 Jan Kiszka wrote: LINT0 of the LAPIC can be used to route PIT events as NMI watchdog ticks into the guest. This patch aligns the in-kernel irqchip emulation with the user space irqchip with already supports this feature. The trick is to route PIT interrupts to all LAPIC's LVT0 lines. Rebased patch and slightly polished patch originally posted by Sheng Yang. Signed-off-by: Sheng Yang [EMAIL PROTECTED] Thanks for pick up this patch again! Have you test some Windows guest with this watchdog? Last time I dropped it because it cause BSOD on some version of Windows(IRQ_NOT_EQUAL_OR_LESS). I don't remember the exactly situation there, but you may have a try. Not yet. I always tell my colleagues that I don't need Windows on my desktop, I just need a few VM images - for testing... :) I will try to dig out / generate some image and reproduce the issue you and Gleb see. Hope it will trigger here as well. Anything special required to make Windows use the NMI as watchdog? I don't know if Windows use NMI watchdog. In fact, my original patch just cause Windows BSOD, and I think Windows don't use it(Linux NMI watchdog mechanism is a little tricky one)... -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
Amit Shah wrote: * On Wednesday 24 Sep 2008 14:16:47 Avi Kivity wrote: Amit Shah wrote: I'd say we have about 3 weeks to get things in. How do you figure? 2.6.26 was released July 13, we're more than 2.5 months later. A week for 2.6.28 to open and two weeks for the rc1 window. Furthermore, I'm not queueing untested patches for 2.6.28 at this time. Of course, I'm not advocating this! If they're tested by Intel, we can push them in. No, the patches have to be in my tree some time before the merge window opens. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch] cap code_gen_buffer_size on ia64
Jes Sorensen wrote: Hi, This one limits the code_gen_buffer_size on ia64, phys_mem_size/4 really gets out of hand when you boot say a 64GB guest. ia64 doesn't codegen; why not set it to zero? (and the phys_ram_size / 4 heuristic is ridiculous; code size doesn't scale with guest size) -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch] cap code_gen_buffer_size on ia64
Avi Kivity wrote: ia64 doesn't codegen; why not set it to zero? (and the phys_ram_size / 4 heuristic is ridiculous; code size doesn't scale with guest size) That works too - I didn't really know this part too well, but I hit the problem that I was unable to allocate the space because of 64 bit issues. I'll whip up a patch to disable it for ia64. Cheers, Jes -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/5] bios: 4G updates
Alex Williamson wrote: As requested, here's the follow-on to the 4G MTRR changes split into functional bits. [1/5] Rename variables to reflect what they're really reporting [2/5] Reformat ram_probe() to match the rest of the code [3/5] Add SMBIOS info for memory above 4G [4/5] Fix the SMBIOS type 19 20 range end address [5/5] Optional - switch default MTRR type to WB and only cover MMIO I've taken some liberties renaming and reformatting, if we'd rather not introduce too many extraneous changes, I can drop those. The SMBIOS changes seem to work up to 32767MB, then we hit a limitation in the type 17 table only providing 15bits for the size. We might need to describe multiple virtual DIMMs to get around that, but it's a separate issue. The final patch is optional and switches over to make the variable MTRRs only describe the MMIO hole, leaving the rest of the address space default to WB. I can't say I fully understand the implications of hotplug memory for this scenario. Let me know if there are comments. The patches all look good, however renaming and reformatting will lead to merge headaches later on. We haven't been good at working with bochs bios upstream. Can you peek in bochs upstream and see if it's worth merging? If not, I'll just merge these patches. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/9] coalesce mmio regions with an explicit call
Glauber Costa wrote: You can't coalesce the registers which trigger device action. You'll destroy latency and/or functionality. which kills the goal of getting rid of explicit kvm code. It's a fact that coalescing helps kvm but not qemu. So maybe the solution here is to add calls in qemu to a memory coalescing function that in the raw qemu / kqemu case just don't do anything? That's just word games. s/kvm/qemu/ won't change the fact that this is a kvm specific hook. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch] do not allocate code_gen buffer on ia64
Avi Kivity wrote: Jes Sorensen wrote: Hi, This one limits the code_gen_buffer_size on ia64, phys_mem_size/4 really gets out of hand when you boot say a 64GB guest. ia64 doesn't codegen; why not set it to zero? How about this one then? Jes Do not allocate a code_gen buffer on ia64 given it doesn't support code generation. Signed-off-by: Jes Sorensen [EMAIL PROTECTED] --- qemu/exec.c |4 1 file changed, 4 insertions(+) Index: kvm-userspace.git/qemu/exec.c === --- kvm-userspace.git.orig/qemu/exec.c +++ kvm-userspace.git/qemu/exec.c @@ -407,6 +407,10 @@ static void code_gen_alloc(unsigned long tb_size) { +#ifdef TARGET_IA64 + return; +#endif + #ifdef USE_STATIC_CODE_GEN_BUFFER code_gen_buffer = static_code_gen_buffer; code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
Re: [patch] do not allocate code_gen buffer on ia64
Jes Sorensen wrote: Avi Kivity wrote: Jes Sorensen wrote: Hi, This one limits the code_gen_buffer_size on ia64, phys_mem_size/4 really gets out of hand when you boot say a 64GB guest. ia64 doesn't codegen; why not set it to zero? How about this one then? Applied, thanks. Note qemu uses 4 spaces for intedenation. Talk to your editor. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch] do not allocate code_gen buffer on ia64
Avi Kivity wrote: Applied, thanks. Note qemu uses 4 spaces for intedenation. Talk to your editor. Even when it's a double indentation, ie 8 spaces? Thats just plain sicko :-( Jes -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM: PIC: enhance IPI avoidance
Marcelo Tosatti wrote: KVM: PIC: enhance IPI avoidance The PIC code makes little effort to avoid kvm_vcpu_kick(), resulting in unnecessary guest exits in some conditions. For example, if the timer interrupt is routed through the IOAPIC, IRR for IRQ 0 will get set but not cleared, since the APIC is handling the acks. This means that everytime an interrupt 16 is triggered, the priority logic will find IRQ0 pending and send an IPI to vcpu0 (in case IRQ0 is not masked, which is Linux's case). Introduce a new variable isr_ack to represent the IRQ's for which the guest has been signalled / cleared the ISR. Use it to avoid more than one IPI per trigger-ack cycle, in addition to the avoidance when ISR is set in get_priority(). Signed-off-by: Marcelo Tosatti [EMAIL PROTECTED] struct kvm_pic *kvm_create_pic(struct kvm *kvm) Index: kvm/arch/x86/kvm/irq.h === --- kvm.orig/arch/x86/kvm/irq.h +++ kvm/arch/x86/kvm/irq.h @@ -42,6 +42,7 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ + u8 isr_ack; /* interrupt ack detection */ u8 priority_add;/* highest irq priority */ u8 irq_base; u8 read_reg_select; Needs to be cleared by reset and by register load from userspace, no? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: Remaining passthrough/VT-d tasks list
Amit Shah wrote: * On Wednesday 24 Sep 2008 14:08:14 Han, Weidong wrote: Amit Shah wrote: * On Wednesday 24 Sep 2008 13:21:25 Han, Weidong wrote: Amit Shah wrote: - Add dummy driver to hide/unbind passthrough device from host kernel This isn't needed; we currently don't assign the device to the guest if we find that a driver is already loaded. I intend to change it to failing guest start altogether in case we find a module already using a device. When a guest exits, we release all the structures and hence even unloading kvm is not needed to reclaim the device on the host side. This task needn't targe 2.6.28. For long term, we need it to make device assignment more user friendly. How is the current scheme not user friendly? Or, how will adding a dummy driver be more user friendly? We had some discussion on this few months ago. Currently, users need to remove device driver before assignment. If there are more than one same type devices, removing driver makes them cannot work at the same time, even though user just want to assign one of them to guest. Note that not all drivers support unbind function. If we can provide a mechanism to hide single device independently, e.g, implement a dummy driver to own devices that user want to assign to guest. I think it's more friendly to end user than remove/unbind driver manually. This needs a change in the driver core and it definitely won't be solved by having a dummy device. We have to have a way to signal to modules that a particular device will now be owned by a different module, even if the current module thinks it is the sole owner. The assigned devices are only owned by the dummy driver. Like Xen, pciback owns the assignable devices via adding option 'pciback.hide=(bus:dev:func)' in grub, that means device(bus:dev:func) driver won't be loaded. Then user can assign these hidden devices. Randy (Weidong) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
Yang, Sheng wrote: On Tuesday 23 September 2008 17:45:44 Gleb Natapov wrote: On Tue, Sep 23, 2008 at 05:42:02PM +0800, Yang, Sheng wrote: That is exactly what I am using. Run it with SMP hal and do hibernate. Oh... Finally found how to enable that hibernate option And this hibernate works on my virtual_nmi supported box, with smp hal and 2 cpus. However, for this hibernate won't success if there is no NMI support, maybe we can say it's not a regression... I am not saying it's a regression, but it would be nice to have it working :) Yeah, of course. :) OK, I've a 2003 server up and running now, I'm able to reproduce kvm_handle_exit: unexpected, valid vectoring info and exit reason is 0x9 but not via hibernate (it suspends and powers off normally, but then hangs after resume), rather by manually injecting an NMI on CPU0. After Windows' graphical installation phase I had a hanging guest. At the same time I got kvm_handle_exit: unexpected, valid vectoring info and exit reason is 0x9 kvm_handle_exit: Breaking out of NMI-blocked state on VCPU 0 after 1 s timeout in the kernel log as well. Something is borken. Will retest the installation with vanilla KVM. Anyone any ideas on the task switch thing? Just a false positive or an indication for the real problem in that domain? Jan -- Siemens AG, Corporate Technology, CT SE 2 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
Jan Kiszka wrote: After Windows' graphical installation phase I had a hanging guest. At the same time I got kvm_handle_exit: unexpected, valid vectoring info and exit reason is 0x9 kvm_handle_exit: Breaking out of NMI-blocked state on VCPU 0 after 1 s timeout in the kernel log as well. Something is borken. Will retest the installation with vanilla KVM. Anyone any ideas on the task switch thing? Just a false positive or an indication for the real problem in that domain? Check the descriptor type for the NMI vector, that should show if it's real or not. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Guest crash with 2.6.27-rc6 (a different one)
Dan Smith wrote: Hi, After hitting the pvclock-related issue, I recompiled my 2.6.27-rc6 kernel without CONFIG_KVM_CLOCK. It stays up far longer, but I see the following guest crash when I stress it (with a source build): Are you using 4K stacks? If so, please try with 8K stacks and report. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
On Wed, Sep 24, 2008 at 02:40:18PM +0200, Jan Kiszka wrote: Yang, Sheng wrote: On Tuesday 23 September 2008 17:45:44 Gleb Natapov wrote: On Tue, Sep 23, 2008 at 05:42:02PM +0800, Yang, Sheng wrote: That is exactly what I am using. Run it with SMP hal and do hibernate. Oh... Finally found how to enable that hibernate option And this hibernate works on my virtual_nmi supported box, with smp hal and 2 cpus. However, for this hibernate won't success if there is no NMI support, maybe we can say it's not a regression... I am not saying it's a regression, but it would be nice to have it working :) Yeah, of course. :) OK, I've a 2003 server up and running now, I'm able to reproduce kvm_handle_exit: unexpected, valid vectoring info and exit reason is 0x9 but not via hibernate (it suspends and powers off normally, but then hangs after resume), rather by manually injecting an NMI on CPU0. I found out today that on regular windows 2003 the problem does not exist (on hibernate at least). The image I have was used to run WLK tests (windows logo kit) and this kit changes something in windows kernel to do additional stuff after hibernation and that is where we crash. After Windows' graphical installation phase I had a hanging guest. At the same time I got kvm_handle_exit: unexpected, valid vectoring info and exit reason is 0x9 kvm_handle_exit: Breaking out of NMI-blocked state on VCPU 0 after 1 s timeout in the kernel log as well. Something is borken. Will retest the installation with vanilla KVM. Anyone any ideas on the task switch thing? Just a false positive or an indication for the real problem in that domain? Nothing is broken IMO. The IDT entry for NMI is set up as task gate so we get a task switch exit after NMI injection. We should do something like this: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 046a91b..860e66d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2826,10 +2826,20 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long exit_qualification; u16 tss_selector; int reason; + struct vcpu_vmx *vmx = to_vmx(vcpu); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification 30; + + if (reason == TASK_SWITCH_GATE vmx-vcpu.arch.nmi_injected + (vmx-idt_vectoring_info VECTORING_INFO_VALID_MASK) + (vmx-idt_vectoring_info VECTORING_INFO_TYPE_MASK) == INTR_TYPE_NMI_INTR) { + vcpu-arch.nmi_injected = false; + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + printk(KERN_DEBUGNMI cause task switch. No need to reinject\n); + } tss_selector = exit_qualification; return kvm_task_switch(vcpu, tss_selector, reason); @@ -3002,7 +3012,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if ((vectoring_info VECTORING_INFO_VALID_MASK) (exit_reason != EXIT_REASON_EXCEPTION_NMI - exit_reason != EXIT_REASON_EPT_VIOLATION)) + exit_reason != EXIT_REASON_EPT_VIOLATION + exit_reason != EXIT_REASON_TASK_SWITCH)) printk(KERN_WARNING %s: unexpected, valid vectoring info and exit reason is 0x%x\n, __func__, exit_reason); if (exit_reason kvm_vmx_max_exit_handlers -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch] do not allocate code_gen buffer on ia64
Avi Kivity wrote: Jes Sorensen wrote: Avi Kivity wrote: Applied, thanks. Note qemu uses 4 spaces for intedenation. Talk to your editor. Even when it's a double indentation, ie 8 spaces? Yes. :-( Well then we really should add something like this to every file in qemu, since this behavior is so non standard from what any normal editor does per default. /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ Thats just plain sicko :-( It's the only scheme that works 100% reliably. Ehm, maybe, but then that makes 8 space (tab) indention work 110% reliable :-) Cheers, Jes -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
Gleb Natapov wrote: On Wed, Sep 24, 2008 at 02:40:18PM +0200, Jan Kiszka wrote: Yang, Sheng wrote: On Tuesday 23 September 2008 17:45:44 Gleb Natapov wrote: On Tue, Sep 23, 2008 at 05:42:02PM +0800, Yang, Sheng wrote: That is exactly what I am using. Run it with SMP hal and do hibernate. Oh... Finally found how to enable that hibernate option And this hibernate works on my virtual_nmi supported box, with smp hal and 2 cpus. However, for this hibernate won't success if there is no NMI support, maybe we can say it's not a regression... I am not saying it's a regression, but it would be nice to have it working :) Yeah, of course. :) OK, I've a 2003 server up and running now, I'm able to reproduce kvm_handle_exit: unexpected, valid vectoring info and exit reason is 0x9 but not via hibernate (it suspends and powers off normally, but then hangs after resume), rather by manually injecting an NMI on CPU0. I found out today that on regular windows 2003 the problem does not exist (on hibernate at least). The image I have was used to run WLK tests (windows logo kit) and this kit changes something in windows kernel to do additional stuff after hibernation and that is where we crash. Ahh! After Windows' graphical installation phase I had a hanging guest. At the same time I got kvm_handle_exit: unexpected, valid vectoring info and exit reason is 0x9 kvm_handle_exit: Breaking out of NMI-blocked state on VCPU 0 after 1 s timeout in the kernel log as well. Something is borken. Will retest the installation with vanilla KVM. Anyone any ideas on the task switch thing? Just a false positive or an indication for the real problem in that domain? Nothing is broken IMO. The IDT entry for NMI is set up as task gate so we get a task switch exit after NMI injection. Yes, that's what I see here now as well. We should do something like this: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 046a91b..860e66d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2826,10 +2826,20 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long exit_qualification; u16 tss_selector; int reason; + struct vcpu_vmx *vmx = to_vmx(vcpu); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification 30; + + if (reason == TASK_SWITCH_GATE vmx-vcpu.arch.nmi_injected + (vmx-idt_vectoring_info VECTORING_INFO_VALID_MASK) + (vmx-idt_vectoring_info VECTORING_INFO_TYPE_MASK) == INTR_TYPE_NMI_INTR) { + vcpu-arch.nmi_injected = false; + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + printk(KERN_DEBUGNMI cause task switch. No need to reinject\n); + } OK, I just think we are not supposed to set GUEST_INTR_STATE_NMI without cpu_has_virtual_nmis(). Otherwise looks reasonable. Have you tested this? Does it make your 2003 power-off? tss_selector = exit_qualification; return kvm_task_switch(vcpu, tss_selector, reason); @@ -3002,7 +3012,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if ((vectoring_info VECTORING_INFO_VALID_MASK) (exit_reason != EXIT_REASON_EXCEPTION_NMI - exit_reason != EXIT_REASON_EPT_VIOLATION)) + exit_reason != EXIT_REASON_EPT_VIOLATION + exit_reason != EXIT_REASON_TASK_SWITCH)) printk(KERN_WARNING %s: unexpected, valid vectoring info and exit reason is 0x%x\n, __func__, exit_reason); Dumping the vectoring info here as well would have accelerated the debugging. I think we should add this. if (exit_reason kvm_vmx_max_exit_handlers -- Gleb. Jan -- Siemens AG, Corporate Technology, CT SE 2 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
On Wed, Sep 24, 2008 at 02:56:40PM +0200, Jan Kiszka wrote: We should do something like this: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 046a91b..860e66d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2826,10 +2826,20 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long exit_qualification; u16 tss_selector; int reason; + struct vcpu_vmx *vmx = to_vmx(vcpu); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification 30; + + if (reason == TASK_SWITCH_GATE vmx-vcpu.arch.nmi_injected + (vmx-idt_vectoring_info VECTORING_INFO_VALID_MASK) + (vmx-idt_vectoring_info VECTORING_INFO_TYPE_MASK) == INTR_TYPE_NMI_INTR) { + vcpu-arch.nmi_injected = false; + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + printk(KERN_DEBUGNMI cause task switch. No need to reinject\n); + } OK, I just think we are not supposed to set GUEST_INTR_STATE_NMI without cpu_has_virtual_nmis(). Otherwise looks reasonable. Have you tested this? Does it make your 2003 power-off? It does power-off, but hands during reboot. Looking at it right now. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
Gleb Natapov wrote: On Wed, Sep 24, 2008 at 02:56:40PM +0200, Jan Kiszka wrote: We should do something like this: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 046a91b..860e66d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2826,10 +2826,20 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long exit_qualification; u16 tss_selector; int reason; + struct vcpu_vmx *vmx = to_vmx(vcpu); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification 30; + + if (reason == TASK_SWITCH_GATE vmx-vcpu.arch.nmi_injected + (vmx-idt_vectoring_info VECTORING_INFO_VALID_MASK) + (vmx-idt_vectoring_info VECTORING_INFO_TYPE_MASK) == INTR_TYPE_NMI_INTR) { + vcpu-arch.nmi_injected = false; + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + printk(KERN_DEBUGNMI cause task switch. No need to reinject\n); + } OK, I just think we are not supposed to set GUEST_INTR_STATE_NMI without cpu_has_virtual_nmis(). Otherwise looks reasonable. Have you tested this? Does it make your 2003 power-off? It does power-off, but hands during reboot. Looking at it right now. After completing the RAM read-back from disk? This is where it hangs here. I also briefly played with an XP image, and that one even hard-rebooted at that point. Jan -- Siemens AG, Corporate Technology, CT SE 2 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
On Wed, Sep 24, 2008 at 03:11:36PM +0200, Jan Kiszka wrote: OK, I just think we are not supposed to set GUEST_INTR_STATE_NMI without cpu_has_virtual_nmis(). Otherwise looks reasonable. Have you tested this? Does it make your 2003 power-off? It does power-off, but hands during reboot. Looking at it right now. Yeah, with your patch I'm getting a totally different Blue Screen on spurious manual NMI injection: Hardware Malfunction Call your hardware vendor for support 8) Cool! -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
On Wed, Sep 24, 2008 at 04:17:00PM +0300, Gleb Natapov wrote: On Wed, Sep 24, 2008 at 03:11:36PM +0200, Jan Kiszka wrote: OK, I just think we are not supposed to set GUEST_INTR_STATE_NMI without cpu_has_virtual_nmis(). Otherwise looks reasonable. Have you tested this? Does it make your 2003 power-off? It does power-off, but hands during reboot. Looking at it right now. Yeah, with your patch I'm getting a totally different Blue Screen on spurious manual NMI injection: Hardware Malfunction Call your hardware vendor for support 8) Cool! BTW I have this is also needed in case CPU was reseted before it had a chance to handle NMI: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5927b79..106e16d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2234,6 +2234,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) } vmx-vcpu.arch.rmode.active = 0; + vcpu-arch.nmi_pending = false; + vcpu-arch.nmi_injected = false; vmx-vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vmx-vcpu, 0); -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
On Wed, Sep 24, 2008 at 03:08:04PM +0200, Jan Kiszka wrote: It does power-off, but hands during reboot. Looking at it right now. After completing the RAM read-back from disk? This is where it hangs Much earlier. BIOS hangs because CPU1 ignores SIPI. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
Gleb Natapov wrote: On Wed, Sep 24, 2008 at 03:08:04PM +0200, Jan Kiszka wrote: It does power-off, but hands during reboot. Looking at it right now. After completing the RAM read-back from disk? This is where it hangs Much earlier. BIOS hangs because CPU1 ignores SIPI. That sounds like an APIC state reset issue. BTW, I'm getting tons of Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 1 Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 1 Ignoring de-assert INIT to vcpu 1 SIPI to vcpu 1 vector 0x24 vcpu 1 received sipi with vector # 24 SIPI to vcpu 1 vector 0x24 SIPI to vcpu 1 vector 0x24 Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 1 SIPI to vcpu 1 vector 0x24 on my box with SMP guests (Linux and Windows). Do they all point to yet incomplete emulations, or are they just far too verbose? Jan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
On Wed, Sep 24, 2008 at 03:33:13PM +0200, Jan Kiszka wrote: Gleb Natapov wrote: On Wed, Sep 24, 2008 at 03:08:04PM +0200, Jan Kiszka wrote: It does power-off, but hands during reboot. Looking at it right now. After completing the RAM read-back from disk? This is where it hangs Much earlier. BIOS hangs because CPU1 ignores SIPI. That sounds like an APIC state reset issue. BTW, I'm getting tons of Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 1 Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 1 Ignoring de-assert INIT to vcpu 1 SIPI to vcpu 1 vector 0x24 vcpu 1 received sipi with vector # 24 SIPI to vcpu 1 vector 0x24 SIPI to vcpu 1 vector 0x24 Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 1 SIPI to vcpu 1 vector 0x24 on my box with SMP guests (Linux and Windows). Do they all point to yet incomplete emulations, or are they just far too verbose? Not sure about de-assert messages but others are fine. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
Jan Kiszka wrote: Gleb Natapov wrote: On Wed, Sep 24, 2008 at 03:08:04PM +0200, Jan Kiszka wrote: It does power-off, but hands during reboot. Looking at it right now. After completing the RAM read-back from disk? This is where it hangs Much earlier. BIOS hangs because CPU1 ignores SIPI. That sounds like an APIC state reset issue. BTW, I'm getting tons of Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 1 Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 1 Ignoring de-assert INIT to vcpu 1 SIPI to vcpu 1 vector 0x24 vcpu 1 received sipi with vector # 24 SIPI to vcpu 1 vector 0x24 SIPI to vcpu 1 vector 0x24 Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 1 SIPI to vcpu 1 vector 0x24 on my box with SMP guests (Linux and Windows). Do they all point to yet incomplete emulations, or are they just far too verbose? 'Ignoring' sounds incomplete, but there's no real need to print it. The rest is just noise. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
Gleb Natapov wrote: On Wed, Sep 24, 2008 at 04:17:00PM +0300, Gleb Natapov wrote: On Wed, Sep 24, 2008 at 03:11:36PM +0200, Jan Kiszka wrote: OK, I just think we are not supposed to set GUEST_INTR_STATE_NMI without cpu_has_virtual_nmis(). Otherwise looks reasonable. Have you tested this? Does it make your 2003 power-off? It does power-off, but hands during reboot. Looking at it right now. Yeah, with your patch I'm getting a totally different Blue Screen on spurious manual NMI injection: Hardware Malfunction Call your hardware vendor for support 8) Cool! BTW I have this is also needed in case CPU was reseted before it had a chance to handle NMI: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5927b79..106e16d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2234,6 +2234,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) } vmx-vcpu.arch.rmode.active = 0; + vcpu-arch.nmi_pending = false; + vcpu-arch.nmi_injected = false; vmx-vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vmx-vcpu, 0); Good point. Same will be required for soft_vnmi_blocked. Will include this as well as your other patch in an update of my NMI series. Jan -- Siemens AG, Corporate Technology, CT SE 2 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/4] Allow enabling kvm_trace on external module
Eduardo Habkost wrote: This series adds compat code to allow enabling kvm_trace when building KVM as an external module on older kernels. The most hackish part is the last patch, that adds --with-kvm-trace to configure and adds an include to a generated file on kernel/x86/Kbuild. It doesn't look pretty, so suggestions on how to make this better are welcome. Applied all, thanks. We could improve Kbuild by having kernel/Makefile include config.mak and pass some variable to Kbuild somehow, but diving into Kbuild isn't my idea of a week well spent. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/4] relay_open() compat
Eduardo Habkost wrote: @@ -73,6 +73,8 @@ BEGIN { split(INIT_WORK tsc_khz desc_struct ldttss_desc64 desc_ptr \ } } +{ sub(/relay_open/, kvm_relay_open) } + I moved this bit into the compat_apis variable. Hope it still works. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
Gleb Natapov wrote: On Wed, Sep 24, 2008 at 03:08:04PM +0200, Jan Kiszka wrote: It does power-off, but hands during reboot. Looking at it right now. After completing the RAM read-back from disk? This is where it hangs Much earlier. BIOS hangs because CPU1 ignores SIPI. Apropos APIC: I also have this with Win2003 in my kernel log: apic write: bad size=1 fee00030 Jan -- Siemens AG, Corporate Technology, CT SE 2 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Implement an fd pool to get real AIO with posix-aio
Avi Kivity wrote: Anthony Liguori wrote: dup()ing the fd on each request is unnecessary work; would be better to cache the duped fd. Yeah, I was concerned about this too. Ryan reran the fio benchmark and the submission latency and completion latency were identical to the linux-aio patches. That suggests that the overhead of dup() is lost in the noise. Since this is simpler and keeps the number of open file descriptors as low as possible, I was happy about that. Regards, Anthony Liguori Of course, if this is just a stepping stone, it doesn't matter very much. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
Jan Kiszka wrote: Gleb Natapov wrote: On Wed, Sep 24, 2008 at 03:08:04PM +0200, Jan Kiszka wrote: It does power-off, but hands during reboot. Looking at it right now. After completing the RAM read-back from disk? This is where it hangs Much earlier. BIOS hangs because CPU1 ignores SIPI. Apropos APIC: I also have this with Win2003 in my kernel log: apic write: bad size=1 fee00030 Yes Windows ignores the specs here (which want 4-byte accesses). We should probably drop this printk as well. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM: PIC: enhance IPI avoidance
On Wed, Sep 24, 2008 at 03:19:47PM +0300, Avi Kivity wrote: Index: kvm/arch/x86/kvm/irq.h === --- kvm.orig/arch/x86/kvm/irq.h +++ kvm/arch/x86/kvm/irq.h @@ -42,6 +42,7 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ +u8 isr_ack; /* interrupt ack detection */ u8 priority_add;/* highest irq priority */ u8 irq_base; u8 read_reg_select; Needs to be cleared by reset @@ -213,6 +214,7 @@ void kvm_pic_reset(struct kvm_kpic_state s-irr = 0; s-imr = 0; s-isr = 0; + s-isr_ack = 0xff; s-priority_add = 0; s-irq_base = 0; s-read_reg_select = 0; and by register load from userspace, no? Isnt that responsability of the guest? Unacked IOAPIC interrupts are not cleared on register load, are they? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: Remaining passthrough/VT-d tasks list
Avi Kivity wrote: Amit Shah wrote: * On Wednesday 24 Sep 2008 14:16:47 Avi Kivity wrote: Amit Shah wrote: I'd say we have about 3 weeks to get things in. How do you figure? 2.6.26 was released July 13, we're more than 2.5 months later. A week for 2.6.28 to open and two weeks for the rc1 window. Furthermore, I'm not queueing untested patches for 2.6.28 at this time. Of course, I'm not advocating this! If they're tested by Intel, we can push them in. No, the patches have to be in my tree some time before the merge window opens. I agree patches need sufficient testing before merge to mainline. Anyway, let's try best to improve passthrough/VT-d code quality and make it stable asap. Randy (Weidong) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/11] VMX: work around lacking VNMI support
On Wed, Sep 24, 2008 at 04:02:36PM +0300, Gleb Natapov wrote: On Wed, Sep 24, 2008 at 02:56:40PM +0200, Jan Kiszka wrote: We should do something like this: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 046a91b..860e66d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2826,10 +2826,20 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long exit_qualification; u16 tss_selector; int reason; + struct vcpu_vmx *vmx = to_vmx(vcpu); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification 30; + + if (reason == TASK_SWITCH_GATE vmx-vcpu.arch.nmi_injected + (vmx-idt_vectoring_info VECTORING_INFO_VALID_MASK) + (vmx-idt_vectoring_info VECTORING_INFO_TYPE_MASK) == INTR_TYPE_NMI_INTR) { + vcpu-arch.nmi_injected = false; + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + printk(KERN_DEBUGNMI cause task switch. No need to reinject\n); + } OK, I just think we are not supposed to set GUEST_INTR_STATE_NMI without cpu_has_virtual_nmis(). Otherwise looks reasonable. Have you tested this? Does it make your 2003 power-off? It does power-off, but hands during reboot. Looking at it right now. OK. The hand is a bug in qemu. Apic reset function marks CPU 1 as halted and the CPU is never enters kernel again. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM: PIC: enhance IPI avoidance
Marcelo Tosatti wrote: and by register load from userspace, no? Isnt that responsability of the guest? I'm talking about a restore to previous state scenario. In this case we want to disable any IPI avoidance in case it avoids a needed IPI. Unacked IOAPIC interrupts are not cleared on register load, are they? Good question. I don't know if they should or shouldn't. But that's a different question. isr_ack is not guest visible, so nothing is lost from clearing it, but we can fail if we don't clear it. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/7] qemu: Include hw.h in qemu/hw/isa.h to fix compile issues
Amit Shah wrote: * On Tuesday 23 Sep 2008 21:43:44 Anthony Liguori wrote: Amit Shah wrote: Signed-off-by: Amit Shah [EMAIL PROTECTED] --- qemu/hw/isa.h |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/qemu/hw/isa.h b/qemu/hw/isa.h index 222e4f3..e4a1326 100644 --- a/qemu/hw/isa.h +++ b/qemu/hw/isa.h @@ -2,6 +2,8 @@ #define HW_ISA_H /* ISA bus */ +#include hw.h + extern target_phys_addr_t isa_mem_base; int register_ioport_read(int start, int length, int size, What compile issues? register_ioport_read* and register_ioport_write* functions cause a lot of this. You could also address this by including hw.h before including isa.h. Basically, everything should include qemu-common.h and anything that's implemented emulated hardware should include hw.h before including anything else. It's not perfect, but it's how things are right now. Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
Amit Shah wrote: * On Tuesday 23 Sep 2008 22:00:32 Anthony Liguori wrote: Amit Shah wrote: diff --git a/qemu/Makefile.target b/qemu/Makefile.target index 72f3db8..40eb273 100644 --- a/qemu/Makefile.target +++ b/qemu/Makefile.target @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o +OBJS+= device-assignment.o This needs to be conditional on at least linux hosts, but probably also kvm support. I didn't see any other file that's doing it. So I added this conditional in vl.c by having a #if defined(__linux__). That's how usb-linux.c does it as well. Is there a better way? aio and compatfd currently do it this way. block-raw-win32 and block-raw-posix are this way. We're slowly moving things away from #ifdef #else #endif to conditional compilation. Not the whole functionality needs kvm support. This should be able to work even without kvm (for example, when the guest is 1:1 mapped in the host address space). KVM is needed for interrupt remapping though. That's something I don't see happening for normal userspace any time soon. + /* FIXME: Add support for emulated MMIO for non-kvm guests */ + if (kvm_enabled()) { This doesn't work at all if kvm isn't enabled right? You should probably bail out in the init if kvm isn't enabled. If this whole file is included conditionally based on KVM support, then you don't have to worry about using kvm_enabled() guards to conditionally compile out code. Non-kvm support is currently broken and should be fixed, but that can happen after we get this merged. But it would take bouncing interrupts to userspace? I don't think that will ever happen upstream personally. At any rate, there's no point in even trying to support something like that until progress is made upstream on this front. I can temporarily add a check for kvm_enabled and bail out. + sprintf(dir, /sys/bus/pci/devices/:%02x:%02x.%x/, + r_bus, r_dev, r_func); snprintf() It's guarded by the %02x modifiers; so this doesn't depend on user input. strcpy or sprintf should never be used. It doesn't matter if it's safe in a particular instance. There are safer functions to use (like snprintf). All it takes is for someone to come along and change the /sys/bus path to be larger without adjusting the buffer size and everything goes to hell. It's inherently brittle. + fprintf(stderr, Registered host PCI device %02x:%02x.%1x + (\%s\) as guest device %02x:%02x.%1x\n, + r_bus, r_dev, r_func, e_dev_name, + pci_bus_num(e_bus), e_device, r_func); Please don't fprintf() unconditionally. OK; however, a vmdk file open does that so I though it was alright to do it. I obviously don't use vmdk or else I would have removed that by now :-) A lot more checks are needed here to see if things can succeed. We definitely should bail out if they can't. Bailing out is done in the out: label below. What else do you think can fail? I've taken care of all the cases that do fail IMO. + return pci_dev; +out: + pci_unregister_device(pci_dev-dev); + return NULL; +} +/* + * Syntax to assign device: + * + * -pcidevice dev=bus:dev.func,dma=dma + * + * Example: + * -pcidevice host=00:13.0,dma=pvdma + * + * dma can currently only be 'none' to disable iommu support. Does it actually work if you disable iommu support? If the guest is 1:1 mapped. You mean with Andrea's reserved ram patches? +#include sys/mman.h Don't think this is needed here. We use mmap(), so this is needed. Ah. +/* Initialize assigned devices */ +if (pci_enabled) { +int r = -1; +do { +init_assigned_device(pci_bus, r); Why pass r by reference instead of just returning it? At any rate, you should detect when this fails and gracefully terminate QEMU. 'r' is the count of the number of assigned devices -- mostly needed because we have the data stored in an array. If we migrate to a list, this can be relaxed. ATM, I start the guest without assigning the device. I haven't figured out a way to gracefully terminate qemu yet. In the case of hot plug, you fail the hot plug. If you start with device assignment, just doing an exit would be sufficient. +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__) + case QEMU_OPTION_pcidevice: + add_assigned_device(optarg); You should copy into an array, then in pc.c, iterate through the array and call into add_assigned_device. Is there any benefit in doing this? We're moving the iterate out of vl.c to pc.c and both will happen
Re: [PATCH 6/7] KVM/userspace: Build vtd.c for Intel IOMMU support
Amit Shah wrote: * On Tuesday 23 Sep 2008 22:01:10 Anthony Liguori wrote: Amit Shah wrote: Signed-off-by: Amit Shah [EMAIL PROTECTED] --- kernel/x86/Kbuild |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/kernel/x86/Kbuild b/kernel/x86/Kbuild index 8dc0483..a4cd00c 100644 --- a/kernel/x86/Kbuild +++ b/kernel/x86/Kbuild @@ -5,6 +5,9 @@ kvm-objs := kvm_main.o x86.o mmu.o x86_emulate.o ../anon_inodes.o irq.o i8259.o ifeq ($(CONFIG_KVM_TRACE),y) kvm-objs += kvm_trace.o endif +ifeq ($(CONFIG_DMAR),y) +kvm-objs += vtd.o +endif kvm-intel-objs := vmx.o vmx-debug.o ../external-module-compat.o kvm-amd-objs := svm.o ../external-module-compat.o Where's the file come from? Already in the kernel tree -- arch/x86/kvm/vtd.c So this is independent of the rest of the series? Any reason not to commit this Avi? Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining passthrough/VT-d tasks list
Avi Kivity wrote: Han, Weidong wrote: - Add dummy driver to hide/unbind passthrough device from host kernel Maybe this can be implemented at the modprobe/hotplug level. Wouldn't you just blacklist the devices in the host and call it a day? Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch] do not allocate code_gen buffer on ia64
Avi Kivity wrote: Jes Sorensen wrote: Avi Kivity wrote: Jes Sorensen wrote: Hi, This one limits the code_gen_buffer_size on ia64, phys_mem_size/4 really gets out of hand when you boot say a 64GB guest. ia64 doesn't codegen; why not set it to zero? How about this one then? Applied, thanks. Note qemu uses 4 spaces for intedenation. Talk to your editor. It would seem better to replace #ifdef TARGET_IA64 with if (kvm_enabled()). If QEMU ever got proper ia64 target support, that's going to be an ugly bug to find. Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: Remaining passthrough/VT-d tasks list
Avi Kivity wrote: Han, Weidong wrote: Hi all, The initial passthrough/VT-d patches have been in kvm, it's time to enhance it, and push them into 2.6.28. - Shared Interrupt support Shared guest interrupts is a prerequisite for merging into mainline. Without this, device assignment is useless in anything but a benchmark scenario. I won't push device assignment for 2.6.28 without it. Shared host interrupts are a different matter; which one did you mean? Avi: How about we think in other way? The top usage model of IOMMU is SR-IOV in my mind, at least for enterprise usage model. We are pushing the SR-IOV patch for 2.6.28, and are continuously polishing the patch. Even if it missed the 2.6.28 merge windows (unlikely?), we could be able to ask OSVs to take the SR-IOV patch seperately before code froze since it is very small, but it is hard to ask for taking whole IOMMU patches. In Xen side, IOMMU is there, MSI-x is there, so SR-IOV patch is the only one missed to enable SR-IOV. In KVM side, very likely we can get MSI patch down soon before chinese holiday, and we of course will spend tons of effort in qualities too. Should we target this? If yes, we put MSI patch and push 2.6.28 as 1st priority. We would be able to see next major release of VMM using KVM have HW IO virtualization technology: Close to native performance, non sacriface of IO sharing, minimal CPU utilization etc. For those legacy PCI pass thru support, we can continue improve it too. Thanks, eddie -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch] do not allocate code_gen buffer on ia64
Anthony Liguori wrote: It would seem better to replace #ifdef TARGET_IA64 with if (kvm_enabled()). Right. Committed. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/5] bios: 4G updates
On Wed, 2008-09-24 at 14:07 +0300, Avi Kivity wrote: The patches all look good, however renaming and reformatting will lead to merge headaches later on. We haven't been good at working with bochs bios upstream. Can you peek in bochs upstream and see if it's worth merging? If not, I'll just merge these patches. I'll take a look. It seemed like they added support for putting the ACPI processor objects in an SSDT last I checked, but the AML for their processors is fairly trivial. I'll see if there's anything else worthwhile. Thanks, Alex -- Alex Williamson HP Open Source Linux Org. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM for Sparc?
On 9/23/08, David Miller [EMAIL PROTECTED] wrote: From: Blue Swirl [EMAIL PROTECTED] Date: Tue, 23 Sep 2008 18:28:06 +0300 On 9/22/08, David Miller [EMAIL PROTECTED] wrote: As he mentioned, the V8 rett instruction causes problems on V9 chips. An opcode which was a V8 privileged instruction, rett, got reused as a non-privileged instruction in V9, for return. There are others: rdtbr/flushw and stdfq/stqf. Also any ASI 0x80 accesses are unprivileged on V9, though that shouldn't be a problem since all ASIs used on V8 were 0x80. And of course MMUs are incompatible. Thanks for the list. I sent a message to someone who I think might have been responsible for these architectual design decisions, letting them know what problems it is causing :-) So booting a 32-bit kernel on a 64-bit cpu is going to be challenging, at best. Maybe it would be possible to run V8 userspace with full speed acceleration on V9 and use translation only for kernel code? Yes, that should work. BTW, there is another area related the ASIs. Trap numbers. Even through V9, traps only up to 0x7f are valid. But sun4v extended V9 to allow trap numbers = 0x80, mostly these are used for hypervisor calls. The trap number field of the instruction is just extended one more bit higher to accomodate this. I see, also Qemu needs to use one more bit then. Does this mean that even V8 code written specially may use these traps to call hypervisor? Then we would need to catch these, maybe with the some assistance from the hypervisor. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM for Sparc?
On 9/23/08, David Miller [EMAIL PROTECTED] wrote: From: Blue Swirl [EMAIL PROTECTED] Date: Tue, 23 Sep 2008 18:34:12 +0300 On 9/23/08, David Miller [EMAIL PROTECTED] wrote: Sun4v systems come with Sun's hypervisor. Linux simply runs on top of that, whether as a host or a guest. The hypervisor source is opensource and we could technically make changes to it, but it isn't very practical. Do you mean OpenxVM? I think Sun has not published the hypervisor part yet. No, I mean OpenSPARC. The full hypervisor source code for sun4v is in the tarball. http://www.opensparc.net/ The code is there for both Niagara-T1 and Niagara-T2, for example for Niagara-T1 click on Get The Source -- OpenSPARC T1 -- Downloads Then on that page you want OpenSPARC T1 Download for Architecture and Performance Modeling Tools. It includes the full hypervisor and even the openboot PROM source code. I'm not sure, but I think that hypervisor is not a real hypervisor like the PROM version but one specially designed for Legion emulator. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/4] Allow enabling kvm_trace on external module
On Wed, Sep 24, 2008 at 04:48:35PM +0300, Avi Kivity wrote: Eduardo Habkost wrote: This series adds compat code to allow enabling kvm_trace when building KVM as an external module on older kernels. The most hackish part is the last patch, that adds --with-kvm-trace to configure and adds an include to a generated file on kernel/x86/Kbuild. It doesn't look pretty, so suggestions on how to make this better are welcome. Applied all, thanks. We could improve Kbuild by having kernel/Makefile include config.mak and pass some variable to Kbuild somehow, but diving into Kbuild isn't my idea of a week well spent. Oops. I've just noticed I broke './configure --with-patched-kernel'. Fix below. --- From: Eduardo Habkost [EMAIL PROTECTED] Date: Wed, 24 Sep 2008 14:11:42 -0300 Subject: Always generate config.kbuild When implementing --with-kvm-trace, I supposed make would never enter the 'kernel' directory when compiling with --with-patched-kernel. I was wrong and broke --with-patched-kernel. Change configure to always generate config.kbuild on the kernel directory. Otherwise make will explode on 'make header-sync', that runs even when --with-patched-kernel was used. Signed-off-by: Eduardo Habkost [EMAIL PROTECTED] --- configure |2 -- 1 files changed, 0 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 78c2f9c..3b27364 100755 --- a/configure +++ b/configure @@ -137,8 +137,6 @@ LD=$cross_prefix$ld OBJCOPY=$cross_prefix$objcopy EOF -if [ -n $want_module ];then cat EOF kernel/config.kbuild CONFIG_KVM_TRACE=$kvm_trace EOF -fi -- 1.5.5.GIT -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM for Sparc?
On 9/24/08, Blue Swirl [EMAIL PROTECTED] wrote: On 9/23/08, David Miller [EMAIL PROTECTED] wrote: From: Blue Swirl [EMAIL PROTECTED] Date: Tue, 23 Sep 2008 18:28:06 +0300 On 9/22/08, David Miller [EMAIL PROTECTED] wrote: As he mentioned, the V8 rett instruction causes problems on V9 chips. An opcode which was a V8 privileged instruction, rett, got reused as a non-privileged instruction in V9, for return. There are others: rdtbr/flushw and stdfq/stqf. Also any ASI 0x80 accesses are unprivileged on V9, though that shouldn't be a problem since all ASIs used on V8 were 0x80. And of course MMUs are incompatible. Thanks for the list. I sent a message to someone who I think might have been responsible for these architectual design decisions, letting them know what problems it is causing :-) So booting a 32-bit kernel on a 64-bit cpu is going to be challenging, at best. Maybe it would be possible to run V8 userspace with full speed acceleration on V9 and use translation only for kernel code? Yes, that should work. BTW, there is another area related the ASIs. Trap numbers. Even through V9, traps only up to 0x7f are valid. But sun4v extended V9 to allow trap numbers = 0x80, mostly these are used for hypervisor calls. The trap number field of the instruction is just extended one more bit higher to accomodate this. I see, also Qemu needs to use one more bit then. Does this mean that even V8 code written specially may use these traps to call hypervisor? Then we would need to catch these, maybe with the some assistance from the hypervisor. Now I found the relevant part in the manuals. The extra sun4v bit is not taken into account from user mode, so we can't catch privileged to hyperprivileged mode traps easily. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM for Sparc?
From: Blue Swirl [EMAIL PROTECTED] Date: Wed, 24 Sep 2008 20:20:33 +0300 I see, also Qemu needs to use one more bit then. Does this mean that even V8 code written specially may use these traps to call hypervisor? No, V8 code should not set the extra bit. Only V9 code on a processor which is hypervisor capable should do this. And the hypervisor calls can only be invoked from privileged mode. This is all described in the UltraSPARC Architecture 2005 Specification (Hyperprivileged Edition) at: http://www.opensparc.net/opensparc-t1/index.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM for Sparc?
From: Blue Swirl [EMAIL PROTECTED] Date: Wed, 24 Sep 2008 20:22:45 +0300 I'm not sure, but I think that hypervisor is not a real hypervisor like the PROM version but one specially designed for Legion emulator. No, it's the real deal. All the real hardware device support is there. And Legion simulates a real, full Niagara system. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM for Sparc?
From: Blue Swirl [EMAIL PROTECTED] Date: Wed, 24 Sep 2008 21:06:21 +0300 Now I found the relevant part in the manuals. The extra sun4v bit is not taken into account from user mode, so we can't catch privileged to hyperprivileged mode traps easily. That's right, the top bit is ignored in user mode. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM: PIC: enhance IPI avoidance
On Wed, Sep 24, 2008 at 05:49:37PM +0300, Avi Kivity wrote: Marcelo Tosatti wrote: and by register load from userspace, no? Isnt that responsability of the guest? I'm talking about a restore to previous state scenario. In this case we want to disable any IPI avoidance in case it avoids a needed IPI. Unacked IOAPIC interrupts are not cleared on register load, are they? Good question. I don't know if they should or shouldn't. But that's a different question. isr_ack is not guest visible, so nothing is lost from clearing it, but we can fail if we don't clear it. True. Anything other potential problem you could think of? KVM: PIC: enhance IPI avoidance The PIC code makes little effort to avoid kvm_vcpu_kick(), resulting in unnecessary guest exits in some conditions. For example, if the timer interrupt is routed through the IOAPIC, IRR for IRQ 0 will get set but not cleared, since the APIC is handling the acks. This means that everytime an interrupt 16 is triggered, the priority logic will find IRQ0 pending and send an IPI to vcpu0 (in case IRQ0 is not masked, which is Linux's case). Introduce a new variable isr_ack to represent the IRQ's for which the guest has been signalled / cleared the ISR. Use it to avoid more than one IPI per trigger-ack cycle, in addition to the avoidance when ISR is set in get_priority(). Index: kvm/arch/x86/kvm/i8259.c === --- kvm.orig/arch/x86/kvm/i8259.c +++ kvm/arch/x86/kvm/i8259.c @@ -33,6 +33,14 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s-isr = ~(1 irq); + s-isr_ack |= (1 irq); +} + +void kvm_pic_clear_isr_ack(struct kvm *kvm) +{ + struct kvm_pic *s = pic_irqchip(kvm); + s-pics[0].isr_ack = 0xff; + s-pics[1].isr_ack = 0xff; } /* @@ -213,6 +221,7 @@ void kvm_pic_reset(struct kvm_kpic_state s-irr = 0; s-imr = 0; s-isr = 0; + s-isr_ack = 0xff; s-priority_add = 0; s-irq_base = 0; s-read_reg_select = 0; @@ -444,10 +453,14 @@ static void pic_irq_request(void *opaque { struct kvm *kvm = opaque; struct kvm_vcpu *vcpu = kvm-vcpus[0]; + struct kvm_pic *s = pic_irqchip(kvm); + int irq = pic_get_irq(s-pics[0]); - pic_irqchip(kvm)-output = level; - if (vcpu) + s-output = level; + if (vcpu level (s-pics[0].isr_ack (1 irq))) { + s-pics[0].isr_ack = ~(1 irq); kvm_vcpu_kick(vcpu); + } } struct kvm_pic *kvm_create_pic(struct kvm *kvm) Index: kvm/arch/x86/kvm/irq.h === --- kvm.orig/arch/x86/kvm/irq.h +++ kvm/arch/x86/kvm/irq.h @@ -42,6 +42,7 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ + u8 isr_ack; /* interrupt ack detection */ u8 priority_add;/* highest irq priority */ u8 irq_base; u8 read_reg_select; @@ -70,6 +71,7 @@ struct kvm_pic *kvm_create_pic(struct kv void kvm_pic_set_irq(void *opaque, int irq, int level); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); +void kvm_pic_clear_isr_ack(struct kvm *kvm); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) { Index: kvm/arch/x86/kvm/x86.c === --- kvm.orig/arch/x86/kvm/x86.c +++ kvm/arch/x86/kvm/x86.c @@ -3963,6 +3963,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct pr_debug(Set back pending irq %d\n, pending_vec); } + kvm_pic_clear_isr_ack(vcpu-kvm); } kvm_set_segment(vcpu, sregs-cs, VCPU_SREG_CS); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
VPN connection from Windows 2000 guest to remote server.
Good morning, Is it possible to establish a PPTP connection from a guest running Windows 2000 Professional SP4 to a remote machine running Windows Server 2003 using user-mode networking? It appears to me that the problem is that GRE protocol packets are not being successfully transmitted. I can create a PPTP connection to the VPN server from the host machine and, using iptraf on the external interface, I see GRE packets being transmitted. However, when I initiate the connection from the guest machine, using iptraf, I see the initial connection using TCP to port 1723 on the server but no GRE protocol packets are detected. Thanks for your help. Until next time, Colin. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
On Tuesday 23 September 2008 22:54:53 Amit Shah wrote: From: Or Sagi [EMAIL PROTECTED] From: Nir Peleg [EMAIL PROTECTED] From: Amit Shah [EMAIL PROTECTED] From: Ben-Ami Yassour [EMAIL PROTECTED] From: Weidong Han [EMAIL PROTECTED] From: Glauber de Oliveira Costa [EMAIL PROTECTED] With this patch, we can assign a device on the host machine to a guest. A new command-line option, -pcidevice is added. For example, to invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this: -pcidevice host=04:08.0 * The host driver for the device, if any, is to be removed before assigning the device (else device assignment will fail). * A device that shares IRQ with another host device cannot currently be assigned. This works only with the in-kernel irqchip method; to use the userspace irqchip, a kernel module (irqhook) and some extra changes are needed. Hi Amit I am afraid I got this when try to enable VT-d. create_userspace_phys_mem: Invalid argument assigned_dev_iomem_map: Error: create new mapping failed Can you have a look at it? (and the patch you sent to Weidong don't got this problem.) Thanks. -- regards Yang, Sheng Signed-off-by: Amit Shah [EMAIL PROTECTED] --- qemu/Makefile.target|1 + qemu/hw/device-assignment.c | 665 +++ qemu/hw/device-assignment.h | 93 ++ qemu/hw/pc.c|9 + qemu/hw/pci.c |7 + qemu/vl.c | 18 ++ 6 files changed, 793 insertions(+), 0 deletions(-) create mode 100644 qemu/hw/device-assignment.c create mode 100644 qemu/hw/device-assignment.h diff --git a/qemu/Makefile.target b/qemu/Makefile.target index 72f3db8..40eb273 100644 --- a/qemu/Makefile.target +++ b/qemu/Makefile.target @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o +OBJS+= device-assignment.o ifeq ($(USE_KVM_PIT), 1) OBJS+= i8254-kvm.o endif diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c new file mode 100644 index 000..e70daf2 --- /dev/null +++ b/qemu/hw/device-assignment.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2007, Neocleus Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * + * Assign a PCI device from the host to a guest VM. + * + * Adapted for KVM by Qumranet. + * + * Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED]) + * Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED]) + * Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED]) + * Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED]) + */ +#include stdio.h +#include sys/io.h +#include qemu-kvm.h +#include linux/kvm_para.h +#include device-assignment.h + +/* From linux/ioport.h */ +#define IORESOURCE_IO 0x0100 /* Resource type */ +#define IORESOURCE_MEM 0x0200 +#define IORESOURCE_IRQ 0x0400 +#define IORESOURCE_DMA 0x0800 +#define IORESOURCE_PREFETCH0x1000 /* No side effects */ + +/* #define DEVICE_ASSIGNMENT_DEBUG */ + +#ifdef DEVICE_ASSIGNMENT_DEBUG +#define DEBUG(fmt, args...) fprintf(stderr, %s: fmt, __func__ , ## args) +#else +#define DEBUG(fmt, args...) +#endif + +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr, + uint32_t value) +{ + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; + uint32_t r_pio = (unsigned long)r_access-r_virtbase + + (addr - r_access-e_physbase); + + if (r_access-debug DEVICE_ASSIGNMENT_DEBUG_PIO) { + fprintf(stderr, %s: r_pio=%08x e_physbase=%08x +r_virtbase=%08lx value=%08x\n, + __func__, r_pio, (int)r_access-e_physbase, + (unsigned long)r_access-r_virtbase, value); + } + iopl(3); + outb(value, r_pio); +} + +static void assigned_dev_ioport_writew(void *opaque, uint32_t addr, + uint32_t value) +{ + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; + uint32_t r_pio =
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
On Thursday 25 September 2008 12:54:46 Yang, Sheng wrote: On Tuesday 23 September 2008 22:54:53 Amit Shah wrote: From: Or Sagi [EMAIL PROTECTED] From: Nir Peleg [EMAIL PROTECTED] From: Amit Shah [EMAIL PROTECTED] From: Ben-Ami Yassour [EMAIL PROTECTED] From: Weidong Han [EMAIL PROTECTED] From: Glauber de Oliveira Costa [EMAIL PROTECTED] With this patch, we can assign a device on the host machine to a guest. A new command-line option, -pcidevice is added. For example, to invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this: -pcidevice host=04:08.0 * The host driver for the device, if any, is to be removed before assigning the device (else device assignment will fail). * A device that shares IRQ with another host device cannot currently be assigned. This works only with the in-kernel irqchip method; to use the userspace irqchip, a kernel module (irqhook) and some extra changes are needed. Hi Amit I am afraid I got this when try to enable VT-d. create_userspace_phys_mem: Invalid argument assigned_dev_iomem_map: Error: create new mapping failed Can you have a look at it? (and the patch you sent to Weidong don't got this problem.) Oh, Weidong's patch [PATCH] VT-d: Fix iommu map page for mmio pages fix it. -- regards Yang, Sheng Thanks. -- regards Yang, Sheng Signed-off-by: Amit Shah [EMAIL PROTECTED] --- qemu/Makefile.target|1 + qemu/hw/device-assignment.c | 665 +++ qemu/hw/device-assignment.h | 93 ++ qemu/hw/pc.c|9 + qemu/hw/pci.c |7 + qemu/vl.c | 18 ++ 6 files changed, 793 insertions(+), 0 deletions(-) create mode 100644 qemu/hw/device-assignment.c create mode 100644 qemu/hw/device-assignment.h diff --git a/qemu/Makefile.target b/qemu/Makefile.target index 72f3db8..40eb273 100644 --- a/qemu/Makefile.target +++ b/qemu/Makefile.target @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o +OBJS+= device-assignment.o ifeq ($(USE_KVM_PIT), 1) OBJS+= i8254-kvm.o endif diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c new file mode 100644 index 000..e70daf2 --- /dev/null +++ b/qemu/hw/device-assignment.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2007, Neocleus Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * + * Assign a PCI device from the host to a guest VM. + * + * Adapted for KVM by Qumranet. + * + * Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED]) + * Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED]) + * Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED]) + * Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED]) + */ +#include stdio.h +#include sys/io.h +#include qemu-kvm.h +#include linux/kvm_para.h +#include device-assignment.h + +/* From linux/ioport.h */ +#define IORESOURCE_IO 0x0100 /* Resource type */ +#define IORESOURCE_MEM 0x0200 +#define IORESOURCE_IRQ 0x0400 +#define IORESOURCE_DMA 0x0800 +#define IORESOURCE_PREFETCH0x1000 /* No side effects */ + +/* #define DEVICE_ASSIGNMENT_DEBUG */ + +#ifdef DEVICE_ASSIGNMENT_DEBUG +#define DEBUG(fmt, args...) fprintf(stderr, %s: fmt, __func__ , ## args) +#else +#define DEBUG(fmt, args...) +#endif + +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr, + uint32_t value) +{ + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; + uint32_t r_pio = (unsigned long)r_access-r_virtbase + + (addr - r_access-e_physbase); + + if (r_access-debug DEVICE_ASSIGNMENT_DEBUG_PIO) { + fprintf(stderr, %s: r_pio=%08x e_physbase=%08x +r_virtbase=%08lx value=%08x\n, + __func__, r_pio, (int)r_access-e_physbase, + (unsigned long)r_access-r_virtbase, value); +