[COMMIT master] libkvm: make kvm_create_pit static
From: Michael S. Tsirkin m...@redhat.com libkvm-x86.c:55: warning: no previous prototype for ‘kvm_create_pit’ Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/kvm/libkvm/libkvm-x86.c b/kvm/libkvm/libkvm-x86.c index 2fc4fce..df8cc81 100644 --- a/kvm/libkvm/libkvm-x86.c +++ b/kvm/libkvm/libkvm-x86.c @@ -52,7 +52,7 @@ static int kvm_init_tss(kvm_context_t kvm) return 0; } -int kvm_create_pit(kvm_context_t kvm) +static int kvm_create_pit(kvm_context_t kvm) { #ifdef KVM_CAP_PIT int r; -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] Fix loading extboot option rom
From: Avi Kivity a...@redhat.com The buffer that is used to store the extboot filename is later overwritten by the vga rom loading code. Use strdup() to keep our filename. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/hw/pc.c b/hw/pc.c index db34f53..4b17b9c 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -963,7 +963,7 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, if (extboot_drive != -1) { snprintf(buf, sizeof(buf), %s/%s, bios_dir, EXTBOOT_FILENAME); -option_rom[nb_option_roms++] = buf; +option_rom[nb_option_roms++] = strdup(buf); } option_rom_offset = qemu_ram_alloc(0x2); -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] Present kvm with corret apic phys id.
From: Glauber Costa glom...@redhat.com KVM will 24-shift bits in addr 0x20 (APIC_ID) before actually using it. We currently load phys_id as s-id. After shifted by 24 bits, it will result in a meaningless value. We should really be doing s-id 24, which, after shifted, will lead to the correct value. This is for the load function. save has the invert problem. Signed-off-by: Glauber Costa glom...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/hw/apic.c b/hw/apic.c index 8c059f6..466fb7e 100644 --- a/hw/apic.c +++ b/hw/apic.c @@ -835,7 +835,7 @@ static void kvm_kernel_lapic_save_to_user(APICState *s) kvm_get_lapic(kvm_context, s-cpu_env-cpu_index, kapic); -s-id = kapic_reg(kapic, 0x2); +s-id = kapic_reg(kapic, 0x2) 24; s-tpr = kapic_reg(kapic, 0x8); s-arb_id = kapic_reg(kapic, 0x9); s-log_dest = kapic_reg(kapic, 0xd) 24; @@ -868,7 +868,7 @@ static void kvm_kernel_lapic_load_from_user(APICState *s) int i; memset(klapic, 0, sizeof apic); -kapic_set_reg(klapic, 0x2, s-id); +kapic_set_reg(klapic, 0x2, s-id 24); kapic_set_reg(klapic, 0x8, s-tpr); kapic_set_reg(klapic, 0xd, s-log_dest 24); kapic_set_reg(klapic, 0xe, s-dest_mode 28 | 0x0fff); -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] Remove the dependency for phys_ram_base for ipf.c
From: Jes Sorensen j...@sgi.com Fix ia64 code to use copy_physical_memory_{read,write} in hob and nvram code, removing dependencies of qemu_get_ram_ptr() usage. This results in cleaned up APIs and removal of unnecessary global variables. Signed-off-by: Jes Sorensen j...@sgi.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/hw/ipf.c b/hw/ipf.c index 248b01d..d051666 100644 --- a/hw/ipf.c +++ b/hw/ipf.c @@ -54,8 +54,6 @@ static fdctrl_t *floppy_controller; static RTCState *rtc_state; static PCIDevice *i440fx_state; -void *gfw_start; - static uint32_t ipf_to_legacy_io(target_phys_addr_t addr) { return (uint32_t)(((addr0x3ff) 12 2)|((addr) 0x3)); @@ -455,15 +453,12 @@ static void ipf_init1(ram_addr_t ram_size, int vga_ram_size, if (kvm_enabled()) { unsigned long image_size; uint8_t *image = NULL; -target_phys_addr_t fw_image_start; -unsigned long nvram_addr = 0; +unsigned long nvram_addr; unsigned long nvram_fd = 0; unsigned long type = READ_FROM_NVRAM; unsigned long i = 0; - -ram_addr = qemu_ram_alloc(GFW_SIZE); -gfw_start = qemu_get_ram_ptr(ram_addr); -cpu_register_physical_memory(GFW_START, GFW_SIZE, ram_addr); +unsigned long fw_offset; +ram_addr_t fw_mem = qemu_ram_alloc(GFW_SIZE); snprintf(buf, sizeof(buf), %s/%s, bios_dir, FW_FILENAME); image = read_image(buf, image_size ); @@ -472,26 +467,27 @@ static void ipf_init1(ram_addr_t ram_size, int vga_ram_size, fprintf(stderr, Please check Guest firmware at %s\n, buf); exit(1); } +fw_offset = GFW_START + GFW_SIZE - image_size; -/* Load Guest Firmware to the proper postion. */ -fw_image_start = GFW_START + GFW_SIZE - image_size; -cpu_physical_memory_write(fw_image_start, image, image_size); -free(image); +cpu_register_physical_memory(GFW_START, GFW_SIZE, fw_mem); +cpu_physical_memory_write(fw_offset, image, image_size); +free(image); if (nvram) { nvram_addr = NVRAM_START; nvram_fd = kvm_ia64_nvram_init(type); if (nvram_fd != -1) { -kvm_ia64_copy_from_nvram_to_GFW(nvram_fd, gfw_start); +kvm_ia64_copy_from_nvram_to_GFW(nvram_fd); close(nvram_fd); } i = atexit((void *)kvm_ia64_copy_from_GFW_to_nvram); if (i != 0) fprintf(stderr, cannot set exit function\n); -} -kvm_ia64_build_hob(ram_size + above_4g_mem_size, smp_cpus, - gfw_start, nvram_addr); +} else +nvram_addr = 0; + +kvm_ia64_build_hob(ram_size + above_4g_mem_size, smp_cpus, nvram_addr); } /*Register legacy io address space, size:64M*/ @@ -512,17 +508,15 @@ static void ipf_init1(ram_addr_t ram_size, int vga_ram_size, } if (cirrus_vga_enabled) { -if (pci_enabled) { +if (pci_enabled) pci_cirrus_vga_init(pci_bus, vga_ram_size); -} else { +else isa_cirrus_vga_init(vga_ram_size); -} } else { -if (pci_enabled) { +if (pci_enabled) pci_vga_init(pci_bus, vga_ram_size, 0, 0); -} else { +else isa_vga_init(vga_ram_size); -} } rtc_state = rtc_init(0x70, i8259[8], 2000); diff --git a/target-ia64/firmware.c b/target-ia64/firmware.c index ba16bd8..79f8464 100644 --- a/target-ia64/firmware.c +++ b/target-ia64/firmware.c @@ -91,12 +91,11 @@ static int add_nvram_hob(void *hob_buf, unsigned long nvram_addr); static int build_hob(void *hob_buf, unsigned long hob_buf_size, unsigned long dom_mem_size, unsigned long vcpus, unsigned long nvram_addr); -static int load_hob(void *hob_buf, -unsigned long dom_mem_size, void *hob_start); +static int load_hob(void *hob_buf, unsigned long dom_mem_size); int kvm_ia64_build_hob(unsigned long memsize, unsigned long vcpus, - void* fw_start, unsigned long nvram_addr) + unsigned long nvram_addr) { char *hob_buf; @@ -111,7 +110,8 @@ kvm_ia64_build_hob(unsigned long memsize, unsigned long vcpus, Hob_Output(Could not build hob); return -1; } -if (load_hob(hob_buf, memsize, fw_start + HOB_OFFSET) 0) { + +if (load_hob(hob_buf, memsize) 0) { free(hob_buf); Hob_Output(Could not load hob); return -1; @@ -249,7 +249,7 @@ err_out: return -1; } static int -load_hob(void *hob_buf, unsigned long dom_mem_size, void *hob_start) +load_hob(void *hob_buf, unsigned long dom_mem_size) { int hob_size; @@ -263,7 +263,9 @@ load_hob(void *hob_buf, unsigned long dom_mem_size, void *hob_start) Hob_Output(No enough memory for hob data); return
[COMMIT master] Build extboot
From: Avi Kivity a...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/Makefile b/Makefile index 157b616..6ce206a 100644 --- a/Makefile +++ b/Makefile @@ -411,3 +411,14 @@ tarbin: # Include automatically generated dependency files -include $(wildcard *.d audio/*.d slirp/*.d) + +.PHONY: kvm/extboot + +all: kvm/extboot + +kvm/extboot: + $(MAKE) -C $@ + if ! [ -f pc-bios/extboot.bin ] \ + || ! cmp -s pc-bios/extboot.bin $@/extboot.bin; then \ + cp $@/extboot.bin pc-bios/extboot.bin; \ + fi -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] Fix extboot merge
From: Avi Kivity a...@redhat.com Last qemu merge broke extboot completely. Instead of reading the command, extboot corrupted the stack. Instead of writing back the geometry, extboot wrote nothing. Fix by reading the command correctly and writing back the results. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/hw/extboot.c b/hw/extboot.c index f66b6c5..b91d54f 100644 --- a/hw/extboot.c +++ b/hw/extboot.c @@ -81,8 +81,8 @@ static void extboot_write_cmd(void *opaque, uint32_t addr, uint32_t value) int blen = 0; void *buf = NULL; -cpu_physical_memory_read((value 0x) 4, (uint8_t *)buf, - sizeof(buf)); +cpu_physical_memory_read((value 0x) 4, (uint8_t *)cmd, + sizeof(cmd)); if (cmd.type == 0x01 || cmd.type == 0x02) { pa = cmd.xfer.segment * 16 + cmd.xfer.offset; @@ -98,7 +98,6 @@ static void extboot_write_cmd(void *opaque, uint32_t addr, uint32_t value) cmd.query_geometry.heads = heads; cmd.query_geometry.sectors = sectors; cmd.query_geometry.nb_sectors = nb_sectors; - cpu_physical_memory_set_dirty((value 0x) 4); break; case 0x01: err = bdrv_read(bs, cmd.xfer.sector, buf, cmd.xfer.nb_sectors); @@ -118,6 +117,8 @@ static void extboot_write_cmd(void *opaque, uint32_t addr, uint32_t value) break; } +cpu_physical_memory_write((value 0x) 4, (uint8_t *)cmd, + sizeof(cmd)); if (buf) qemu_free(buf); } -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: SVM: Fix cross vendor migration issue in segment segment descriptor
From: Andre Przywara andre.przyw...@amd.com On AMD CPUs sometimes the DB bit in the stack segment descriptor is left as 1, although the whole segment has been made unusable. Clear it here to pass an Intel VMX entry check when cross vendor migrating. Signed-off-by: Andre Przywara andre.przyw...@amd.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1647e81..61453e6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -804,6 +804,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, if (!var-unusable) var-type |= 0x1; break; + case VCPU_SREG_SS: + /* On AMD CPUs sometimes the DB bit in the segment +* descriptor is left as 1, although the whole segment has +* been made unusable. Clear it here to pass an Intel VMX +* entry check when cross vendor migrating. +*/ + if (var-unusable) + var-db = 0; + break; } } -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: Drop request_nmi from stats
From: Jan Kiszka jan.kis...@siemens.com The stats entry request_nmi is no longer used as the related user space interface was dropped. So clean it up. Signed-off-by: Jan Kiszka jan.kis...@siemens.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8e680c3..5322ee6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -442,7 +442,6 @@ struct kvm_vcpu_stat { u32 halt_exits; u32 halt_wakeup; u32 request_irq_exits; - u32 request_nmi_exits; u32 irq_exits; u32 host_state_reload; u32 efer_reload; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2d7082c..308d8e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { halt_wakeup, VCPU_STAT(halt_wakeup) }, { hypercalls, VCPU_STAT(hypercalls) }, { request_irq, VCPU_STAT(request_irq_exits) }, - { request_nmi, VCPU_STAT(request_nmi_exits) }, { irq_exits, VCPU_STAT(irq_exits) }, { host_state_reload, VCPU_STAT(host_state_reload) }, { efer_reload, VCPU_STAT(efer_reload) }, -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/9] Do not allow interrupt injection from userspace if there is a pending event.
Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86.c |5 +++-- 1 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2d7082c..12ab1cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3080,8 +3080,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, kvm_run-ready_for_interrupt_injection = 1; else kvm_run-ready_for_interrupt_injection = - (kvm_arch_interrupt_allowed(vcpu) -!kvm_cpu_has_interrupt(vcpu)); + kvm_arch_interrupt_allowed(vcpu) + !kvm_cpu_has_interrupt(vcpu) + !kvm_event_needs_reinjection(vcpu); } static void vapic_enter(struct kvm_vcpu *vcpu) -- 1.6.2.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/9] [SVM] skip_emulated_instruction() decode an instruction if size is not known
Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/svm.c | 11 +-- 1 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c1ef2b9..14cdfce 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -207,7 +207,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!svm-next_rip) { - printk(KERN_DEBUG %s: NOP\n, __func__); + if (emulate_instruction(vcpu, vcpu-run, 0, 0, EMULTYPE_SKIP) != + EMULATE_DONE) + printk(KERN_DEBUG %s: NOP\n, __func__); return; } if (svm-next_rip - kvm_rip_read(vcpu) MAX_INST_SIZE) @@ -1836,11 +1838,8 @@ static int task_switch_interception(struct vcpu_svm *svm, if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT -(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (emulate_instruction(svm-vcpu, kvm_run, 0, 0, - EMULTYPE_SKIP) != EMULATE_DONE) - return 0; - } +(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) + skip_emulated_instruction(svm-vcpu); return kvm_task_switch(svm-vcpu, tss_selector, reason); } -- 1.6.2.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/9] Remove irq_pending bitmap
Only one interrupt vector can be injected from userspace irqchip at any given time so no need to store it in a bitmap. Put it into interrupt queue directly. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/include/asm/kvm_host.h |2 -- arch/x86/kvm/irq.c |4 ++-- arch/x86/kvm/x86.c | 38 +++--- arch/x86/kvm/x86.h | 12 4 files changed, 13 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8e680c3..cc892f5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -266,8 +266,6 @@ struct kvm_mmu { struct kvm_vcpu_arch { u64 host_tsc; - unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ - DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 11c2757..96dfbb6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -50,7 +50,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) struct kvm_pic *s; if (!irqchip_in_kernel(v-kvm)) - return v-arch.irq_summary; + return v-arch.interrupt.pending; if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ if (kvm_apic_accept_pic_intr(v)) { @@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) int vector; if (!irqchip_in_kernel(v-kvm)) - return kvm_pop_irq(v); + return v-arch.interrupt.nr; vector = kvm_get_apic_interrupt(v); /* APIC */ if (vector == -1) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12ab1cc..4596927 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1424,8 +1424,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -ENXIO; vcpu_load(vcpu); - set_bit(irq-irq, vcpu-arch.irq_pending); - set_bit(irq-irq / BITS_PER_LONG, vcpu-arch.irq_summary); + kvm_queue_interrupt(vcpu, irq-irq); vcpu_put(vcpu); @@ -3562,12 +3561,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs-efer = vcpu-arch.shadow_efer; sregs-apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu-kvm)) - memset(sregs-interrupt_bitmap, 0, - sizeof sregs-interrupt_bitmap); - else - memcpy(sregs-interrupt_bitmap, vcpu-arch.irq_pending, - sizeof sregs-interrupt_bitmap); + memset(sregs-interrupt_bitmap, 0, sizeof sregs-interrupt_bitmap); if (vcpu-arch.interrupt.pending) set_bit(vcpu-arch.interrupt.nr, @@ -4037,7 +4031,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int i, pending_vec, max_bits; + int pending_vec, max_bits; struct descriptor_table dt; vcpu_load(vcpu); @@ -4079,24 +4073,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - if (!irqchip_in_kernel(vcpu-kvm)) { - memcpy(vcpu-arch.irq_pending, sregs-interrupt_bitmap, - sizeof vcpu-arch.irq_pending); - vcpu-arch.irq_summary = 0; - for (i = 0; i ARRAY_SIZE(vcpu-arch.irq_pending); ++i) - if (vcpu-arch.irq_pending[i]) - __set_bit(i, vcpu-arch.irq_summary); - } else { - max_bits = (sizeof sregs-interrupt_bitmap) 3; - pending_vec = find_first_bit( - (const unsigned long *)sregs-interrupt_bitmap, - max_bits); - /* Only pending external irq is handled here */ - if (pending_vec max_bits) { - kvm_queue_interrupt(vcpu, pending_vec); - pr_debug(Set back pending irq %d\n, pending_vec); - } - kvm_pic_clear_isr_ack(vcpu-kvm); + max_bits = (sizeof sregs-interrupt_bitmap) 3; + pending_vec = find_first_bit( + (const unsigned long *)sregs-interrupt_bitmap, max_bits); + if (pending_vec max_bits) { + kvm_queue_interrupt(vcpu, pending_vec); + pr_debug(Set back pending irq %d\n, pending_vec); + if (irqchip_in_kernel(vcpu-kvm)) + kvm_pic_clear_isr_ack(vcpu-kvm); } kvm_set_segment(vcpu, sregs-cs, VCPU_SREG_CS); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 21203d4..c1f1a8c 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -19,18 +19,6 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
[PATCH 5/9] [VMX] Do not re-execute INTn instruction.
Re-inject event instead. This is what Intel suggest. Also use correct instruction length when re-injecting soft fault/interrupt. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/include/asm/kvm_host.h |5 - arch/x86/kvm/svm.c |6 +++--- arch/x86/kvm/vmx.c | 29 ++--- arch/x86/kvm/x86.c | 13 - arch/x86/kvm/x86.h |9 - 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cc892f5..fea0429 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -319,6 +319,8 @@ struct kvm_vcpu_arch { struct kvm_pio_request pio; void *pio_data; + u8 event_exit_inst_len; + struct kvm_queued_exception { bool pending; bool has_error_code; @@ -328,6 +330,7 @@ struct kvm_vcpu_arch { struct kvm_queued_interrupt { bool pending; + bool soft; u8 nr; } interrupt; @@ -510,7 +513,7 @@ struct kvm_x86_ops { void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*set_irq)(struct kvm_vcpu *vcpu, int vec); + void (*set_irq)(struct kvm_vcpu *vcpu, int vec, bool soft); void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 14cdfce..d5173a2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2284,7 +2284,7 @@ static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) +static void svm_set_irq(struct kvm_vcpu *vcpu, int irq, bool soft) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2392,7 +2392,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) case SVM_EXITINTINFO_TYPE_EXEPT: /* In case of software exception do not reinject an exception vector, but re-execute and instruction instead */ - if (vector == BP_VECTOR || vector == OF_VECTOR) + if (kvm_exception_is_soft(vector)) break; if (exitintinfo SVM_EXITINTINFO_VALID_ERR) { u32 err = svm-vmcb-control.exit_int_info_err; @@ -2402,7 +2402,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) kvm_queue_exception(svm-vcpu, vector); break; case SVM_EXITINTINFO_TYPE_INTR: - kvm_queue_interrupt(svm-vcpu, vector); + kvm_queue_interrupt(svm-vcpu, vector, false); break; default: break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a9b30e6..092a3ee 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -779,8 +779,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, return; } - if (nr == BP_VECTOR || nr == OF_VECTOR) { - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + if (kvm_exception_is_soft(nr)) { + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, +vmx-vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; } else intr_info |= INTR_TYPE_HARD_EXCEPTION; @@ -2429,9 +2430,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) +static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq, bool soft) { struct vcpu_vmx *vmx = to_vmx(vcpu); + uint32_t intr; KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); @@ -2446,8 +2448,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) kvm_rip_write(vcpu, vmx-rmode.irq.rip - 1); return; } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); + intr = irq | INTR_INFO_VALID_MASK; + if (soft) { + intr |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, +vmx-vcpu.arch.event_exit_inst_len); + } else + intr |= INTR_TYPE_EXT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -3008,6 +3016,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) GUEST_INTR_STATE_NMI);
[PATCH 1/9] Unprotect a page if #PF happens during NMI injection.
It is done for exception and interrupt already. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/svm.c |3 +-- arch/x86/kvm/vmx.c |2 +- arch/x86/kvm/x86.h |6 ++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8f411ff..c1ef2b9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1090,8 +1090,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (npt_enabled) svm_flush_tlb(svm-vcpu); else { - if (svm-vcpu.arch.interrupt.pending || - svm-vcpu.arch.exception.pending) + if (kvm_event_needs_reinjection(svm-vcpu)) kvm_mmu_unprotect_page_virt(svm-vcpu, fault_address); } return kvm_mmu_page_fault(svm-vcpu, fault_address, error_code); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e8a5649..a9b30e6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2599,7 +2599,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 32), handler); - if (vcpu-arch.interrupt.pending || vcpu-arch.exception.pending) + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 39350b2..21203d4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -30,4 +30,10 @@ static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu) clear_bit(word_index, vcpu-arch.irq_summary); return irq; } + +static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) +{ + return vcpu-arch.exception.pending || vcpu-arch.interrupt.pending || + vcpu-arch.nmi_injected; +} #endif -- 1.6.2.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 8/9] Replace pending exception by PF if it happens serially.
replace previous exception with a new one in a hope that instruction re-execution will regenerate lost exception. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86.c | 19 --- 1 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4ba00ab..a869b89 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -177,16 +177,21 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, ++vcpu-stat.pf_guest; if (vcpu-arch.exception.pending) { - if (vcpu-arch.exception.nr == PF_VECTOR) { - printk(KERN_DEBUG kvm: inject_page_fault: -double fault 0x%lx\n, addr); - vcpu-arch.exception.nr = DF_VECTOR; - vcpu-arch.exception.error_code = 0; - } else if (vcpu-arch.exception.nr == DF_VECTOR) { + switch(vcpu-arch.exception.nr) { + case DF_VECTOR: /* triple fault - shutdown */ set_bit(KVM_REQ_TRIPLE_FAULT, vcpu-requests); + case PF_VECTOR: + vcpu-arch.exception.nr = DF_VECTOR; + vcpu-arch.exception.error_code = 0; + return; + default: + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + vcpu-arch.exception.pending = false; + break; } - return; } vcpu-arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); -- 1.6.2.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/9] IRQ/NMI window should always be requested.
Currently they are not requested if there is pending exception. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86.c | 30 -- 1 files changed, 12 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 023842b..bce49da 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3127,8 +3127,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops-update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_irq(struct kvm_vcpu *vcpu) +static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + if (vcpu-guest_debug KVM_GUESTDBG_SINGLESTEP) + kvm_x86_ops-drop_interrupt_shadow(vcpu); + /* try to reinject previous events if any */ if (vcpu-arch.nmi_injected) { kvm_x86_ops-set_nmi(vcpu); @@ -3158,26 +3161,11 @@ static void inject_irq(struct kvm_vcpu *vcpu) } } -static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - bool req_int_win = !irqchip_in_kernel(vcpu-kvm) - kvm_run-request_interrupt_window; - - if (vcpu-guest_debug KVM_GUESTDBG_SINGLESTEP) - kvm_x86_ops-drop_interrupt_shadow(vcpu); - - inject_irq(vcpu); - - /* enable NMI/IRQ window open exits if needed */ - if (vcpu-arch.nmi_pending) - kvm_x86_ops-enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops-enable_irq_window(vcpu); -} - static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; + bool req_int_win = !irqchip_in_kernel(vcpu-kvm) + kvm_run-request_interrupt_window; if (vcpu-requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, vcpu-requests)) @@ -3235,6 +3223,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) else inject_pending_irq(vcpu, kvm_run); + /* enable NMI/IRQ window open exits if needed */ + if (vcpu-arch.nmi_pending) + kvm_x86_ops-enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops-enable_irq_window(vcpu); + if (kvm_lapic_enabled(vcpu)) { if (!vcpu-arch.apic-vapic_addr) update_cr8_intercept(vcpu); -- 1.6.2.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.
If NMI is received during handling of another NMI it should be injected immediately after IRET from previous NMI handler, but SVM intercept IRET before instruction execution so we can't inject pending NMI at this point and there is not way to request exit when NMI window opens. This patch fix SVM code to open NMI window after IRET by single stepping over IRET instruction. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/include/asm/kvm_host.h |2 + arch/x86/kvm/svm.c | 62 +++--- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fea0429..bcd0857 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -358,6 +358,7 @@ struct kvm_vcpu_arch { unsigned int time_offset; struct page *time_page; + bool singlestep; /* guest is single stepped by KVM */ bool nmi_pending; bool nmi_injected; @@ -772,6 +773,7 @@ enum { #define HF_HIF_MASK(1 1) #define HF_VINTR_MASK (1 2) #define HF_NMI_MASK(1 3) +#define HF_IRET_MASK (1 4) /* * Hardware virtualization extension instructions may fault if a diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d5173a2..bf10991 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -933,15 +933,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, } -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void update_db_intercept(struct kvm_vcpu *vcpu) { - int old_debug = vcpu-guest_debug; struct vcpu_svm *svm = to_svm(vcpu); - vcpu-guest_debug = dbg-control; - svm-vmcb-control.intercept_exceptions = ~((1 DB_VECTOR) | (1 BP_VECTOR)); + + if (vcpu-arch.singlestep) + svm-vmcb-control.intercept_exceptions |= (1 DB_VECTOR); + if (vcpu-guest_debug KVM_GUESTDBG_ENABLE) { if (vcpu-guest_debug (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) @@ -952,6 +953,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1 BP_VECTOR; } else vcpu-guest_debug = 0; +} + +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +{ + int old_debug = vcpu-guest_debug; + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu-guest_debug = dbg-control; + + update_db_intercept(vcpu); if (vcpu-guest_debug KVM_GUESTDBG_USE_HW_BP) svm-vmcb-save.dr7 = dbg-arch.debugreg[7]; @@ -1101,14 +1112,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (!(svm-vcpu.guest_debug - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + !svm-vcpu.arch.singlestep) { kvm_queue_exception(svm-vcpu, DB_VECTOR); return 1; } - kvm_run-exit_reason = KVM_EXIT_DEBUG; - kvm_run-debug.arch.pc = svm-vmcb-save.cs.base + svm-vmcb-save.rip; - kvm_run-debug.arch.exception = DB_VECTOR; - return 0; + + if (svm-vcpu.arch.singlestep) { + svm-vcpu.arch.singlestep = false; + if (!(svm-vcpu.guest_debug KVM_GUESTDBG_SINGLESTEP)) + svm-vmcb-save.rflags = + ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(to_svm(svm)); + } + + if (svm-vcpu.guest_debug + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ + kvm_run-exit_reason = KVM_EXIT_DEBUG; + kvm_run-debug.arch.pc = + svm-vmcb-save.cs.base + svm-vmcb-save.rip; + kvm_run-debug.arch.exception = DB_VECTOR; + return 0; + } + + return 1; } static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) @@ -1855,7 +1882,7 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { ++svm-vcpu.stat.nmi_window_exits; svm-vmcb-control.intercept = ~(1UL INTERCEPT_IRET); - svm-vcpu.arch.hflags = ~HF_NMI_MASK; + svm-vcpu.arch.hflags |= HF_IRET_MASK; return 1; } @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm-vmcb-control.int_state SVM_INTERRUPT_SHADOW_MASK) - enable_irq_window(vcpu); + if ((svm-vcpu.arch.hflags (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ + + /* Something prevents NMI from been injected. Single step over + possible problem (IRET or exception injection or
Re: Paravirtualisation or not?
If a set of drivers essentially implementing the virtio framework (virtio_pci, virtio_ring, virtio queues) were available for windows, that would be *really* neat. I haven't tried them myself but I think this will give you virtio-net for Windows: http://sourceforge.net/project/showfiles.php?group_id=180599package_id=267943 More information: http://www.linux-kvm.com/content/tip-how-setup-windows-guest-paravirtual-network-drivers Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.
Gleb Natapov wrote: If NMI is received during handling of another NMI it should be injected immediately after IRET from previous NMI handler, but SVM intercept IRET before instruction execution so we can't inject pending NMI at this point and there is not way to request exit when NMI window opens. This patch fix SVM code to open NMI window after IRET by single stepping over IRET instruction. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/include/asm/kvm_host.h |2 + arch/x86/kvm/svm.c | 62 +++--- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fea0429..bcd0857 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -358,6 +358,7 @@ struct kvm_vcpu_arch { unsigned int time_offset; struct page *time_page; + bool singlestep; /* guest is single stepped by KVM */ bool nmi_pending; bool nmi_injected; @@ -772,6 +773,7 @@ enum { #define HF_HIF_MASK (1 1) #define HF_VINTR_MASK(1 2) #define HF_NMI_MASK (1 3) +#define HF_IRET_MASK (1 4) /* * Hardware virtualization extension instructions may fault if a diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d5173a2..bf10991 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -933,15 +933,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, } -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void update_db_intercept(struct kvm_vcpu *vcpu) { - int old_debug = vcpu-guest_debug; struct vcpu_svm *svm = to_svm(vcpu); - vcpu-guest_debug = dbg-control; - svm-vmcb-control.intercept_exceptions = ~((1 DB_VECTOR) | (1 BP_VECTOR)); + + if (vcpu-arch.singlestep) + svm-vmcb-control.intercept_exceptions |= (1 DB_VECTOR); + if (vcpu-guest_debug KVM_GUESTDBG_ENABLE) { if (vcpu-guest_debug (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) @@ -952,6 +953,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1 BP_VECTOR; } else vcpu-guest_debug = 0; +} + +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +{ + int old_debug = vcpu-guest_debug; + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu-guest_debug = dbg-control; + + update_db_intercept(vcpu); if (vcpu-guest_debug KVM_GUESTDBG_USE_HW_BP) svm-vmcb-save.dr7 = dbg-arch.debugreg[7]; @@ -1101,14 +1112,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (!(svm-vcpu.guest_debug - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + !svm-vcpu.arch.singlestep) { kvm_queue_exception(svm-vcpu, DB_VECTOR); return 1; } - kvm_run-exit_reason = KVM_EXIT_DEBUG; - kvm_run-debug.arch.pc = svm-vmcb-save.cs.base + svm-vmcb-save.rip; - kvm_run-debug.arch.exception = DB_VECTOR; - return 0; + + if (svm-vcpu.arch.singlestep) { + svm-vcpu.arch.singlestep = false; + if (!(svm-vcpu.guest_debug KVM_GUESTDBG_SINGLESTEP)) + svm-vmcb-save.rflags = + ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(to_svm(svm)); + } + + if (svm-vcpu.guest_debug + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ + kvm_run-exit_reason = KVM_EXIT_DEBUG; + kvm_run-debug.arch.pc = + svm-vmcb-save.cs.base + svm-vmcb-save.rip; + kvm_run-debug.arch.exception = DB_VECTOR; + return 0; + } + + return 1; } static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) @@ -1855,7 +1882,7 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { ++svm-vcpu.stat.nmi_window_exits; svm-vmcb-control.intercept = ~(1UL INTERCEPT_IRET); - svm-vcpu.arch.hflags = ~HF_NMI_MASK; + svm-vcpu.arch.hflags |= HF_IRET_MASK; return 1; } @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm-vmcb-control.int_state SVM_INTERRUPT_SHADOW_MASK) - enable_irq_window(vcpu); + if ((svm-vcpu.arch.hflags (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ + + /* Something prevents NMI from been injected. Single step over +possible problem (IRET
Re: Paravirtualisation or not?
On Tue, May 5, 2009 at 11:37 AM, Stefan Hajnoczi stefa...@gmail.com wrote: If a set of drivers essentially implementing the virtio framework (virtio_pci, virtio_ring, virtio queues) were available for windows, that would be *really* neat. I haven't tried them myself but I think this will give you virtio-net for Windows: http://sourceforge.net/project/showfiles.php?group_id=180599package_id=267943 More information: http://www.linux-kvm.com/content/tip-how-setup-windows-guest-paravirtual-network-drivers Hi Stefan :) Sure, closed-source virtio-net drivers exist (in fact there is a newer version than the one you linked. I think it is 12/2008 distributed as an iso). The point (and the advantage of Xen in this area) is that Xen provides the source too under GPL. Even if there was source available for the virtio framework only (and not net at all) it would still be useful to others wanting to write virtio drivers for windows. It is harder for a third party to do this job because you would have to make the decision to either use the Windows DDK and samples (which means you can't release under GPL and thus you can't reuse or even look at the current virtio implementations) or use GPL and the current linux virtio code as a base but in this case you can forget DDK and the samples (at least that is my understanding). Cheers, Pantelis -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/1] qemu-kvm: virtio-net: Re-instate GSO code removed upstream
This commit: commit 559a8f45f34cc50d1a60b4f67a06614d506b2e01 Subject: Remove stray GSO code from virtio_net (Mark McLoughlin) Removed some GSO code from upstream qemu.git, but it needs to be re-instated in qemu-kvm.git. Reported-by: Sridhar Samudrala s...@us.ibm.com Signed-off-by: Mark McLoughlin mar...@redhat.com --- hw/virtio-net.c |5 + 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index ac8e030..e5d7add 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -424,6 +424,11 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) if (n-promisc) return 1; +#ifdef TAP_VNET_HDR +if (tap_has_vnet_hdr(n-vc-vlan-first_client)) +ptr += sizeof(struct virtio_net_hdr); +#endif + if (!memcmp(ptr[12], vlan, sizeof(vlan))) { int vid = be16_to_cpup((uint16_t *)(ptr + 14)) 0xfff; if (!(n-vlans[vid 5] (1U (vid 0x1f -- 1.6.0.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvm] virtio-net not working with the latest qemu-kvm git
On Mon, 2009-05-04 at 11:44 -0600, Alex Williamson wrote: On Mon, 2009-05-04 at 09:50 -0700, Sridhar Samudrala wrote: When i moved to the latest qemu-kvm git tree from kvm-85, i noticed that networking stopped working between the host and the guest. It started working when i put the device in promiscuos mode by running tcpdump in background on the guest. After browsing through the recent patches, i found that the following commit is causing the regression. Remove stray GSO code from virtio_net (Mark McLoughlin) http://git.kernel.org/?p=virt/kvm/qemu-kvm.git;a=commitdiff;h=559a8f45f34cc50d1a60b4f67a06614d506b2e01 The comment doesn't seem to match with the code that is removed with this patch. Yep, I agree, the removed code is not bogus. We have to skip the vnet header to to get to the ethernet header, which we do the filtering on. The code was removed in upstream qemu.git, but we need it re-instated in qemu-kvm.git. Just sent a patch to do that. Thanks, Mark. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.
On Tue, May 05, 2009 at 10:45:20AM +0200, Jan Kiszka wrote: @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm-vmcb-control.int_state SVM_INTERRUPT_SHADOW_MASK) - enable_irq_window(vcpu); + if ((svm-vcpu.arch.hflags (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ + + /* Something prevents NMI from been injected. Single step over + possible problem (IRET or exception injection or interrupt + shadow) */ + vcpu-arch.singlestep = true; + svm-vmcb-save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); Can you single-step like this out of an IRQ handler? I mean, IRET will restore the flags from the stack, and those settings should be overwritten. Or am I missing something? It seems to be working :) Shouldn't CPU checks single step before executing IRET and thus using old flags value? It is interesting to check what rflag value is immediately after IRET. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.
Gleb Natapov wrote: On Tue, May 05, 2009 at 10:45:20AM +0200, Jan Kiszka wrote: @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm-vmcb-control.int_state SVM_INTERRUPT_SHADOW_MASK) - enable_irq_window(vcpu); + if ((svm-vcpu.arch.hflags (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ + + /* Something prevents NMI from been injected. Single step over + possible problem (IRET or exception injection or interrupt + shadow) */ + vcpu-arch.singlestep = true; + svm-vmcb-save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); Can you single-step like this out of an IRQ handler? I mean, IRET will restore the flags from the stack, and those settings should be overwritten. Or am I missing something? It seems to be working :) Shouldn't CPU checks single step before executing IRET and thus using old flags value? It is interesting to check what rflag value is immediately after IRET. Hmm, guess I have to re-read some manuals. But regarding rflags-after-iret, I think it should be cleared due to that restoring from the stack. Jan -- Siemens AG, Corporate Technology, CT SE 2 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.
On Tue, May 05, 2009 at 11:25:13AM +0200, Jan Kiszka wrote: Gleb Natapov wrote: On Tue, May 05, 2009 at 10:45:20AM +0200, Jan Kiszka wrote: @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm-vmcb-control.int_state SVM_INTERRUPT_SHADOW_MASK) - enable_irq_window(vcpu); + if ((svm-vcpu.arch.hflags (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ + + /* Something prevents NMI from been injected. Single step over +possible problem (IRET or exception injection or interrupt +shadow) */ + vcpu-arch.singlestep = true; + svm-vmcb-save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); Can you single-step like this out of an IRQ handler? I mean, IRET will restore the flags from the stack, and those settings should be overwritten. Or am I missing something? It seems to be working :) Shouldn't CPU checks single step before executing IRET and thus using old flags value? It is interesting to check what rflag value is immediately after IRET. Hmm, guess I have to re-read some manuals. But regarding rflags-after-iret, I think it should be cleared due to that restoring from the stack. Just re-tested this once more. DB is intercepted after IRET and TF/RF is cleared already. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.
I noticed a small bug in previous patch. Use this one instead. (change update_db_intercept(to_svm(svm)) - update_db_intercept(svm-vcpu)) If NMI is received during handling of another NMI it should be injected immediately after IRET from previous NMI handler, but SVM intercept IRET before instruction execution so we can't inject pending NMI at this point and there is not way to request exit when NMI window opens. This patch fix SVM code to open NMI window after IRET by single stepping over IRET instruction. Signed-off-by: Gleb Natapov g...@redhat.com diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fea0429..bcd0857 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -358,6 +358,7 @@ struct kvm_vcpu_arch { unsigned int time_offset; struct page *time_page; + bool singlestep; /* guest is single stepped by KVM */ bool nmi_pending; bool nmi_injected; @@ -772,6 +773,7 @@ enum { #define HF_HIF_MASK(1 1) #define HF_VINTR_MASK (1 2) #define HF_NMI_MASK(1 3) +#define HF_IRET_MASK (1 4) /* * Hardware virtualization extension instructions may fault if a diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d5173a2..5c00258 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -933,15 +933,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, } -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void update_db_intercept(struct kvm_vcpu *vcpu) { - int old_debug = vcpu-guest_debug; struct vcpu_svm *svm = to_svm(vcpu); - vcpu-guest_debug = dbg-control; - svm-vmcb-control.intercept_exceptions = ~((1 DB_VECTOR) | (1 BP_VECTOR)); + + if (vcpu-arch.singlestep) + svm-vmcb-control.intercept_exceptions |= (1 DB_VECTOR); + if (vcpu-guest_debug KVM_GUESTDBG_ENABLE) { if (vcpu-guest_debug (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) @@ -952,6 +953,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1 BP_VECTOR; } else vcpu-guest_debug = 0; +} + +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +{ + int old_debug = vcpu-guest_debug; + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu-guest_debug = dbg-control; + + update_db_intercept(vcpu); if (vcpu-guest_debug KVM_GUESTDBG_USE_HW_BP) svm-vmcb-save.dr7 = dbg-arch.debugreg[7]; @@ -1101,14 +1112,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (!(svm-vcpu.guest_debug - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + !svm-vcpu.arch.singlestep) { kvm_queue_exception(svm-vcpu, DB_VECTOR); return 1; } - kvm_run-exit_reason = KVM_EXIT_DEBUG; - kvm_run-debug.arch.pc = svm-vmcb-save.cs.base + svm-vmcb-save.rip; - kvm_run-debug.arch.exception = DB_VECTOR; - return 0; + + if (svm-vcpu.arch.singlestep) { + svm-vcpu.arch.singlestep = false; + if (!(svm-vcpu.guest_debug KVM_GUESTDBG_SINGLESTEP)) + svm-vmcb-save.rflags = + ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(svm-vcpu); + } + + if (svm-vcpu.guest_debug + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ + kvm_run-exit_reason = KVM_EXIT_DEBUG; + kvm_run-debug.arch.pc = + svm-vmcb-save.cs.base + svm-vmcb-save.rip; + kvm_run-debug.arch.exception = DB_VECTOR; + return 0; + } + + return 1; } static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) @@ -1855,7 +1882,7 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { ++svm-vcpu.stat.nmi_window_exits; svm-vmcb-control.intercept = ~(1UL INTERCEPT_IRET); - svm-vcpu.arch.hflags = ~HF_NMI_MASK; + svm-vcpu.arch.hflags |= HF_IRET_MASK; return 1; } @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm-vmcb-control.int_state SVM_INTERRUPT_SHADOW_MASK) - enable_irq_window(vcpu); + if ((svm-vcpu.arch.hflags (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ + + /* Something prevents NMI from been injected. Single step over + possible problem (IRET or exception injection or interrupt + shadow) */ +
Re: qemu/hw/device-assignment: questions about msix_table_page
On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote: If guest can write to the real device MSI-X table directly, it would cause chaos on interrupt delivery, for what guest see is totally different with what's host see... Obviously. Thanks, What's the reason that this page is unmapped from the qemu memory space? Specifically what do these lines do: int offset = r_dev-msix_table_addr - real_region-base_addr; ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE); -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu/hw/device-assignment: questions about msix_table_page
On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote: On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote: If guest can write to the real device MSI-X table directly, it would cause chaos on interrupt delivery, for what guest see is totally different with what's host see... Obviously. Thanks, What's the reason that this page is unmapped from the qemu memory space? Specifically what do these lines do: int offset = r_dev-msix_table_addr - real_region-base_addr; ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE); I believe this allows accesses to this page (the MSI-X table), which is part of the guest address space (through kvm memory slots), to be trapped by qemu. Since there is no actual page in this guest address, KVM treats accesses as MMIO and forwards them to QEMU. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu/hw/device-assignment: questions about msix_table_page
On Tue, May 05, 2009 at 01:34:50PM +0300, Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote: On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote: If guest can write to the real device MSI-X table directly, it would cause chaos on interrupt delivery, for what guest see is totally different with what's host see... Obviously. Thanks, What's the reason that this page is unmapped from the qemu memory space? Specifically what do these lines do: int offset = r_dev-msix_table_addr - real_region-base_addr; ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE); I believe this allows accesses to this page (the MSI-X table), which is part of the guest address space (through kvm memory slots), to be trapped by qemu. Since there is no actual page in this guest address, KVM treats accesses as MMIO and forwards them to QEMU. I thought about this too. But why is this necessary for assigned MSI-X but not for emulated devices such as e.g. e1000? All e1000 does seems to be cpu_register_physical_memory ... Because there is no registered (kvm) memory slot for the range which e1000 registers its MMIO? Not sure about the address of the MSI-X table page, but you could achieve the same effect by splitting the slot which it lives in two, with a 1 page hole between them. BTW this is why you can't map the MSI-X table page directly, you want accesses to be trapped. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu/hw/device-assignment: questions about msix_table_page
On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote: On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote: If guest can write to the real device MSI-X table directly, it would cause chaos on interrupt delivery, for what guest see is totally different with what's host see... Obviously. Thanks, What's the reason that this page is unmapped from the qemu memory space? Specifically what do these lines do: int offset = r_dev-msix_table_addr - real_region-base_addr; ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE); I believe this allows accesses to this page (the MSI-X table), which is part of the guest address space (through kvm memory slots), to be trapped by qemu. Since there is no actual page in this guest address, KVM treats accesses as MMIO and forwards them to QEMU. I thought about this too. But why is this necessary for assigned MSI-X but not for emulated devices such as e.g. e1000? All e1000 does seems to be cpu_register_physical_memory ... -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote: The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned device? If you can't, it would be better to change the ioctls before 2.6.30 is release IMO. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote: The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned device? Sure, but only one KVM_ASSIGN_SET_MSIX_NR. If you can't, it would be better to change the ioctls before 2.6.30 is release IMO. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)
On Tue, 2009-05-05 at 03:43 -0700, Nicholas A. Bellinger wrote: On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote: Hi, The VF also works in the host if the VF driver is programed properly. So it would be easier to develop the VF driver in the host and then verify the VF driver in the guest. BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select the CONFIG_PCI_IOV in the kernel .config? Thanks, Yu Greetings Yu and Sheng, So the original attachment was for the v2.6.29-fc11 host kernel output, I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was enabled) for KVM host with kvm-85 and now things are looking quite stable for me. So far I have been able to successfully push LIO-Target v3.0 traffic *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port. I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and FILEIO storage objects (in the KVM Guest), and they are passing validation and I am seeing ~500 Mb/sec of throughput and very low CPU usage in the KVM guests. Ok I am seeing another issue with the e1000e port on 02:00.0..: As i start to push multiple badblocks tests RAMDISK_DR iSCSI Logical units into KVM Guest running LIO v2.6.29.2 from the external Linux/iSCSI Initiator machine, after about 100 GB of iSCSI traffic, I see the following exception in KVM host v2.6.30-rc3: DRHD: handling fault status reg 2 DMAR:[DMA Write] Request device [02:00.0] fault addr 7fc958b01 DMAR:[fault reason 04] Access beyond MGAW pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 60 for MSI/MSI-X pci-stub :02:00.0: irq 61 for MSI/MSI-X I am able to restart the LIO-Target KVM Guest and the Linux/iSCSI Initiators are able to reconnect.. Wow, very cool.. Not sure if this is a bug in the target_core_mod RAMDISK_DR subsystem plugin (mapping struct iovec to internally allocated struct page) or what. I will have to look at the DMAR code to understand what this exception means.. --nab One issue I did notice while using the pci-stub method of device-assignment with same e1000 port (02:00.0) was while using an iSCSI Initiator (Open-iSCSI) on the KVM Host machine and doing sustained traffic into the LIO-Target KVM Guest on the same local KVM host to max out traffic between the other onboard e1000e port (03.00.0), I see the following: pci-stub :02:00.0: PCI INT A - GSI 17 (level, low) - IRQ 17 assign device: host bdf = 2:0:0 pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 60 for MSI/MSI-X pci-stub :02:00.0: irq 61 for MSI/MSI-X scsi4 : iSCSI Initiator over TCP/IP scsi 4:0:0:0: Direct-Access LIO-ORG RAMDISK-DR 3.0 PQ: 0 ANSI: 5 sd 4:0:0:0: Attached scsi generic sg1 type 0 scsi 4:0:0:1: Direct-Access LIO-ORG RAMDISK-DR 3.0 PQ: 0 ANSI: 5 sd 4:0:0:1: Attached scsi generic sg2 type 0 sd 4:0:0:0: [sdb] 262144 512-byte hardware sectors: (134 MB/128 MiB) sd 4:0:0:1: [sdc] 262144 512-byte hardware sectors: (134 MB/128 MiB) sd 4:0:0:0: [sdb] Write Protect is off sd 4:0:0:0: [sdb] Mode Sense: 2f 00 00 00 sd 4:0:0:1: [sdc] Write Protect is off sd 4:0:0:1: [sdc] Mode Sense: 2f 00 00 00 sd 4:0:0:0: [sdb] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA sd 4:0:0:1: [sdc] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA sdb:6 sdc: unknown partition table sd 4:0:0:0: [sdb] Attached SCSI disk unknown partition table sd 4:0:0:1: [sdc] Attached SCSI disk [ cut here ] WARNING: at kernel/irq/manage.c:260 enable_irq+0x36/0x50() Hardware name: empty Unbalanced enable for IRQ 59 Modules linked in: ipt_REJECT xt_tcpudp bridge stp sunrpc iptable_filter ip_tables xt_state nf_conntrack ip6table_filter ip6_tables x_tables ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi cpufreq_ondemand acpi_cpufreq freq_table ext3 jbd loop dm_multipath scsi_dh kvm_intel kvm uinput i2c_i801 firewire_ohci joydev firewire_core sg i2c_core 8250_pnp crc_itu_t e1000e 8250 serial_core rtc_cmos pcspkr serio_raw rtc_core rtc_lib button sd_mod dm_snapshot dm_zero dm_mirror dm_region_hash dm_log dm_mod uhci_hcd ohci_hcd ehci_hcd ata_piix libata scsi_mod [last unloaded: microcode] Pid: 51, comm: events/0 Tainted: GW 2.6.30-rc3 #11 Call Trace: [80235fee] ? warn_slowpath+0xcb/0xe8 [80253a7c] ? generic_exec_single+0x6a/0x88 [8022acec] ? update_curr+0x67/0xeb [a0198748] ? vcpu_kick_intr+0x0/0x1 [kvm] [8020a5d8] ? __switch_to+0xb6/0x274 [8022b70a] ? __dequeue_entity+0x1b/0x2f [a01ac7e4] ?
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
Michael S. Tsirkin wrote: The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. +1 signature.asc Description: OpenPGP digital signature
Re: qemu/hw/device-assignment: questions about msix_table_page
On Tue, May 05, 2009 at 07:49:10AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:34:50PM +0300, Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote: On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote: If guest can write to the real device MSI-X table directly, it would cause chaos on interrupt delivery, for what guest see is totally different with what's host see... Obviously. Thanks, What's the reason that this page is unmapped from the qemu memory space? Specifically what do these lines do: int offset = r_dev-msix_table_addr - real_region-base_addr; ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE); I believe this allows accesses to this page (the MSI-X table), which is part of the guest address space (through kvm memory slots), to be trapped by qemu. Since there is no actual page in this guest address, KVM treats accesses as MMIO and forwards them to QEMU. I thought about this too. But why is this necessary for assigned MSI-X but not for emulated devices such as e.g. e1000? All e1000 does seems to be cpu_register_physical_memory ... Because there is no registered (kvm) memory slot for the range which e1000 registers its MMIO? ret = kvm_register_phys_mem(kvm_context, e_phys, region-u.r_virtbase, TARGET_PAGE_ALIGN(e_size), 0); is what creates this slot, correct? Not sure about the address of the MSI-X table page, but you could achieve the same effect by splitting the slot which it lives in two, with a 1 page hole between them. BTW this is why you can't map the MSI-X table page directly, you want accesses to be trapped. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote: The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned device? Sure, but only one KVM_ASSIGN_SET_MSIX_NR. MSIX_NR is the size of the table, while MSIX_ENTRY updates a single entry, if I read the code correctly. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: fix comment on locking
__kvm_set_memory_region callers must (and do) take slots_lock, not mmap_sem, for write. Fix the comment to match this reality. Signed-off-by: Michael S. Tsirkin m...@redhat.com --- virt/kvm/kvm_main.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 605697e..060d86c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -911,7 +911,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) * * Discontiguous memory is allowed, mostly for framebuffers. * - * Must be called holding mmap_sem for write. + * Must be called holding slots_lock for write. */ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, -- 1.6.3.rc3.1.g830204 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote: Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote: The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned device? Sure, but only one KVM_ASSIGN_SET_MSIX_NR. MSIX_NR is the size of the table, while MSIX_ENTRY updates a single entry, if I read the code correctly. Right. So we'll need something like this for MSI as well. Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY and changed to do the right thing depending on the IRQ type? -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ kvm-Bugs-2787205 ] Video: KVM graphics performance dropped
Bugs item #2787205, was opened at 2009-05-05 15:02 Message generated for change (Tracker Item Submitted) made by technologov You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2787205group_id=180599 Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: None Group: None Status: Open Resolution: None Priority: 8 Private: No Submitted By: Technologov (technologov) Assigned to: Nobody/Anonymous (nobody) Summary: Video: KVM graphics performance dropped Initial Comment: Starting with KVM-84 the video performance dropped to a turtle speed when using remote X11 SDL rendering. KVM-85 is even worser, because in addition to slow speed, it adds flickering. KVM is basically useless when working over remote X11/SDL. -Alexey, 5.5.2009. -- You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2787205group_id=180599 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote: Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote: The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned device? Sure, but only one KVM_ASSIGN_SET_MSIX_NR. MSIX_NR is the size of the table, while MSIX_ENTRY updates a single entry, if I read the code correctly. Right. So we'll need something like this for MSI as well. Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY and changed to do the right thing depending on the IRQ type? Works for me. Sheng, is there a reason why it wasn't done like this? btw, it could be further simplified by using irqfd. Instead of the host device tying directly into kvm, it could just trigger an eventfd; and we could terminate the eventfd either in kvm (irqfd) or in qemu. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
On Tue, May 05, 2009 at 03:08:40PM +0300, Avi Kivity wrote: Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote: Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote: The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned device? Sure, but only one KVM_ASSIGN_SET_MSIX_NR. MSIX_NR is the size of the table, while MSIX_ENTRY updates a single entry, if I read the code correctly. Right. So we'll need something like this for MSI as well. Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY and changed to do the right thing depending on the IRQ type? Works for me. Sheng, is there a reason why it wasn't done like this? btw, it could be further simplified by using irqfd. Instead of the host device tying directly into kvm, it could just trigger an eventfd; and we could terminate the eventfd either in kvm (irqfd) or in qemu. This probably is outside the scope for 2.6.30 :) -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Serialize qcow2 writes
Commit 641636d (qcow2 corruption: Fix alloc_cluster_link_l2; 4df8f71 on stable-0.10) exposes a bug with concurrent allocating qcow2 writes: the writes will trigger a call to free_any_clusters() and corrupt the image. As a temporary workaround until a real fix is written, this patch serializes writes to avoid the issue. With this, I can install Fedora 10 on a virtio disk. Signed-off-by: Avi Kivity a...@redhat.com --- block-qcow2.c | 27 ++- 1 files changed, 26 insertions(+), 1 deletions(-) diff --git a/block-qcow2.c b/block-qcow2.c index 1f33125..6685915 100644 --- a/block-qcow2.c +++ b/block-qcow2.c @@ -157,6 +157,8 @@ typedef struct BDRVQcowState { int snapshots_size; int nb_snapshots; QCowSnapshot *snapshots; +int write_in_progress; +TAILQ_HEAD(QCow2DeferredWrites, QCowAIOCB) deferred_writes; } BDRVQcowState; static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset); @@ -371,6 +373,9 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) if (qcow_read_snapshots(bs) 0) goto fail; +s-write_in_progress = 0; +TAILQ_INIT(s-deferred_writes); + #ifdef DEBUG_ALLOC check_refcounts(bs); #endif @@ -1274,6 +1279,7 @@ typedef struct QCowAIOCB { QEMUIOVector hd_qiov; QEMUBH *bh; QCowL2Meta l2meta; +TAILQ_ENTRY(QCowAIOCB) deferred_writes_link; } QCowAIOCB; static void qcow_aio_read_cb(void *opaque, int ret); @@ -1439,6 +1445,8 @@ static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, return acb-common; } +static void fire_deferred_writes(BDRVQcowState *s); + static void qcow_aio_write_cb(void *opaque, int ret) { QCowAIOCB *acb = opaque; @@ -1509,6 +1517,21 @@ done: qemu_vfree(acb-orig_buf); acb-common.cb(acb-common.opaque, ret); qemu_aio_release(acb); + +s-write_in_progress = 0; +fire_deferred_writes(s); +} + +static void fire_deferred_writes(BDRVQcowState *s) +{ +QCowAIOCB *acb; + +if (!s-write_in_progress !TAILQ_EMPTY(s-deferred_writes)) { +s-write_in_progress = 1; +acb = TAILQ_FIRST(s-deferred_writes); +TAILQ_REMOVE(s-deferred_writes, acb, deferred_writes_link); +qcow_aio_write_cb(acb, 0); +} } static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, @@ -1524,7 +1547,9 @@ static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, if (!acb) return NULL; -qcow_aio_write_cb(acb, 0); +TAILQ_INSERT_TAIL(s-deferred_writes, acb, deferred_writes_link); +fire_deferred_writes(s); + return acb-common; } -- 1.6.1.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu/hw/device-assignment: questions about msix_table_page
On Tue, May 05, 2009 at 07:49:10AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:34:50PM +0300, Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote: On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote: If guest can write to the real device MSI-X table directly, it would cause chaos on interrupt delivery, for what guest see is totally different with what's host see... Obviously. Thanks, What's the reason that this page is unmapped from the qemu memory space? Specifically what do these lines do: int offset = r_dev-msix_table_addr - real_region-base_addr; ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE); I believe this allows accesses to this page (the MSI-X table), which is part of the guest address space (through kvm memory slots), to be trapped by qemu. Since there is no actual page in this guest address, KVM treats accesses as MMIO and forwards them to QEMU. I thought about this too. But why is this necessary for assigned MSI-X but not for emulated devices such as e.g. e1000? All e1000 does seems to be cpu_register_physical_memory ... Because there is no registered (kvm) memory slot for the range which e1000 registers its MMIO? Not sure about the address of the MSI-X table page, but you could achieve the same effect by splitting the slot which it lives in two, with a 1 page hole between them. You could also move the emulated MSI-X table, sticking it on top of the existing BAR. Since PCI config includes the pointer to the table, a driver that reads this pointer will continue to work. Of course, there's no guarantee that guest drivers don't just hard-code this offset. BTW this is why you can't map the MSI-X table page directly, you want accesses to be trapped. BTW current design won't work if the base page size is 4K, will it? The hole covers a page, so you'll get faults outside the MSI-X table. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
On Tue, May 05, 2009 at 03:08:40PM +0300, Avi Kivity wrote: Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote: Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote: The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned device? Sure, but only one KVM_ASSIGN_SET_MSIX_NR. MSIX_NR is the size of the table, while MSIX_ENTRY updates a single entry, if I read the code correctly. Right. So we'll need something like this for MSI as well. Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY and changed to do the right thing depending on the IRQ type? Works for me. Sheng, is there a reason why it wasn't done like this? btw, it could be further simplified by using irqfd. Instead of the host device tying directly into kvm, it could just trigger an eventfd; and we could terminate the eventfd either in kvm (irqfd) or in qemu. If you are going wild, you could then split this code out from kvm into something like a UIO driver. E.g. qemu could then in theory support assigned devices even without VT-d hardware support in CPU. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 0/3] generic hypercall support
(Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a) Please see patch 1/3 for a description. This has been tested with a KVM guest on x86_64 and appears to work properly. Comments, please. -Greg --- Gregory Haskins (3): kvm: add pv_cpu_ops.hypercall support to the guest x86: add generic hypercall support add generic hypercall support arch/Kconfig |3 + arch/x86/Kconfig |1 arch/x86/include/asm/paravirt.h | 13 ++ arch/x86/include/asm/processor.h |6 +++ arch/x86/kernel/kvm.c| 22 ++ include/linux/hypercall.h| 83 ++ 6 files changed, 128 insertions(+), 0 deletions(-) create mode 100644 include/linux/hypercall.h -- Signature -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 1/3] add generic hypercall support
We add a generic hypercall() mechanism for use by IO code which is compatible with a variety of hypervisors, but which prefers to use hypercalls over other types of hypervisor traps for performance and/or feature reasons. For instance, consider an emulated PCI device in KVM. Today we can chose to do IO over MMIO or PIO infrastructure, but they each have their own distinct disadvantages: *) MMIO causes a page-fault, which must be decoded by the hypervisor and is therefore fairly expensive. *) PIO is more direct than MMIO, but it poses other problems such as: a) can have a small limited address space (x86 is 2^16) b) is a narrow-band interface (one 8, 16, 32, 64 bit word at a time) c) not available on all archs (PCI mentions ppc as problematic) and is therefore recommended to avoid. Hypercalls, on the other hand, offer a direct access path like PIOs, yet do not suffer the same drawbacks such as a limited address space or a narrow-band interface. Hypercalls are much more friendly to software to software interaction since we can pack multiple registers in a way that is natural and simple for software to utilize. The problem with hypercalls today is that there is no generic support. There is various support for hypervisor specific implementations (for instance, see kvm_hypercall0() in arch/x86/include/asm/kvm_para.h). This makes it difficult to implement a device that is hypervisor agnostic since it would not only need to know the hypercall ABI, but also which platform specific function call it should make. If we can convey a dynamic binding to a specific hypercall vector in a generic way (out of the scope of this patch series), then an IO driver could utilize that dynamic binding to communicate without requiring hypervisor specific knowledge. Therefore, we implement a system wide hypercall() interface based on a variable length list of unsigned longs (representing registers to pack) and expect that various arch/hypervisor implementations can fill in the details, if supported. This is expected to be done as part of the pv_ops infrastructure, which is the natural hook-point for hypervisor specific code. Note, however, that the generic hypercall() interface does not require the implementation to use pv_ops if so desired. Example use case: -- Consider a PCI device X. It can already advertise MMIO/PIO regions via its BAR infrastructure. With this new model it could also advertise a hypercall vector in its device-specific upper configuration space. (The allocation and assignment of this vector on the backend is beyond the scope of this series). The guest-side driver for device X would sense (via something like a feature-bit) if the hypercall was available and valid, read the value with a configuration cycle, and proceed to ignore the BARs in favor of using the hypercall() interface. Signed-off-by: Gregory Haskins ghask...@novell.com --- include/linux/hypercall.h | 83 + 1 files changed, 83 insertions(+), 0 deletions(-) create mode 100644 include/linux/hypercall.h diff --git a/include/linux/hypercall.h b/include/linux/hypercall.h new file mode 100644 index 000..c8a1492 --- /dev/null +++ b/include/linux/hypercall.h @@ -0,0 +1,83 @@ +/* + * Copyright 2009 Novell. All Rights Reserved. + * + * Author: + * Gregory Haskins ghask...@novell.com + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef _LINUX_HYPERCALL_H +#define _LINUX_HYPERCALL_H + +#ifdef CONFIG_HAVE_HYPERCALL + +long hypercall(unsigned long nr, unsigned long *args, size_t count); + +#else + +static inline long +hypercall(unsigned long nr, unsigned long *args, size_t count) +{ + return -EINVAL; +} + +#endif /* CONFIG_HAVE_HYPERCALL */ + +#define hypercall0(nr) hypercall(nr, NULL, 0) +#define hypercall1(nr, a1) \ + ({ \ + unsigned long __args[] = { a1, }; \ + long __ret; \ + __ret = hypercall(nr, __args, ARRAY_SIZE(__args)); \ + __ret; \ + }) +#define hypercall2(nr, a1, a2) \ + ({
[RFC PATCH 3/3] kvm: add pv_cpu_ops.hypercall support to the guest
Signed-off-by: Gregory Haskins ghask...@novell.com --- arch/x86/kernel/kvm.c | 22 ++ 1 files changed, 22 insertions(+), 0 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33019dd..d299ed5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -50,6 +50,26 @@ static void kvm_io_delay(void) { } +static long _kvm_hypercall(unsigned long nr, + unsigned long *args, + size_t count) +{ + switch (count) { + case 0: + return kvm_hypercall0(nr); + case 1: + return kvm_hypercall1(nr, args[0]); + case 2: + return kvm_hypercall2(nr, args[0], args[1]); + case 3: + return kvm_hypercall3(nr, args[0], args[1], args[2]); + case 4: + return kvm_hypercall4(nr, args[0], args[1], args[2], args[3]); + default: + return -EINVAL; + } +} + static void kvm_mmu_op(void *buffer, unsigned len) { int r; @@ -207,6 +227,8 @@ static void paravirt_ops_setup(void) if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; + pv_cpu_ops.hypercall = _kvm_hypercall; + if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { pv_mmu_ops.set_pte = kvm_set_pte; pv_mmu_ops.set_pte_at = kvm_set_pte_at; -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 2/3] x86: add generic hypercall support
This adds a hypercall() vector to x86 pv_cpu_ops to be optionally filled in by a hypervisor driver as it loads its other pv_ops components. We also declare x86 as CONFIG_HAVE_HYPERCALL to enable the generic hypercall code whenever the user builds for x86. Signed-off-by: Gregory Haskins ghask...@novell.com --- arch/Kconfig |3 +++ arch/x86/Kconfig |1 + arch/x86/include/asm/paravirt.h | 13 + arch/x86/include/asm/processor.h |6 ++ 4 files changed, 23 insertions(+), 0 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 78a35e9..239b658 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -112,3 +112,6 @@ config HAVE_DMA_API_DEBUG config HAVE_DEFAULT_NO_SPIN_MUTEXES bool + +config HAVE_HYPERCALL +bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index df9e885..3c609cf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -46,6 +46,7 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_HYPERCALL config ARCH_DEFCONFIG string diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 378e369..ed22c84 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -6,6 +6,7 @@ #ifdef CONFIG_PARAVIRT #include asm/pgtable_types.h #include asm/asm.h +#include asm/errno.h /* Bitmask of what can be clobbered: usually at least eax. */ #define CLBR_NONE 0 @@ -203,6 +204,8 @@ struct pv_cpu_ops { void (*swapgs)(void); + long (*hypercall)(unsigned long nr, unsigned long *args, size_t count); + struct pv_lazy_ops lazy_mode; }; @@ -723,6 +726,16 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx); } +static inline long hypercall(unsigned long nr, +unsigned long *args, +size_t count) +{ + if (!pv_cpu_ops.hypercall) + return -EINVAL; + + return pv_cpu_ops.hypercall(nr, args, count); +} + /* * These special macros can be used to get or set a debugging register */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c2cceae..8fa988d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -570,6 +570,12 @@ static inline void native_swapgs(void) #define __cpuidnative_cpuid #define paravirt_enabled() 0 +static inline long +hypercall(unsigned long nr, unsigned long *args, size_t count) +{ + return -EINVAL; +} + /* * These special macros can be used to get or set a debugging register */ -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
Michael S. Tsirkin wrote: Works for me. Sheng, is there a reason why it wasn't done like this? btw, it could be further simplified by using irqfd. Instead of the host device tying directly into kvm, it could just trigger an eventfd; and we could terminate the eventfd either in kvm (irqfd) or in qemu. If you are going wild, you could then split this code out from kvm into something like a UIO driver. E.g. qemu could then in theory support assigned devices even without VT-d hardware support in CPU. That's my thinking. PCI interrupts don't work because we need to do some hacky stuff in there, but MSI should. Oh, and we could improve UIO support for interrupts when using MSI, since there's no need to acknowledge the interrupt. Support we can tell the kernel to signal an eventfd whenever an MSI fires. We then ask kvm for an irqfd, and give that irqfd to the kernel for the MSI. Voila, we assign an interrupt from userspace, without the device or kvm knowing anything about it. Like you say, we can assign the device to pure qemu, or to a userspace driver. Beautiful, I finally found something to replace my old Lego set. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/3] generic hypercall support
Gregory Haskins wrote: (Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a) Please see patch 1/3 for a description. This has been tested with a KVM guest on x86_64 and appears to work properly. Comments, please. What about the hypercalls in include/asm/kvm_para.h? In general, hypercalls cannot be generic since each hypervisor implements its own ABI. The abstraction needs to be at a higher level (pv_ops is such a level). -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/3] generic hypercall support
Avi Kivity wrote: Gregory Haskins wrote: (Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a) Please see patch 1/3 for a description. This has been tested with a KVM guest on x86_64 and appears to work properly. Comments, please. What about the hypercalls in include/asm/kvm_para.h? In general, hypercalls cannot be generic since each hypervisor implements its own ABI. Please see the prologue to 1/3. Its all described there, including a use case which I think answers your questions. If there is still ambiguity, let me know. The abstraction needs to be at a higher level (pv_ops is such a level). Yep, agreed. Thats exactly what this series is doing, actually. -Greg signature.asc Description: OpenPGP digital signature
Re: [PATCH] deal with interrupt shadow state for emulated instruction
Hmm, if the guest runs an infinite emulated 'mov ss', it will keep toggling the MOV_SS bit, but STI will remain set, so we'll never allow an interrupt into the guest kernel. We have no choice but returning both flags, since svm does not differentiate between them. But see below for an alternative path that makes it a non-issue. diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index d2664fc..797d41f 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1618,6 +1618,16 @@ special_insn: int err; sel = c-src.val; +if (c-modrm_reg == VCPU_SREG_SS) { +u32 int_shadow = +kvm_x86_ops-get_interrupt_shadow(ctxt-vcpu); +/* See sti emulation for an explanation of this */ +if ((int_shadow X86_SHADOW_INT_MOV_SS)) +ctxt-interruptibility = ~X86_SHADOW_INT_MOV_SS; +else +ctxt-interruptibility |= X86_SHADOW_INT_MOV_SS; +} ^= =p \o/ After re-reading this, masking the flags in here makes no sense. I am moving to an approach in which I do if (!(int_shadow X86_SHADOW_INT_MOV_SS)) ctxt-interruptibility = X86_SHADOW_INT_MOV_SS; Since if the next instruction is an sti, it is certainly not an sti; sti instruction (the current is mov ss, after all). So we should mask it anyway. This also solves nicely the problem you raised at svm.c. @@ -1846,10 +1856,23 @@ special_insn: ctxt-eflags = ~X86_EFLAGS_IF; c-dst.type = OP_NONE; /* Disable writeback. */ break; -case 0xfb: /* sti */ +case 0xfb: { /* sti */ +u32 int_shadow = kvm_x86_ops-get_interrupt_shadow(ctxt-vcpu); +/* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case + */ +if ((int_shadow X86_SHADOW_INT_STI)) +ctxt-interruptibility = ~X86_SHADOW_INT_STI; +else +ctxt-interruptibility |= X86_SHADOW_INT_STI; ^= ditto -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/3] generic hypercall support
Gregory Haskins wrote: Avi Kivity wrote: Gregory Haskins wrote: (Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a) Please see patch 1/3 for a description. This has been tested with a KVM guest on x86_64 and appears to work properly. Comments, please. What about the hypercalls in include/asm/kvm_para.h? In general, hypercalls cannot be generic since each hypervisor implements its own ABI. Please see the prologue to 1/3. Its all described there, including a use case which I think answers your questions. If there is still ambiguity, let me know. Yeah, sorry. The abstraction needs to be at a higher level (pv_ops is such a level). Yep, agreed. Thats exactly what this series is doing, actually. No, it doesn't. It makes making hypercalls a pv_op, but hypervisors don't implement the same ABI. pv_ops all _use_ hypercalls to implement higher level operations, like set_pte (probably the only place set_pte can be considered a high level operation). In this case, the higher level event could be hypervisor_dynamic_event(number); each pv_ops implementation would use its own hypercalls to implement that. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/3] generic hypercall support
Avi Kivity wrote: Gregory Haskins wrote: Avi Kivity wrote: Gregory Haskins wrote: (Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a) Please see patch 1/3 for a description. This has been tested with a KVM guest on x86_64 and appears to work properly. Comments, please. What about the hypercalls in include/asm/kvm_para.h? In general, hypercalls cannot be generic since each hypervisor implements its own ABI. Please see the prologue to 1/3. Its all described there, including a use case which I think answers your questions. If there is still ambiguity, let me know. Yeah, sorry. The abstraction needs to be at a higher level (pv_ops is such a level). Yep, agreed. Thats exactly what this series is doing, actually. No, it doesn't. It makes making hypercalls a pv_op, but hypervisors don't implement the same ABI. Yes, that is true, but I think the issue right now is more of semantics. I think we are on the same page. So you would never have someone making a generic hypercall(KVM_HC_MMU_OP). I agree. What I am proposing here is more akin to PIO-BAR + iowrite()/ioread(). E.g. the infrastructure sets up the addressing (where in PIO this is literally an address, and for hypercalls this is a vector), but the device defines the ABI at that address. So its really the device end-point that is defining the ABI here, not the hypervisor (per se) and thats why I thought its ok to declare these generic. But to your point below... pv_ops all _use_ hypercalls to implement higher level operations, like set_pte (probably the only place set_pte can be considered a high level operation). In this case, the higher level event could be hypervisor_dynamic_event(number); each pv_ops implementation would use its own hypercalls to implement that. I see. I had designed it slightly different where KVM could assign any top level vector it wanted and thus that drove the guest-side interface you see here to be more generic hypercall. However, I think your proposal is perfectly fine too and it makes sense to more narrowly focus these calls as specifically dynamic...as thats the only vectors that we could technically use like this anyway. So rather than allocate a top-level vector, I will add KVM_HC_DYNAMIC to kvm_para.h, and I will change the interface to follow suit (something like s/hypercall/dynhc). Sound good? Thanks, Avi, -Greg signature.asc Description: OpenPGP digital signature
Re: [KVM PATCH v4 0/2] irqfd
On Mon, 4 May 2009, Gregory Haskins wrote: (Applies to kvm.git:7da2e3ba, plus you will also need Davide Libenzi's eventfd_file_create() patch, which you can find here: http://www.mail-archive.com/kvm@vger.kernel.org/msg13923.html Ping me back if Al acks the irqfd thing, that I'll take a better look at the patch above and make an official post. Without any users, I'd rather leave the current code as is. - Davide -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/3] generic hypercall support
Gregory Haskins wrote: So rather than allocate a top-level vector, I will add KVM_HC_DYNAMIC to kvm_para.h, and I will change the interface to follow suit (something like s/hypercall/dynhc). Sound good? A small ramification of this change will be that I will need to do something like add a feature-bit to cpuid for detecting if HC_DYNAMIC is supported on the backend or not. The current v1 design doesn't suffer from this requirement because the presence of the dynamic vector itself is enough to know its supported. I like Avi's proposal enough to say that its worth this minor inconvenience, but FYI I will have to additionally submit a userspace patch for v2 if we go this route. -Greg signature.asc Description: OpenPGP digital signature
Re: [KVM PATCH v4 0/2] irqfd
Davide Libenzi wrote: On Mon, 4 May 2009, Gregory Haskins wrote: (Applies to kvm.git:7da2e3ba, plus you will also need Davide Libenzi's eventfd_file_create() patch, which you can find here: http://www.mail-archive.com/kvm@vger.kernel.org/msg13923.html Ping me back if Al acks the irqfd thing, that I'll take a better look at the patch above and make an official post. Without any users, I'd rather leave the current code as is. Will do, Davide. Thank you. -Greg signature.asc Description: OpenPGP digital signature
[PATCH 0/6] kvm-s390: collection of kvm-s390 fixes
From: Christian Ehrhardt ehrha...@de.ibm.com This is a collection of fixes for kvm-s390 that originate from several tests made in the last few months. They are now tested a while and should be ready to be merged. All six patches are created either by Carsten Otte or Christain Borntraeger. I'm just the one stumbling across the filled patch queue and cleaning them up for submission. The patches themselve have proper tags to account creator etc. *not sending patches a few weeks makes somewhat forgetful - I beg a pardon from all on cc that got it two times now after adding the kvm list this time. Patches included: [PATCH 1/6] kvm-s390: Fix memory slot versus run' [PATCH 2/6] kvm-s390: use hrtimer for clock wakeup from idle' [PATCH 3/6] kvm-s390: optimize float int lock: spin_lock_bh -- spin_lock' [PATCH 4/6] kvm-s390: Unlink vcpu on destroy' [PATCH 5/6] kvm-s390: Sanity check on validity intercept' [PATCH 6/6] kvm-s390: Verify memory in kvm run' Overall-Diffstat: arch/s390/include/asm/kvm_host.h |5 ++- arch/s390/kvm/intercept.c| 28 --- arch/s390/kvm/interrupt.c| 55 --- arch/s390/kvm/kvm-s390.c | 50 --- arch/s390/kvm/kvm-s390.h |4 ++ arch/s390/kvm/priv.c |4 +- arch/s390/kvm/sigp.c | 16 +-- 7 files changed, 110 insertions(+), 52 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/6] kvm-s390: optimize float int lock: spin_lock_bh -- spin_lock
From: Christian Borntraeger borntrae...@de.ibm.com The floating interrupt lock is only taken in process context. We can replace all spin_lock_bh with standard spin_lock calls. Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com --- arch/s390/kvm/interrupt.c | 20 ++-- arch/s390/kvm/kvm-s390.c |4 ++-- arch/s390/kvm/priv.c |4 ++-- arch/s390/kvm/sigp.c | 16 4 files changed, 22 insertions(+), 22 deletions(-) Index: kvm/arch/s390/kvm/interrupt.c === --- kvm.orig/arch/s390/kvm/interrupt.c +++ kvm/arch/s390/kvm/interrupt.c @@ -301,13 +301,13 @@ int kvm_cpu_has_interrupt(struct kvm_vcp } if ((!rc) atomic_read(fi-active)) { - spin_lock_bh(fi-lock); + spin_lock(fi-lock); list_for_each_entry(inti, fi-list, list) if (__interrupt_is_deliverable(vcpu, inti)) { rc = 1; break; } - spin_unlock_bh(fi-lock); + spin_unlock(fi-lock); } if ((!rc) (vcpu-arch.sie_block-ckc @@ -368,7 +368,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu hrtimer_start(vcpu-arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL); VCPU_EVENT(vcpu, 5, enabled wait via clock comparator: %llx ns, sltime); no_timer: - spin_lock_bh(vcpu-arch.local_int.float_int-lock); + spin_lock(vcpu-arch.local_int.float_int-lock); spin_lock_bh(vcpu-arch.local_int.lock); add_wait_queue(vcpu-arch.local_int.wq, wait); while (list_empty(vcpu-arch.local_int.list) @@ -377,18 +377,18 @@ no_timer: !signal_pending(current)) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_bh(vcpu-arch.local_int.lock); - spin_unlock_bh(vcpu-arch.local_int.float_int-lock); + spin_unlock(vcpu-arch.local_int.float_int-lock); vcpu_put(vcpu); schedule(); vcpu_load(vcpu); - spin_lock_bh(vcpu-arch.local_int.float_int-lock); + spin_lock(vcpu-arch.local_int.float_int-lock); spin_lock_bh(vcpu-arch.local_int.lock); } __unset_cpu_idle(vcpu); __set_current_state(TASK_RUNNING); remove_wait_queue(vcpu-wq, wait); spin_unlock_bh(vcpu-arch.local_int.lock); - spin_unlock_bh(vcpu-arch.local_int.float_int-lock); + spin_unlock(vcpu-arch.local_int.float_int-lock); hrtimer_try_to_cancel(vcpu-arch.ckc_timer); return 0; } @@ -455,7 +455,7 @@ void kvm_s390_deliver_pending_interrupts if (atomic_read(fi-active)) { do { deliver = 0; - spin_lock_bh(fi-lock); + spin_lock(fi-lock); list_for_each_entry_safe(inti, n, fi-list, list) { if (__interrupt_is_deliverable(vcpu, inti)) { list_del(inti-list); @@ -466,7 +466,7 @@ void kvm_s390_deliver_pending_interrupts } if (list_empty(fi-list)) atomic_set(fi-active, 0); - spin_unlock_bh(fi-lock); + spin_unlock(fi-lock); if (deliver) { __do_deliver_interrupt(vcpu, inti); kfree(inti); @@ -531,7 +531,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, mutex_lock(kvm-lock); fi = kvm-arch.float_int; - spin_lock_bh(fi-lock); + spin_lock(fi-lock); list_add_tail(inti-list, fi-list); atomic_set(fi-active, 1); sigcpu = find_first_bit(fi-idle_mask, KVM_MAX_VCPUS); @@ -548,7 +548,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, if (waitqueue_active(li-wq)) wake_up_interruptible(li-wq); spin_unlock_bh(li-lock); - spin_unlock_bh(fi-lock); + spin_unlock(fi-lock); mutex_unlock(kvm-lock); return 0; } Index: kvm/arch/s390/kvm/kvm-s390.c === --- kvm.orig/arch/s390/kvm/kvm-s390.c +++ kvm/arch/s390/kvm/kvm-s390.c @@ -323,11 +323,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(st spin_lock_init(vcpu-arch.local_int.lock); INIT_LIST_HEAD(vcpu-arch.local_int.list); vcpu-arch.local_int.float_int = kvm-arch.float_int; - spin_lock_bh(kvm-arch.float_int.lock); + spin_lock(kvm-arch.float_int.lock); kvm-arch.float_int.local_int[id] = vcpu-arch.local_int; init_waitqueue_head(vcpu-arch.local_int.wq); vcpu-arch.local_int.cpuflags = vcpu-arch.sie_block-cpuflags; - spin_unlock_bh(kvm-arch.float_int.lock); + spin_unlock(kvm-arch.float_int.lock); rc =
[PATCH 1/6] kvm-s390: Fix memory slot versus run
From: Carsten Otte co...@de.ibm.com This patch fixes an incorrectness in the kvm backend for s390. In case virtual cpus are being created before the corresponding memory slot is being registered, we need to update the sie control blocks for the virtual cpus. In order to do that, we use the vcpu-mutex to lock out kvm_run and friends. This way we can ensure a consistent update of the memory for the entire smp configuration. Reported-by: Mijo Safradin m...@linux.vnet.ibm.com Signed-off-by: Carsten Otte co...@de.ibm.com --- arch/s390/kvm/kvm-s390.c | 24 1 file changed, 20 insertions(+), 4 deletions(-) Index: kvm/arch/s390/kvm/kvm-s390.c === --- kvm.orig/arch/s390/kvm/kvm-s390.c +++ kvm/arch/s390/kvm/kvm-s390.c @@ -657,6 +657,8 @@ int kvm_arch_set_memory_region(struct kv struct kvm_memory_slot old, int user_alloc) { + int i; + /* A few sanity checks. We can have exactly one memory slot which has to start at guest virtual zero and which has to be located at a page boundary in userland and which has to end at a page boundary. @@ -676,13 +678,27 @@ int kvm_arch_set_memory_region(struct kv if (mem-memory_size (PAGE_SIZE - 1)) return -EINVAL; + /* lock all vcpus */ + for (i = 0; i KVM_MAX_VCPUS; ++i) { + if (kvm-vcpus[i]) + mutex_lock(kvm-vcpus[i]-mutex); + } + kvm-arch.guest_origin = mem-userspace_addr; kvm-arch.guest_memsize = mem-memory_size; - /* FIXME: we do want to interrupt running CPUs and update their memory - configuration now to avoid race conditions. But hey, changing the - memory layout while virtual CPUs are running is usually bad - programming practice. */ + /* update sie control blocks, and unlock all vcpus */ + for (i = 0; i KVM_MAX_VCPUS; ++i) { + if (kvm-vcpus[i]) { + kvm-vcpus[i]-arch.sie_block-gmsor = + kvm-arch.guest_origin; + kvm-vcpus[i]-arch.sie_block-gmslm = + kvm-arch.guest_memsize + + kvm-arch.guest_origin + + VIRTIODESCSPACE - 1ul; + mutex_unlock(kvm-vcpus[i]-mutex); + } + } return 0; } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/6] kvm-s390: use hrtimer for clock wakeup from idle
From: Christian Borntraeger borntrae...@de.ibm.com This patch reworks the s390 clock comparator wakeup to hrtimer. The clock comparator is a per-cpu value that is compared against the TOD clock. If ckc = TOD an external interrupt 1004 is triggered. Since the clock comparator and the TOD clock have a much higher resolution than jiffies we should use hrtimers to trigger the wakeup. This speeds up guest nanosleep for small values. Since hrtimers callbacks run in hard-irq context, I added a tasklet to do the actual work with enabled interrupts. Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com Signed-off-by: Carsten Otte co...@de.ibm.com --- include/asm/kvm_host.h |5 - kvm/interrupt.c| 35 +-- kvm/kvm-s390.c |7 +-- kvm/kvm-s390.h |4 +++- 4 files changed, 37 insertions(+), 14 deletions(-) Index: kvm/arch/s390/include/asm/kvm_host.h === --- kvm.orig/arch/s390/include/asm/kvm_host.h 2009-05-05 15:58:45.0 +0200 +++ kvm/arch/s390/include/asm/kvm_host.h2009-05-05 16:16:49.0 +0200 @@ -13,6 +13,8 @@ #ifndef ASM_KVM_HOST_H #define ASM_KVM_HOST_H +#include linux/hrtimer.h +#include linux/interrupt.h #include linux/kvm_host.h #include asm/debug.h #include asm/cpuid.h @@ -210,7 +212,8 @@ s390_fp_regs guest_fpregs; unsigned int guest_acrs[NUM_ACRS]; struct kvm_s390_local_interrupt local_int; - struct timer_list ckc_timer; + struct hrtimerckc_timer; + struct tasklet_struct tasklet; union { cpuid_t cpu_id; u64 stidp_data; Index: kvm/arch/s390/kvm/interrupt.c === --- kvm.orig/arch/s390/kvm/interrupt.c 2009-05-05 15:58:45.0 +0200 +++ kvm/arch/s390/kvm/interrupt.c 2009-05-05 16:18:02.0 +0200 @@ -12,6 +12,8 @@ #include asm/lowcore.h #include asm/uaccess.h +#include linux/hrtimer.h +#include linux/interrupt.h #include linux/kvm_host.h #include linux/signal.h #include kvm-s390.h @@ -361,12 +363,12 @@ return 0; } - sltime = (vcpu-arch.sie_block-ckc - now) / (0xf424ul / HZ) + 1; + sltime = (vcpu-arch.sie_block-ckc - now)/4096*1000; - vcpu-arch.ckc_timer.expires = jiffies + sltime; - - add_timer(vcpu-arch.ckc_timer); - VCPU_EVENT(vcpu, 5, enabled wait timer:%llx jiffies, sltime); + hrtimer_start(vcpu-arch.ckc_timer, ktime_set(0, sltime), + HRTIMER_MODE_REL); + VCPU_EVENT(vcpu, 5, enabled wait via clock comparator: %llx ns, + sltime); no_timer: spin_lock_bh(vcpu-arch.local_int.float_int-lock); spin_lock_bh(vcpu-arch.local_int.lock); @@ -389,21 +391,34 @@ remove_wait_queue(vcpu-wq, wait); spin_unlock_bh(vcpu-arch.local_int.lock); spin_unlock_bh(vcpu-arch.local_int.float_int-lock); - del_timer(vcpu-arch.ckc_timer); + hrtimer_try_to_cancel(vcpu-arch.ckc_timer); return 0; } -void kvm_s390_idle_wakeup(unsigned long data) +void kvm_s390_tasklet(unsigned long parm) { - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) parm; - spin_lock_bh(vcpu-arch.local_int.lock); + spin_lock(vcpu-arch.local_int.lock); vcpu-arch.local_int.timer_due = 1; if (waitqueue_active(vcpu-arch.local_int.wq)) wake_up_interruptible(vcpu-arch.local_int.wq); - spin_unlock_bh(vcpu-arch.local_int.lock); + spin_unlock(vcpu-arch.local_int.lock); } +/* + * low level hrtimer wake routine. Because this runs in hardirq context + * we schedule a tasklet to do the real work. + */ +enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); + tasklet_schedule(vcpu-arch.tasklet); + + return HRTIMER_NORESTART; +} void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) { Index: kvm/arch/s390/kvm/kvm-s390.c === --- kvm.orig/arch/s390/kvm/kvm-s390.c 2009-05-05 16:16:48.0 +0200 +++ kvm/arch/s390/kvm/kvm-s390.c2009-05-05 16:16:49.0 +0200 @@ -15,6 +15,7 @@ #include linux/compiler.h #include linux/err.h #include linux/fs.h +#include linux/hrtimer.h #include linux/init.h #include linux/kvm.h #include linux/kvm_host.h @@ -286,8 +287,10 @@ vcpu-arch.sie_block-gmsor = vcpu-kvm-arch.guest_origin; vcpu-arch.sie_block-ecb = 2; vcpu-arch.sie_block-eca = 0xC1002001U; - setup_timer(vcpu-arch.ckc_timer, kvm_s390_idle_wakeup, -(unsigned long) vcpu); + hrtimer_init(vcpu-arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); +
[PATCH 5/6] kvm-s390: Sanity check on validity intercept
From: Carsten Otte co...@de.ibm.com This patch adds a sanity check for the content of the guest prefix register content before faulting in the cpu lowcore that it refers to. The guest might end up in an endless loop where SIE complains about missing lowcore with incorrect content of the prefix register without this fix. Reported-by: Mijo Safradin m...@linux.vnet.ibm.com Signed-off-by: Carsten Otte co...@de.ibm.com --- arch/s390/kvm/intercept.c | 28 ++-- 1 file changed, 18 insertions(+), 10 deletions(-) Index: kvm/arch/s390/kvm/intercept.c === --- kvm.orig/arch/s390/kvm/intercept.c +++ kvm/arch/s390/kvm/intercept.c @@ -154,17 +154,25 @@ static int handle_stop(struct kvm_vcpu * static int handle_validity(struct kvm_vcpu *vcpu) { int viwhy = vcpu-arch.sie_block-ipb 16; + int rc; + vcpu-stat.exit_validity++; - if (viwhy == 0x37) { - fault_in_pages_writeable((char __user *) -vcpu-kvm-arch.guest_origin + -vcpu-arch.sie_block-prefix, -PAGE_SIZE); - return 0; - } - VCPU_EVENT(vcpu, 2, unhandled validity intercept code %d, - viwhy); - return -ENOTSUPP; + if ((viwhy == 0x37) (vcpu-arch.sie_block-prefix + = vcpu-kvm-arch.guest_memsize - 2*PAGE_SIZE)){ + rc = fault_in_pages_writeable((char __user *) +vcpu-kvm-arch.guest_origin + +vcpu-arch.sie_block-prefix, +2*PAGE_SIZE); + if (rc) + /* user will receive sigsegv, exit to user */ + rc = -ENOTSUPP; + } else + rc = -ENOTSUPP; + + if (rc) + VCPU_EVENT(vcpu, 2, unhandled validity intercept code %d, + viwhy); + return rc; } static int handle_instruction(struct kvm_vcpu *vcpu) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/6] kvm-s390: Unlink vcpu on destroy
From: Carsten Otte co...@de.ibm.com This patch makes sure we do unlink a vcpu's sie control block from the system control area in kvm_arch_vcpu_destroy. This prevents illegal accesses to the sie control block from other virtual cpus after free. Reported-by: Mijo Safradin m...@linux.vnet.ibm.com Signed-off-by: Carsten Otte co...@de.ibm.com --- arch/s390/kvm/kvm-s390.c |9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) Index: kvm/arch/s390/kvm/kvm-s390.c === --- kvm.orig/arch/s390/kvm/kvm-s390.c +++ kvm/arch/s390/kvm/kvm-s390.c @@ -195,6 +195,9 @@ out_nokvm: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, %s, free cpu); + if (vcpu-kvm-arch.sca-cpu[vcpu-vcpu_id].sda == + (__u64) vcpu-arch.sie_block) + vcpu-kvm-arch.sca-cpu[vcpu-vcpu_id].sda = 0; free_page((unsigned long)(vcpu-arch.sie_block)); kvm_vcpu_uninit(vcpu); kfree(vcpu); @@ -307,8 +310,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(st vcpu-arch.sie_block-icpua = id; BUG_ON(!kvm-arch.sca); - BUG_ON(kvm-arch.sca-cpu[id].sda); - kvm-arch.sca-cpu[id].sda = (__u64) vcpu-arch.sie_block; + if (!kvm-arch.sca-cpu[id].sda) + kvm-arch.sca-cpu[id].sda = (__u64) vcpu-arch.sie_block; + else + BUG_ON(!kvm-vcpus[id]); /* vcpu does already exist */ vcpu-arch.sie_block-scaoh = (__u32)(((__u64)kvm-arch.sca) 32); vcpu-arch.sie_block-scaol = (__u32)(__u64)kvm-arch.sca; -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/6] kvm-s390: Verify memory in kvm run
From: Carsten Otte co...@de.ibm.com This check verifies that the guest we're trying to run in KVM_RUN has some memory assigned to it. It enters an endless exception loop if this is not the case. Reported-by: Mijo Safradin m...@linux.vnet.ibm.com Signed-off-by: Carsten Otte co...@de.ibm.com --- arch/s390/kvm/kvm-s390.c |6 ++ 1 file changed, 6 insertions(+) Index: kvm/arch/s390/kvm/kvm-s390.c === --- kvm.orig/arch/s390/kvm/kvm-s390.c +++ kvm/arch/s390/kvm/kvm-s390.c @@ -478,6 +478,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v vcpu_load(vcpu); + /* verify, that memory has been registered */ + if (!vcpu-kvm-arch.guest_memsize) { + vcpu_put(vcpu); + return -EINVAL; + } + if (vcpu-sigset_active) sigprocmask(SIG_SETMASK, vcpu-sigset, sigsaved); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/3] generic hypercall support
Gregory Haskins wrote: I see. I had designed it slightly different where KVM could assign any top level vector it wanted and thus that drove the guest-side interface you see here to be more generic hypercall. However, I think your proposal is perfectly fine too and it makes sense to more narrowly focus these calls as specifically dynamic...as thats the only vectors that we could technically use like this anyway. So rather than allocate a top-level vector, I will add KVM_HC_DYNAMIC to kvm_para.h, and I will change the interface to follow suit (something like s/hypercall/dynhc). Sound good? Yeah. Another couple of points: - on the host side, we'd rig this to hit an eventfd. Nothing stops us from rigging pio to hit an eventfd as well, giving us kernel handling for pio trigger points. - pio actually has an advantage over hypercalls with nested guests. Since hypercalls don't have an associated port number, the lowermost hypervisor must interpret a hypercall as going to a guest's hypervisor, and not any lower-level hypervisors. What it boils down to is that you cannot use device assignment to give a guest access to a virtio/vbus device from a lower level hypervisor. (Bah, that's totally unreadable. What I want is instead of hypervisor[eth0/virtio-server] intermediate[virtio-driver/virtio-server] guest[virtio-driver] do hypervisor[eth0/virtio-server] intermediate[assign virtio device] guest[virtio-driver] well, it's probably still unreadable) -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface
Gregory Haskins wrote: KVM provides a complete virtual system environment for guests, including support for injecting interrupts modeled after the real exception/interrupt facilities present on the native platform (such as the IDT on x86). Virtual interrupts can come from a variety of sources (emulated devices, pass-through devices, etc) but all must be injected to the guest via the KVM infrastructure. This patch adds a new mechanism to inject a specific interrupt to a guest using a decoupled eventfd mechnanism: Any legal signal on the irqfd (using eventfd semantics from either userspace or kernel) will translate into an injected interrupt in the guest at the next available interrupt window. +struct kvm_irqfd { + __u32 gsi; + __u32 flags; +}; + Please add some reserved space here. +int +kvm_irqfd(struct kvm *kvm, int gsi, int flags) +{ + struct _irqfd *irqfd; + struct file *file = NULL; + int fd = -1; + int ret; + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!irqfd) + return -ENOMEM; + + irqfd-kvm = kvm; You need to increase the refcount on struct kvm here. Otherwise evil userspace will create an irqfd, close the vm and vcpu fds, and inject an interrupt. Otherwise, looks good. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: Question about KVM and PC speaker
Hello Jan, Am 04.05.2009 23:00, schrieb Jan Kiszka: Samuel Thibault wrote: Jan Kiszka, le Mon 04 May 2009 22:29:39 +0200, a écrit : When I boot the VM from the Lenny CD, there is no audible signal tone. Hmm, I successfully tested with '-soundbw pcspk' + my patches or -no-kvm-pit. There is probably a different, unrelated issue with your setup. Remember that the BIOS support for beeps is probably still missing. Simon, you should also test beeps from an installed Linux guest. Yeah, I forgot... :) Indeed, I only tested the console beep of a fully installed Linux guest. In this surrounding, the beep works fine at my place at well. However, the beep worked without the patching of the sources when I entered -no-kvm-pit. For me as a user there is no noticeable change with the patch. Is a support for BIOS worked on right now? Thanks in advance for the further help. Simon -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 1/3] add generic hypercall support
On Tue, 2009-05-05 at 09:24 -0400, Gregory Haskins wrote: *) PIO is more direct than MMIO, but it poses other problems such as: a) can have a small limited address space (x86 is 2^16) b) is a narrow-band interface (one 8, 16, 32, 64 bit word at a time) c) not available on all archs (PCI mentions ppc as problematic) and is therefore recommended to avoid. Side note: I don't know what PCI has to do with this, and problematic isn't the word I would use. ;) As far as I know, x86 is the only still-alive architecture that implements instructions for a separate IO space (not even ia64 does). -- Hollis Blanchard IBM Linux Technology Center -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] BIOS changes for configuring irq0-inti2 override
Beth Kon wrote: These patches resolve the irq0-inti2 override issue, and get the hpet working on kvm. I've found a problem with these patches. I'll resubmit shortly. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface
Avi Kivity wrote: Gregory Haskins wrote: KVM provides a complete virtual system environment for guests, including support for injecting interrupts modeled after the real exception/interrupt facilities present on the native platform (such as the IDT on x86). Virtual interrupts can come from a variety of sources (emulated devices, pass-through devices, etc) but all must be injected to the guest via the KVM infrastructure. This patch adds a new mechanism to inject a specific interrupt to a guest using a decoupled eventfd mechnanism: Any legal signal on the irqfd (using eventfd semantics from either userspace or kernel) will translate into an injected interrupt in the guest at the next available interrupt window. +struct kvm_irqfd { +__u32 gsi; +__u32 flags; +}; + Please add some reserved space here. Ack. Any rule of thumb here? How about a __u8 pad[16] ? +int +kvm_irqfd(struct kvm *kvm, int gsi, int flags) +{ +struct _irqfd *irqfd; +struct file *file = NULL; +int fd = -1; +int ret; + +irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); +if (!irqfd) +return -ENOMEM; + +irqfd-kvm = kvm; You need to increase the refcount on struct kvm here. Otherwise evil userspace will create an irqfd, close the vm and vcpu fds, and inject an interrupt. Good catch. Will fix. Thanks Avi, -Greg signature.asc Description: OpenPGP digital signature
Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface
Gregory Haskins wrote: +struct kvm_irqfd { +__u32 gsi; +__u32 flags; +}; + Please add some reserved space here. Ack. Any rule of thumb here? How about a __u8 pad[16] ? I'd round it up so the whole thing is 32 bytes (not that it matters). -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)
On Tue, 2009-05-05 at 04:28 -0700, Nicholas A. Bellinger wrote: On Tue, 2009-05-05 at 03:43 -0700, Nicholas A. Bellinger wrote: On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote: Hi, The VF also works in the host if the VF driver is programed properly. So it would be easier to develop the VF driver in the host and then verify the VF driver in the guest. BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select the CONFIG_PCI_IOV in the kernel .config? Thanks, Yu Greetings Yu and Sheng, So the original attachment was for the v2.6.29-fc11 host kernel output, I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was enabled) for KVM host with kvm-85 and now things are looking quite stable for me. So far I have been able to successfully push LIO-Target v3.0 traffic *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port. I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and FILEIO storage objects (in the KVM Guest), and they are passing validation and I am seeing ~500 Mb/sec of throughput and very low CPU usage in the KVM guests. Ok I am seeing another issue with the e1000e port on 02:00.0..: As i start to push multiple badblocks tests RAMDISK_DR iSCSI Logical units into KVM Guest running LIO v2.6.29.2 from the external Linux/iSCSI Initiator machine, after about 100 GB of iSCSI traffic, I see the following exception in KVM host v2.6.30-rc3: DRHD: handling fault status reg 2 DMAR:[DMA Write] Request device [02:00.0] fault addr 7fc958b01 DMAR:[fault reason 04] Access beyond MGAW pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 60 for MSI/MSI-X pci-stub :02:00.0: irq 61 for MSI/MSI-X I am able to restart the LIO-Target KVM Guest and the Linux/iSCSI Initiators are able to reconnect.. Wow, very cool.. Not sure if this is a bug in the target_core_mod RAMDISK_DR subsystem plugin (mapping struct iovec to internally allocated struct page) or what. I will have to look at the DMAR code to understand what this exception means.. Greetings Yu, Sheng and Co, So I have been making progress this morning.. So far, I have hooked up a LSI mpt-function PCIe SAS adapter into the KVM guest with a Sandisk SATA SSD 32 GB drive. It is using MSI interrupts (not MSI-X) and I am able to push ~70 MB/sec from a 2nd Linux/iSCSI Initiator machine (running Open-iSCSI) with the 1500 byte MTUs on e1000e ports from within the KVM guest. The interesting thing is that I am having to use IBLOCK export (using using submit_bio(), and complete emulation of SCSI control path) for SATA SSD in order to get I/O running stable Using the pSCSI export I am getting immediate exceptions from scsi_execute_async() in the v2.6.29.2 KVM guest.. Using a 2nd SAS disk I am able to use target_core_mod/pSCSI export and push badblocks and LTP disktest traffic however.. Here is a bit about the the setup looks, *) Linux/iSCSI Initiator node accessing KVM Guest LIO-Target v3.0 storage: subjekt:~# lsscsi [6:0:0:0]diskATA ST3250820AS 3.AA /dev/sda [10:0:0:0] cd/dvd PIONEER DVD-ROM DVD-305 1.06 /dev/scd1 [18:0:0:0] cd/dvd TOSHIBA DVD/HD X807616 MC08 /dev/scd2 [32:0:0:0] diskLIO-ORG RAMDISK-DR 3.0 /dev/sdb [32:0:0:1] diskLIO-ORG RAMDISK-DR 3.0 /dev/sdc [32:0:0:2] diskLIO-ORG FILEIO 3.0 /dev/sdd [32:0:0:3] diskLIO-ORG IBLOCK 3.0 /dev/sde subjekt:~# sg_inq -i /dev/sde VPD INQUIRY: Device Identification page Designation descriptor number 1, descriptor length: 20 id_type: NAA, code_set: Binary associated with the addressed logical unit NAA 6, IEEE Company_id: 0x1405 Vendor Specific Identifier: 0xa97e4ce21 Vendor Specific Identifier Extension: 0xc0711de829b000c2 [0x6001405a97e4ce21c0711de829b000c2] Designation descriptor number 2, descriptor length: 52 id_type: T10 vendor identification, code_set: ASCII associated with the addressed logical unit vendor id: LIO-ORG vendor specific: IBLOCK:a97e4ce21c0711de829b000c2943d57b Designation descriptor number 3, descriptor length: 8 transport: Internet SCSI (iSCSI) id_type: Relative target port, code_set: Binary associated with the target port Relative target port: 0x1 Designation descriptor number 4, descriptor length: 8 transport: Internet SCSI (iSCSI) id_type: Target port group, code_set: Binary associated with the target port Target port group: 0x0 Designation descriptor number 5, descriptor length: 8 id_type: Logical unit group, code_set: Binary associated with the addressed logical unit Logical unit group: 0x0 Designation descriptor number 6, descriptor length: 80 transport: Internet SCSI (iSCSI) id_type: SCSI name string, code_set: UTF-8
Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface
Avi Kivity wrote: Gregory Haskins wrote: +int +kvm_irqfd(struct kvm *kvm, int gsi, int flags) +{ +struct _irqfd *irqfd; +struct file *file = NULL; +int fd = -1; +int ret; + +irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); +if (!irqfd) +return -ENOMEM; + +irqfd-kvm = kvm; You need to increase the refcount on struct kvm here. Otherwise evil userspace will create an irqfd, close the vm and vcpu fds, and inject an interrupt. I just reviewed the code in prep for v5, and now I remember why I didnt take a reference: I designed it the opposite direction: the vm-fd owns a reference to the irqfd, and will decouple the kvm context from the eventfd on shutdown (see kvm_irqfd_release()). I still need to spin a v5 regardless in order to add the padding as previously discussed. But let me know if you still see any holes in light of this alternate object lifetime approach I am using. -Greg signature.asc Description: OpenPGP digital signature
Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface
Gregory Haskins wrote: Avi Kivity wrote: Gregory Haskins wrote: +int +kvm_irqfd(struct kvm *kvm, int gsi, int flags) +{ +struct _irqfd *irqfd; +struct file *file = NULL; +int fd = -1; +int ret; + +irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); +if (!irqfd) +return -ENOMEM; + +irqfd-kvm = kvm; You need to increase the refcount on struct kvm here. Otherwise evil userspace will create an irqfd, close the vm and vcpu fds, and inject an interrupt. I just reviewed the code in prep for v5, and now I remember why I didnt take a reference: I designed it the opposite direction: the vm-fd owns a reference to the irqfd, and will decouple the kvm context from the eventfd on shutdown (see kvm_irqfd_release()). I still need to spin a v5 regardless in order to add the padding as previously discussed. But let me know if you still see any holes in light of this alternate object lifetime approach I am using. Right, irqfd_release works. But I think refcounting is simpler, since we already kvm_get_kvm() and kvm_put_kvm(), and you wouldn't need the irqfd list. On the other hand, I'm not sure you get a callback from eventfd on close(), so refcounting may not be implementable. Drat, irqfd_release doesn't work. You reference kvm-lock in irqfd_inject without taking any locks. btw, there's still your original idea of creating the eventfd in userspace and passing it down. That would be workable if we can see a way to both signal the eventfd and get called back in irq context. Maybe that's preferable to what we're doing here, but we need to see how it would work. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface
Avi Kivity wrote: Gregory Haskins wrote: Avi Kivity wrote: Gregory Haskins wrote: +int +kvm_irqfd(struct kvm *kvm, int gsi, int flags) +{ +struct _irqfd *irqfd; +struct file *file = NULL; +int fd = -1; +int ret; + +irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); +if (!irqfd) +return -ENOMEM; + +irqfd-kvm = kvm; You need to increase the refcount on struct kvm here. Otherwise evil userspace will create an irqfd, close the vm and vcpu fds, and inject an interrupt. I just reviewed the code in prep for v5, and now I remember why I didnt take a reference: I designed it the opposite direction: the vm-fd owns a reference to the irqfd, and will decouple the kvm context from the eventfd on shutdown (see kvm_irqfd_release()). I still need to spin a v5 regardless in order to add the padding as previously discussed. But let me know if you still see any holes in light of this alternate object lifetime approach I am using. Right, irqfd_release works. But I think refcounting is simpler, since we already kvm_get_kvm() and kvm_put_kvm(), and you wouldn't need the irqfd list. On the other hand, I'm not sure you get a callback from eventfd on close(), so refcounting may not be implementable. ;) Drat, irqfd_release doesn't work. You reference kvm-lock in irqfd_inject without taking any locks. I *think* this is ok, tho. I remove myself from the waitq, and then flush any potentially scheduled deferred work before returning. This all happens synchronously to the vm_release() code when the vm-fd is bring dropped, but before we actually release the struct kvm*. Therefore, I think kvm-lock is guaranteed to remain valid for the duration of the irqfd_release(), and we guarantee it wont be accessed after the irqfd_release() completes. Or do you have a different concern? On this topic of proper ref counts, though I wonder if I need an extra fget() in there. I presume that the evenfd_file_create() returns with only a single reference, which presumably I am handing one to userspace, and one to the irqfd which is broken. Or does fd_install() bump that for me (doesnt look like it)? Al, Davide, any comments? btw, there's still your original idea of creating the eventfd in userspace and passing it down. That would be workable if we can see a way to both signal the eventfd and get called back in irq context. Maybe that's preferable to what we're doing here, but we need to see how it would work. We can do that, but I don't see it as changing the general problem here. However, I think if you find that the above comments about the kvm-lock w.r.t. irqfd_release() are ok, we don't need to worry about it. If you prefer the userspace allocation of eventfd() for other reasons, we can easily go back to that model as well...but its not strictly necessary for this particular issue iiuc. -Greg signature.asc Description: OpenPGP digital signature
[PATCH] deal with interrupt shadow state for emulated instruction
we currently unblock shadow interrupt state when we skip an instruction, but failing to do so when we actually emulate one. This blocks interrupts in key instruction blocks, in particular sti; hlt; sequences If the instruction emulated is an sti, we have to block shadow interrupts. The same goes for mov ss. pop ss also needs it, but we don't currently emulate it. Without this patch, I cannot boot gpxe option roms at vmx machines. This is described at https://bugzilla.redhat.com/show_bug.cgi?id=494469 Signed-off-by: Glauber Costa glom...@redhat.com CC: H. Peter Anvin h...@zytor.com CC: Avi Kivity a...@redhat.com --- arch/x86/include/asm/kvm_host.h|2 + arch/x86/include/asm/kvm_x86_emulate.h |6 arch/x86/kvm/svm.c | 25 +++- arch/x86/kvm/vmx.c | 49 ++-- arch/x86/kvm/x86.c |7 - arch/x86/kvm/x86_emulate.c | 21 +- 6 files changed, 98 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8e680c3..a49d07b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -510,6 +510,8 @@ struct kvm_x86_ops { void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); + u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); void (*set_irq)(struct kvm_vcpu *vcpu, int vec); diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h index 6a15973..b7ed2c4 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_x86_emulate.h @@ -143,6 +143,9 @@ struct decode_cache { struct fetch_cache fetch; }; +#define X86_SHADOW_INT_MOV_SS 1 +#define X86_SHADOW_INT_STI 2 + struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; @@ -152,6 +155,9 @@ struct x86_emulate_ctxt { int mode; u32 cs_base; + /* interruptibility state, as a result of execution of STI or MOV SS */ + int interruptibility; + /* decode cache */ struct decode_cache decode; }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ef43a18..4941dea 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -202,6 +202,27 @@ static int is_external_interrupt(u32 info) return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); } +static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 ret = 0; + + if (svm-vmcb-control.int_state SVM_INTERRUPT_SHADOW_MASK) + ret |= (X86_SHADOW_INT_STI X86_SHADOW_INT_MOV_SS); + return ret; +} + +static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (mask == 0) + svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK; + else + svm-vmcb-control.int_state |= SVM_INTERRUPT_SHADOW_MASK; + +} + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -215,7 +236,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) __func__, kvm_rip_read(vcpu), svm-next_rip); kvm_rip_write(vcpu, svm-next_rip); - svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK; + svm_set_interrupt_shadow(vcpu, 0); } static int has_svm(void) @@ -2637,6 +2658,8 @@ static struct kvm_x86_ops svm_x86_ops = { .run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow= svm_set_interrupt_shadow, + .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, .set_irq = svm_set_irq, .set_nmi = svm_inject_nmi, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e8a5649..bbfe894 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -736,23 +736,52 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) +{ + u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + int ret = 0; + + if (interruptibility GUEST_INTR_STATE_STI) + ret |= X86_SHADOW_INT_STI; + if (interruptibility GUEST_INTR_STATE_MOV_SS) + ret |= X86_SHADOW_INT_MOV_SS; + + return ret; +} + +static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32
RE: [PATCH] reserved-ram for pci-passthrough without VT-d capable hardware
Andrea, Thanks for your answers. I already patched the kernel and kvm (including rombios). The host boots up and the memory mapping is as explained in the patch. Now I am trying to launch a vm using memory mapping but it hangs after opening the sdl windows and before showing the bios messages. I am running qemu command from a console in the host that is running X and the command line is the following: Qemu-system-x86_64 -hda ./dm.img -cdrom /dev/sr0 -m 32 -reserved-ram -boot d - Is this command line correct? - Should I run the vm without having started the X in the host machine? - What should I see after starting the vm? Should the vm take ownership of the video card? Thanks, Pablo -Original Message- From: Andrea Arcangeli [mailto:aarca...@redhat.com] Sent: Tuesday, April 28, 2009 3:06 PM To: Passera, Pablo R Cc: kvm@vger.kernel.org Subject: Re: [PATCH] reserved-ram for pci-passthrough without VT-d capable hardware On Tue, Apr 28, 2009 at 07:35:26AM -0600, Passera, Pablo R wrote: - Against which kernel version was this patch generated? I don't remember exactly (I was just using an upstream hg checkout and I didn't record its hash value) but I think you can go back to when e820.c was still shared and it'll likely apply and work. - Did you try this on a 32 or 64 bits system? I only tested it on 64bit but there's no reason why it shouldn't work on 32bit too. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] BIOS changes for configuring irq0-inti2 override
Beth Kon wrote: @@ -477,6 +480,7 @@ void wrmsr_smp(uint32_t index, uint64_t val) #define QEMU_CFG_SIGNATURE 0x00 #define QEMU_CFG_ID 0x01 #define QEMU_CFG_UUID 0x02 +#define QEMU_CFG_IRQ0_OVERRIDE 0x0e Small thing to consider before you resubmit: In his patch read-additional-acpi-tables-from-a-vm.patch Gleb introduced: #define QEMU_CFG_ARCH_LOCAL 0x8000 #define QEMU_CFG_ACPI_TABLES (QEMU_CFG_ARCH_LOCAL + 0) I think the idea behind this was to seperate the generic part from arch specific. The IRQ0 override seems to be arch specific (x86 only?) just like the ACPI tables, right? - Sebastian -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: Question about KVM and PC speaker
Simon Bienlein wrote: Is a support for BIOS worked on right now? The vgabios (vgabios.c) has a FIXME should beep. Volker, do you plan to fix this? Which frequency should be used for the beep? Which delay? Getting a delay using inb(0x61) 0x10 is still a no go on qemu, right? - Sebastian -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/3] generic hypercall support
* Gregory Haskins (gregory.hask...@gmail.com) wrote: So you would never have someone making a generic hypercall(KVM_HC_MMU_OP). I agree. Which is why I think the interface proposal you've made is wrong. There's already hypercall interfaces w/ specific ABI and semantic meaning (which are typically called directly/indirectly from an existing pv op hook). But a free-form hypercall(unsigned long nr, unsigned long *args, size_t count) means hypercall number and arg list must be the same in order for code to call hypercall() in a hypervisor agnostic way. The pv_ops level need to have semantic meaning, not a free form hypercall multiplexor. thanks, -chris -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu/hw/device-assignment: questions about msix_table_page
On Tuesday 05 May 2009 20:46:04 Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 07:49:10AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:34:50PM +0300, Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote: On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote: If guest can write to the real device MSI-X table directly, it would cause chaos on interrupt delivery, for what guest see is totally different with what's host see... Obviously. Thanks, What's the reason that this page is unmapped from the qemu memory space? Specifically what do these lines do: int offset = r_dev-msix_table_addr - real_region-base_addr; ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE); I believe this allows accesses to this page (the MSI-X table), which is part of the guest address space (through kvm memory slots), to be trapped by qemu. Since there is no actual page in this guest address, KVM treats accesses as MMIO and forwards them to QEMU. I thought about this too. But why is this necessary for assigned MSI-X but not for emulated devices such as e.g. e1000? All e1000 does seems to be cpu_register_physical_memory ... Because there is no registered (kvm) memory slot for the range which e1000 registers its MMIO? Not sure about the address of the MSI-X table page, but you could achieve the same effect by splitting the slot which it lives in two, with a 1 page hole between them. You could also move the emulated MSI-X table, sticking it on top of the existing BAR. Since PCI config includes the pointer to the table, a driver that reads this pointer will continue to work. One BAR can contain more than a MSI-X table... The PCI spec only said the other information should be page aligned and can't in the same page of MSI-X table(except PBA). I think this method make thing more complicate, we don't want to and can't trap other informations in the same BAR... Of course, there's no guarantee that guest drivers don't just hard-code this offset. I think this mostly won't happen. BTW this is why you can't map the MSI-X table page directly, you want accesses to be trapped. BTW current design won't work if the base page size is 4K, will it? The hole covers a page, so you'll get faults outside the MSI-X table. Yes. One entry for MSI-X is 16bytes, one page can contain 256 entries. Well, I haven't see a device get more than 100 entries, but for this limitation, maybe we should limit MSI-X max entries to 256 (rather than 512 entries now)temporarily... -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: PCI device assignment over KVM
On Wednesday 06 May 2009 08:42:37 Tong Liu wrote: Hi Sheng, I have installed latest KVM-85 release and the Failed to assign irq error goes away. However my device kernel driver on guest OS (RHEL5u2) can't query my card successfully. Our developer said there is memory mapping error during KVM device assignment. Could your developer elaborate it? I am providing all the steps I have done and captures the output as pictures here. Can you help take a look and see if really my device is not supported by KVM yet? Every other things looks fine. I think the key point should be that memory mapping error. -- regards Yang, Sheng Step 1: Before device assignment, /proc/interrupts has my device which is not sharing IRQ with others due to MSI-x enabled Picture 1.jpg. Even though dmesg shows it is using IRQ 18 during boot but /proc shows IRQ 58, etc, I guess it is changed to that number by MSI-X after boot. Step 2: Unbinding my device :01:00.0 from host and start kvm guest Picture 2.jpg, some phy mem error reported but I am assuming it is not critical At this moment, /proc/interrupts on local machine is changed. My device is disppeared and another entry is created: kvm_assigned_intx_device as picture 3.jpg shown And dmesg didn't show obvious errors too, it shows pci device assigned. Picture 4.jpg Log into guest OS(RHEL5u2), run dmesg, it shows errors about my device which means it is not passed successfully. Picture 5.jpg I tried other device :06:00.1 which is OK. Regards Tong -Original Message- From: Sheng Yang [mailto:sh...@linux.intel.com] Sent: Monday, May 04, 2009 9:35 PM To: Tong Liu Subject: Re: PCI device assignment over KVM On Tuesday 05 May 2009 12:06:09 Tong Liu wrote: BTW, as you mentioned, after boot up, 06:00.0 got IRQ 56 because it's MSI enabled. And 01:00.0 is using IRQ18, actually they are not sharing IRQ with any others. (Even though 06:00.0 was using IRQ 18 with 01:00.0 during boot but it is converted to 56 after boot up, so nothing shared) Why is KVM still trying to enable 01:00.0 INTx (dmesg error I put in the first email) if 01:00.0 is not sharing IRQ with any other after boot? You can use cat /proc/interrupts to know if there are other IRQ handler for IRQ 18. Also lspci -v. If you are sure that there is no sharing interrupt for IRQ 18, you may need to look into INTx enabling part of KVM (virt/kvm/kvm_main.c:kvm_vm_ioctl_assign_irq()) to know what's happening exactly. And please try latest KVM and qemu-kvm as well(or kvm-85 release). The your dmesg show that the version of your KVM is old - no thing like failed to enable INTx device! in current code now. We rework the framework two monthes ago. -- regards Yang, Sheng Thanks Tong -Original Message- From: Sheng Yang [mailto:sh...@linux.intel.com] Sent: Monday, May 04, 2009 6:26 PM To: Tong Liu Subject: Re: PCI device assignment over KVM On Tuesday 05 May 2009 05:32:32 Tong Liu wrote: Hi Sheng, My system has VT-d support and I want to assign one PCI-E card to guest OS. I have an issue with PCI device assignment over KVM. I am using latest kernerl 2.6.30-rc4. Here are the steps I have done: 1. Unbind PCI device 01:00.0 from host using steps documented on KVM webiste. http://www.linux-kvm.org/page/How_to_assign_devices_with_VT-d_in_KVM 2. Then run the following command: qemu-system-x86_64 -m 4096 -boot c -net none -hda vdisk.img -pcidevice host=01:00.0 It reports errors: Assign_irq: deassign: Invalid argument Failed to assign irq for 01:00.0: Input/output error Perhaps you are assigning a device that shares an IRQ with another device? Failed to deassign device 01:00.0 : Invalid argument In dmesg it shows the following error: pci-stub :01:00.0: PCI INT A - GSI 18 (level, low) - IRQ 18 assign device: host bdf = 1:0:0 kvm: failed to enable INTx device! pci-stub :01:00.0: PCI INT A disabled kvm_vm_ioctl_deassign_device: device hasn't been assigned before, so cannot be deassigned Info for my device :01:00.0: lspci -t: -[:00]-+-00.0 +-05.0-[:01]00.0 +-09.0-[:02-07]--+-00.0-[:03-06]--+-00.0-[:04]-- +-01.0-[:05]-- \-02.0-[:06]--+-00.0 \-00.1 |\-00.3-[:07]-- lspci -v: 01:00.0 InfiniBand: Mellanox Technologies MT26428 [ConnectX IB QDR, PCIe 2.0 5GT/s] (rev a0) Subsystem: Mellanox Technologies Unknown device 0005 Flags: fast devsel, IRQ 18 Memory at 9930 (64-bit, non-prefetchable) [size=1M] Memory at 9800 (64-bit, prefetchable) [size=8M] Capabilities: [40] Power Management version 3 Capabilities: [48] Vital Product Data Capabilities: [9c] MSI-X: Enable+ Mask- TabSize=256 Capabilities: [60] Express
Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?
On Tuesday 05 May 2009 20:08:40 Avi Kivity wrote: Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote: Michael S. Tsirkin wrote: On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote: On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote: The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have been merged for 2.6.30. However, I note that PCI spec allows devices to support multiple vectors with MSI as well (support will be in linux 2.6.30). Well, one question: when did them merged? IIRC, MSI-X related things are still pending for 2.6.31... :) Even though qemu for now only uses a single vector with MSI, it would seem that it's better to make the kernel/user interface generic straight away rather than add more ioctls later. What do you think? It might not be too late to fix this for 2.6.30. Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned device? Sure, but only one KVM_ASSIGN_SET_MSIX_NR. MSIX_NR is the size of the table, while MSIX_ENTRY updates a single entry, if I read the code correctly. Right. So we'll need something like this for MSI as well. Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY and changed to do the right thing depending on the IRQ type? Works for me. Sheng, is there a reason why it wasn't done like this? No, I think it's fine. Also some related structure should be modified. And one flag field should be add to kvm_assigned_msix_nr and kvm_assigned_msix_entry(using padding ones) to indicate the interrupt type, for we can't determined the irq type by device's status at that time. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)
On Tuesday 05 May 2009 18:43:46 Nicholas A. Bellinger wrote: On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote: Hi, The VF also works in the host if the VF driver is programed properly. So it would be easier to develop the VF driver in the host and then verify the VF driver in the guest. BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select the CONFIG_PCI_IOV in the kernel .config? Thanks, Yu Greetings Yu and Sheng, So the original attachment was for the v2.6.29-fc11 host kernel output, I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was enabled) for KVM host with kvm-85 and now things are looking quite stable for me. So far I have been able to successfully push LIO-Target v3.0 traffic *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port. I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and FILEIO storage objects (in the KVM Guest), and they are passing validation and I am seeing ~500 Mb/sec of throughput and very low CPU usage in the KVM guests. One issue I did notice while using the pci-stub method of device-assignment with same e1000 port (02:00.0) was while using an iSCSI Initiator (Open-iSCSI) on the KVM Host machine and doing sustained traffic into the LIO-Target KVM Guest on the same local KVM host to max out traffic between the other onboard e1000e port (03.00.0), I see the following: pci-stub :02:00.0: PCI INT A - GSI 17 (level, low) - IRQ 17 assign device: host bdf = 2:0:0 pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 60 for MSI/MSI-X pci-stub :02:00.0: irq 61 for MSI/MSI-X scsi4 : iSCSI Initiator over TCP/IP scsi 4:0:0:0: Direct-Access LIO-ORG RAMDISK-DR 3.0 PQ: 0 ANSI: 5 sd 4:0:0:0: Attached scsi generic sg1 type 0 scsi 4:0:0:1: Direct-Access LIO-ORG RAMDISK-DR 3.0 PQ: 0 ANSI: 5 sd 4:0:0:1: Attached scsi generic sg2 type 0 sd 4:0:0:0: [sdb] 262144 512-byte hardware sectors: (134 MB/128 MiB) sd 4:0:0:1: [sdc] 262144 512-byte hardware sectors: (134 MB/128 MiB) sd 4:0:0:0: [sdb] Write Protect is off sd 4:0:0:0: [sdb] Mode Sense: 2f 00 00 00 sd 4:0:0:1: [sdc] Write Protect is off sd 4:0:0:1: [sdc] Mode Sense: 2f 00 00 00 sd 4:0:0:0: [sdb] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA sd 4:0:0:1: [sdc] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA sdb:6 sdc: unknown partition table sd 4:0:0:0: [sdb] Attached SCSI disk unknown partition table sd 4:0:0:1: [sdc] Attached SCSI disk [ cut here ] WARNING: at kernel/irq/manage.c:260 enable_irq+0x36/0x50() Hardware name: empty Unbalanced enable for IRQ 59 Modules linked in: ipt_REJECT xt_tcpudp bridge stp sunrpc iptable_filter ip_tables xt_state nf_conntrack ip6table_filter ip6_tables x_tables ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi cpufreq_ondemand acpi_cpufreq freq_table ext3 jbd loop dm_multipath scsi_dh kvm_intel kvm uinput i2c_i801 firewire_ohci joydev firewire_core sg i2c_core 8250_pnp crc_itu_t e1000e 8250 serial_core rtc_cmos pcspkr serio_raw rtc_core rtc_lib button sd_mod dm_snapshot dm_zero dm_mirror dm_region_hash dm_log dm_mod uhci_hcd ohci_hcd ehci_hcd ata_piix libata scsi_mod [last unloaded: microcode] Pid: 51, comm: events/0 Tainted: GW 2.6.30-rc3 #11 Call Trace: [80235fee] ? warn_slowpath+0xcb/0xe8 [80253a7c] ? generic_exec_single+0x6a/0x88 [8022acec] ? update_curr+0x67/0xeb [a0198748] ? vcpu_kick_intr+0x0/0x1 [kvm] [8020a5d8] ? __switch_to+0xb6/0x274 [8022b70a] ? __dequeue_entity+0x1b/0x2f [a01ac7e4] ? kvm_irq_delivery_to_apic+0xb3/0xf7 [kvm] [a01aa4d4] ? __apic_accept_irq+0x15a/0x173 [kvm] [a01ac883] ? kvm_set_msi+0x5b/0x60 [kvm] [80266d97] ? enable_irq+0x36/0x50 [a0195ab5] ? kvm_assigned_dev_interrupt_work_handler+0x6d/0xbc [kvm] [802449fa] ? worker_thread+0x182/0x223 [8024820b] ? autoremove_wake_function+0x0/0x2a [80244878] ? worker_thread+0x0/0x223 [80244878] ? worker_thread+0x0/0x223 [80247e72] ? kthread+0x54/0x7e [8020cb0a] ? child_rip+0xa/0x20 [804d0af5] ? _spin_lock+0x5/0x8 [80247e1e] ? kthread+0x0/0x7e [8020cb00] ? child_rip+0x0/0x20 ---[ end trace 3fbc2dd20bf89ef1 ]--- connection1:0: ping timeout of 5 secs expired, last rx 4295286327, last ping 4295285518, now 4295286768 connection1:0: detected conn error (1011) Attached are the v2.6.30-rc3 KVM host and v2.6.29.2 KVM guest dmesg output. When the 'Unbalanced enable for IRQ
Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)
On Tuesday 05 May 2009 19:28:15 Nicholas A. Bellinger wrote: On Tue, 2009-05-05 at 03:43 -0700, Nicholas A. Bellinger wrote: On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote: Hi, The VF also works in the host if the VF driver is programed properly. So it would be easier to develop the VF driver in the host and then verify the VF driver in the guest. BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select the CONFIG_PCI_IOV in the kernel .config? Thanks, Yu Greetings Yu and Sheng, So the original attachment was for the v2.6.29-fc11 host kernel output, I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was enabled) for KVM host with kvm-85 and now things are looking quite stable for me. So far I have been able to successfully push LIO-Target v3.0 traffic *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port. I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and FILEIO storage objects (in the KVM Guest), and they are passing validation and I am seeing ~500 Mb/sec of throughput and very low CPU usage in the KVM guests. Ok I am seeing another issue with the e1000e port on 02:00.0..: As i start to push multiple badblocks tests RAMDISK_DR iSCSI Logical units into KVM Guest running LIO v2.6.29.2 from the external Linux/iSCSI Initiator machine, after about 100 GB of iSCSI traffic, I see the following exception in KVM host v2.6.30-rc3: DRHD: handling fault status reg 2 DMAR:[DMA Write] Request device [02:00.0] fault addr 7fc958b01 DMAR:[fault reason 04] Access beyond MGAW This means the fault address is too big It's got 51 bits width which is far beyond the physical address limit of current IA32e(48 bits). Don't know how you can get this... -- regards Yang, Sheng pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 60 for MSI/MSI-X pci-stub :02:00.0: irq 61 for MSI/MSI-X I am able to restart the LIO-Target KVM Guest and the Linux/iSCSI Initiators are able to reconnect.. Wow, very cool.. Not sure if this is a bug in the target_core_mod RAMDISK_DR subsystem plugin (mapping struct iovec to internally allocated struct page) or what. I will have to look at the DMAR code to understand what this exception means.. --nab One issue I did notice while using the pci-stub method of device-assignment with same e1000 port (02:00.0) was while using an iSCSI Initiator (Open-iSCSI) on the KVM Host machine and doing sustained traffic into the LIO-Target KVM Guest on the same local KVM host to max out traffic between the other onboard e1000e port (03.00.0), I see the following: pci-stub :02:00.0: PCI INT A - GSI 17 (level, low) - IRQ 17 assign device: host bdf = 2:0:0 pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 60 for MSI/MSI-X pci-stub :02:00.0: irq 61 for MSI/MSI-X scsi4 : iSCSI Initiator over TCP/IP scsi 4:0:0:0: Direct-Access LIO-ORG RAMDISK-DR 3.0 PQ: 0 ANSI: 5 sd 4:0:0:0: Attached scsi generic sg1 type 0 scsi 4:0:0:1: Direct-Access LIO-ORG RAMDISK-DR 3.0 PQ: 0 ANSI: 5 sd 4:0:0:1: Attached scsi generic sg2 type 0 sd 4:0:0:0: [sdb] 262144 512-byte hardware sectors: (134 MB/128 MiB) sd 4:0:0:1: [sdc] 262144 512-byte hardware sectors: (134 MB/128 MiB) sd 4:0:0:0: [sdb] Write Protect is off sd 4:0:0:0: [sdb] Mode Sense: 2f 00 00 00 sd 4:0:0:1: [sdc] Write Protect is off sd 4:0:0:1: [sdc] Mode Sense: 2f 00 00 00 sd 4:0:0:0: [sdb] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA sd 4:0:0:1: [sdc] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA sdb:6 sdc: unknown partition table sd 4:0:0:0: [sdb] Attached SCSI disk unknown partition table sd 4:0:0:1: [sdc] Attached SCSI disk [ cut here ] WARNING: at kernel/irq/manage.c:260 enable_irq+0x36/0x50() Hardware name: empty Unbalanced enable for IRQ 59 Modules linked in: ipt_REJECT xt_tcpudp bridge stp sunrpc iptable_filter ip_tables xt_state nf_conntrack ip6table_filter ip6_tables x_tables ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi cpufreq_ondemand acpi_cpufreq freq_table ext3 jbd loop dm_multipath scsi_dh kvm_intel kvm uinput i2c_i801 firewire_ohci joydev firewire_core sg i2c_core 8250_pnp crc_itu_t e1000e 8250 serial_core rtc_cmos pcspkr serio_raw rtc_core rtc_lib button sd_mod dm_snapshot dm_zero dm_mirror dm_region_hash dm_log dm_mod uhci_hcd ohci_hcd ehci_hcd ata_piix libata scsi_mod [last unloaded: microcode] Pid: 51, comm: events/0 Tainted: GW 2.6.30-rc3 #11
Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)
On Wednesday 06 May 2009 01:45:47 Nicholas A. Bellinger wrote: On Tue, 2009-05-05 at 04:28 -0700, Nicholas A. Bellinger wrote: On Tue, 2009-05-05 at 03:43 -0700, Nicholas A. Bellinger wrote: On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote: Hi, The VF also works in the host if the VF driver is programed properly. So it would be easier to develop the VF driver in the host and then verify the VF driver in the guest. BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select the CONFIG_PCI_IOV in the kernel .config? Thanks, Yu Greetings Yu and Sheng, So the original attachment was for the v2.6.29-fc11 host kernel output, I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was enabled) for KVM host with kvm-85 and now things are looking quite stable for me. So far I have been able to successfully push LIO-Target v3.0 traffic *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port. I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and FILEIO storage objects (in the KVM Guest), and they are passing validation and I am seeing ~500 Mb/sec of throughput and very low CPU usage in the KVM guests. Ok I am seeing another issue with the e1000e port on 02:00.0..: As i start to push multiple badblocks tests RAMDISK_DR iSCSI Logical units into KVM Guest running LIO v2.6.29.2 from the external Linux/iSCSI Initiator machine, after about 100 GB of iSCSI traffic, I see the following exception in KVM host v2.6.30-rc3: DRHD: handling fault status reg 2 DMAR:[DMA Write] Request device [02:00.0] fault addr 7fc958b01 DMAR:[fault reason 04] Access beyond MGAW pci-stub :02:00.0: irq 59 for MSI/MSI-X pci-stub :02:00.0: irq 60 for MSI/MSI-X pci-stub :02:00.0: irq 61 for MSI/MSI-X I am able to restart the LIO-Target KVM Guest and the Linux/iSCSI Initiators are able to reconnect.. Wow, very cool.. Not sure if this is a bug in the target_core_mod RAMDISK_DR subsystem plugin (mapping struct iovec to internally allocated struct page) or what. I will have to look at the DMAR code to understand what this exception means.. Greetings Yu, Sheng and Co, So I have been making progress this morning.. So far, I have hooked up a LSI mpt-function PCIe SAS adapter into the KVM guest with a Sandisk SATA SSD 32 GB drive. It is using MSI interrupts (not MSI-X) and I am able to push ~70 MB/sec from a 2nd Linux/iSCSI Initiator machine (running Open-iSCSI) with the 1500 byte MTUs on e1000e ports from within the KVM guest. Is MSI-X can't be enabled or the device only have MSI capability? Just curious... The interesting thing is that I am having to use IBLOCK export (using using submit_bio(), and complete emulation of SCSI control path) for SATA SSD in order to get I/O running stable Using the pSCSI export I am getting immediate exceptions from scsi_execute_async() in the v2.6.29.2 KVM guest.. Didn't see exception in the log below... (And buried with iscsi log I can't understand. Looking forward for the help from others...) Any thing notable show in the host side? I think the target to get pSCSI work well now? BTW: Maybe you can try the patch from Marcelo titled [patch 0/4] use smp_send_reschedule in vcpu_kick / assigned dev host intx race fix. -- regards Yang, Sheng Using a 2nd SAS disk I am able to use target_core_mod/pSCSI export and push badblocks and LTP disktest traffic however.. Here is a bit about the the setup looks, *) Linux/iSCSI Initiator node accessing KVM Guest LIO-Target v3.0 storage: subjekt:~# lsscsi [6:0:0:0]diskATA ST3250820AS 3.AA /dev/sda [10:0:0:0] cd/dvd PIONEER DVD-ROM DVD-305 1.06 /dev/scd1 [18:0:0:0] cd/dvd TOSHIBA DVD/HD X807616 MC08 /dev/scd2 [32:0:0:0] diskLIO-ORG RAMDISK-DR 3.0 /dev/sdb [32:0:0:1] diskLIO-ORG RAMDISK-DR 3.0 /dev/sdc [32:0:0:2] diskLIO-ORG FILEIO 3.0 /dev/sdd [32:0:0:3] diskLIO-ORG IBLOCK 3.0 /dev/sde subjekt:~# sg_inq -i /dev/sde VPD INQUIRY: Device Identification page Designation descriptor number 1, descriptor length: 20 id_type: NAA, code_set: Binary associated with the addressed logical unit NAA 6, IEEE Company_id: 0x1405 Vendor Specific Identifier: 0xa97e4ce21 Vendor Specific Identifier Extension: 0xc0711de829b000c2 [0x6001405a97e4ce21c0711de829b000c2] Designation descriptor number 2, descriptor length: 52 id_type: T10 vendor identification, code_set: ASCII associated with the addressed logical unit vendor id: LIO-ORG vendor specific: IBLOCK:a97e4ce21c0711de829b000c2943d57b Designation descriptor number 3, descriptor length: 8 transport: Internet SCSI (iSCSI) id_type: Relative target port, code_set:
Re: [RFC PATCH 0/3] generic hypercall support
Chris Wright wrote: * Gregory Haskins (gregory.hask...@gmail.com) wrote: So you would never have someone making a generic hypercall(KVM_HC_MMU_OP). I agree. Which is why I think the interface proposal you've made is wrong. I respectfully disagree. Its only wrong in that the name chosen for the interface was perhaps too broad/vague. I still believe the concept is sound, and the general layering is appropriate. There's already hypercall interfaces w/ specific ABI and semantic meaning (which are typically called directly/indirectly from an existing pv op hook). Yes, these are different, thus the new interface. But a free-form hypercall(unsigned long nr, unsigned long *args, size_t count) means hypercall number and arg list must be the same in order for code to call hypercall() in a hypervisor agnostic way. Yes, and that is exactly the intention. I think its perhaps the point you are missing. I am well aware that historically the things we do over a hypercall interface would inherently have meaning only to a specific hypervisor (e.g. KVM_HC_MMU_OPS (vector 2) via kvm_hypercall()). However, this doesn't in any way infer that it is the only use for the general concept. Its just the only way they have been exploited to date. While I acknowledge that the hypervisor certainly must be coordinated with their use, in their essence hypercalls are just another form of IO joining the ranks of things like MMIO and PIO. This is an attempt to bring them out of the bowels of CONFIG_PARAVIRT to make them a first class citizen. The thing I am building here is really not a general hypercall in the broad sense. Rather, its a subset of the hypercall vector namespace. It is designed specifically for dynamic binding a synchronous call() interface to things like virtual devices, and it is therefore these virtual device models that define the particular ABI within that namespace. Thus the ABI in question is explicitly independent of the underlying hypervisor. I therefore stand by the proposed design to have this interface described above the hypervisor support layer (i.e. pv_ops) (albeit with perhaps a better name like dynamic hypercall as per my later discussion with Avi). Consider PIO: The hypervisor (or hardware) and OS negotiate a port address, but the two end-points are the driver and the device-model (or real device). The driver doesnt have to say: if (kvm) kvm_iowrite32(addr, ..); else if (lguest) lguest_iowrite32(addr, ...); else native_iowrite32(addr, ...); Instead, it just says iowrite32(addr, ...); and the address is used to route the message appropriately by the platform. The ABI of that message, however, is specific to the driver/device and is not interpreted by kvm/lguest/native-hw infrastructure on the way. Today, there is no equivelent of a platform agnostic iowrite32() for hypercalls so the driver would look like the pseudocode above except substitute with kvm_hypercall(), lguest_hypercall(), etc. The proposal is to allow the hypervisor to assign a dynamic vector to resources in the backend and convey this vector to the guest (such as in PCI config-space as mentioned in my example use-case). The provides the address negotiation function that would normally be done for something like a pio port-address. The hypervisor agnostic driver can then use this globally recognized address-token coupled with other device-private ABI parameters to communicate with the device. This can all occur without the core hypervisor needing to understand the details beyond the addressing. What this means to our interface design is that the only thing the hypervisor really cares about is the first nr parameter. This acts as our address-token. The optional/variable list of args is just payload as far as the core infrastructure is concerned and are coupled only to our device ABI. They were chosen to be an array of ulongs (vs something like vargs) to reflect the fact that hypercalls are typically passed by packing registers. Hope this helps, -Greg signature.asc Description: OpenPGP digital signature
[KVM-AUTOTEST][PATCH] timedrift support
Hello everyone, I like to submit patch to add a new function for 'time drift check' for guest running on KVM. The TimeDrift design logic is below: 1. Set the host as the NTP server 2. Guest only sync it's clock with host *once* when it booted up. * if the offset value of ntpdate large than 1 sec, the guest will sync the clock with host. * if the offset value of ntpdate less than 1 sec, the guest doesn't need sync it's clock with host. 3. Then the cpu stress testing will running on guest. * a C program will give the real load to guest cpu 4.when the cpustress testing finished. running the commandline ntpdate -q host-ip totally 20 times on guest to query the time from host and judge whether the guest clock has drift or not. The details of my patch is attached. thanks. Bear. diff -urN kvm_runtest_2.bak/cpu_stress.c kvm_runtest_2/cpu_stress.c --- kvm_runtest_2.bak/cpu_stress.c 1969-12-31 19:00:00.0 -0500 +++ kvm_runtest_2/cpu_stress.c 2009-05-05 22:35:34.0 -0400 @@ -0,0 +1,61 @@ +#define _GNU_SOURCE +#include stdio.h +#include pthread.h +#include sched.h +#include stdlib.h +#include fcntl.h +#include math.h +#include unistd.h + +#define MAX_CPUS 256 +#define BUFFSIZE 1024 + + +void worker_child(int cpu) +{ + int cur_freq; + int min_freq; + int max_freq; + int last_freq; + cpu_set_t mask; + int i; + double x; +int d = 0; + /* + * bind this thread to the specified cpu + */ + CPU_ZERO(mask); + CPU_SET(cpu, mask); + sched_setaffinity(0, CPU_SETSIZE, mask); + + while (d++ != 50) { + for (i=0; i10; i++) +x = sqrt(x); + } + + _exit(0); + +} + + +main() { + cpu_set_t mask; + int i; + int code; + + if (sched_getaffinity(0, CPU_SETSIZE, mask) 0){ + perror (sched_getaffinity); + exit(1); + } + + for (i=0; iCPU_SETSIZE; i++) + if (CPU_ISSET(i, mask)){ + printf (CPU%d\n,i); + if (fork() == 0) +worker_child(i); + } + + + wait(code); + exit (WEXITSTATUS(code)); +} diff -urN kvm_runtest_2.bak/kvm_runtest_2.py kvm_runtest_2/kvm_runtest_2.py --- kvm_runtest_2.bak/kvm_runtest_2.py 2009-04-29 06:17:29.0 -0400 +++ kvm_runtest_2/kvm_runtest_2.py 2009-04-29 08:06:32.0 -0400 @@ -36,6 +36,8 @@ autotest: test_routine(kvm_tests, run_autotest), kvm_install: test_routine(kvm_install, run_kvm_install), linux_s3: test_routine(kvm_tests, run_linux_s3), +ntp_server_setup: test_routine(kvm_tests, run_ntp_server_setup), +timedrift:test_routine(kvm_tests, run_timedrift), } # Make it possible to import modules from the test's bindir diff -urN kvm_runtest_2.bak/kvm_tests.cfg.sample kvm_runtest_2/kvm_tests.cfg.sample --- kvm_runtest_2.bak/kvm_tests.cfg.sample 2009-04-29 06:17:29.0 -0400 +++ kvm_runtest_2/kvm_tests.cfg.sample 2009-04-29 08:09:36.0 -0400 @@ -81,6 +81,10 @@ - linux_s3: install setup type = linux_s3 +- ntp_server_setup: +type = ntp_server_setup +- timedrift: ntp_server_setup +type = timedrift # NICs variants: - @rtl8139: diff -urN kvm_runtest_2.bak/kvm_tests.py kvm_runtest_2/kvm_tests.py --- kvm_runtest_2.bak/kvm_tests.py 2009-04-29 06:17:29.0 -0400 +++ kvm_runtest_2/kvm_tests.py 2009-05-05 23:45:57.0 -0400 @@ -394,3 +394,235 @@ kvm_log.info(VM resumed after S3) session.close() + +def run_ntp_server_setup(test, params, env): + +NTP server configuration and related network file modification + +kvm_log.debug(run ntp server setup) +status = 1 +# stop firewall for NTP server if it is running. +status = os.system(/etc/init.d/iptables status) +if status == 0: +os.system(/etc/init.d/iptables stop) +status = 1 + +# prevent dhcp client modify the ntp.conf +kvm_log.info(prevent dhcp client modify the ntp.conf) + +config_file = /etc/sysconfig/network +network_file = open(/etc/sysconfig/network, a) +string = PEERNTP=no + +if os.system(grep %s %s % (string, config_file)): +network_file.writelines(str(string)+'\n') + +network_file.close() + +# start ntp server on host +kvm_log.info(backup ntp config file) + +ntp_filename = os.path.expanduser(/etc/ntp.conf) +# backup ntp config file +backup_bootloader_filename = ntp_filename + _backup +if os.path.exists(ntp_filename): +os.rename(ntp_filename, backup_bootloader_filename) + +status = os.system(/etc/init.d/ntpd status) +if status == 0: +os.system(/etc/init.d/ntpd stop) +status = 1 + +kvm_log.info(start ntp server on host) + +ntp_cmd = ''' +echo restrict default kod nomodify notrap nopeer noquery /etc/ntp.conf;\ +echo restrict 127.0.0.1 /etc/ntp.conf;\ +echo driftfile /var/lib/ntp/drift /etc/ntp.conf;\ +echo keys /etc/ntp/keys /etc/ntp.conf;\ +echo
Re: [patch 0/4] use smp_send_reschedule in vcpu_kick / assigned dev host intx race fix
On Thursday 30 April 2009 09:59:56 Sheng Yang wrote: On Thursday 30 April 2009 08:56:57 Sheng Yang wrote: On Thursday 30 April 2009 01:47:57 Marcelo Tosatti wrote: On Tue, Apr 28, 2009 at 03:08:46PM +0800, Sheng Yang wrote: Ack all. This also solved one bug by my hand. Thanks! I observe one point: the performance of high workload interrupt(e.g. 10 gigabyte oplin card) dropped dramatically with smp_send_reschedule() method... In one environment(the speed of oplin card also limited by cpu performance), Using smp_call_function_single() can get more than 1G bit/s stably(native got 1.2G), but smp_send_reschedule() can only got around 600M bit/s... And the rescheduling interrupt number is about 2000/second per cpu. And the interrupt rate is about tens of thousands per second for the device. Anyway, this method is more elegant and correct. Though there is still room for optimize - but of course, the correctness is first priority. Are you using the compat code or a kvm.git kernel? Can you remove only the last patch (the spinlock) to confirm its the cause of the slowdown? I am using kvm.git. I said this because I tried the old version of patch(which have warning) and it would got more than 1G/sec. I'd like to take a close look at what's happened. Still ACK this patchset. And sorry, my memory messed... The old version of patch and this one offered the same performance. So the problem is not here. I get more than 1g per second by one of myself's experiment. Disable/enable irq purposed to use with level interrupt to prevent it send interrupt again after kernel handler return, but it not applied to MSI/MSI-X. Though some interrupt may be merged with one, but AFAIK the driver can handle it well. My experiment is discard disable/enable IRQ for MSI/MSI-X, then can get much better performance for oplin card, 2x with disable/enable one. I would prepare a patch for it. Hi Avi Is there any issue blocked this patchset? Thanks! -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/9] Remove irq_pending bitmap
On Tuesday 05 May 2009 16:14:29 Gleb Natapov wrote: Only one interrupt vector can be injected from userspace irqchip at any given time so no need to store it in a bitmap. Put it into interrupt queue directly. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/include/asm/kvm_host.h |2 -- arch/x86/kvm/irq.c |4 ++-- arch/x86/kvm/x86.c | 38 +++--- arch/x86/kvm/x86.h | 12 4 files changed, 13 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8e680c3..cc892f5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -266,8 +266,6 @@ struct kvm_mmu { struct kvm_vcpu_arch { u64 host_tsc; - unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ - DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 11c2757..96dfbb6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -50,7 +50,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) struct kvm_pic *s; if (!irqchip_in_kernel(v-kvm)) - return v-arch.irq_summary; + return v-arch.interrupt.pending; if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ if (kvm_apic_accept_pic_intr(v)) { @@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) int vector; if (!irqchip_in_kernel(v-kvm)) - return kvm_pop_irq(v); + return v-arch.interrupt.nr; vector = kvm_get_apic_interrupt(v); /* APIC */ if (vector == -1) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12ab1cc..4596927 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1424,8 +1424,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -ENXIO; vcpu_load(vcpu); - set_bit(irq-irq, vcpu-arch.irq_pending); - set_bit(irq-irq / BITS_PER_LONG, vcpu-arch.irq_summary); + kvm_queue_interrupt(vcpu, irq-irq); vcpu_put(vcpu); @@ -3562,12 +3561,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs-efer = vcpu-arch.shadow_efer; sregs-apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu-kvm)) - memset(sregs-interrupt_bitmap, 0, -sizeof sregs-interrupt_bitmap); ? When did we discard the saving of pending interrupt for irqchip_in_kernel? - else - memcpy(sregs-interrupt_bitmap, vcpu-arch.irq_pending, -sizeof sregs-interrupt_bitmap); + memset(sregs-interrupt_bitmap, 0, sizeof sregs-interrupt_bitmap); No need to save any pending interrupts? Did I miss anything? if (vcpu-arch.interrupt.pending) set_bit(vcpu-arch.interrupt.nr, @@ -4037,7 +4031,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int i, pending_vec, max_bits; + int pending_vec, max_bits; struct descriptor_table dt; vcpu_load(vcpu); @@ -4079,24 +4073,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - if (!irqchip_in_kernel(vcpu-kvm)) { - memcpy(vcpu-arch.irq_pending, sregs-interrupt_bitmap, -sizeof vcpu-arch.irq_pending); - vcpu-arch.irq_summary = 0; - for (i = 0; i ARRAY_SIZE(vcpu-arch.irq_pending); ++i) - if (vcpu-arch.irq_pending[i]) - __set_bit(i, vcpu-arch.irq_summary); - } else { - max_bits = (sizeof sregs-interrupt_bitmap) 3; - pending_vec = find_first_bit( - (const unsigned long *)sregs-interrupt_bitmap, - max_bits); - /* Only pending external irq is handled here */ - if (pending_vec max_bits) { - kvm_queue_interrupt(vcpu, pending_vec); - pr_debug(Set back pending irq %d\n, pending_vec); - } - kvm_pic_clear_isr_ack(vcpu-kvm); + max_bits = (sizeof sregs-interrupt_bitmap) 3; If interrupt_bitmap is always zero as above, why we got this... For compatible? -- regards Yang, Sheng + pending_vec = find_first_bit( + (const unsigned long *)sregs-interrupt_bitmap, max_bits); + if (pending_vec max_bits) { + kvm_queue_interrupt(vcpu, pending_vec); + pr_debug(Set back pending irq %d\n, pending_vec); + if (irqchip_in_kernel(vcpu-kvm)) + kvm_pic_clear_isr_ack(vcpu-kvm); } kvm_set_segment(vcpu, sregs-cs, VCPU_SREG_CS); diff --git a/arch/x86/kvm/x86.h