[COMMIT master] Rename KVM_UPSTREAM to OBSOLETE_KVM_IMPL
From: Avi Kivity a...@redhat.com The symbol KVM_UPSTREAM is used to mark sections of code that are part of the upstream kvm implemetation that is not used in qemu-kvm. However the name becomes ambiguous if qemu-kvm is merged upstream. Rename the symbol to avoid confusion. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/cpus.c b/cpus.c index c545a62..99c04d1 100644 --- a/cpus.c +++ b/cpus.c @@ -299,7 +299,7 @@ void qemu_notify_event(void) } } -#if defined(KVM_UPSTREAM) || !defined(CONFIG_KVM) +#if defined(OBSOLETE_KVM_IMPL) || !defined(CONFIG_KVM) void qemu_mutex_lock_iothread(void) {} void qemu_mutex_unlock_iothread(void) {} #endif diff --git a/kvm-all.c b/kvm-all.c index 4ff75c4..d4b0861 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -41,7 +41,7 @@ do { } while (0) #endif -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL typedef struct KVMSlot { @@ -156,7 +156,7 @@ static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot) return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, mem); } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL static void kvm_reset_vcpu(void *opaque) { CPUState *env = opaque; @@ -176,7 +176,7 @@ int kvm_pit_in_kernel(void) } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL int kvm_init_vcpu(CPUState *env) { KVMState *s = kvm_state; @@ -594,7 +594,7 @@ void kvm_cpu_register_phys_memory_client(void) cpu_register_phys_memory_client(kvm_cpu_phys_memory_client); } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL int kvm_init(int smp_cpus) { @@ -816,7 +816,7 @@ void kvm_flush_coalesced_mmio_buffer(void) #endif } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL static void do_kvm_cpu_synchronize_state(void *_env) { @@ -1038,7 +1038,7 @@ int kvm_has_debugregs(void) return kvm_state-debugregs; } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL int kvm_has_xsave(void) { return kvm_state-xsave; @@ -1069,10 +1069,10 @@ void kvm_setup_guest_memory(void *start, size_t size) } #ifdef KVM_CAP_SET_GUEST_DEBUG -#ifndef KVM_UPSTREAM +#ifndef OBSOLETE_KVM_IMPL #define run_on_cpu on_vcpu static void on_vcpu(CPUState *env, void (*func)(void *data), void *data); -#endif /* !KVM_UPSTREAM */ +#endif /* !OBSOLETE_KVM_IMPL */ struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env, target_ulong pc) diff --git a/kvm.h b/kvm.h index d321fce..56236ae 100644 --- a/kvm.h +++ b/kvm.h @@ -31,13 +31,13 @@ extern int kvm_allowed; #define kvm_enabled() (0) #endif -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL struct kvm_run; /* external API */ int kvm_init(int smp_cpus); -#endif /* KVM_UPSTREAM */ +#endif /* OBSOLETE_KVM_IMPL */ int kvm_has_sync_mmu(void); int kvm_has_vcpu_events(void); @@ -96,7 +96,7 @@ int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run); int kvm_arch_pre_run(CPUState *env, struct kvm_run *run); -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL int kvm_arch_process_irqchip_events(CPUState *env); #endif diff --git a/target-i386/kvm.c b/target-i386/kvm.c index b00e80d..f4fc063 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -188,7 +188,7 @@ int kvm_arch_init_vcpu(CPUState *env) return r; } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL env-mp_state = KVM_MP_STATE_RUNNABLE; @@ -304,7 +304,7 @@ void kvm_arch_reset_vcpu(CPUState *env) env-mp_state = KVM_MP_STATE_RUNNABLE; } } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL static int kvm_has_msr_star(CPUState *env) { @@ -644,7 +644,7 @@ static void kvm_msr_entry_set(struct kvm_msr_entry *entry, entry-data = value; } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL static int kvm_put_msrs(CPUState *env, int level) { struct { @@ -1104,7 +1104,7 @@ static int kvm_get_debugregs(CPUState *env) return 0; } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL int kvm_arch_put_registers(CPUState *env, int level) { int ret; @@ -1242,7 +1242,7 @@ int kvm_arch_post_run(CPUState *env, struct kvm_run *run) return 0; } -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL int kvm_arch_process_irqchip_events(CPUState *env) { diff --git a/vl.c b/vl.c index 22a3616..378a176 100644 --- a/vl.c +++ b/vl.c @@ -2466,7 +2466,7 @@ int main(int argc, char **argv, char **envp) case QEMU_OPTION_smbios: do_smbios_option(optarg); break; -#ifdef KVM_UPSTREAM +#ifdef OBSOLETE_KVM_IMPL case QEMU_OPTION_enable_kvm: kvm_allowed = 1; #endif @@ -2803,7 +2803,7 @@ int main(int argc, char **argv, char **envp) if (kvm_allowed) { int ret = kvm_init(smp_cpus); if (ret 0) { -#if defined(KVM_UPSTREAM) || defined(CONFIG_NO_CPU_EMULATION) +#if defined(OBSOLETE_KVM_IMPL) || defined(CONFIG_NO_CPU_EMULATION) if (!kvm_available()) { printf(KVM not supported for this target\n); } else { -- To unsubscribe
[COMMIT master] KVM: VMX: Split up vmx_complete_interrupts()
From: Avi Kivity a...@redhat.com vmx_complete_interrupts() does too much, split it up: - vmx_vcpu_run() gets the cache important vmcs fields part - a new vmx_complete_atomic_exit() gets the parts that must be done atomically - a new vmx_recover_nmi_blocking() does what its name says - vmx_complete_interrupts() retains the event injection recovery code This helps in reducing the work done in atomic context. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 291f99c..568f936 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -125,6 +125,7 @@ struct vcpu_vmx { unsigned long host_rsp; int launched; u8fail; + u32 exit_intr_info; u32 idt_vectoring_info; struct shared_msr_entry *guest_msrs; int nmsrs; @@ -3781,18 +3782,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info; - u32 idt_vectoring_info = vmx-idt_vectoring_info; - bool unblock_nmi; - u8 vector; - int type; - bool idtv_info_valid; - - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - vmx-exit_reason = vmcs_read32(VM_EXIT_REASON); + u32 exit_intr_info = vmx-exit_intr_info; /* Handle machine checks before interrupts are enabled */ if ((vmx-exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) @@ -3807,8 +3799,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) asm(int $2); kvm_after_handle_nmi(vmx-vcpu); } +} - idtv_info_valid = idt_vectoring_info VECTORING_INFO_VALID_MASK; +static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info = vmx-exit_intr_info; + bool unblock_nmi; + u8 vector; + bool idtv_info_valid; + + idtv_info_valid = vmx-idt_vectoring_info VECTORING_INFO_VALID_MASK; if (cpu_has_virtual_nmis()) { unblock_nmi = (exit_intr_info INTR_INFO_UNBLOCK_NMI) != 0; @@ -3830,6 +3830,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) } else if (unlikely(vmx-soft_vnmi_blocked)) vmx-vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx-entry_time)); +} + +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + u32 idt_vectoring_info = vmx-idt_vectoring_info; + u8 vector; + int type; + bool idtv_info_valid; + + idtv_info_valid = idt_vectoring_info VECTORING_INFO_VALID_MASK; vmx-vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(vmx-vcpu); @@ -4042,6 +4052,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) asm(mov %0, %%ds; mov %0, %%es : : r(__USER_DS)); vmx-launched = 1; + vmx-exit_reason = vmcs_read32(VM_EXIT_REASON); + vmx-exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx_complete_atomic_exit(vmx); + vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: VMX: Move real-mode interrupt injection fixup to vmx_complete_interrupts()
From: Avi Kivity a...@redhat.com This allows reuse of vmx_complete_interrupts() for cancelling injections. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 568f936..c10d700 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -182,6 +182,7 @@ static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); +static void fixup_rmode_irq(struct vcpu_vmx *vmx); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3834,11 +3835,15 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { - u32 idt_vectoring_info = vmx-idt_vectoring_info; + u32 idt_vectoring_info; u8 vector; int type; bool idtv_info_valid; + if (vmx-rmode.irq.pending) + fixup_rmode_irq(vmx); + + idt_vectoring_info = vmx-idt_vectoring_info; idtv_info_valid = idt_vectoring_info VECTORING_INFO_VALID_MASK; vmx-vcpu.arch.nmi_injected = false; @@ -4046,8 +4051,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu-arch.regs_dirty = 0; vmx-idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if (vmx-rmode.irq.pending) - fixup_rmode_irq(vmx); asm(mov %0, %%ds; mov %0, %%es : : r(__USER_DS)); vmx-launched = 1; -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: VMX: Parameterize vmx_complete_interrupts() for both exit and entry
From: Avi Kivity a...@redhat.com Currently vmx_complete_interrupts() can decode event information from vmx exit fields into the generic kvm event queues. Make it able to decode the information from the entry fields as well by parametrizing it. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c10d700..108dbaf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -182,7 +182,7 @@ static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void fixup_rmode_irq(struct vcpu_vmx *vmx); +static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3833,17 +3833,18 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) ktime_to_ns(ktime_sub(ktime_get(), vmx-entry_time)); } -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, + u32 idt_vectoring_info, + int instr_len_field, + int error_code_field) { - u32 idt_vectoring_info; u8 vector; int type; bool idtv_info_valid; if (vmx-rmode.irq.pending) - fixup_rmode_irq(vmx); + fixup_rmode_irq(vmx, idt_vectoring_info); - idt_vectoring_info = vmx-idt_vectoring_info; idtv_info_valid = idt_vectoring_info VECTORING_INFO_VALID_MASK; vmx-vcpu.arch.nmi_injected = false; @@ -3871,18 +3872,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) break; case INTR_TYPE_SOFT_EXCEPTION: vmx-vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); + u32 err = vmcs_read32(error_code_field); kvm_queue_exception_e(vmx-vcpu, vector, err); } else kvm_queue_exception(vmx-vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vmx-vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(vmx-vcpu, vector, @@ -3893,24 +3894,31 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) } } +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + __vmx_complete_interrupts(vmx, vmx-idt_vectoring_info, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_ERROR_CODE); +} + /* * Failure to inject an interrupt should give us the information * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs * when fetching the interrupt redirection bitmap in the real-mode * tss, this doesn't happen. So we do it ourselves. */ -static void fixup_rmode_irq(struct vcpu_vmx *vmx) +static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info) { vmx-rmode.irq.pending = 0; if (kvm_rip_read(vmx-vcpu) + 1 != vmx-rmode.irq.rip) return; kvm_rip_write(vmx-vcpu, vmx-rmode.irq.rip); - if (vmx-idt_vectoring_info VECTORING_INFO_VALID_MASK) { - vmx-idt_vectoring_info = ~VECTORING_INFO_TYPE_MASK; - vmx-idt_vectoring_info |= INTR_TYPE_EXT_INTR; + if (*idt_vectoring_info VECTORING_INFO_VALID_MASK) { + *idt_vectoring_info = ~VECTORING_INFO_TYPE_MASK; + *idt_vectoring_info |= INTR_TYPE_EXT_INTR; return; } - vmx-idt_vectoring_info = + *idt_vectoring_info = VECTORING_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | vmx-rmode.irq.vector; -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: Check for pending events before attempting injection
From: Avi Kivity a...@redhat.com Instead of blindly attempting to inject an event before each guest entry, check for a possible event first in vcpu-requests. Sites that can trigger event injection are modified to set KVM_REQ_EVENT: - interrupt, nmi window opening - ppr updates - i8259 output changes - local apic irr changes - rflags updates - gif flag set - event set on exit This improves non-injecting entry performance, and sets the stage for non-atomic injection. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 6e77471..ab1bb8f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s) if (!found) return; + kvm_make_request(KVM_REQ_EVENT, found); kvm_vcpu_kick(found); } } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 77d8c0f..c6f2f15 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static void apic_update_ppr(struct kvm_lapic *apic) { - u32 tpr, isrv, ppr; + u32 tpr, isrv, ppr, old_ppr; int isr; + old_ppr = apic_get_reg(apic, APIC_PROCPRI); tpr = apic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; @@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug(vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x, apic, ppr, isr, isrv); - apic_set_reg(apic, APIC_PROCPRI, ppr); + if (old_ppr != ppr) { + apic_set_reg(apic, APIC_PROCPRI, ppr); + kvm_make_request(KVM_REQ_EVENT, apic-vcpu); + } } static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) @@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; @@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, INIT on a runnable vcpu %d\n, vcpu-vcpu_id); vcpu-arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { apic_debug(Ignoring de-assert INIT to vcpu %d\n, @@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; vcpu-arch.sipi_vector = vector; vcpu-arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } break; @@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_EDGE_TRIG; if (!(apic_get_reg(apic, APIC_SPIV) APIC_SPIV_DIRECTED_EOI)) kvm_ioapic_update_eoi(apic-vcpu-kvm, vector, trigger_mode); + kvm_make_request(KVM_REQ_EVENT, apic-vcpu); } static void apic_send_ipi(struct kvm_lapic *apic) @@ -1152,6 +1160,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) update_divide_count(apic); start_apic_timer(apic); apic-irr_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index eeb08d6..bc317eb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2371,6 +2371,7 @@ static int stgi_interception(struct vcpu_svm *svm) svm-next_rip = kvm_rip_read(svm-vcpu) + 3; skip_emulated_instruction(svm-vcpu); + kvm_make_request(KVM_REQ_EVENT, svm-vcpu); enable_gif(svm); @@ -2763,6 +2764,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm-vcpu.run; + kvm_make_request(KVM_REQ_EVENT, svm-vcpu); svm_clear_vintr(svm); svm-vmcb-control.int_ctl = ~V_IRQ_MASK; /* @@ -3209,8 +3211,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm-int3_injected = 0; - if (svm-vcpu.arch.hflags HF_IRET_MASK) + if (svm-vcpu.arch.hflags HF_IRET_MASK) { svm-vcpu.arch.hflags = ~(HF_NMI_MASK | HF_IRET_MASK); + kvm_make_request(KVM_REQ_EVENT, svm-vcpu); + } svm-vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(svm-vcpu); @@ -3219,6 +3223,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) if (!(exitintinfo SVM_EXITINTINFO_VALID)) return; +
[COMMIT master] KVM: Non-atomic interrupt injection
From: Avi Kivity a...@redhat.com Change the interrupt injection code to work from preemptible, interrupts enabled context. This works by adding a -cancel_injection() operation that undoes an injection in case we were not able to actually enter the guest (this condition could never happen with atomic injection). Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3a00741..02f9780 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -552,6 +552,7 @@ struct kvm_x86_ops { void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code, bool reinject); + void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bc317eb..43f5558 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3261,6 +3261,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) } } +static void svm_cancel_injection(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = svm-vmcb-control; + + control-exit_int_info = control-event_inj; + control-exit_int_info_err = control-event_inj_err; + control-event_inj = 0; + svm_complete_interrupts(svm); +} + #ifdef CONFIG_X86_64 #define R r #else @@ -3626,6 +3637,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_irq = svm_set_irq, .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, + .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, .get_nmi_mask = svm_get_nmi_mask, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 108dbaf..199fa7e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3901,6 +3901,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) IDT_VECTORING_ERROR_CODE); } +static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +{ + __vmx_complete_interrupts(to_vmx(vcpu), + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); +} + /* * Failure to inject an interrupt should give us the information * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs @@ -4354,6 +4364,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_irq = vmx_inject_irq, .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, + .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, .get_nmi_mask = vmx_get_nmi_mask, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e719803..a465bd2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5005,7 +5005,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) int r; bool req_int_win = !irqchip_in_kernel(vcpu-kvm) vcpu-run-request_interrupt_window; - bool req_event; if (vcpu-requests) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) @@ -5041,6 +5040,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu-arch.nmi_pending) + kvm_x86_ops-enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops-enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } + } + preempt_disable(); kvm_x86_ops-prepare_guest_switch(vcpu); @@ -5053,35 +5067,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) local_irq_disable(); - req_event = kvm_check_request(KVM_REQ_EVENT, vcpu); - if (!atomic_read(vcpu-guest_mode) || vcpu-requests || need_resched() || signal_pending(current)) { - if (req_event) - kvm_make_request(KVM_REQ_EVENT, vcpu); atomic_set(vcpu-guest_mode, 0); smp_wmb(); local_irq_enable(); preempt_enable(); + kvm_x86_ops-cancel_injection(vcpu); r = 1; goto
[COMMIT master] KVM: VMX: Move fixup_rmode_irq() to avoid forward declaration
From: Avi Kivity a...@redhat.com No code changes. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 199fa7e..8ef6199 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -182,7 +182,6 @@ static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3833,6 +3832,29 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) ktime_to_ns(ktime_sub(ktime_get(), vmx-entry_time)); } +/* + * Failure to inject an interrupt should give us the information + * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs + * when fetching the interrupt redirection bitmap in the real-mode + * tss, this doesn't happen. So we do it ourselves. + */ +static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info) +{ + vmx-rmode.irq.pending = 0; + if (kvm_rip_read(vmx-vcpu) + 1 != vmx-rmode.irq.rip) + return; + kvm_rip_write(vmx-vcpu, vmx-rmode.irq.rip); + if (*idt_vectoring_info VECTORING_INFO_VALID_MASK) { + *idt_vectoring_info = ~VECTORING_INFO_TYPE_MASK; + *idt_vectoring_info |= INTR_TYPE_EXT_INTR; + return; + } + *idt_vectoring_info = + VECTORING_INFO_VALID_MASK + | INTR_TYPE_EXT_INTR + | vmx-rmode.irq.vector; +} + static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, u32 idt_vectoring_info, int instr_len_field, @@ -3911,29 +3933,6 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } -/* - * Failure to inject an interrupt should give us the information - * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs - * when fetching the interrupt redirection bitmap in the real-mode - * tss, this doesn't happen. So we do it ourselves. - */ -static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info) -{ - vmx-rmode.irq.pending = 0; - if (kvm_rip_read(vmx-vcpu) + 1 != vmx-rmode.irq.rip) - return; - kvm_rip_write(vmx-vcpu, vmx-rmode.irq.rip); - if (*idt_vectoring_info VECTORING_INFO_VALID_MASK) { - *idt_vectoring_info = ~VECTORING_INFO_TYPE_MASK; - *idt_vectoring_info |= INTR_TYPE_EXT_INTR; - return; - } - *idt_vectoring_info = - VECTORING_INFO_VALID_MASK - | INTR_TYPE_EXT_INTR - | vmx-rmode.irq.vector; -} - #ifdef CONFIG_X86_64 #define R r #define Q q -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: MMU: Don't track nested fault info in error-code
From: Joerg Roedel joerg.roe...@amd.com This patch moves the detection whether a page-fault was nested or not out of the error code and moves it into a separate variable in the fault struct. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 02f9780..8c5779d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -322,6 +322,7 @@ struct kvm_vcpu_arch { struct { u64 address; unsigned error_code; + bool nested; } fault; /* only needed in kvm_pv_mmu_op() path, but it's hot so diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 513abbb..7086ca8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -47,7 +47,6 @@ #define PFERR_USER_MASK (1U 2) #define PFERR_RSVD_MASK (1U 3) #define PFERR_FETCH_MASK (1U 4) -#define PFERR_NESTED_MASK (1U 31) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a465bd2..a51635e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -342,18 +342,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu) void kvm_propagate_fault(struct kvm_vcpu *vcpu) { - u32 nested, error; - - error = vcpu-arch.fault.error_code; - nested = error PFERR_NESTED_MASK; - error = error ~PFERR_NESTED_MASK; - - vcpu-arch.fault.error_code = error; - - if (mmu_is_nested(vcpu) !nested) + if (mmu_is_nested(vcpu) !vcpu-arch.fault.nested) vcpu-arch.nested_mmu.inject_page_fault(vcpu); else vcpu-arch.mmu.inject_page_fault(vcpu); + + vcpu-arch.fault.nested = false; } void kvm_inject_nmi(struct kvm_vcpu *vcpu) @@ -3524,7 +3518,7 @@ static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) access |= PFERR_USER_MASK; t_gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, gpa, access, error); if (t_gpa == UNMAPPED_GVA) - vcpu-arch.fault.error_code |= PFERR_NESTED_MASK; + vcpu-arch.fault.nested = true; return t_gpa; } -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: PATCH: virtio_console: Fix poll blocking even though there is data to read
On Wed, 15 Sep 2010 11:16:24 pm Amit Shah wrote: On (Wed) Sep 15 2010 [15:37:21], Hans de Goede wrote: --- linux-2.6.35.x86_64/drivers/char/virtio_console.c~2010-08-02 00:11:14.0 +0200 +++ linux-2.6.35.x86_64/drivers/char/virtio_console.c 2010-09-15 13:39:29.043505000 +0200 @@ -642,7 +642,7 @@ static unsigned int port_fops_poll(struc poll_wait(filp,port-waitqueue, wait); ret = 0; - if (port-inbuf) + if (!will_read_block(port)) Looks correct, but this should be if (port_has_data(port)) instead. That certainly works for me (as in will still fix the bug I'm hitting), but quoting from man 2 select: Three independent sets of file descriptors are watched. Those listed in readfds will be watched to see if characters become available for reading (more precisely, to see if a read will not block; in particu‐ lar, a file descriptor is also ready on end-of-file) Notice the a file descriptor is also ready on end-of-file, and port_fops_read treats the host not being connected as eof (it returns 0 in that case). So from an API pov I'm not sure what is correct. poll(2) says: POLLIN There is data to read. That makes it simple. That's a documentation bug. On a pipe, POLLIN is set when the other end closes. read() then returns 0 immediately. poll() sets POLLIN when read() won't block, and people count on it. Let's do that, please. Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] vhost-net: fix range checking in mrg bufs case
Tested-by: Jason Wang jasow...@redhat.com - Michael S. Tsirkin m...@redhat.com wrote: In mergeable buffer case, we use headcount, log_num and seg as indexes in same-size arrays, and we know that headcount = seg and log_num equals either 0 or seg. Therefore, the right thing to do is range-check seg, not headcount as we do now: these will be different if guest chains s/g descriptors (this does not happen now, but we can not trust the guest). Long term, we should add BUG_ON checks to verify two other indexes are what we think they should be. Reported-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- Dave, I'll queue this on my tree, no need to bother. drivers/vhost/net.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 6400cd5..f095de6 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -245,7 +245,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq, int r, nlogs = 0; while (datalen 0) { - if (unlikely(headcount = VHOST_NET_MAX_SG)) { + if (unlikely(seg = VHOST_NET_MAX_SG)) { r = -ENOBUFS; goto err; } -- 1.7.3.rc1.5.ge5969 ___ Virtualization mailing list virtualizat...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: PATCH: virtio_console: Fix poll blocking even though there is data to read
On (Thu) Sep 16 2010 [15:32:54], Rusty Russell wrote: On Wed, 15 Sep 2010 11:16:24 pm Amit Shah wrote: On (Wed) Sep 15 2010 [15:37:21], Hans de Goede wrote: --- linux-2.6.35.x86_64/drivers/char/virtio_console.c~ 2010-08-02 00:11:14.0 +0200 +++ linux-2.6.35.x86_64/drivers/char/virtio_console.c 2010-09-15 13:39:29.043505000 +0200 @@ -642,7 +642,7 @@ static unsigned int port_fops_poll(struc poll_wait(filp,port-waitqueue, wait); ret = 0; - if (port-inbuf) + if (!will_read_block(port)) Looks correct, but this should be if (port_has_data(port)) instead. That certainly works for me (as in will still fix the bug I'm hitting), but quoting from man 2 select: Three independent sets of file descriptors are watched. Those listed in readfds will be watched to see if characters become available for reading (more precisely, to see if a read will not block; in particu‐ lar, a file descriptor is also ready on end-of-file) Notice the a file descriptor is also ready on end-of-file, and port_fops_read treats the host not being connected as eof (it returns 0 in that case). So from an API pov I'm not sure what is correct. poll(2) says: POLLIN There is data to read. That makes it simple. That's a documentation bug. On a pipe, POLLIN is set when the other end closes. read() then returns 0 immediately. Currently we don't set POLLIN when host goes down. I'll do a second patch for that. poll() sets POLLIN when read() won't block, and people count on it. Yes, that's the behaviour with Hans's new patch as well -- that's not changing. The will_read_block() function (and the comment on top of it) are causing this confusion: /* The condition that must be true for polling to end */ static bool will_read_block(struct port *port) { if (!port-guest_connected) { /* Port got hot-unplugged. Let's exit. */ return false; } return !port_has_data(port) port-host_connected; } This function is only called to unblock a blocking read() call. So the comment there has to be changed to read 'waiting' to end instead of 'polling' to end. read() does return 0 immediately when the other end is not connected (and there's no data to read). In effect, we need: - Hans's patch - a patch to set POLLIN when host goes down (in addition to POLLHUP and SIGIO) - a patch to change the comment for will_read_block. Thanks, Amit -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote: That should give us a clue on what's going wrong. I did the steps you mentioned. Here's the gdb transcript: $ gdb k12_1009 -ex 'target remote localhost:1234' What is k12_1009? Are you sure this is vmlinux? BTW what version on kvm/qemu are you using. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote: That should give us a clue on what's going wrong. I did the steps you mentioned. Here's the gdb transcript: $ gdb k12_1009 -ex 'target remote localhost:1234' What is k12_1009? Are you sure this is vmlinux? Hi Gleb: Yes, k12_1009 is the kernel image that isolinux loads. -- isolinux.cfg DEFAULT 1 TIMEOUT 100 PROMPT 1 DISPLAY isolinux.txt LABEL 1 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram LABEL 2 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram BOOT_SUPPORT=1 -- file test $ file k12_1009 k12_1009: Linux kernel x86 boot executable bzImage, version 2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, Normal VGA BTW what version on kvm/qemu are you using. qemu's version is 0.12.5. modinfo kvm doesn't reveal any version string though, however the kernel is 2.6.35 from Arch. Question: from what I've read so far kvm versions are of 'kvm-xx' where xx is a number, on sourceforge there is kvm-kmod-{kernel-version}, am I correct in assuming and saying then that my kvm version is 2.6.35? Thanks... -- Gleb. -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote: That should give us a clue on what's going wrong. I did the steps you mentioned. Here's the gdb transcript: $ gdb k12_1009 -ex 'target remote localhost:1234' What is k12_1009? Are you sure this is vmlinux? Hi Gleb: Yes, k12_1009 is the kernel image that isolinux loads. -- isolinux.cfg DEFAULT 1 TIMEOUT 100 PROMPT 1 DISPLAY isolinux.txt LABEL 1 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram LABEL 2 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram BOOT_SUPPORT=1 -- file test $ file k12_1009 k12_1009: Linux kernel x86 boot executable bzImage, version 2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, Normal VGA This is bzImage, You need corresponding vmlinux bzImage was created from. BTW what version on kvm/qemu are you using. qemu's version is 0.12.5. modinfo kvm doesn't reveal any version string though, however the kernel is 2.6.35 from Arch. Question: from what I've read so far kvm versions are of 'kvm-xx' where xx is a number, on sourceforge there is kvm-kmod-{kernel-version}, am I correct in assuming and saying then that my kvm version is 2.6.35? kvm-xx is deprecated. So when asked what kvm are you using you should provide your kernel version like you did. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote: That should give us a clue on what's going wrong. I did the steps you mentioned. Here's the gdb transcript: $ gdb k12_1009 -ex 'target remote localhost:1234' What is k12_1009? Are you sure this is vmlinux? Hi Gleb: Yes, k12_1009 is the kernel image that isolinux loads. -- isolinux.cfg DEFAULT 1 TIMEOUT 100 PROMPT 1 DISPLAY isolinux.txt LABEL 1 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram LABEL 2 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram BOOT_SUPPORT=1 -- file test $ file k12_1009 k12_1009: Linux kernel x86 boot executable bzImage, version 2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, Normal VGA This is bzImage, You need corresponding vmlinux bzImage was created from. Oh I see. Thanks for pointing that out. Unfortunately, I don't think I can get that image. Either I ask someone from Novell or IBM. I'll try crawling the system's filesystem, maybe I can find it there... Will keep you updated. BTW what version on kvm/qemu are you using. qemu's version is 0.12.5. modinfo kvm doesn't reveal any version string though, however the kernel is 2.6.35 from Arch. Question: from what I've read so far kvm versions are of 'kvm-xx' where xx is a number, on sourceforge there is kvm-kmod-{kernel-version}, am I correct in assuming and saying then that my kvm version is 2.6.35? kvm-xx is deprecated. So when asked what kvm are you using you should provide your kernel version like you did. -- Gleb. -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Wed, 2010-09-15 at 22:37 +0400, Michael Tokarev wrote: Alec Joseph Rivera wrote: [] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes) Memory: 1022272k/1048564k available (1671k kernel code, 25488k reserved, 890k data, 200k init, 131064k highmem) Checking if this processor honours the WP bit even in supervisor mode... Ok. -- end Hm. There was some error with earlier kernels and kvm that resulted in exactly this behavour. Maybe even a (guest) kernel bug.. no? My memory is vague in this area, but it tells there was something... Hi Michael: I suspect has something to do with loops (re: bogomips calibration) or timers (re: calibration + pci probing). Will verify this when I get the time to read the source. Maybe someone more familiar with this kernel area can shed some light on the topic too...? :-) only if I'm not stretching too much.. /mjt -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel
From: Michael S. Tsirkin [mailto:m...@redhat.com] Sent: Wednesday, September 15, 2010 5:59 PM To: Xin, Xiaohui Cc: Shirley Ma; Arnd Bergmann; Avi Kivity; David Miller; net...@vger.kernel.org; kvm@vger.kernel.org; linux-ker...@vger.kernel.org Subject: Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel On Wed, Sep 15, 2010 at 10:46:02AM +0800, Xin, Xiaohui wrote: From: Michael S. Tsirkin [mailto:m...@redhat.com] Sent: Wednesday, September 15, 2010 12:30 AM To: Shirley Ma Cc: Arnd Bergmann; Avi Kivity; Xin, Xiaohui; David Miller; net...@vger.kernel.org; kvm@vger.kernel.org; linux-ker...@vger.kernel.org Subject: Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel On Tue, Sep 14, 2010 at 09:00:25AM -0700, Shirley Ma wrote: On Tue, 2010-09-14 at 17:22 +0200, Michael S. Tsirkin wrote: I would expect this to hurt performance significantly. We could do this for asynchronous requests only to avoid the slowdown. Is kiocb in sendmsg helpful here? It is not used now. Shirley Precisely. This is what the patch from Xin Xiaohui does. That code already seems to do most of what you are trying to do, right? The main thing missing seems to be macvtap integration, so that we can fall back on data copy if zero copy is unavailable? How hard would it be to basically link the mp and macvtap modules together to get us this functionality? Anyone? Michael, Is to support macvtap with zero-copy through mp device the functionality you mentioned above? I have trouble parsing the above question. At some point Arnd suggested that the mp device functionality would fit nicely as part of the macvtap driver. It seems to make sense superficially, the advantage if it worked would be that we would get zero copy (mostly) transparently. Do you agree with this goal? I would say yes. Before Shirley Ma has suggested to move the zero-copy functionality into tun/tap device or macvtap device. How do you think about that? I suspect there will be a lot of duplicate code in that three drivers except we can extract code of zero-copy into kernel APIs and vhost APIs. tap would be very hard at this point as it does not bind to a device. macvtap might work, we mainly need to figure out a way to detect that device can do zero copy so the right mode is used. I think a first step could be to simply link mp code into macvtap module, pass necessary ioctls on, then move some code around as necessary. This might get rid of code duplication nicely. I'll look into this to see how much effort would be. Do you think that's worth to do and help current process which is blocked too long than I expected? I think it's nice to have. And if done hopefully this will get the folk working on the macvtap driver to review the code, which will help find all issues faster. I think if you post some performance numbers, this will also help get people excited and looking at the code. The performance data I have posted before is compared with raw socket on vhost-net. But currently, the raw socket backend is removed from the qemu side. So I can only compare with tap on vhost-net. But unfortunately, I missed something that I even can't bring it up. I was blocked by this for a time. I also don't see the process as completely blocked, each review round points out more issues: we aren't going back and forth changing same lines again and again, are we? One thing that might help is increase the frequency of updates, try sending them out sooner. On the other hand 10 new patches each revision is a lot: if there is a part of patchset that has stabilised you can split it out, post once and keep posting the changing part separately. I hope these suggestions help. Thanks, Michael! -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, Sep 16, 2010 at 4:17 PM, Alec Joseph Rivera eij...@gmail.com wrote: On Wed, 2010-09-15 at 22:37 +0400, Michael Tokarev wrote: Alec Joseph Rivera wrote: [] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes) Memory: 1022272k/1048564k available (1671k kernel code, 25488k reserved, 890k data, 200k init, 131064k highmem) Checking if this processor honours the WP bit even in supervisor mode... Ok. -- end Hm. There was some error with earlier kernels and kvm that resulted in exactly this behavour. Maybe even a (guest) kernel bug.. no? My memory is vague in this area, but it tells there was something... Hi Michael: I suspect has something to do with loops (re: bogomips calibration) or timers (re: calibration + pci probing). Will verify this when I get the time to read the source. Maybe someone more familiar with this kernel area can shed some light on the topic too...? :-) only if I'm not stretching too much.. boot with param lpj=xxx to avoid calibration copy xxx value from your normal boot kernel messages /mjt -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Regards dave -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 11:02:56AM +0200, Gleb Natapov wrote: On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote: To avoid bouncing irqfd injection out to workqueue context, we must support injecting irqs from local interrupt context. Doing this seems to only require disabling irqs locally. RFC, completely untested, x86 only. Thoughts? We do not want to disable irqs for a long time and some of code paths under lock involve looping over all cpus. For MSI injection path is lockless and this is the only case that matters, MSI only appeared in rhel6, older guests still use level interrupts. Which paths require looping over all cpus? Do PCI interrupts need this? so inject irqs from local interrupt context if it is MSI or from workqueue otherwise. OK, but this would need a rework of irqfd code as it is currently unaware of interrupt type. Signed-off-by: Michael S. Tsirkin m...@redhat.com --- arch/x86/kvm/i8259.c | 12 virt/kvm/eventfd.c | 30 +- virt/kvm/ioapic.c| 34 -- 3 files changed, 41 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8d10c06..294fa36 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -198,8 +198,10 @@ void kvm_pic_update_irq(struct kvm_pic *s) int kvm_pic_set_irq(void *opaque, int irq, int level) { struct kvm_pic *s = opaque; + unsigned long flags; int ret = -1; + local_irq_save(flags); pic_lock(s); if (irq = 0 irq PIC_NUM_PINS) { ret = pic_set_irq1(s-pics[irq 3], irq 7, level); @@ -208,6 +210,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) s-pics[irq 3].imr, ret == 0); } pic_unlock(s); + local_irq_restore(flags); return ret; } @@ -235,8 +238,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; + unsigned long flags; struct kvm_pic *s = pic_irqchip(kvm); + local_irq_save(flags); pic_lock(s); irq = pic_get_irq(s-pics[0]); if (irq = 0) { @@ -263,6 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm) } pic_update_irq(s); pic_unlock(s); + local_irq_restore(flags); return intno; } @@ -476,6 +482,7 @@ static int picdev_write(struct kvm_io_device *this, { struct kvm_pic *s = to_pic(this); unsigned char data = *(unsigned char *)val; + unsigned long flags; if (!picdev_in_range(addr)) return -EOPNOTSUPP; @@ -484,6 +491,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR PIC: non byte write\n); return 0; } + local_irq_save(flags); pic_lock(s); switch (addr) { case 0x20: @@ -498,6 +506,7 @@ static int picdev_write(struct kvm_io_device *this, break; } pic_unlock(s); + local_irq_restore(flags); return 0; } @@ -506,6 +515,7 @@ static int picdev_read(struct kvm_io_device *this, { struct kvm_pic *s = to_pic(this); unsigned char data = 0; + unsigned long flags; if (!picdev_in_range(addr)) return -EOPNOTSUPP; @@ -514,6 +524,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR PIC: non byte read\n); return 0; } + local_irq_save(flags); pic_lock(s); switch (addr) { case 0x20: @@ -529,6 +540,7 @@ static int picdev_read(struct kvm_io_device *this, } *(unsigned char *)val = data; pic_unlock(s); + local_irq_restore(flags); return 0; } diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 66cf65b..30ede90 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -50,22 +50,11 @@ struct _irqfd { struct list_head list; poll_tablept; wait_queue_t wait; - struct work_structinject; struct work_structshutdown; }; static struct workqueue_struct *irqfd_cleanup_wq; -static void -irqfd_inject(struct work_struct *work) -{ - struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); - struct kvm *kvm = irqfd-kvm; - - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 1); - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 0); -} - /* * Race-free decouple logic (ordering is critical) */ @@ -82,12 +71,6 @@ irqfd_shutdown(struct work_struct *work) eventfd_ctx_remove_wait_queue(irqfd-eventfd, irqfd-wait, cnt); /* -* We know no new events will be scheduled at this point, so block -* until all previously outstanding events have completed -*/ - flush_work(irqfd-inject); - -
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 11:10:55AM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 11:02:56AM +0200, Gleb Natapov wrote: On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote: To avoid bouncing irqfd injection out to workqueue context, we must support injecting irqs from local interrupt context. Doing this seems to only require disabling irqs locally. RFC, completely untested, x86 only. Thoughts? We do not want to disable irqs for a long time and some of code paths under lock involve looping over all cpus. For MSI injection path is lockless and this is the only case that matters, MSI only appeared in rhel6, older guests still use level interrupts. So they are already slow for other reasons. Which paths require looping over all cpus? Do PCI interrupts need this? All interrupts need it. IOAPIC has a loop to find dst cpu for interrupt. Pic has a loop to find cpu in virtual wire mode. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] dma_rw.h (was Re: [PATCH 0/7] AMD IOMMU emulation patchset v4)
On Thu, Sep 16, 2010 at 10:06:16AM +0300, Eduard - Gabriel Munteanu wrote: On Mon, Sep 13, 2010 at 10:01:20PM +0200, Michael S. Tsirkin wrote: So I think the following will give the idea of what an API might look like that will let us avoid the scary hacks in e.g. the ide layer and other generic layers that need to do DMA, without either binding us to pci, adding more complexity with callbacks, or losing type safety with casts and void*. Basically we have DMADevice that we can use container_of on to get a PCIDevice from, and DMAMmu that will get instanciated in a specific MMU. This is not complete code - just a header - I might complete this later if/when there's interest or hopefully someone interested in iommu emulation will. Hi, I personally like this approach better. It also seems to make poisoning cpu_physical_memory_*() easier if we convert every device to this API. We could then ban cpu_physical_memory_*(), perhaps by requiring a #define and #ifdef-ing those declarations. Notes: the IOMMU_PERM_RW code seem unused, so I replaced this with plain is_write. Is it ever useful? The original idea made provisions for stuff like full R/W memory maps. In that case cpu_physical_memory_map() would call the translation / checking function with perms == IOMMU_PERM_RW. That's not there yet so it can be removed at the moment, especially since it only affects these helpers. Also, I'm not sure if there are other sorts of accesses besides reads and writes we want to check or translate. It seems that invalidate callback should be able to get away with just a device, so I switched to that from a void pointer for type safety. Seems enough for the users I saw. I think this makes matters too complicated. Normally, a single DMADevice should be embedded within a busDevice, No, DMADevice is a device that does DMA. So e.g. a PCI device would embed one. Remember, traslations are per device, right? DMAMmu is part of the iommu object. so doing this makes it really hard to invalidate a specific map when there are more of them. It forces device code to act as a bus, provide fake 'DMADevice's for each map and dispatch translation to the real DMATranslateFunc. I see no other way. If you really want more type-safety (although I think this is a case of a true opaque identifying something only device code understands), I have another proposal: have a DMAMap embedded in the opaque. Example from dma-helpers.c: typedef struct { DMADevice *owner; [...] } DMAMap; typedef struct { [...] DMAMap map; [...] } DMAAIOCB; /* The callback. */ static void dma_bdrv_cancel(DMAMap *map) { DMAAIOCB *dbs = container_of(map, DMAAIOCB, map); [...] } The upside is we only need to pass the DMAMap. That can also contain details of the actual map in case the device wants to release only the relevant range and remap the rest. Fine. Or maybe DMAAIOCB (just make some letters lower case: DMAIocb?). Everyone will use it anyway, right? I saw devices do stl_le_phys and such, these might need to be wrapped as well. stl_le_phys() is defined and used only by hw/eepro100.c. That's already dealt with by converting the device. I see. Need to get around to adding some prefix to it to make this clear. Thanks, Eduard Signed-off-by: Michael S. Tsirkin m...@redhat.com --- diff --git a/hw/dma_rw.h b/hw/dma_rw.h new file mode 100644 index 000..d63fd17 --- /dev/null +++ b/hw/dma_rw.h @@ -0,0 +1,122 @@ +#ifndef DMA_RW_H +#define DMA_RW_H + +#include qemu-common.h + +/* We currently only have pci mmus, but using + a generic type makes it possible to use this + e.g. from the generic ide code without callbacks. */ +typedef uint64_t dma_addr_t; + +typedef struct DMAMmu DMAMmu; +typedef struct DMADevice DMADevice; + +typedef int DMATranslateFunc(DMAMmu *mmu, + DMADevice *dev, + dma_addr_t addr, + dma_addr_t *paddr, + dma_addr_t *len, + int is_write); + +typedef int DMAInvalidateMapFunc(DMADevice *); +struct DMAMmu { + /* invalidate, etc. */ + DmaTranslateFunc *translate; +}; + +struct DMADevice { + DMAMmu *mmu; + DMAInvalidateMapFunc *invalidate; +}; + +void dma_device_init(DMADevice *, DMAMmu *, DMAInvalidateMapFunc *); + +static inline void dma_memory_rw(DMADevice *dev, +dma_addr_t addr, +void *buf, +uint32_t len, +int is_write) +{ +uint32_t plen; +/* Fast-path non-iommu. + * More importantly, makes it obvious what this function does. */ +if (!dev-mmu) { + cpu_physical_memory_rw(paddr, buf, plen, is_write);
Re: [Qemu-devel] [PATCH RFC] dma_rw.h (was Re: [PATCH 0/7] AMD IOMMU emulation patchset v4)
On Mon, Sep 13, 2010 at 03:45:34PM -0500, Anthony Liguori wrote: On 09/13/2010 03:01 PM, Michael S. Tsirkin wrote: So I think the following will give the idea of what an API might look like that will let us avoid the scary hacks in e.g. the ide layer and other generic layers that need to do DMA, without either binding us to pci, adding more complexity with callbacks, or losing type safety with casts and void*. Basically we have DMADevice that we can use container_of on to get a PCIDevice from, and DMAMmu that will get instanciated in a specific MMU. This is not complete code - just a header - I might complete this later if/when there's interest or hopefully someone interested in iommu emulation will. Notes: the IOMMU_PERM_RW code seem unused, so I replaced this with plain is_write. Is it ever useful? It seems that invalidate callback should be able to get away with just a device, so I switched to that from a void pointer for type safety. Seems enough for the users I saw. I saw devices do stl_le_phys and such, these might need to be wrapped as well. Signed-off-by: Michael S. Tsirkinm...@redhat.com One of the troubles with an interface like this is that I'm not sure a generic model universally works. For instance, I know some PCI busses do transparent byte swapping. For this to work, there has to be a notion of generic memory reads/writes vs. reads of a 32-bit, 16-bit, and 8-bit value. With a generic API, we lose the flexibility to do this type of bus interface. Regards, Anthony Liguori Surely only PCI root can do such tricks. Anyway, I suspect what you refer to is byte swapping of config cycles and similar IO done by driver. If a bus byteswapped a DMA transaction, this basically breaks DMA as driver will have to go and fix up all data before passing it up to the OS. Right? We'd have to add more wrappers to emulate such insanity, as MMU intentionally only handles translation. --- diff --git a/hw/dma_rw.h b/hw/dma_rw.h new file mode 100644 index 000..d63fd17 --- /dev/null +++ b/hw/dma_rw.h @@ -0,0 +1,122 @@ +#ifndef DMA_RW_H +#define DMA_RW_H + +#include qemu-common.h + +/* We currently only have pci mmus, but using + a generic type makes it possible to use this + e.g. from the generic ide code without callbacks. */ +typedef uint64_t dma_addr_t; + +typedef struct DMAMmu DMAMmu; +typedef struct DMADevice DMADevice; + +typedef int DMATranslateFunc(DMAMmu *mmu, + DMADevice *dev, + dma_addr_t addr, + dma_addr_t *paddr, + dma_addr_t *len, + int is_write); + +typedef int DMAInvalidateMapFunc(DMADevice *); +struct DMAMmu { +/* invalidate, etc. */ +DmaTranslateFunc *translate; +}; + +struct DMADevice { +DMAMmu *mmu; +DMAInvalidateMapFunc *invalidate; +}; + +void dma_device_init(DMADevice *, DMAMmu *, DMAInvalidateMapFunc *); + +static inline void dma_memory_rw(DMADevice *dev, + dma_addr_t addr, + void *buf, + uint32_t len, + int is_write) +{ +uint32_t plen; +/* Fast-path non-iommu. + * More importantly, makes it obvious what this function does. */ +if (!dev-mmu) { +cpu_physical_memory_rw(paddr, buf, plen, is_write); +return; +} +while (len) { +err = dev-mmu-translate(iommu, dev, addr,paddr,plen, is_write); +if (err) { +return; +} + +/* The translation might be valid for larger regions. */ +if (plen len) { +plen = len; +} + +cpu_physical_memory_rw(paddr, buf, plen, is_write); + +len -= plen; +addr += plen; +buf += plen; +} +} + +void *dma_memory_map(DMADevice *dev, +dma_addr_t addr, +uint32_t *len, +int is_write); +void dma_memory_unmap(DMADevice *dev, + void *buffer, + uint32_t len, + int is_write, + uint32_t access_len); + + ++#define DEFINE_DMA_LD(suffix, size) \ ++uint##size##_t dma_ld##suffix(DMADevice *dev, dma_addr_t addr)\ ++{ \ ++int err; \ ++target_phys_addr_t paddr, plen; \ ++if (!dev-mmu) { \ ++return ld##suffix##_phys(addr, val); \ ++} \ ++
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 11:25:53AM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 11:10:55AM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 11:02:56AM +0200, Gleb Natapov wrote: On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote: To avoid bouncing irqfd injection out to workqueue context, we must support injecting irqs from local interrupt context. Doing this seems to only require disabling irqs locally. RFC, completely untested, x86 only. Thoughts? We do not want to disable irqs for a long time and some of code paths under lock involve looping over all cpus. For MSI injection path is lockless and this is the only case that matters, MSI only appeared in rhel6, older guests still use level interrupts. So they are already slow for other reasons. They are slower than MSI, but I do not want irqfd to be slower than ioctls. Which paths require looping over all cpus? Do PCI interrupts need this? All interrupts need it. IOAPIC has a loop to find dst cpu for interrupt. Pic has a loop to find cpu in virtual wire mode. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, 2010-09-16 at 16:32 +0800, Dave Young wrote: On Thu, Sep 16, 2010 at 4:17 PM, Alec Joseph Rivera eij...@gmail.com wrote: On Wed, 2010-09-15 at 22:37 +0400, Michael Tokarev wrote: Alec Joseph Rivera wrote: [] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes) Memory: 1022272k/1048564k available (1671k kernel code, 25488k reserved, 890k data, 200k init, 131064k highmem) Checking if this processor honours the WP bit even in supervisor mode... Ok. -- end Hm. There was some error with earlier kernels and kvm that resulted in exactly this behavour. Maybe even a (guest) kernel bug.. no? My memory is vague in this area, but it tells there was something... Hi Michael: I suspect has something to do with loops (re: bogomips calibration) or timers (re: calibration + pci probing). Will verify this when I get the time to read the source. Maybe someone more familiar with this kernel area can shed some light on the topic too...? :-) only if I'm not stretching too much.. boot with param lpj=xxx to avoid calibration copy xxx value from your normal boot kernel messages Hi Dave: Crossing out calibration, added the lpj parameter (and combinations with -no-hpet and -no-kvm-irqchip) but still hangs after WP bit checking. /mjt -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote: On 09/16/2010 11:25 AM, Gleb Natapov wrote: MSI only appeared in rhel6, older guests still use level interrupts. So they are already slow for other reasons. Exactly, for example they need to exit to userspace to ack the interrupt. That's far slower than the workqueue. Well, this is not exactly comparable: you might get same irq asserted multiple times and only deasserted once. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel
On Thu, Sep 16, 2010 at 04:18:10PM +0800, Xin, Xiaohui wrote: From: Michael S. Tsirkin [mailto:m...@redhat.com] Sent: Wednesday, September 15, 2010 5:59 PM To: Xin, Xiaohui Cc: Shirley Ma; Arnd Bergmann; Avi Kivity; David Miller; net...@vger.kernel.org; kvm@vger.kernel.org; linux-ker...@vger.kernel.org Subject: Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel On Wed, Sep 15, 2010 at 10:46:02AM +0800, Xin, Xiaohui wrote: From: Michael S. Tsirkin [mailto:m...@redhat.com] Sent: Wednesday, September 15, 2010 12:30 AM To: Shirley Ma Cc: Arnd Bergmann; Avi Kivity; Xin, Xiaohui; David Miller; net...@vger.kernel.org; kvm@vger.kernel.org; linux-ker...@vger.kernel.org Subject: Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel On Tue, Sep 14, 2010 at 09:00:25AM -0700, Shirley Ma wrote: On Tue, 2010-09-14 at 17:22 +0200, Michael S. Tsirkin wrote: I would expect this to hurt performance significantly. We could do this for asynchronous requests only to avoid the slowdown. Is kiocb in sendmsg helpful here? It is not used now. Shirley Precisely. This is what the patch from Xin Xiaohui does. That code already seems to do most of what you are trying to do, right? The main thing missing seems to be macvtap integration, so that we can fall back on data copy if zero copy is unavailable? How hard would it be to basically link the mp and macvtap modules together to get us this functionality? Anyone? Michael, Is to support macvtap with zero-copy through mp device the functionality you mentioned above? I have trouble parsing the above question. At some point Arnd suggested that the mp device functionality would fit nicely as part of the macvtap driver. It seems to make sense superficially, the advantage if it worked would be that we would get zero copy (mostly) transparently. Do you agree with this goal? I would say yes. In that case, it's a blocker for upstream merge because this change affects userspace. Before Shirley Ma has suggested to move the zero-copy functionality into tun/tap device or macvtap device. How do you think about that? I suspect there will be a lot of duplicate code in that three drivers except we can extract code of zero-copy into kernel APIs and vhost APIs. tap would be very hard at this point as it does not bind to a device. macvtap might work, we mainly need to figure out a way to detect that device can do zero copy so the right mode is used. I think a first step could be to simply link mp code into macvtap module, pass necessary ioctls on, then move some code around as necessary. This might get rid of code duplication nicely. I'll look into this to see how much effort would be. Do you think that's worth to do and help current process which is blocked too long than I expected? I think it's nice to have. And if done hopefully this will get the folk working on the macvtap driver to review the code, which will help find all issues faster. I think if you post some performance numbers, this will also help get people excited and looking at the code. The performance data I have posted before is compared with raw socket on vhost-net. But currently, the raw socket backend is removed from the qemu side. So I can only compare with tap on vhost-net. But unfortunately, I missed something that I even can't bring it up. I was blocked by this for a time. Hey, maybe you are seeing the bug that was reported recently. Could you try tcpdump -i on the tap interface in host and ethX on guest and tell me what you see? If you see packet in guest but not in host, could you try adding printks in vhost handle_tx to see whether it gets called and if yes where it fails? I also don't see the process as completely blocked, each review round points out more issues: we aren't going back and forth changing same lines again and again, are we? One thing that might help is increase the frequency of updates, try sending them out sooner. On the other hand 10 new patches each revision is a lot: if there is a part of patchset that has stabilised you can split it out, post once and keep posting the changing part separately. I hope these suggestions help. Thanks, Michael! -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, 2010-09-16 at 16:03 +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote: That should give us a clue on what's going wrong. I did the steps you mentioned. Here's the gdb transcript: $ gdb k12_1009 -ex 'target remote localhost:1234' What is k12_1009? Are you sure this is vmlinux? Hi Gleb: Yes, k12_1009 is the kernel image that isolinux loads. -- isolinux.cfg DEFAULT 1 TIMEOUT 100 PROMPT 1 DISPLAY isolinux.txt LABEL 1 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram LABEL 2 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram BOOT_SUPPORT=1 -- file test $ file k12_1009 k12_1009: Linux kernel x86 boot executable bzImage, version 2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, Normal VGA This is bzImage, You need corresponding vmlinux bzImage was created from. Oh I see. Thanks for pointing that out. Unfortunately, I don't think I can get that image. Either I ask someone from Novell or IBM. I'll try crawling the system's filesystem, maybe I can find it there... Will keep you updated. No vmlinux/z image found on cd or an installed system :-( BTW what version on kvm/qemu are you using. qemu's version is 0.12.5. modinfo kvm doesn't reveal any version string though, however the kernel is 2.6.35 from Arch. Question: from what I've read so far kvm versions are of 'kvm-xx' where xx is a number, on sourceforge there is kvm-kmod-{kernel-version}, am I correct in assuming and saying then that my kvm version is 2.6.35? kvm-xx is deprecated. So when asked what kvm are you using you should provide your kernel version like you did. -- Gleb. -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote: On 09/16/2010 11:25 AM, Gleb Natapov wrote: MSI only appeared in rhel6, older guests still use level interrupts. So they are already slow for other reasons. Exactly, for example they need to exit to userspace to ack the interrupt. That's far slower than the workqueue. Well, this is not exactly comparable: you might get same irq asserted multiple times and only deasserted once. Are we talking about level interrupts? Why would you assert level triggered interrupt multiple times before deasserting it? -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, Sep 16, 2010 at 06:11:36PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 16:03 +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote: That should give us a clue on what's going wrong. I did the steps you mentioned. Here's the gdb transcript: $ gdb k12_1009 -ex 'target remote localhost:1234' What is k12_1009? Are you sure this is vmlinux? Hi Gleb: Yes, k12_1009 is the kernel image that isolinux loads. -- isolinux.cfg DEFAULT 1 TIMEOUT 100 PROMPT 1 DISPLAY isolinux.txt LABEL 1 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram LABEL 2 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram BOOT_SUPPORT=1 -- file test $ file k12_1009 k12_1009: Linux kernel x86 boot executable bzImage, version 2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, Normal VGA This is bzImage, You need corresponding vmlinux bzImage was created from. Oh I see. Thanks for pointing that out. Unfortunately, I don't think I can get that image. Either I ask someone from Novell or IBM. I'll try crawling the system's filesystem, maybe I can find it there... Will keep you updated. No vmlinux/z image found on cd or an installed system :-( It may be hidden somewhere in debug rpm. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote: On 09/16/2010 11:25 AM, Gleb Natapov wrote: MSI only appeared in rhel6, older guests still use level interrupts. So they are already slow for other reasons. Exactly, for example they need to exit to userspace to ack the interrupt. That's far slower than the workqueue. Well, this is not exactly comparable: you might get same irq asserted multiple times and only deasserted once. Are we talking about level interrupts? Why would you assert level triggered interrupt multiple times before deasserting it? User of irqfd has no way to know what current interrupt level is. So it has to keep asserting. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote: On 09/16/2010 11:25 AM, Gleb Natapov wrote: MSI only appeared in rhel6, older guests still use level interrupts. So they are already slow for other reasons. Exactly, for example they need to exit to userspace to ack the interrupt. That's far slower than the workqueue. Well, this is not exactly comparable: you might get same irq asserted multiple times and only deasserted once. Are we talking about level interrupts? Why would you assert level triggered interrupt multiple times before deasserting it? User of irqfd has no way to know what current interrupt level is. So it has to keep asserting. Why can't it keep track of current level? -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
Alec Joseph Rivera wrote: On Wed, 2010-09-15 at 20:15 +0200, Alexander Graf wrote: On 15.09.2010, at 20:07, Alec Joseph Rivera wrote: On Wed, 2010-09-15 at 19:51 +0200, Alexander Graf wrote: On 15.09.2010, at 19:48, Alec Joseph Rivera wrote: On Wed, 2010-09-15 at 19:28 +0200, Alexander Graf wrote: On 15.09.2010, at 18:53, Alec Joseph Rivera wrote: Hi all: I'm Alec Joseph Rivera, from the Philippines and new on this list. Anyway, I'm trying to run Lotus Foundations on kvm and it just hangs on bootup. It stops right after: Checking if this processor honours the WP bit even in supervisor mode... Ok. There's no more output after that. My invocation line is: $ qemu-kvm -cpu host -m 1G -cdrom lfs.iso Your host is probably too new for this old guest. -cpu host directly passes through this host cpu's identifiers on which the guest might choke. Please try again without -cpu. Tried without -cpu, still hangs after the WP bit checking... $ qemu-kvm -m 1G -cdrom lfs.iso I was scanning the changelogs and read something about a -no-kernel-irqchip. The man page doesn't say anything about it but will try this one too.. Please try: $ qemu-kvm -m 1G -cdrom lfs.iso -serial stdio Then when it shows the bootloader, add console=ttyS0 to the kernel command line. That should give all the debugging output necessary. Already done this one too (I've read it from your conversations with a Peter guy if I'm not mistaken). It just stops with no panic messages right after the mentioned checking part, which made me dig after timers (bogomips calibration should be next I believe). But I've gotten nowhere so far... Also, there's no -no-kernel-irqchip parameter, must have been a typo for -no-kvm-irqchip. -- begin kernel messages Linux version 2.6.16.54-0.2.5-bigsmp (ge...@buildhost) (gcc version 4.1.2 20070115 (prerelease) (SUSE Linux)) #1 SMP Mon Jan 21 08:29:51 EST 2008 Took me quite a while to find that kernel in the graveyard :). It's a SLES10 SP1 kernel from the update repo at one specific point in time (more updates came after that one). BIOS-provided physical RAM map: BIOS-e820: - 0009f400 (usable) BIOS-e820: 0009f400 - 000a (reserved) BIOS-e820: 000f - 0010 (reserved) BIOS-e820: 0010 - 3fffd000 (usable) BIOS-e820: 3fffd000 - 4000 (reserved) BIOS-e820: fffbc000 - 0001 (reserved) 127MB HIGHMEM available. 895MB LOWMEM available. found SMP MP-table at 000f8990 DMI 2.4 present. Using APIC driver default ACPI: PM-Timer IO Port: 0xb008 ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled) Processor #0 15:11 APIC version 20 ACPI: LAPIC (acpi_id[0x01] lapic_id[0x01] enabled) Processor #1 15:11 APIC version 20 Overriding APIC driver with bigsmp ACPI: IOAPIC (id[0x02] address[0xfec0] gsi_base[0]) IOAPIC[0]: apic_id 2, version 17, address 0xfec0, GSI 0-23 ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 5 global_irq 5 high level) ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level) ACPI: INT_SRC_OVR (bus 0 bus_irq 10 global_irq 10 high level) ACPI: INT_SRC_OVR (bus 0 bus_irq 11 global_irq 11 high level) Enabling APIC mode: Physflat. Using 1 I/O APICs ACPI: HPET id: 0x8086a201 base: 0xfed0 Using ACPI (MADT) for SMP configuration information Allocating PCI resources starting at 5000 (gap: 4000:bffbc000) Built 1 zonelists Kernel command line: ramdisk_size=32768 initrd=initrd root=/dev/ram BOOT_IMAGE=k12_1009 console=ttyS0 Enabling fast FPU save and restore... done. Enabling unmasked SIMD FPU exception support... done. Initializing CPU#0 PID hash table entries: 4096 (order: 12, 65536 bytes) Console: colour VGA+ 80x25 Dentry cache hash table entries: 131072 (order: 7, 524288 bytes) Inode-cache hash table entries: 65536 (order: 6, 262144 bytes) Memory: 1022272k/1048564k available (1671k kernel code, 25488k reserved, 890k data, 200k init, 131064k highmem) Checking if this processor honours the WP bit even in supervisor mode... Ok. Interesting. Mind to check if you find a the vmlinux binary for that kernel on the cd? Maybe it's hidden inside an rpm. Then you can use: $ qemu-kvm -s -cdrom ... $ gdb vmlinux -ex 'target remote localhost:1234' (gdb) bt That should give us a clue on what's going wrong. I did the steps you mentioned. Here's the gdb transcript: $ gdb k12_1009 -ex 'target remote localhost:1234' GNU gdb (GDB) 7.2 Copyright (C) 2010 Free Software Foundation, Inc. License GPLv3+: GNU GPL version 3 or later http://gnu.org/licenses/gpl.html This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. Type show copying and show
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote: On 09/16/2010 11:25 AM, Gleb Natapov wrote: MSI only appeared in rhel6, older guests still use level interrupts. So they are already slow for other reasons. Exactly, for example they need to exit to userspace to ack the interrupt. That's far slower than the workqueue. Well, this is not exactly comparable: you might get same irq asserted multiple times and only deasserted once. Are we talking about level interrupts? Why would you assert level triggered interrupt multiple times before deasserting it? User of irqfd has no way to know what current interrupt level is. So it has to keep asserting. Why can't it keep track of current level? This breaks the model: eventfd user is unaware of PCI, levels and such: it just signals the event. Remember that asserts are done from e.g. vhost-net, deasserts need to be handled by qemu. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 12:44:55PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote: On 09/16/2010 11:25 AM, Gleb Natapov wrote: MSI only appeared in rhel6, older guests still use level interrupts. So they are already slow for other reasons. Exactly, for example they need to exit to userspace to ack the interrupt. That's far slower than the workqueue. Well, this is not exactly comparable: you might get same irq asserted multiple times and only deasserted once. Are we talking about level interrupts? Why would you assert level triggered interrupt multiple times before deasserting it? User of irqfd has no way to know what current interrupt level is. So it has to keep asserting. Why can't it keep track of current level? This breaks the model: eventfd user is unaware of PCI, levels and such: it just signals the event. Remember that asserts are done from e.g. vhost-net, deasserts need to be handled by qemu. eventfd user implements HW and it knows exactly what type of interrupt this HW generates. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote: On Thu, Sep 16, 2010 at 12:27:29PM +0200, Alexander Graf wrote: Alec Joseph Rivera wrote: Program received signal SIGINT, Interrupt. 0xc015405f in ?? () (gdb) bt #0 0xc015405f in ?? () 0xc015405a __pte_alloc_kernel+82:mov(%edi),%eax 0xc015405c __pte_alloc_kernel+84:mov0x4(%edi),%edx 0xc015405f __pte_alloc_kernel+87:lock cmpxchg8b (%edi) 0xc0154063 __pte_alloc_kernel+91:jne0xc015405a __pte_alloc_kernel+82 This looks like a bug I also have seen recently on 32 bit host-kvm. The instruction emulation for 'lock cmpxchg8b' was broken so that the rip was not advanced and the guest just iterated over this instruction again and again. I thought this was fixed with the latest kvm updates in 2.6.36. Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free time :-) Thanks, will update the list when either comes. Joerg -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, 2010-09-16 at 12:14 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 06:11:36PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 16:03 +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote: That should give us a clue on what's going wrong. I did the steps you mentioned. Here's the gdb transcript: $ gdb k12_1009 -ex 'target remote localhost:1234' What is k12_1009? Are you sure this is vmlinux? Hi Gleb: Yes, k12_1009 is the kernel image that isolinux loads. -- isolinux.cfg DEFAULT 1 TIMEOUT 100 PROMPT 1 DISPLAY isolinux.txt LABEL 1 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram LABEL 2 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram BOOT_SUPPORT=1 -- file test $ file k12_1009 k12_1009: Linux kernel x86 boot executable bzImage, version 2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, Normal VGA This is bzImage, You need corresponding vmlinux bzImage was created from. Oh I see. Thanks for pointing that out. Unfortunately, I don't think I can get that image. Either I ask someone from Novell or IBM. I'll try crawling the system's filesystem, maybe I can find it there... Will keep you updated. No vmlinux/z image found on cd or an installed system :-( It may be hidden somewhere in debug rpm. Sadly, Lotus Foundations doesn't supply the debug rpms :-( -- Gleb. -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] dma_rw.h (was Re: [PATCH 0/7] AMD IOMMU emulation patchset v4)
On Thu, Sep 16, 2010 at 11:20:43AM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 10:06:16AM +0300, Eduard - Gabriel Munteanu wrote: [snip] No, DMADevice is a device that does DMA. So e.g. a PCI device would embed one. Remember, traslations are per device, right? DMAMmu is part of the iommu object. I agree, all I'm saying is a DMADevice is insufficient to invalidate one of the many maps a given device holds, at least without resorting to horrible tricks or destroying them all. so doing this makes it really hard to invalidate a specific map when there are more of them. It forces device code to act as a bus, provide fake 'DMADevice's for each map and dispatch translation to the real DMATranslateFunc. I see no other way. If you really want more type-safety (although I think this is a case of a true opaque identifying something only device code understands), I have another proposal: have a DMAMap embedded in the opaque. Example from dma-helpers.c: typedef struct { DMADevice *owner; [...] } DMAMap; typedef struct { [...] DMAMap map; [...] } DMAAIOCB; /* The callback. */ static void dma_bdrv_cancel(DMAMap *map) { DMAAIOCB *dbs = container_of(map, DMAAIOCB, map); [...] } The upside is we only need to pass the DMAMap. That can also contain details of the actual map in case the device wants to release only the relevant range and remap the rest. Fine. Or maybe DMAAIOCB (just make some letters lower case: DMAIocb?). Everyone will use it anyway, right? No, you misunderstood me. DMAAIOCB is already there in dma-helpers.c, it's the opaque I used and it was already there before my patches. The idea was to define DMAMap and embed it in DMAAIOCB, so we can upcast from the former to the latter (which is what we actually need). IDE DMA uses that, but other devices would use whatever they want, even nothing except the DMAMap. The only requirement is that the container struct allows device code to acknowledge the invalidation (in case of AIO, we simply kill that thread and release resources). Well, perhaps DMAMap isn't the best name, but you get the idea. I saw devices do stl_le_phys and such, these might need to be wrapped as well. stl_le_phys() is defined and used only by hw/eepro100.c. That's already dealt with by converting the device. I see. Need to get around to adding some prefix to it to make this clear. Thanks, Eduard [snip] Eduard -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 12:53:52PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 12:54:03PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 12:44:55PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote: On 09/16/2010 11:25 AM, Gleb Natapov wrote: MSI only appeared in rhel6, older guests still use level interrupts. So they are already slow for other reasons. Exactly, for example they need to exit to userspace to ack the interrupt. That's far slower than the workqueue. Well, this is not exactly comparable: you might get same irq asserted multiple times and only deasserted once. Are we talking about level interrupts? Why would you assert level triggered interrupt multiple times before deasserting it? User of irqfd has no way to know what current interrupt level is. So it has to keep asserting. Why can't it keep track of current level? This breaks the model: eventfd user is unaware of PCI, levels and such: it just signals the event. Remember that asserts are done from e.g. vhost-net, deasserts need to be handled by qemu. eventfd user implements HW and it knows exactly what type of interrupt this HW generates. We haver two users: qemu does deasserts, vhost-net does asserts. Well this is broken. You want KVM to track level for you and this is wrong. KVM does this anyway because it can't relay on devise model to behave correctly [0], but in your case it is designed to behave incorrectly. Interrupt type is a device property. PCI devices just happen to be level triggered according to PCI spec. What if you want to use vhost-net to implement network device which has active-low interrupt line? [1] If you want to split parts that asserts irq and de-asserts it then we should have irqfd that tracks line status and knows interrupt line polarity. Another application is out of process virtio (sandboxing!). It will still assert and de-assert irq at the same code, so it will be able to track irq line status. Again, pci stuff needs to stay in qemu. Nothing to do with PCI whatsoever. [0] most qemu devices behave incorrectly and trigger level irq more then needed. [1] this is how correct PCI device should behave but we override polarity in ACPI, but now incorrect behaviour is deeply designed into vhost-net. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, Sep 16, 2010 at 07:13:03PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 12:14 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 06:11:36PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 16:03 +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote: That should give us a clue on what's going wrong. I did the steps you mentioned. Here's the gdb transcript: $ gdb k12_1009 -ex 'target remote localhost:1234' What is k12_1009? Are you sure this is vmlinux? Hi Gleb: Yes, k12_1009 is the kernel image that isolinux loads. -- isolinux.cfg DEFAULT 1 TIMEOUT 100 PROMPT 1 DISPLAY isolinux.txt LABEL 1 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram LABEL 2 KERNEL k12_1009 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram BOOT_SUPPORT=1 -- file test $ file k12_1009 k12_1009: Linux kernel x86 boot executable bzImage, version 2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, Normal VGA This is bzImage, You need corresponding vmlinux bzImage was created from. Oh I see. Thanks for pointing that out. Unfortunately, I don't think I can get that image. Either I ask someone from Novell or IBM. I'll try crawling the system's filesystem, maybe I can find it there... Will keep you updated. No vmlinux/z image found on cd or an installed system :-( It may be hidden somewhere in debug rpm. Sadly, Lotus Foundations doesn't supply the debug rpms :-( Looks like Alexander already found relevant rpm. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote: Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free time :-) Thanks, will update the list when either comes. Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on 32 bit hosts. Joerg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
16.09.2010 15:32, Joerg Roedel пишет: On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote: Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free time :-) Thanks, will update the list when either comes. Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on 32 bit hosts. What commit it is? I swear I saw it in 2.6.32-stable somewhere. Is it this one (2.6.32.12): From: Gleb Natapov g...@redhat.com Date: Fri, 19 Mar 2010 15:47:31 +0100 Subject: KVM: x86 emulator: fix memory access during x86 emulation To: sta...@kernel.org Cc: Marcelo Tosatti mtosa...@redhat.com, Avi Kivity a...@redhat.com, Gleb Natapov g...@redhat.com Message-ID: 1269010059-25309-4-git-send-email-stefan.ba...@canonical.com From: Gleb Natapov g...@redhat.com commit 1871c6020d7308afb99127bba51f04548e7ca84e upstream Currently when x86 emulator needs to access memory, page walk is done with broadest permission possible, so if emulated instruction was executed by userspace process it can still access kernel memory. Fix that by providing correct memory access to page walker during emulation. /mjt -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, September 10, 2010 7:24 AM To: Liu Yu-B13201 Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org Subject: Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0 On 08.09.2010, at 11:40, Liu Yu wrote: Current guest TLB1 is mapped to host TLB1. As host kernel only provides 4K uncontinuous pages, we have to break guest large mapping into 4K shadow mappings. These 4K shadow mappings are then mapped into host TLB1 on fly. As host TLB1 only has 13 free entries, there's serious tlb miss. Since e500v2 has a big number of TLB0 entries, it should be help to map those 4K shadow mappings to host TLB0. To achieve this, we need to unlink guest tlb and host tlb, So that guest TLB1 mappings can route to any host TLB0 entries freely. Pages/mappings are considerred in the same kind as host tlb entry. This patch remove the link between pages and guest tlb entry to do the unlink. And keep host_tlb0_ref in each vcpu to trace pages. Then it's easy to map guest TLB1 to host TLB0. In guest ramdisk boot test(guest mainly uses TLB1), with this patch, the tlb miss number get down 90%. Signed-off-by: Liu Yu yu@freescale.com --- arch/powerpc/include/asm/kvm_e500.h |7 +- arch/powerpc/kvm/e500.c |4 + arch/powerpc/kvm/e500_tlb.c | 280 --- arch/powerpc/kvm/e500_tlb.h |1 + 4 files changed, 104 insertions(+), 188 deletions(-) -static unsigned int tlb1_entry_num; +static inline unsigned int get_tlb0_entry_offset(u32 eaddr, u32 esel) +{ + return ((eaddr 0x7F000) (12 - host_tlb0_assoc_bit) | + (esel (host_tlb0_assoc - 1))); +} void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) { @@ -62,11 +68,6 @@ static inline unsigned int tlb0_get_next_victim( return victim; } -static inline unsigned int tlb1_max_shadow_size(void) -{ - return tlb1_entry_num - tlbcam_index; -} - static inline int tlbe_is_writable(struct tlbe *tlbe) { return tlbe-mas3 (MAS3_SW|MAS3_UW); @@ -100,7 +101,7 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) /* * writing shadow tlb entry to host TLB */ -static inline void __write_host_tlbe(struct tlbe *stlbe) +static inline void __host_tlbe_write(struct tlbe *stlbe) { mtspr(SPRN_MAS1, stlbe-mas1); mtspr(SPRN_MAS2, stlbe-mas2); @@ -109,25 +110,22 @@ static inline void __write_host_tlbe(struct tlbe *stlbe) __asm__ __volatile__ (tlbwe\n : : ); } -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel, struct tlbe *stlbe) +static inline u32 host_tlb0_write(struct kvmppc_vcpu_e500 *vcpu_e500, + u32 gvaddr, struct tlbe *stlbe) { - local_irq_disable(); - if (tlbsel == 0) { - __write_host_tlbe(stlbe); - } else { - unsigned register mas0; + unsigned register mas0; - mas0 = mfspr(SPRN_MAS0); + local_irq_disable(); Do you have to disable interrupts for a tlb write? If you get preempted before the write, the entry you overwrite could be different. But you don't protect against that either way. And if you get preempted afterwards, you could lose the entry. But since you enable interrupts beyond that, that could happen either way too. So what's the reason for the disable here? Hello Alex, Doesn't local_irq_disable() also disable preempt? The reason to disable interrupts is because it's possible to have tlb misses in interrupt service route. Thanks, Yu -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, 2010-09-16 at 13:32 +0200, Joerg Roedel wrote: On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote: Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free time :-) Thanks, will update the list when either comes. Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on 32 bit hosts. Nice :-) woot woot... Will definite compile a test kernel asap. Joerg -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, Sep 16, 2010 at 03:35:17PM +0400, Michael Tokarev wrote: 16.09.2010 15:32, Joerg Roedel пишет: On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote: Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free time :-) Thanks, will update the list when either comes. Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on 32 bit hosts. What commit it is? I swear I saw it in 2.6.32-stable somewhere. Is it this one (2.6.32.12): it is 16518d5ada690643453eb0aef3cc7841d3623c2d, but the bug shouldn't be present in 2.6.32. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, Sep 16, 2010 at 07:37:55PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 13:32 +0200, Joerg Roedel wrote: On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote: Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free time :-) Thanks, will update the list when either comes. Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on 32 bit hosts. Nice :-) woot woot... Will definite compile a test kernel asap. So your host is 32bit? -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, Sep 16, 2010 at 03:35:17PM +0400, Michael Tokarev wrote: 16.09.2010 15:32, Joerg Roedel пишет: On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote: Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free time :-) Thanks, will update the list when either comes. Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on 32 bit hosts. What commit it is? I swear I saw it in 2.6.32-stable somewhere. Is it this one (2.6.32.12): It is 16518d5ada690643453eb0aef3cc7841d3623c2d in latest linus/master branch. It was merged between -rc3 and -rc4. Joerg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0
Liu Yu-B13201 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, September 10, 2010 7:24 AM To: Liu Yu-B13201 Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org Subject: Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0 On 08.09.2010, at 11:40, Liu Yu wrote: Current guest TLB1 is mapped to host TLB1. As host kernel only provides 4K uncontinuous pages, we have to break guest large mapping into 4K shadow mappings. These 4K shadow mappings are then mapped into host TLB1 on fly. As host TLB1 only has 13 free entries, there's serious tlb miss. Since e500v2 has a big number of TLB0 entries, it should be help to map those 4K shadow mappings to host TLB0. To achieve this, we need to unlink guest tlb and host tlb, So that guest TLB1 mappings can route to any host TLB0 entries freely. Pages/mappings are considerred in the same kind as host tlb entry. This patch remove the link between pages and guest tlb entry to do the unlink. And keep host_tlb0_ref in each vcpu to trace pages. Then it's easy to map guest TLB1 to host TLB0. In guest ramdisk boot test(guest mainly uses TLB1), with this patch, the tlb miss number get down 90%. Signed-off-by: Liu Yu yu@freescale.com --- arch/powerpc/include/asm/kvm_e500.h |7 +- arch/powerpc/kvm/e500.c |4 + arch/powerpc/kvm/e500_tlb.c | 280 --- arch/powerpc/kvm/e500_tlb.h |1 + 4 files changed, 104 insertions(+), 188 deletions(-) -static unsigned int tlb1_entry_num; +static inline unsigned int get_tlb0_entry_offset(u32 eaddr, u32 esel) +{ + return ((eaddr 0x7F000) (12 - host_tlb0_assoc_bit) | + (esel (host_tlb0_assoc - 1))); +} void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) { @@ -62,11 +68,6 @@ static inline unsigned int tlb0_get_next_victim( return victim; } -static inline unsigned int tlb1_max_shadow_size(void) -{ - return tlb1_entry_num - tlbcam_index; -} - static inline int tlbe_is_writable(struct tlbe *tlbe) { return tlbe-mas3 (MAS3_SW|MAS3_UW); @@ -100,7 +101,7 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) /* * writing shadow tlb entry to host TLB */ -static inline void __write_host_tlbe(struct tlbe *stlbe) +static inline void __host_tlbe_write(struct tlbe *stlbe) { mtspr(SPRN_MAS1, stlbe-mas1); mtspr(SPRN_MAS2, stlbe-mas2); @@ -109,25 +110,22 @@ static inline void __write_host_tlbe(struct tlbe *stlbe) __asm__ __volatile__ (tlbwe\n : : ); } -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel, struct tlbe *stlbe) +static inline u32 host_tlb0_write(struct kvmppc_vcpu_e500 *vcpu_e500, + u32 gvaddr, struct tlbe *stlbe) { - local_irq_disable(); - if (tlbsel == 0) { - __write_host_tlbe(stlbe); - } else { - unsigned register mas0; + unsigned register mas0; - mas0 = mfspr(SPRN_MAS0); + local_irq_disable(); Do you have to disable interrupts for a tlb write? If you get preempted before the write, the entry you overwrite could be different. But you don't protect against that either way. And if you get preempted afterwards, you could lose the entry. But since you enable interrupts beyond that, that could happen either way too. So what's the reason for the disable here? Hello Alex, Doesn't local_irq_disable() also disable preempt? The reason to disable interrupts is because it's possible to have tlb misses in interrupt service route. Yes, local_irq_disable disables preempt. That's exactly what I was referring to :). I don't understand the statement about the service route. A tlb miss still arrives with local_irq_disable. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.16 Guest Hangs on Boot
On Thu, 2010-09-16 at 13:41 +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 07:37:55PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 13:32 +0200, Joerg Roedel wrote: On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote: On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote: Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free time :-) Thanks, will update the list when either comes. Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on 32 bit hosts. Nice :-) woot woot... Will definite compile a test kernel asap. So your host is 32bit? Yes. It's Arch-based. I initially thought of installing a 64-bit kernel when I saw the guest host status on kvm's home, but programs are compiled 32-bit. I was put off thinking about binary compatibility. Haven't tested such since my Debian days... -- Gleb. -- --- Follow me: http://twitter.com/agirivera Invite as a friend: http://www.facebook.com/agirivera -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: .img on nfs, relative on ram, consuming mass ram
Ok, thanks for taking time. I'll dig into your answers. So as i run relative.img on diskless systems with original.img on nfs, what are the best practice/tips i can use ? Is ramfs more suitable than tmpfs ? Fred. On Thu, 16 Sep 2010 11:09:49 +0200 Andre Przywara andre.przyw...@amd.com wrote: TOURNIER Frédéric wrote: Hi ! Here's my config : Version : qemu-kvm-0.12.5, qemu-kvm-0.12.4 Hosts : AMD 64X2, Phenom and Core 2 duo OS : Slackware 64 13.0 Kernel : 2.6.35.4 and many previous versions I use a PXE server to boot semi-diskless (swap partitions and some local stuff) stations. This server also serves a read-only nfs folder, with plenty of .img on it. When clients connects, a relative image is created in /tmp, which is a tmpfs, so hosted in ram. And here i go on my 2G stations : qemu-system-x86_64 -m 1024 -vga std -usb -usbdevice tablet -localtime -soundhw es1370 /tmp/relimg.img qemu-system-x86_64 -m 1024 -vga std -usb -usbdevice tablet -localtime -soundhw es1370 /dev/shm/relimg.img I tried both. Always the same result : the ram is consumed quickly, and mass swap occurs. Which is only natural, as tmpfs is promising to never swap. So it will take precedence over other RAM (that's why it is limited to half of the memory by default). As soon as the guest has (re)written more disk sectors than your free RAM can hold, the system will start to swap out your guest RAM (and other host applications). So in general you should avoid putting relative disk images to tmpfs if your host memory is limited. As a workaround you could try to further limit the tmpfs max size (mount -t tmpfs -o size=512M none /dev/shm), but this could lead to data loss in your guest as it possibly cannot back the written sectors anymore. On a 4G system, i see kvm uses more than 1024, maybe 1200. And everytime a launch a program inside the vm, the amount of the host free ram (not cached) diminishes, which is weird, because it should have been reserved. KVM uses on-demand paging like other applications. So it will not reserve memory for your guest (unless you use hugetlbfs's -mempath): $ kvm -cdrom ttylinux_ser.iso -nographic -m 3072M $ top PID USER PR NI VIRT RES SHR S %CPU %MEMTIME+ COMMAND 6015 user 20 0 3205m 128m 3020 S2 2.2 0:04.94 kvm Regards, Andre. So on a 2G system, swap occurs very fast and the machine slow a lot down. An on a total diskless system, this leads fast to a freeze. I have no problem if i use a relative image on disk : qemu-system-x86_64 -m 1024 -vga std -usb -usbdevice tablet -localtime -soundhw es1370 -drive file=/mnt/hd/sda/sda1/tmp/relimg.img,cache=none -- Andre Przywara AMD-Operating System Research Center (OSRC), Dresden, Germany Tel: +49 351 448-3567-12 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote: We haver two users: qemu does deasserts, vhost-net does asserts. Well this is broken. You want KVM to track level for you and this is wrong. KVM does this anyway because it can't relay on devise model to behave correctly [0], but in your case it is designed to behave incorrectly. Interrupt type is a device property. PCI devices just happen to be level triggered according to PCI spec. What if you want to use vhost-net to implement network device which has active-low interrupt line? [1] The polarity would have to be reversed in gsi (irq line can be shared, all devices must be active high or low consistently). There are gsi dedicated to PCI. They can be shared only between PCI devices. If you want to split parts that asserts irq and de-asserts it then we should have irqfd that tracks line status and knows interrupt line polarity. Yes, it can know about polarity even though I think it's cleaner to do this per gsi. But it can not track line status as line is shared with other devices. It should track only device's line status. Another application is out of process virtio (sandboxing!). It will still assert and de-assert irq at the same code, so it will be able to track irq line status. Again, pci stuff needs to stay in qemu. Nothing to do with PCI whatsoever. [0] most qemu devices behave incorrectly and trigger level irq more then needed. Which devices? Most of them. They just call update_irq_status() or something and re-assert interrupt regardless of what previous status was. pci core tracks line status and will never assert the same line multiple times. That's good if pci core does this, but device shouldn't even try it. [1] this is how correct PCI device should behave but we override polarity in ACPI, but now incorrect behaviour is deeply designed into vhost-net. Not really, vhost net signals an eventfd. What happens then is up to kvm. That is what current broken design does and it works, but if you want to save unneeded calls into kvm fix design. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote: We haver two users: qemu does deasserts, vhost-net does asserts. Well this is broken. You want KVM to track level for you and this is wrong. KVM does this anyway because it can't relay on devise model to behave correctly [0], but in your case it is designed to behave incorrectly. Interrupt type is a device property. PCI devices just happen to be level triggered according to PCI spec. What if you want to use vhost-net to implement network device which has active-low interrupt line? [1] The polarity would have to be reversed in gsi (irq line can be shared, all devices must be active high or low consistently). There are gsi dedicated to PCI. They can be shared only between PCI devices. If you want to split parts that asserts irq and de-asserts it then we should have irqfd that tracks line status and knows interrupt line polarity. Yes, it can know about polarity even though I think it's cleaner to do this per gsi. But it can not track line status as line is shared with other devices. It should track only device's line status. There is no such thing as device's line status on real hardware, either. Devices do not drive INT# high: they drive it low (all the time) or do not drive it at all. Or consider express, the spec explicitly says: Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but are not errors. Another application is out of process virtio (sandboxing!). It will still assert and de-assert irq at the same code, so it will be able to track irq line status. Again, pci stuff needs to stay in qemu. Nothing to do with PCI whatsoever. [0] most qemu devices behave incorrectly and trigger level irq more then needed. Which devices? Most of them. They just call update_irq_status() or something and re-assert interrupt regardless of what previous status was. At least for PCI devices, these calls do nothing if status does not change. pci core tracks line status and will never assert the same line multiple times. That's good if pci core does this, but device shouldn't even try it. I disagree. We don't want to duplicate a ton of code all over the codebase. [1] this is how correct PCI device should behave but we override polarity in ACPI, but now incorrect behaviour is deeply designed into vhost-net. Not really, vhost net signals an eventfd. What happens then is up to kvm. That is what current broken design does and it works, but if you want to save unneeded calls into kvm fix design. The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci. Making vhost aware of pci breaks this, I would not call that fixing the design. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On 09/16/2010 02:57 PM, Michael S. Tsirkin wrote: If you want to split parts that asserts irq and de-asserts it then we should have irqfd that tracks line status and knows interrupt line polarity. Yes, it can know about polarity even though I think it's cleaner to do this per gsi. But it can not track line status as line is shared with other devices. It should track only device's line status. There is no such thing as device's line status on real hardware, either. Devices do not drive INT# high: they drive it low (all the time) or do not drive it at all. That's just an implementation detail. Devices either assert INT# or they do not. Tying the wires together constitutes an AND gate. This gate has to be modelled somewhere, currently it's in qemu's pci emulation. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote: We haver two users: qemu does deasserts, vhost-net does asserts. Well this is broken. You want KVM to track level for you and this is wrong. KVM does this anyway because it can't relay on devise model to behave correctly [0], but in your case it is designed to behave incorrectly. Interrupt type is a device property. PCI devices just happen to be level triggered according to PCI spec. What if you want to use vhost-net to implement network device which has active-low interrupt line? [1] The polarity would have to be reversed in gsi (irq line can be shared, all devices must be active high or low consistently). There are gsi dedicated to PCI. They can be shared only between PCI devices. If you want to split parts that asserts irq and de-asserts it then we should have irqfd that tracks line status and knows interrupt line polarity. Yes, it can know about polarity even though I think it's cleaner to do this per gsi. But it can not track line status as line is shared with other devices. It should track only device's line status. There is no such thing as device's line status on real hardware, either. Devices do not drive INT# high: they drive it low (all the time) or do not drive it at all. Same thing, other naming. Device either drive it low (irq_set(1)) or not (irq_set(0)). Or consider express, the spec explicitly says: Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but are not errors. Another application is out of process virtio (sandboxing!). It will still assert and de-assert irq at the same code, so it will be able to track irq line status. Again, pci stuff needs to stay in qemu. Nothing to do with PCI whatsoever. [0] most qemu devices behave incorrectly and trigger level irq more then needed. Which devices? Most of them. They just call update_irq_status() or something and re-assert interrupt regardless of what previous status was. At least for PCI devices, these calls do nothing if status does not change. I am not sure what code are you locking at. e1000 device emulation doesn't check previous line status before calling qemu_set_irq(). pci core tracks line status and will never assert the same line multiple times. That's good if pci core does this, but device shouldn't even try it. I disagree. We don't want to duplicate a ton of code all over the codebase. So abstract it into a function. It shouldn't be part of PCI emulation. [1] this is how correct PCI device should behave but we override polarity in ACPI, but now incorrect behaviour is deeply designed into vhost-net. Not really, vhost net signals an eventfd. What happens then is up to kvm. That is what current broken design does and it works, but if you want to save unneeded calls into kvm fix design. The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci. Making vhost aware of pci breaks this, I would not call that fixing the design. Once again. Nothing to do with PCI, everything to do with device emulation. And I propose to abstract interrupt assertion part into irqfd, not into vhost. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: .img on nfs, relative on ram, consuming mass ram
2010/9/16 Andre Przywara andre.przyw...@amd.com: TOURNIER Frédéric wrote: Ok, thanks for taking time. I'll dig into your answers. So as i run relative.img on diskless systems with original.img on nfs, what are the best practice/tips i can use ? I thinks it is -snapshot you are looking for. This will put the backing store into normal RAM, and you can later commit it to the original image if needed. See the qemu manpage for more details. In a nutshell you just specify the original image and add -snapshot to the command line. -snapshot creates a temporary qcow2 image in /tmp whose backing file is your original image. I'm not sure what you mean by This will put the backing store into normal RAM? Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 0/6] Nonatomic interrupt injection
On 08/30/2010 02:36 PM, Avi Kivity wrote: This patchset changes interrupt injection to be done from normal process context instead of interrupts disabled context. This is useful for real mode interrupt injection on Intel without the current hacks (injecting as a software interrupt of a vm86 task), reducing latencies, and later, for allowing nested virtualization code to use kvm_read_guest()/kvm_write_guest() instead of kmap() to access the guest vmcb/vmcs. Seems to survive a hack that cancels every 16th entry, after injection has already taken place. With the PIC reset fix posted earlier, this passes autotest on both AMD and Intel, with in-kernel irqchip. I'll run -no-kvm-irqchip tests shortly. Please review carefully, esp. the first patch. Any missing kvm_make_request() there may result in a hung guest. This is now merged, with the change pointed out by Marcelo. Windows XP x64 fails installation without (vmx.c handle_cr()) case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = kvm_register_read(vcpu, reg); kvm_set_cr8(vcpu, cr8); skip_emulated_instruction(vcpu); if (irqchip_in_kernel(vcpu-kvm)) return 1; -if (cr8_prev = cr8) -return 1; vcpu-run-exit_reason = KVM_EXIT_SET_TPR; return 0; } Which doesn't make any sense (anyone?). The failure is present even without the patchset, and is fixed by the same hack, so a regression was not introduced. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 03:14:11PM +0200, Avi Kivity wrote: On 09/16/2010 02:57 PM, Michael S. Tsirkin wrote: If you want to split parts that asserts irq and de-asserts it then we should have irqfd that tracks line status and knows interrupt line polarity. Yes, it can know about polarity even though I think it's cleaner to do this per gsi. But it can not track line status as line is shared with other devices. It should track only device's line status. There is no such thing as device's line status on real hardware, either. Devices do not drive INT# high: they drive it low (all the time) or do not drive it at all. That's just an implementation detail. Devices either assert INT# or they do not. Tying the wires together constitutes an AND gate. This gate has to be modelled somewhere, currently it's in qemu's pci emulation. Right. kvm in kernel has this as well, we need to keep this in kvm kernel if we want to support level with irqfd. Where it does not belong is individual devices: these should be able to assert INTx multiple times and it should have no effect, as per spec. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On 09/16/2010 03:38 PM, Michael S. Tsirkin wrote: That's just an implementation detail. Devices either assert INT# or they do not. Tying the wires together constitutes an AND gate. This gate has to be modelled somewhere, currently it's in qemu's pci emulation. Right. kvm in kernel has this as well, we need to keep this in kvm kernel if we want to support level with irqfd. Right, we added the irq_source_id hack for device assignment. What's wrong with letting userspace mediate this, though? Where it does not belong is individual devices: these should be able to assert INTx multiple times and it should have no effect, as per spec. Yes. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: .img on nfs, relative on ram, consuming mass ram
Stefan Hajnoczi wrote: 2010/9/16 Andre Przywara andre.przyw...@amd.com: TOURNIER Frédéric wrote: Ok, thanks for taking time. I'll dig into your answers. So as i run relative.img on diskless systems with original.img on nfs, what are the best practice/tips i can use ? I thinks it is -snapshot you are looking for. This will put the backing store into normal RAM, and you can later commit it to the original image if needed. See the qemu manpage for more details. In a nutshell you just specify the original image and add -snapshot to the command line. -snapshot creates a temporary qcow2 image in /tmp whose backing file is your original image. I'm not sure what you mean by This will put the backing store into normal RAM? Stefan, you are right. I never looked into the code and because the file in /tmp is deleted just after creation there wasn't a sign of it. For some reason I thought that the buffer would just be allocated in memory. Sorry, my mistake and thanks for pointing this out. So Fred, unfortunately this does not solve your problem. I guess you run into a general problem. If the guest actually changes so much of the disk that this cannot be backed by RAM in the host, you have lost. One solution could be to just make (at least parts of) the disk read-only (a write protected /usr partition works quite well). If you are sure that writes are not that frequent, you could think of putting the overlay file also on the remote storage (NFS). Although this is rather slow, it shouldn't matter if there aren't many writes and the local page cache should catch most of the accesses (while still being nice to other RAM users). Regards, Andre. Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Andre Przywara AMD-Operating System Research Center (OSRC), Dresden, Germany Tel: +49 351 448-3567-12 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 03:38:28PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 03:14:11PM +0200, Avi Kivity wrote: On 09/16/2010 02:57 PM, Michael S. Tsirkin wrote: If you want to split parts that asserts irq and de-asserts it then we should have irqfd that tracks line status and knows interrupt line polarity. Yes, it can know about polarity even though I think it's cleaner to do this per gsi. But it can not track line status as line is shared with other devices. It should track only device's line status. There is no such thing as device's line status on real hardware, either. Devices do not drive INT# high: they drive it low (all the time) or do not drive it at all. That's just an implementation detail. Devices either assert INT# or they do not. Tying the wires together constitutes an AND gate. This gate has to be modelled somewhere, currently it's in qemu's pci emulation. Right. kvm in kernel has this as well, we need to keep this in kvm kernel if we want to support level with irqfd. Where it does not belong is individual devices: these should be able to assert INTx multiple times and it should have no effect, as per spec. Assert_INTx/Deassert_INTx you mentioned are internal PCI thing. What KVM sees logically is status of the line between pci controller and irq chip. We do not emulate PCI inside kernel, but I agree that kernel should handle multiple asserts without de-assert in the middle and, in fact, it does. But the thread started with you trying to optimize this non-optimal device behaviour and I am saying that the fix should be elsewhere. Namely in irqfd. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 03:18:23PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote: We haver two users: qemu does deasserts, vhost-net does asserts. Well this is broken. You want KVM to track level for you and this is wrong. KVM does this anyway because it can't relay on devise model to behave correctly [0], but in your case it is designed to behave incorrectly. Interrupt type is a device property. PCI devices just happen to be level triggered according to PCI spec. What if you want to use vhost-net to implement network device which has active-low interrupt line? [1] The polarity would have to be reversed in gsi (irq line can be shared, all devices must be active high or low consistently). There are gsi dedicated to PCI. They can be shared only between PCI devices. If you want to split parts that asserts irq and de-asserts it then we should have irqfd that tracks line status and knows interrupt line polarity. Yes, it can know about polarity even though I think it's cleaner to do this per gsi. But it can not track line status as line is shared with other devices. It should track only device's line status. There is no such thing as device's line status on real hardware, either. Devices do not drive INT# high: they drive it low (all the time) or do not drive it at all. Same thing, other naming. Device either drive it low (irq_set(1)) or not (irq_set(0)). Well, if I drive it low any number of times it should hae no effect. Or consider express, the spec explicitly says: Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but are not errors. Another application is out of process virtio (sandboxing!). It will still assert and de-assert irq at the same code, so it will be able to track irq line status. Again, pci stuff needs to stay in qemu. Nothing to do with PCI whatsoever. [0] most qemu devices behave incorrectly and trigger level irq more then needed. Which devices? Most of them. They just call update_irq_status() or something and re-assert interrupt regardless of what previous status was. At least for PCI devices, these calls do nothing if status does not change. I am not sure what code are you locking at. e1000 device emulation doesn't check previous line status before calling qemu_set_irq(). Right. If you dig through useless levels of indirection, you will see that each PCI device has 4 pin levels, when one of these changes this makes it up level to the parent bus, and so on. pci core tracks line status and will never assert the same line multiple times. That's good if pci core does this, but device shouldn't even try it. I disagree. We don't want to duplicate a ton of code all over the codebase. So abstract it into a function. It shouldn't be part of PCI emulation. I don't know what you mean by this, send a patch and we can discuss? Note that when I patches PCI interrupt handling for compliance I made it mimic hardware as closely as possible: devices can send any # of assert/deassert messages, bus discards duplicates. [1] this is how correct PCI device should behave but we override polarity in ACPI, but now incorrect behaviour is deeply designed into vhost-net. Not really, vhost net signals an eventfd. What happens then is up to kvm. That is what current broken design does and it works, but if you want to save unneeded calls into kvm fix design. The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci. Making vhost aware of pci breaks this, I would not call that fixing the design. Once again. Nothing to do with PCI, everything to do with device emulation. And I propose to abstract interrupt assertion part into irqfd, not into vhost. -- Gleb. This could work. KVM would need to find all irqfd objects mapped to gsi and notify them on deassert. No idea how hard this is. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] KVM: MMU: Use base_role.nxe for mmu.nx
On 09/15/2010 10:48 AM, Roedel, Joerg wrote: On Tue, Sep 14, 2010 at 06:08:37PM -0400, Marcelo Tosatti wrote: For tdp better set base_role.nxe to zero, otherwise duplicate tdp pagetables can be created if the guest switches between nx/non-nx. This does not work because bit 63 is marked as reserved if base_role.nxe is 0. If the walk_addr_generic function then runs with tdp enabled it would report a set nx bit as a rsvd fault. We also can't use is_nx() in those path because it does not distinguish between l1 and l2 nx. Are there guests that switch the efer.nx bit regularly enough so that it matters? If so I would suggest to drop this patch and keep mmu.nx. I'll do that then. I'm still unhappy about the duplication. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] KVM: MMU: Don't track nested fault info in error-code
On 09/14/2010 05:46 PM, Joerg Roedel wrote: This patch moves the detection whether a page-fault was nested or not out of the error code and moves it into a separate variable in the fault struct. Applied, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/3] Make kvm64 the default cpu model when kvm_enabled()
On 09/14/2010 05:52 PM, Joerg Roedel wrote: As requested by Alex this patch makes kvm64 the default CPU model when qemu is started with -enable-kvm. This breaks compatiblity; if started with -M 0.13 or prior we should default to qemu64. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 03:51:37PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 03:18:23PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote: We haver two users: qemu does deasserts, vhost-net does asserts. Well this is broken. You want KVM to track level for you and this is wrong. KVM does this anyway because it can't relay on devise model to behave correctly [0], but in your case it is designed to behave incorrectly. Interrupt type is a device property. PCI devices just happen to be level triggered according to PCI spec. What if you want to use vhost-net to implement network device which has active-low interrupt line? [1] The polarity would have to be reversed in gsi (irq line can be shared, all devices must be active high or low consistently). There are gsi dedicated to PCI. They can be shared only between PCI devices. If you want to split parts that asserts irq and de-asserts it then we should have irqfd that tracks line status and knows interrupt line polarity. Yes, it can know about polarity even though I think it's cleaner to do this per gsi. But it can not track line status as line is shared with other devices. It should track only device's line status. There is no such thing as device's line status on real hardware, either. Devices do not drive INT# high: they drive it low (all the time) or do not drive it at all. Same thing, other naming. Device either drive it low (irq_set(1)) or not (irq_set(0)). Well, if I drive it low any number of times it should hae no effect. There is no meaning of driving low the line multiple times on real HW. You either drive it low or not. We try to emulate this with individual drive low/high events in software. Or consider express, the spec explicitly says: Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but are not errors. Another application is out of process virtio (sandboxing!). It will still assert and de-assert irq at the same code, so it will be able to track irq line status. Again, pci stuff needs to stay in qemu. Nothing to do with PCI whatsoever. [0] most qemu devices behave incorrectly and trigger level irq more then needed. Which devices? Most of them. They just call update_irq_status() or something and re-assert interrupt regardless of what previous status was. At least for PCI devices, these calls do nothing if status does not change. I am not sure what code are you locking at. e1000 device emulation doesn't check previous line status before calling qemu_set_irq(). Right. If you dig through useless levels of indirection, you will see that each PCI device has 4 pin levels, when one of these changes this makes it up level to the parent bus, and so on. Yes. Qemu PCI level does it right. Ideally device would not even invoke this logic if interrupt status haven't changed. pci core tracks line status and will never assert the same line multiple times. That's good if pci core does this, but device shouldn't even try it. I disagree. We don't want to duplicate a ton of code all over the codebase. So abstract it into a function. It shouldn't be part of PCI emulation. I don't know what you mean by this, send a patch and we can discuss? I don't care enough to send patch. Just remember previous irq status and do not call qemu_set_irq() if it doesn't change. Three lines of code. Note that when I patches PCI interrupt handling for compliance I made it mimic hardware as closely as possible: devices can send any # of assert/deassert messages, bus discards duplicates. Qemu PCI code is correct as far as I can see. Not all devices are connected via PCI and there is not need to go through couple of layer of indirection to figure out that nothing should be done. [1] this is how correct PCI device should behave but we override polarity in ACPI, but now incorrect behaviour is deeply designed into vhost-net. Not really, vhost net signals an eventfd. What happens then is up to kvm. That is what current broken design does and it works, but if you want to save unneeded calls into kvm fix design. The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci. Making vhost aware of pci breaks this, I would not call that fixing the design. Once again. Nothing to do with PCI, everything to do with device emulation. And I propose to abstract interrupt assertion part into irqfd, not into
Re: [PATCH 3/3] Add svm cpuid features
Roedel, Joerg wrote: On Thu, Sep 16, 2010 at 10:06:48AM -0400, Avi Kivity wrote: On 09/14/2010 05:52 PM, Joerg Roedel wrote: This patch adds the svm cpuid feature flags to the qemu intialization path. It also adds the svm features available on phenom to its cpu-definition and extends the host cpu type to support all svm features KVM can provide. Does phenom really have vmcbclean? I thought it was a really new feature. No, but we could emulate it on almost all hardware that has SVM. Its basically the same as with x2apic. -cpu non-host is not about what we could emulate, but what the hardware really is. Features not supported by the particular CPU do not belong there. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/3] Add svm cpuid features
Roedel, Joerg wrote: On Thu, Sep 16, 2010 at 10:17:09AM -0400, Alexander Graf wrote: Roedel, Joerg wrote: On Thu, Sep 16, 2010 at 10:06:48AM -0400, Avi Kivity wrote: On 09/14/2010 05:52 PM, Joerg Roedel wrote: This patch adds the svm cpuid feature flags to the qemu intialization path. It also adds the svm features available on phenom to its cpu-definition and extends the host cpu type to support all svm features KVM can provide. Does phenom really have vmcbclean? I thought it was a really new feature. No, but we could emulate it on almost all hardware that has SVM. Its basically the same as with x2apic. -cpu non-host is not about what we could emulate, but what the hardware really is. Features not supported by the particular CPU do not belong there. I'll remove it then. It would have been nice to have it there because it will improve performance of svm emulation. But if it is about emulating the real cpu then I'll just drop it. speed == -cpu host :). If the performance increase is significant, we can always add a new cpu type for a cpu that does support those flags so people can use it there and still be migration safe. In fact, that would even benefit you guys as you'd sell more new machines for speed ;). Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 04:23:35PM +0200, Michael S. Tsirkin wrote: There is no such thing as device's line status on real hardware, either. Devices do not drive INT# high: they drive it low (all the time) or do not drive it at all. Same thing, other naming. Device either drive it low (irq_set(1)) or not (irq_set(0)). Well, if I drive it low any number of times it should hae no effect. There is no meaning of driving low the line multiple times on real HW. You either drive it low or not. We try to emulate this with individual drive low/high events in software. Or consider express, the spec explicitly says: Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but Express has assert and deassert messages. It might be easier for you to think in these terms. level 1: assert, level 0: deassert. Seems a simple model and what we do models this pretty well. They are between device and pci controller. I am talking about what happens between pci controller and irq chip. We are talking about different things really. are not errors. Another application is out of process virtio (sandboxing!). It will still assert and de-assert irq at the same code, so it will be able to track irq line status. Again, pci stuff needs to stay in qemu. Nothing to do with PCI whatsoever. [0] most qemu devices behave incorrectly and trigger level irq more then needed. Which devices? Most of them. They just call update_irq_status() or something and re-assert interrupt regardless of what previous status was. At least for PCI devices, these calls do nothing if status does not change. I am not sure what code are you locking at. e1000 device emulation doesn't check previous line status before calling qemu_set_irq(). Right. If you dig through useless levels of indirection, you will see that each PCI device has 4 pin levels, when one of these changes this makes it up level to the parent bus, and so on. Yes. Qemu PCI level does it right. Ideally device would not even invoke this logic if interrupt status haven't changed. It needs to call *some* function to check status and assert, right? qemu_set_irq is that function. qemu_set_irq does not check previous level and calls a callback unconditionally. pci core tracks line status and will never assert the same line multiple times. That's good if pci core does this, but device shouldn't even try it. I disagree. We don't want to duplicate a ton of code all over the codebase. So abstract it into a function. It shouldn't be part of PCI emulation. I don't know what you mean by this, send a patch and we can discuss? I don't care enough to send patch. Just remember previous irq status and do not call qemu_set_irq() if it doesn't change. Three lines of code. Heh, we have a ton of devices to support. So? And then we need to migrate this extra status, and make sure it's in sync with PCI code. We'll end up with much more more than 3 lines all of it in a very sensitive and hard to test parts code. You should be able to reconstruct it from device state. What should be in sync with PCI code? Note that when I patches PCI interrupt handling for compliance I made it mimic hardware as closely as possible: devices can send any # of assert/deassert messages, bus discards duplicates. Qemu PCI code is correct as far as I can see. Not all devices are connected via PCI and there is not need to go through couple of layer of indirection to figure out that nothing should be done. If we want to remove the indirection I would be much more interested to remove it for all cases, not just when nothing should be done. I don't care. This indirection may be justified for all I know. You try to shift this discussion to areas I am not interested to look into :) All I am saying is that each device is capable of knowing its current irq line state and optimize out function call + additional logic. Whether upper layer should handle two asserts without de-assert in between is different point and I think we agree on it. [1] this is how correct PCI device should behave but we override polarity in ACPI, but now incorrect behaviour is deeply designed into vhost-net. Not really, vhost net signals an eventfd. What happens then is up to kvm. That is what current broken design does and it works, but if you want to save unneeded calls into kvm fix design. The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle pci. Making vhost aware of pci breaks this, I would not call that fixing the design.
Re: [PATCH 19/24] Deciding if L0 or L1 should handle an L2 exit
On Mon, Jun 14, 2010, Avi Kivity wrote about Re: [PATCH 19/24] Deciding if L0 or L1 should handle an L2 exit: On 06/13/2010 03:32 PM, Nadav Har'El wrote: This patch contains the logic of whether an L2 exit should be handled by L0 and then L2 should be resumed, or whether L1 should be run to handle this exit (using the nested_vmx_vmexit() function of the previous patch). The basic idea is to let L1 handle the exit only if it actually asked to trap this sort of event. For example, when L2 exits on a change to CR0, ... @@ -3819,6 +3841,8 @@ static int handle_exception(struct kvm_v if (is_no_device(intr_info)) { vmx_fpu_activate(vcpu); +if (vmx-nested.nested_mode) +vmx-nested.nested_run_pending = 1; return 1; } Isn't this true for many other exceptions? #UD which we emulate (but the guest doesn't trap), page faults which we handle completely... I was trying to think why nested_run_pending=1 (forcing us to run L2 next) is necessary in the specific case of #NM, and couldn't think of any convincing reason. Sure, in most cases we would like to continue running L2 after L0 serviced this exception that L1 didn't care about, but in the rare cases where L1 should run next (especially, user-space injecting an interrupt), what's so wrong with L1 running next? And, like you said, if it's important for #NM, why not for #PF or other things? Anyway, the code appears to run correctly also without this setting, so I'm guessing that it's an historic artifact, from older code which was written before the days of the lazy fpu loading. So I'm removing it. Good catch. I was aware of this peculiar case in the code (and even documented it in the patch's description), but should have stopped to think if it is really such a special case, or simply an error. And now I believe it was nothing more than an error. +/* Return 1 if we should exit from L2 to L1 to handle a CR access exit, + * rather than handle it ourselves in L0. I.e., check if L1 wanted to + * intercept (via guest_host_mask etc.) the current event. + */ +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, +struct shadow_vmcs *l2svmcs) +{ ... +case 8: +if (l2svmcs-cpu_based_vm_exec_control +CPU_BASED_CR8_LOAD_EXITING) +return 1; Should check TPR threshold here too if enabled. I'll return to this issue in another mail. This one is getting long enough. +case 3: /* lmsw */ +if (l2svmcs-cr0_guest_host_mask +(val ^ l2svmcs-cr0_read_shadow)) +return 1; Need to mask off bit 0 (cr0.pe) of val, since lmsw can't clear it. Right. Also, lmsw only works on the first 4 bits of cr0: The first bit, it can only turn on (like you said), and the next 3 bits, it can change at will. Any other attempted changes to cr0 through lmsw should be ignored, and not cause exits. So I changed the code to this: if (vmcs12-cr0_guest_host_mask 0xe (val ^ vmcs12-cr0_read_shadow)) return 1; if ((vmcs12-cr0_guest_host_mask 0x1) !(vmcs12-cr0_read_shadow 0x1) (val 0x1)) return 1; I wonder if there's a less ugly way to write the same thing... This LMSW is so 80s :( I wonder who's using it these days, and specifically if it would bother anyone if lmsw suddenly acquired new abilities to change bits it never could... Oh, the things that we do for backward compatibility :-) +/* Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we + * should handle it ourselves in L0. Only call this when in nested_mode (L2). + */ +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool afterexit) ... +case EXIT_REASON_EXCEPTION_NMI: +if (is_external_interrupt(intr_info) +(l2svmcs-pin_based_vm_exec_control + PIN_BASED_EXT_INTR_MASK)) +r = 1; A real external interrupt should never be handled by the guest, only a virtual external interrupt. You've opened a whole can of worms ;-) But it's very good that you did. It appears that this nested_vmx_exit_handled() was called for two completely different reasons, with different afterexit parameter (if you remember, this flag had the puzzling name kvm_override in a previous version). On normal exits, it is called with afterexit=1 and did exactly what you wanted, i.e., never handle in the guest: case EXIT_REASON_EXTERNAL_INTERRUPT: return 0; The case which you saw was only relevant for the other place this function is called - in exception injection. But most of the code in this function was irrelevant and/or plain wrong in this case. This part of the code was just terrible, and I couldn't leave it like this, even
Re: Exceed 1GB/s with virtio-net ?
On Tue, 2010-09-14 at 18:14 +0200, Thibault VINCENT wrote: On 13/09/2010 19:34, Alex Williamson wrote: On Mon, Sep 13, 2010 at 4:32 AM, Thibault VINCENT thibault.vinc...@smartjog.com wrote: Hello I'm trying to achieve higher than gigabit transferts over a virtio NIC with no success, and I can't find a recent bug or discussion about such an issue. The simpler test consist of two VM running on a high-end blade server with 4 cores and 4GB RAM each, and a virtio NIC dedicated to the inter-VM communication. On the host, the two vnet interfaces are enslaved into a bridge. I use a combination of 2.6.35 on the host and 2.6.32 in the VMs. Running iperf or netperf on these VMs, with TCP or UDP, result in ~900Mbits/s transferts. This is what could be expected of a 1G interface, and indeed the e1000 emulation performs similar. Changing the txqueuelen, MTU, and offloading settings on every interface (bridge/tap/virtio_net) didn't improve the speed, nor did the installation of irqbalance and the increase in CPU and RAM. Is this normal ? Is the multiple queue patch intended to address this ? It's quite possible I missed something :) I'm able to achieve quite a bit more than 1Gbps using virtio-net between 2 guests on the same host connected via an internal bridge. With the virtio-net TX bottom half handler I can easily hit 7Gbps TCP and 10+Gbps UDP using netperf (TCP_STREAM/UDP_STREAM tests). Even without the bottom half patches (not yet in qemu-kvm.git), I can get ~5Gbps. Maybe you could describe your setup further, host details, bridge setup, guests, specific tests, etc... Thanks, Thanks Alex, I don't use the bottom half patches but anything between 3Gbps and 5Gbps would be fine. Here are some more details: Host - Dell M610 ; 2 x Xeon X5650 ; 6 x 8GB Debian Squeeze amd64 qemu-kvm 0.12.5+dfsg-1 kernel 2.6.35-1 amd64 (Debian Experimental) Guests --- Debian Squeeze amd64 kernel 2.6.35-1 amd64 (Debian Experimental) To measure the throughput between the guests, I do the following. On the host: * create a bridge # brctl addbr br_test # ifconfig br_test 1.1.1.1 up * start two guests # kvm -enable-kvm -m 4096 -smp 4 -drive file=/dev/vg/deb0,id=0,boot=on,format=raw -device virtio-blk-pci,drive=0,id=0 -device virtio-net-pci,vlan=0,id=1,mac=52:54:00:cf:6a:b0 -net tap,vlan=0,name=hostnet0 # kvm -enable-kvm -m 4096 -smp 4 -drive file=/dev/vg/deb1,id=0,boot=on,format=raw -device virtio-blk-pci,drive=0,id=0 -device virtio-net-pci,vlan=0,id=1,mac=52:54:00:cf:6a:b1 -net tap,vlan=0,name=hostnet0 * add guests to the bridge # brctl addif br_test tap0 # brctl addif br_test tap1 On the first guest: # ifconfig eth0 1.1.1.2 up # iperf -s -i 1 On the second guest: # ifconfig eth0 1.1.1.3 up # iperf -c 1.1.1.2 -i 1 Client connecting to 1.1.1.2, TCP port 5001 TCP window size: 16.0 KByte (default) [ 3] local 1.1.1.3 port 43510 connected with 1.1.1.2 port 5001 [ ID] Interval Transfer Bandwidth [ 3] 0.0- 1.0 sec 80.7 MBytes677 Mbits/sec [ 3] 1.0- 2.0 sec102 MBytes855 Mbits/sec [ 3] 2.0- 3.0 sec101 MBytes847 Mbits/sec [ 3] 3.0- 4.0 sec104 MBytes873 Mbits/sec [ 3] 4.0- 5.0 sec104 MBytes874 Mbits/sec [ 3] 5.0- 6.0 sec105 MBytes881 Mbits/sec [ 3] 6.0- 7.0 sec103 MBytes862 Mbits/sec [ 3] 7.0- 8.0 sec101 MBytes848 Mbits/sec [ 3] 8.0- 9.0 sec105 MBytes878 Mbits/sec [ 3] 9.0-10.0 sec105 MBytes882 Mbits/sec [ 3] 0.0-10.0 sec 1011 MBytes848 Mbits/sec On the host again: # iperf -c 1.1.1.1 -i 1 Client connecting to 1.1.1.1, TCP port 5001 TCP window size: 16.0 KByte (default) [ 3] local 1.1.1.3 port 60456 connected with 1.1.1.1 port 5001 [ ID] Interval Transfer Bandwidth [ 3] 0.0- 1.0 sec 97.9 MBytes821 Mbits/sec [ 3] 1.0- 2.0 sec136 MBytes 1.14 Gbits/sec [ 3] 2.0- 3.0 sec153 MBytes 1.28 Gbits/sec [ 3] 3.0- 4.0 sec160 MBytes 1.34 Gbits/sec [ 3] 4.0- 5.0 sec156 MBytes 1.31 Gbits/sec [ 3] 5.0- 6.0 sec122 MBytes 1.02 Gbits/sec [ 3] 6.0- 7.0 sec121 MBytes 1.02 Gbits/sec [ 3] 7.0- 8.0 sec137 MBytes 1.15 Gbits/sec [ 3] 8.0- 9.0 sec139 MBytes 1.17 Gbits/sec [ 3] 9.0-10.0 sec140 MBytes 1.17 Gbits/sec [ 3] 0.0-10.0 sec 1.33 GBytes 1.14 Gbits/sec You can see it's quite slow compared to your figures, between the guests and with the host too. And there is no specific load on any of the three systems, htop in a guest only report one of the four cores going up to 70% (sys+user+wait) during the test. The other tests I mentioned are: * iperf
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 04:51:17PM +0200, Gleb Natapov wrote: What for? Device emulation should do de-assert. Sorry, but at this point I have no idea what you call device emulation. The same thing everyone calls device emulation. In case of virtio-net it is in hw/virtio-net.c. If vhost-net is in use device emulation is split between userspace and kernel, but it is still just device emulation. case in point, virtio net does not know about pci at all, so it can not deassert. qemu has code to de-assert. vhost has code to assert. Good. So qemu will de-assert. So what do you mean by KVM would need to find all irqfd objects mapped to gsi and notify them on deassert I would like to optimize level interrupts and stop driving scheduler insane if at all possible. Worthy goal. Do it in irqfd. Irqfd shouldn't call kvm_set_irq() if irq level hasn't changed. Right. Then it needs to know about deasserts. It does not get this information, so when kvm gets deassert ioctl it should locate irqfd object and set level to 0. See? -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] device-assignment: register a reset function
On Tue, Sep 14, 2010 at 9:04 AM, Bernhard Kohl bernhard.k...@nsn.com wrote: This is necessary because during reboot of a VM the assigned devices continue DMA transfers which causes memory corruption. Signed-off-by: Thomas Ostler thomas.ost...@nsn.com Signed-off-by: Bernhard Kohl bernhard.k...@nsn.com --- hw/device-assignment.c | 14 ++ 1 files changed, 14 insertions(+), 0 deletions(-) diff --git a/hw/device-assignment.c b/hw/device-assignment.c index 87f7418..001aee8 100644 --- a/hw/device-assignment.c +++ b/hw/device-assignment.c @@ -1450,6 +1450,17 @@ static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev) dev-msix_table_page = NULL; } +static void reset_assigned_device(void *opaque) +{ + PCIDevice *d = (PCIDevice *)opaque; + uint32_t conf; + + /* reset the bus master bit to avoid further DMA transfers */ + conf = assigned_dev_pci_read_config(d, 0x04, 0x02); + conf = ~0x04; + assigned_dev_pci_write_config(d, 0x04, conf, 0x02); This should use defined macros, PCI_COMMAND PCI_COMMAND_MASTER. Otherwise seems ok. Thanks, Alex +} + static int assigned_initfn(struct PCIDevice *pci_dev) { AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev); @@ -1499,6 +1510,9 @@ static int assigned_initfn(struct PCIDevice *pci_dev) if (r 0) goto assigned_out; + /* register reset function for the device */ + qemu_register_reset(reset_assigned_device, pci_dev); + /* intercept MSI-X entry page in the MMIO */ if (dev-cap.available ASSIGNED_DEVICE_CAP_MSIX) if (assigned_dev_register_msix_mmio(dev)) -- 1.7.2.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 20/24] Correct handling of interrupt injection
On Mon, Jun 14, 2010, Avi Kivity wrote about Re: [PATCH 20/24] Correct handling of interrupt injection: .. However, properly doing what is described above requires invasive changes to the flow of the existing code, which we elected not to do in this stage. Instead we do something more simplistic and less efficient: we modify ... That's a little sad. I agree. I'd like to change this code to do the proper thing (as I explained in the patch's description), but as I said this will require some invasive changes to existing KVM code outside vmx.c. So seeing that this existing code also works, and despite hurting performance a bit there are much more pressing performance issues (namely, the need for nested EPT) - with your permission I'd like to postpone fixing this issue. static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { +if (to_vmx(vcpu)-nested.nested_mode nested_exit_on_intr(vcpu)) { +if (to_vmx(vcpu)-nested.nested_run_pending) +return 0; +nested_vmx_vmexit(vcpu, true); +/* fall through to normal code, but now in L1, not L2 */ +} + What exit is reported here? When nested_vmx_vmexit is called with the second parameter true, as above, it modifies the (vmcs12) exit reason to be EXIT_REASON_EXTERNAL_INTERRUPT. A hack, but it does the right thing in this case because L1 doesn't even get a chance to care about this exit reason before it exits again (as I tried to explain in the patch's description). -- Nadav Har'El| Thursday, Sep 16 2010, 8 Tishri 5771 n...@math.technion.ac.il |- Phone +972-523-790466, ICQ 13349191 |Debugging: the art of removing bugs. http://nadav.harel.org.il |Programming: the art of inserting them. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: .img on nfs, relative on ram, consuming mass ram
I can't do this because i need performance. I'm currently doing some tests. Will post soon. My config's map : NFS PXE server qemu-kvm host --- | img.img |__|relimg.img| | readonly| net| | --- Keep in touch and thx for time. Fred. On Thu, 16 Sep 2010 15:48:18 +0200 Andre Przywara andre.przyw...@amd.com wrote: Stefan Hajnoczi wrote: 2010/9/16 Andre Przywara andre.przyw...@amd.com: TOURNIER Frédéric wrote: Ok, thanks for taking time. I'll dig into your answers. So as i run relative.img on diskless systems with original.img on nfs, what are the best practice/tips i can use ? I thinks it is -snapshot you are looking for. This will put the backing store into normal RAM, and you can later commit it to the original image if needed. See the qemu manpage for more details. In a nutshell you just specify the original image and add -snapshot to the command line. -snapshot creates a temporary qcow2 image in /tmp whose backing file is your original image. I'm not sure what you mean by This will put the backing store into normal RAM? Stefan, you are right. I never looked into the code and because the file in /tmp is deleted just after creation there wasn't a sign of it. For some reason I thought that the buffer would just be allocated in memory. Sorry, my mistake and thanks for pointing this out. So Fred, unfortunately this does not solve your problem. I guess you run into a general problem. If the guest actually changes so much of the disk that this cannot be backed by RAM in the host, you have lost. One solution could be to just make (at least parts of) the disk read-only (a write protected /usr partition works quite well). If you are sure that writes are not that frequent, you could think of putting the overlay file also on the remote storage (NFS). Although this is rather slow, it shouldn't matter if there aren't many writes and the local page cache should catch most of the accesses (while still being nice to other RAM users). Regards, Andre. Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Andre Przywara AMD-Operating System Research Center (OSRC), Dresden, Germany Tel: +49 351 448-3567-12 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 16/24] Implement VMLAUNCH and VMRESUME
On Thu, Jun 17, 2010, Gleb Natapov wrote about Re: [PATCH 16/24] Implement VMLAUNCH and VMRESUME: +static int handle_launch_or_resume(struct kvm_vcpu *vcpu, bool launch) +{ + if (!nested_vmx_check_permission(vcpu)) ... Should also check MOV SS blocking. Why Intel decided that vm entry should fail in this case? How knows, but spec says so. Thanks. Added the check: if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) GUEST_INTR_STATE_MOV_SS){ nested_vmx_failValid(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); skip_emulated_instruction(vcpu); return 1; } Like you, I don't understand why this test is at all necessary... -- Nadav Har'El| Thursday, Sep 16 2010, 8 Tishri 5771 n...@math.technion.ac.il |- Phone +972-523-790466, ICQ 13349191 |Help Microsoft stamp out piracy. Give http://nadav.harel.org.il |Linux to a friend today! -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: VNC console CTRL key locked
16.09.2010 18:01, Peter Lieven wrote: Hi, with Ubuntu 10.04 Desktop guest in Qemu-KVM 0.12.4 I sometimes see a condition where the CTRL key is locked and cannot be released anymore. The only way I have found to solve this is to restart the VM. Has anyone else experienced this so far? Do you use any -k $lang switch? I've seen something similar with -k, but it was long time ago... /mjt -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: VNC console CTRL key locked
Am 16.09.2010 um 19:13 schrieb Michael Tokarev: 16.09.2010 18:01, Peter Lieven wrote: Hi, with Ubuntu 10.04 Desktop guest in Qemu-KVM 0.12.4 I sometimes see a condition where the CTRL key is locked and cannot be released anymore. The only way I have found to solve this is to restart the VM. Has anyone else experienced this so far? Do you use any -k $lang switch? I've seen something similar with -k, but it was long time ago... hi michael, i use -k de br, peter /mjt -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] kvm: enable irq injection from interrupt context
On Thu, Sep 16, 2010 at 05:43:26PM +0200, Gleb Natapov wrote: On Thu, Sep 16, 2010 at 05:24:11PM +0200, Michael S. Tsirkin wrote: On Thu, Sep 16, 2010 at 04:51:17PM +0200, Gleb Natapov wrote: What for? Device emulation should do de-assert. Sorry, but at this point I have no idea what you call device emulation. The same thing everyone calls device emulation. In case of virtio-net it is in hw/virtio-net.c. If vhost-net is in use device emulation is split between userspace and kernel, but it is still just device emulation. case in point, virtio net does not know about pci at all, so it can not deassert. I don't see what PCI has to do with it. virtio-net device implementation is split between several files. virtio-net/virtio-ring/virtio-pci. All of them still implement one emulated device. Whoever implemented virtio-net initially decided to put irq ack register into PCI config space, so code that does de-assert is in virtio-pci, but it is still part of emulated device. qemu has code to de-assert. vhost has code to assert. Good. So qemu will de-assert. So what do you mean by KVM would need to find all irqfd objects mapped to gsi and notify them on deassert I would like to optimize level interrupts and stop driving scheduler insane if at all possible. Worthy goal. Do it in irqfd. Irqfd shouldn't call kvm_set_irq() if irq level hasn't changed. Right. Then it needs to know about deasserts. It does not get this Dessert should be done by qemu writing 0 into irqfd. writing 0 to eventfd does nothing. The way to deassert irq is currently through ioctl and it seems entirely sensible to me to keep it that way. Also - which irqfd :) We need multiple irqfds to map to a single gsi, with kvm doing OR on them. Assertion and de-assertion should be done through irqfd. information, so when kvm gets deassert ioctl it should locate irqfd object and set level to 0. See? No. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
PCI passthru issue
Hi All, I noticed that the PCI registers are not all mapped to the guest for a PCI pass through device. For example, I was trying to pass through a PCI function (NIC) to a guest and in the guest the MSI-X table offset register is not mapped properly. It was showing all 0s. This is in RHEL 6.0 beta and the guest was a SLES11 64 bit. In the RHEL 6.0 host I see following for the PCI function: - 05:00.0 Ethernet controller: QLogic Corp. cLOM8214 1/10GbE Controller (rev 54) Subsystem: QLogic Corp. 8200 Series Dual Port 10GbE Converged Network Adapter (TCP/IP Networking) Control: I/O- Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr+ Stepping- SERR+ FastB2B- DisINTx+ Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- Interrupt: pin A routed to IRQ 74 Region 0: Memory at df20 (64-bit, non-prefetchable) [size=2M] Region 4: Memory at df1b (64-bit, non-prefetchable) [size=64K] Expansion ROM at df1c [disabled] [size=256K] Capabilities: [40] MSI-X: Enable- Count=32 Masked- Vector table: BAR=0 offset=0009 PBA: BAR=0 offset=00090800 ... Kernel driver in use: pci-stub 00: 77 10 20 80 42 05 10 00 54 00 00 02 40 00 80 00 10: 04 00 20 df 00 00 00 00 00 00 00 00 00 00 00 00 20: 04 00 1b df 00 00 00 00 00 00 00 00 77 10 07 02 30: 00 00 1c df 40 00 00 00 00 00 00 00 0a 01 00 00 40: 11 80 1f 00 00 00 09 00 00 08 09 00 00 00 00 00 -- And in the SLES11 guest, I see following for the same function: 00:07.0 Ethernet controller: QLogic Corp. Device 8020 (rev 54) Subsystem: QLogic Corp. Device 0207 Control: I/O- Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr+ Stepping- SERR+ FastB2B- DisINTx+ Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- Interrupt: pin A routed to IRQ 11 Region 0: Memory at f220 (32-bit, non-prefetchable) [size=2M] Region 4: Memory at f240 (32-bit, non-prefetchable) [size=64K] Expansion ROM at f244 [disabled] [size=256K] Capabilities: [40] Message Signalled Interrupts: Mask- 64bit- Count=1/1 Enable- Address: Data: Capabilities: [50] MSI-X: Enable- Mask- TabSize=32 Vector table: BAR=0 offset=0009 PBA: BAR=0 offset= 00: 77 10 20 80 42 05 10 00 54 00 00 02 40 00 80 00 10: 00 00 20 f2 00 00 00 00 00 00 00 00 00 00 00 00 20: 00 00 40 f2 00 00 00 00 00 00 00 00 77 10 07 02 30: 08 00 44 f2 40 00 00 00 00 00 00 00 0b 01 00 00 40: 05 50 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Note that the data at byte offset 40h is different for the host and the guest. Is this expected or is it a bug in KVM? Is there any spec out there that specifies what PCI registers are accessible from guest and how much of the PCI config space is mapped to the guest? This is helpful to know, especially in the context of SR-IOV. If a VF is pass throughed to a guest, are all the VF registers visible in host system will be mapped (and accessible) to the guest OS? thanks much, Anirban-- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: PCI passthru issue
(2010/09/17 8:01), Anirban Chakraborty wrote: Hi All, I noticed that the PCI registers are not all mapped to the guest for a PCI pass through device. For example, I was trying to pass through a PCI function (NIC) to a guest and in the guest the MSI-X table offset register is not mapped properly. It was showing all 0s. This is in RHEL 6.0 beta and the guest was a SLES11 64 bit. In the RHEL 6.0 host I see following for the PCI function: - 05:00.0 Ethernet controller: QLogic Corp. cLOM8214 1/10GbE Controller (rev 54) Subsystem: QLogic Corp. 8200 Series Dual Port 10GbE Converged Network Adapter (TCP/IP Networking) Control: I/O- Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr+ Stepping- SERR+ FastB2B- DisINTx+ Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- Interrupt: pin A routed to IRQ 74 Region 0: Memory at df20 (64-bit, non-prefetchable) [size=2M] Region 4: Memory at df1b (64-bit, non-prefetchable) [size=64K] Expansion ROM at df1c [disabled] [size=256K] Capabilities: [40] MSI-X: Enable- Count=32 Masked- Vector table: BAR=0 offset=0009 PBA: BAR=0 offset=00090800 ... Kernel driver in use: pci-stub 00: 77 10 20 80 42 05 10 00 54 00 00 02 40 00 80 00 10: 04 00 20 df 00 00 00 00 00 00 00 00 00 00 00 00 20: 04 00 1b df 00 00 00 00 00 00 00 00 77 10 07 02 30: 00 00 1c df 40 00 00 00 00 00 00 00 0a 01 00 00 40: 11 80 1f 00 00 00 09 00 00 08 09 00 00 00 00 00 -- And in the SLES11 guest, I see following for the same function: 00:07.0 Ethernet controller: QLogic Corp. Device 8020 (rev 54) Subsystem: QLogic Corp. Device 0207 Control: I/O- Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr+ Stepping- SERR+ FastB2B- DisINTx+ Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- Interrupt: pin A routed to IRQ 11 Region 0: Memory at f220 (32-bit, non-prefetchable) [size=2M] Region 4: Memory at f240 (32-bit, non-prefetchable) [size=64K] Expansion ROM at f244 [disabled] [size=256K] Capabilities: [40] Message Signalled Interrupts: Mask- 64bit- Count=1/1 Enable- Address: Data: Capabilities: [50] MSI-X: Enable- Mask- TabSize=32 Vector table: BAR=0 offset=0009 PBA: BAR=0 offset= 00: 77 10 20 80 42 05 10 00 54 00 00 02 40 00 80 00 10: 00 00 20 f2 00 00 00 00 00 00 00 00 00 00 00 00 20: 00 00 40 f2 00 00 00 00 00 00 00 00 77 10 07 02 30: 08 00 44 f2 40 00 00 00 00 00 00 00 0b 01 00 00 40: 05 50 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Note that the data at byte offset 40h is different for the host and the guest. Is this expected or is it a bug in KVM? This is expected, since MSIs of PCI pass through device need some emulation to route them to the guest instead of the host. So what we can see on the guest's PCI pass through device is modified capability list that contains emulated capabilities. Is there any spec out there that specifies what PCI registers are accessible from guest and how much of the PCI config space is mapped to the guest? This is helpful to know, especially in the context of SR-IOV. If a VF is pass throughed to a guest, are all the VF registers visible in host system will be mapped (and accessible) to the guest OS? AFAIK the QEMU source code, hw/device-assignment.c, will help you to know what are going on there. I'm not sure what interesting registers can the VF have. But I think you should be careful because the register in the PCI config space might not work like that on the host even the register is mapped and visible from the guest. Thanks, H.Seto -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [RFC PATCH v9 12/16] Add mp(mediate passthru) device.
From: Michael S. Tsirkin [mailto:m...@redhat.com] Sent: Wednesday, September 15, 2010 7:28 PM To: Xin, Xiaohui Cc: net...@vger.kernel.org; kvm@vger.kernel.org; linux-ker...@vger.kernel.org; mi...@elte.hu; da...@davemloft.net; herb...@gondor.hengli.com.au; jd...@linux.intel.com Subject: Re: [RFC PATCH v9 12/16] Add mp(mediate passthru) device. On Wed, Sep 15, 2010 at 11:13:44AM +0800, Xin, Xiaohui wrote: From: Michael S. Tsirkin [mailto:m...@redhat.com] Sent: Sunday, September 12, 2010 9:37 PM To: Xin, Xiaohui Cc: net...@vger.kernel.org; kvm@vger.kernel.org; linux-ker...@vger.kernel.org; mi...@elte.hu; da...@davemloft.net; herb...@gondor.hengli.com.au; jd...@linux.intel.com Subject: Re: [RFC PATCH v9 12/16] Add mp(mediate passthru) device. On Sat, Sep 11, 2010 at 03:41:14PM +0800, Xin, Xiaohui wrote: Playing with rlimit on data path, transparently to the application in this way looks strange to me, I suspect this has unexpected security implications. Further, applications may have other uses for locked memory besides mpassthru - you should not just take it because it's there. Can we have an ioctl that lets userspace configure how much memory to lock? This ioctl will decrement the rlimit and store the data in the device structure so we can do accounting internally. Put it back on close or on another ioctl. Yes, we can decrement the rlimit in ioctl in one time to avoid data path. Need to be careful for when this operation gets called again with 0 or another small value while we have locked memory - maybe just fail with EBUSY? or wait until it gets unlocked? Maybe 0 can be special-cased and deactivate zero-copy?. How about we don't use a new ioctl, but just check the rlimit in one MPASSTHRU_BINDDEV ioctl? If we find mp device break the rlimit, then we fail the bind ioctl, and thus can't do zero copy any more. Yes, and not just check, but decrement as well. I think we should give userspace control over how much memory we can lock and subtract from the rlimit. It's OK to add this as a parameter to MPASSTHRU_BINDDEV. Then increment the rlimit back on unbind and on close? This opens up an interesting condition: process 1 calls bind, process 2 calls unbind or close. This will increment rlimit for process 2. Not sure how to fix this properly. I can't too, can we do any synchronous operations on rlimit stuff? I quite suspect in it. -- MST Here's what infiniband does: simply pass the amount of memory userspace wants you to lock on an ioctl, and verify that either you have CAP_IPC_LOCK or this number does not exceed the current rlimit. (must be on ioctl, not on open, as we likely want the fd passed around between processes), but do not decrement rlimit. Use this on following operations. Be careful if this can be changed while operations are in progress. This does mean that the effective amount of memory that userspace can lock is doubled, but at least it is not unlimited, and we sidestep all other issues such as userspace running out of lockable memory simply by virtue of using the driver. What I have done in mp device is almost the same as it. The difference is that I do not check the capability, and I use my own parameter ctor-pages instead of mm-locked_vm. So currently, 1) add the capability check 2) use mm-locked_vm 3) add an ioctl for userspace to configure how much memory can lock. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, September 10, 2010 7:24 AM To: Liu Yu-B13201 Cc: k...@vger.kernel.org; kvm-ppc@vger.kernel.org Subject: Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0 On 08.09.2010, at 11:40, Liu Yu wrote: Current guest TLB1 is mapped to host TLB1. As host kernel only provides 4K uncontinuous pages, we have to break guest large mapping into 4K shadow mappings. These 4K shadow mappings are then mapped into host TLB1 on fly. As host TLB1 only has 13 free entries, there's serious tlb miss. Since e500v2 has a big number of TLB0 entries, it should be help to map those 4K shadow mappings to host TLB0. To achieve this, we need to unlink guest tlb and host tlb, So that guest TLB1 mappings can route to any host TLB0 entries freely. Pages/mappings are considerred in the same kind as host tlb entry. This patch remove the link between pages and guest tlb entry to do the unlink. And keep host_tlb0_ref in each vcpu to trace pages. Then it's easy to map guest TLB1 to host TLB0. In guest ramdisk boot test(guest mainly uses TLB1), with this patch, the tlb miss number get down 90%. Signed-off-by: Liu Yu yu@freescale.com --- arch/powerpc/include/asm/kvm_e500.h |7 +- arch/powerpc/kvm/e500.c |4 + arch/powerpc/kvm/e500_tlb.c | 280 --- arch/powerpc/kvm/e500_tlb.h |1 + 4 files changed, 104 insertions(+), 188 deletions(-) -static unsigned int tlb1_entry_num; +static inline unsigned int get_tlb0_entry_offset(u32 eaddr, u32 esel) +{ + return ((eaddr 0x7F000) (12 - host_tlb0_assoc_bit) | + (esel (host_tlb0_assoc - 1))); +} void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) { @@ -62,11 +68,6 @@ static inline unsigned int tlb0_get_next_victim( return victim; } -static inline unsigned int tlb1_max_shadow_size(void) -{ - return tlb1_entry_num - tlbcam_index; -} - static inline int tlbe_is_writable(struct tlbe *tlbe) { return tlbe-mas3 (MAS3_SW|MAS3_UW); @@ -100,7 +101,7 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) /* * writing shadow tlb entry to host TLB */ -static inline void __write_host_tlbe(struct tlbe *stlbe) +static inline void __host_tlbe_write(struct tlbe *stlbe) { mtspr(SPRN_MAS1, stlbe-mas1); mtspr(SPRN_MAS2, stlbe-mas2); @@ -109,25 +110,22 @@ static inline void __write_host_tlbe(struct tlbe *stlbe) __asm__ __volatile__ (tlbwe\n : : ); } -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel, struct tlbe *stlbe) +static inline u32 host_tlb0_write(struct kvmppc_vcpu_e500 *vcpu_e500, + u32 gvaddr, struct tlbe *stlbe) { - local_irq_disable(); - if (tlbsel == 0) { - __write_host_tlbe(stlbe); - } else { - unsigned register mas0; + unsigned register mas0; - mas0 = mfspr(SPRN_MAS0); + local_irq_disable(); Do you have to disable interrupts for a tlb write? If you get preempted before the write, the entry you overwrite could be different. But you don't protect against that either way. And if you get preempted afterwards, you could lose the entry. But since you enable interrupts beyond that, that could happen either way too. So what's the reason for the disable here? Hello Alex, Doesn't local_irq_disable() also disable preempt? The reason to disable interrupts is because it's possible to have tlb misses in interrupt service route. Thanks, Yu -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0
Liu Yu-B13201 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, September 10, 2010 7:24 AM To: Liu Yu-B13201 Cc: k...@vger.kernel.org; kvm-ppc@vger.kernel.org Subject: Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0 On 08.09.2010, at 11:40, Liu Yu wrote: Current guest TLB1 is mapped to host TLB1. As host kernel only provides 4K uncontinuous pages, we have to break guest large mapping into 4K shadow mappings. These 4K shadow mappings are then mapped into host TLB1 on fly. As host TLB1 only has 13 free entries, there's serious tlb miss. Since e500v2 has a big number of TLB0 entries, it should be help to map those 4K shadow mappings to host TLB0. To achieve this, we need to unlink guest tlb and host tlb, So that guest TLB1 mappings can route to any host TLB0 entries freely. Pages/mappings are considerred in the same kind as host tlb entry. This patch remove the link between pages and guest tlb entry to do the unlink. And keep host_tlb0_ref in each vcpu to trace pages. Then it's easy to map guest TLB1 to host TLB0. In guest ramdisk boot test(guest mainly uses TLB1), with this patch, the tlb miss number get down 90%. Signed-off-by: Liu Yu yu@freescale.com --- arch/powerpc/include/asm/kvm_e500.h |7 +- arch/powerpc/kvm/e500.c |4 + arch/powerpc/kvm/e500_tlb.c | 280 --- arch/powerpc/kvm/e500_tlb.h |1 + 4 files changed, 104 insertions(+), 188 deletions(-) -static unsigned int tlb1_entry_num; +static inline unsigned int get_tlb0_entry_offset(u32 eaddr, u32 esel) +{ + return ((eaddr 0x7F000) (12 - host_tlb0_assoc_bit) | + (esel (host_tlb0_assoc - 1))); +} void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) { @@ -62,11 +68,6 @@ static inline unsigned int tlb0_get_next_victim( return victim; } -static inline unsigned int tlb1_max_shadow_size(void) -{ - return tlb1_entry_num - tlbcam_index; -} - static inline int tlbe_is_writable(struct tlbe *tlbe) { return tlbe-mas3 (MAS3_SW|MAS3_UW); @@ -100,7 +101,7 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) /* * writing shadow tlb entry to host TLB */ -static inline void __write_host_tlbe(struct tlbe *stlbe) +static inline void __host_tlbe_write(struct tlbe *stlbe) { mtspr(SPRN_MAS1, stlbe-mas1); mtspr(SPRN_MAS2, stlbe-mas2); @@ -109,25 +110,22 @@ static inline void __write_host_tlbe(struct tlbe *stlbe) __asm__ __volatile__ (tlbwe\n : : ); } -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel, struct tlbe *stlbe) +static inline u32 host_tlb0_write(struct kvmppc_vcpu_e500 *vcpu_e500, + u32 gvaddr, struct tlbe *stlbe) { - local_irq_disable(); - if (tlbsel == 0) { - __write_host_tlbe(stlbe); - } else { - unsigned register mas0; + unsigned register mas0; - mas0 = mfspr(SPRN_MAS0); + local_irq_disable(); Do you have to disable interrupts for a tlb write? If you get preempted before the write, the entry you overwrite could be different. But you don't protect against that either way. And if you get preempted afterwards, you could lose the entry. But since you enable interrupts beyond that, that could happen either way too. So what's the reason for the disable here? Hello Alex, Doesn't local_irq_disable() also disable preempt? The reason to disable interrupts is because it's possible to have tlb misses in interrupt service route. Yes, local_irq_disable disables preempt. That's exactly what I was referring to :). I don't understand the statement about the service route. A tlb miss still arrives with local_irq_disable. Alex -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html