[COMMIT master] Rename KVM_UPSTREAM to OBSOLETE_KVM_IMPL

2010-09-16 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

The symbol KVM_UPSTREAM is used to mark sections of code that are part of
the upstream kvm implemetation that is not used in qemu-kvm.  However the
name becomes ambiguous if qemu-kvm is merged upstream.

Rename the symbol to avoid confusion.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/cpus.c b/cpus.c
index c545a62..99c04d1 100644
--- a/cpus.c
+++ b/cpus.c
@@ -299,7 +299,7 @@ void qemu_notify_event(void)
 }
 }
 
-#if defined(KVM_UPSTREAM) || !defined(CONFIG_KVM)
+#if defined(OBSOLETE_KVM_IMPL) || !defined(CONFIG_KVM)
 void qemu_mutex_lock_iothread(void) {}
 void qemu_mutex_unlock_iothread(void) {}
 #endif
diff --git a/kvm-all.c b/kvm-all.c
index 4ff75c4..d4b0861 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -41,7 +41,7 @@
 do { } while (0)
 #endif
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 
 typedef struct KVMSlot
 {
@@ -156,7 +156,7 @@ static int kvm_set_user_memory_region(KVMState *s, KVMSlot 
*slot)
 return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, mem);
 }
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 static void kvm_reset_vcpu(void *opaque)
 {
 CPUState *env = opaque;
@@ -176,7 +176,7 @@ int kvm_pit_in_kernel(void)
 }
 
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 int kvm_init_vcpu(CPUState *env)
 {
 KVMState *s = kvm_state;
@@ -594,7 +594,7 @@ void kvm_cpu_register_phys_memory_client(void)
 cpu_register_phys_memory_client(kvm_cpu_phys_memory_client);
 }
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 
 int kvm_init(int smp_cpus)
 {
@@ -816,7 +816,7 @@ void kvm_flush_coalesced_mmio_buffer(void)
 #endif
 }
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 
 static void do_kvm_cpu_synchronize_state(void *_env)
 {
@@ -1038,7 +1038,7 @@ int kvm_has_debugregs(void)
 return kvm_state-debugregs;
 }
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 int kvm_has_xsave(void)
 {
 return kvm_state-xsave;
@@ -1069,10 +1069,10 @@ void kvm_setup_guest_memory(void *start, size_t size)
 }
 
 #ifdef KVM_CAP_SET_GUEST_DEBUG
-#ifndef KVM_UPSTREAM
+#ifndef OBSOLETE_KVM_IMPL
 #define run_on_cpu on_vcpu
 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data);
-#endif /* !KVM_UPSTREAM */
+#endif /* !OBSOLETE_KVM_IMPL */
 
 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
  target_ulong pc)
diff --git a/kvm.h b/kvm.h
index d321fce..56236ae 100644
--- a/kvm.h
+++ b/kvm.h
@@ -31,13 +31,13 @@ extern int kvm_allowed;
 #define kvm_enabled() (0)
 #endif
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 struct kvm_run;
 
 /* external API */
 
 int kvm_init(int smp_cpus);
-#endif /* KVM_UPSTREAM */
+#endif /* OBSOLETE_KVM_IMPL */
 
 int kvm_has_sync_mmu(void);
 int kvm_has_vcpu_events(void);
@@ -96,7 +96,7 @@ int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run);
 
 int kvm_arch_pre_run(CPUState *env, struct kvm_run *run);
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 int kvm_arch_process_irqchip_events(CPUState *env);
 #endif
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index b00e80d..f4fc063 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -188,7 +188,7 @@ int kvm_arch_init_vcpu(CPUState *env)
 return r;
 }
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 
 env-mp_state = KVM_MP_STATE_RUNNABLE;
 
@@ -304,7 +304,7 @@ void kvm_arch_reset_vcpu(CPUState *env)
 env-mp_state = KVM_MP_STATE_RUNNABLE;
 }
 }
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 
 static int kvm_has_msr_star(CPUState *env)
 {
@@ -644,7 +644,7 @@ static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
 entry-data = value;
 }
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 static int kvm_put_msrs(CPUState *env, int level)
 {
 struct {
@@ -1104,7 +1104,7 @@ static int kvm_get_debugregs(CPUState *env)
 return 0;
 }
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 int kvm_arch_put_registers(CPUState *env, int level)
 {
 int ret;
@@ -1242,7 +1242,7 @@ int kvm_arch_post_run(CPUState *env, struct kvm_run *run)
 return 0;
 }
 
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 
 int kvm_arch_process_irqchip_events(CPUState *env)
 {
diff --git a/vl.c b/vl.c
index 22a3616..378a176 100644
--- a/vl.c
+++ b/vl.c
@@ -2466,7 +2466,7 @@ int main(int argc, char **argv, char **envp)
 case QEMU_OPTION_smbios:
 do_smbios_option(optarg);
 break;
-#ifdef KVM_UPSTREAM
+#ifdef OBSOLETE_KVM_IMPL
 case QEMU_OPTION_enable_kvm:
 kvm_allowed = 1;
 #endif
@@ -2803,7 +2803,7 @@ int main(int argc, char **argv, char **envp)
 if (kvm_allowed) {
 int ret = kvm_init(smp_cpus);
 if (ret  0) {
-#if defined(KVM_UPSTREAM) || defined(CONFIG_NO_CPU_EMULATION)
+#if defined(OBSOLETE_KVM_IMPL) || defined(CONFIG_NO_CPU_EMULATION)
 if (!kvm_available()) {
 printf(KVM not supported for this target\n);
 } else {
--
To unsubscribe 

[COMMIT master] KVM: VMX: Split up vmx_complete_interrupts()

2010-09-16 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

vmx_complete_interrupts() does too much, split it up:
 - vmx_vcpu_run() gets the cache important vmcs fields part
 - a new vmx_complete_atomic_exit() gets the parts that must be done atomically
 - a new vmx_recover_nmi_blocking() does what its name says
 - vmx_complete_interrupts() retains the event injection recovery code

This helps in reducing the work done in atomic context.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 291f99c..568f936 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -125,6 +125,7 @@ struct vcpu_vmx {
unsigned long host_rsp;
int   launched;
u8fail;
+   u32   exit_intr_info;
u32   idt_vectoring_info;
struct shared_msr_entry *guest_msrs;
int   nmsrs;
@@ -3781,18 +3782,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, 
int tpr, int irr)
vmcs_write32(TPR_THRESHOLD, irr);
 }
 
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
-   u32 exit_intr_info;
-   u32 idt_vectoring_info = vmx-idt_vectoring_info;
-   bool unblock_nmi;
-   u8 vector;
-   int type;
-   bool idtv_info_valid;
-
-   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-   vmx-exit_reason = vmcs_read32(VM_EXIT_REASON);
+   u32 exit_intr_info = vmx-exit_intr_info;
 
/* Handle machine checks before interrupts are enabled */
if ((vmx-exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -3807,8 +3799,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
asm(int $2);
kvm_after_handle_nmi(vmx-vcpu);
}
+}
 
-   idtv_info_valid = idt_vectoring_info  VECTORING_INFO_VALID_MASK;
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+   u32 exit_intr_info = vmx-exit_intr_info;
+   bool unblock_nmi;
+   u8 vector;
+   bool idtv_info_valid;
+
+   idtv_info_valid = vmx-idt_vectoring_info  VECTORING_INFO_VALID_MASK;
 
if (cpu_has_virtual_nmis()) {
unblock_nmi = (exit_intr_info  INTR_INFO_UNBLOCK_NMI) != 0;
@@ -3830,6 +3830,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
} else if (unlikely(vmx-soft_vnmi_blocked))
vmx-vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(), vmx-entry_time));
+}
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+   u32 idt_vectoring_info = vmx-idt_vectoring_info;
+   u8 vector;
+   int type;
+   bool idtv_info_valid;
+
+   idtv_info_valid = idt_vectoring_info  VECTORING_INFO_VALID_MASK;
 
vmx-vcpu.arch.nmi_injected = false;
kvm_clear_exception_queue(vmx-vcpu);
@@ -4042,6 +4052,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
asm(mov %0, %%ds; mov %0, %%es : : r(__USER_DS));
vmx-launched = 1;
 
+   vmx-exit_reason = vmcs_read32(VM_EXIT_REASON);
+   vmx-exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+   vmx_complete_atomic_exit(vmx);
+   vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
 }
 
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: VMX: Move real-mode interrupt injection fixup to vmx_complete_interrupts()

2010-09-16 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

This allows reuse of vmx_complete_interrupts() for cancelling injections.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 568f936..c10d700 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,6 +182,7 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
+static void fixup_rmode_irq(struct vcpu_vmx *vmx);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3834,11 +3835,15 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx 
*vmx)
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-   u32 idt_vectoring_info = vmx-idt_vectoring_info;
+   u32 idt_vectoring_info;
u8 vector;
int type;
bool idtv_info_valid;
 
+   if (vmx-rmode.irq.pending)
+   fixup_rmode_irq(vmx);
+
+   idt_vectoring_info = vmx-idt_vectoring_info;
idtv_info_valid = idt_vectoring_info  VECTORING_INFO_VALID_MASK;
 
vmx-vcpu.arch.nmi_injected = false;
@@ -4046,8 +4051,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
vcpu-arch.regs_dirty = 0;
 
vmx-idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-   if (vmx-rmode.irq.pending)
-   fixup_rmode_irq(vmx);
 
asm(mov %0, %%ds; mov %0, %%es : : r(__USER_DS));
vmx-launched = 1;
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: VMX: Parameterize vmx_complete_interrupts() for both exit and entry

2010-09-16 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Currently vmx_complete_interrupts() can decode event information from vmx
exit fields into the generic kvm event queues.  Make it able to decode
the information from the entry fields as well by parametrizing it.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c10d700..108dbaf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,7 +182,7 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void fixup_rmode_irq(struct vcpu_vmx *vmx);
+static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3833,17 +3833,18 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx 
*vmx)
ktime_to_ns(ktime_sub(ktime_get(), vmx-entry_time));
 }
 
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
+ u32 idt_vectoring_info,
+ int instr_len_field,
+ int error_code_field)
 {
-   u32 idt_vectoring_info;
u8 vector;
int type;
bool idtv_info_valid;
 
if (vmx-rmode.irq.pending)
-   fixup_rmode_irq(vmx);
+   fixup_rmode_irq(vmx, idt_vectoring_info);
 
-   idt_vectoring_info = vmx-idt_vectoring_info;
idtv_info_valid = idt_vectoring_info  VECTORING_INFO_VALID_MASK;
 
vmx-vcpu.arch.nmi_injected = false;
@@ -3871,18 +3872,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx 
*vmx)
break;
case INTR_TYPE_SOFT_EXCEPTION:
vmx-vcpu.arch.event_exit_inst_len =
-   vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+   vmcs_read32(instr_len_field);
/* fall through */
case INTR_TYPE_HARD_EXCEPTION:
if (idt_vectoring_info  VECTORING_INFO_DELIVER_CODE_MASK) {
-   u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+   u32 err = vmcs_read32(error_code_field);
kvm_queue_exception_e(vmx-vcpu, vector, err);
} else
kvm_queue_exception(vmx-vcpu, vector);
break;
case INTR_TYPE_SOFT_INTR:
vmx-vcpu.arch.event_exit_inst_len =
-   vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+   vmcs_read32(instr_len_field);
/* fall through */
case INTR_TYPE_EXT_INTR:
kvm_queue_interrupt(vmx-vcpu, vector,
@@ -3893,24 +3894,31 @@ static void vmx_complete_interrupts(struct vcpu_vmx 
*vmx)
}
 }
 
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+   __vmx_complete_interrupts(vmx, vmx-idt_vectoring_info,
+ VM_EXIT_INSTRUCTION_LEN,
+ IDT_VECTORING_ERROR_CODE);
+}
+
 /*
  * Failure to inject an interrupt should give us the information
  * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
  * when fetching the interrupt redirection bitmap in the real-mode
  * tss, this doesn't happen.  So we do it ourselves.
  */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
 {
vmx-rmode.irq.pending = 0;
if (kvm_rip_read(vmx-vcpu) + 1 != vmx-rmode.irq.rip)
return;
kvm_rip_write(vmx-vcpu, vmx-rmode.irq.rip);
-   if (vmx-idt_vectoring_info  VECTORING_INFO_VALID_MASK) {
-   vmx-idt_vectoring_info = ~VECTORING_INFO_TYPE_MASK;
-   vmx-idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+   if (*idt_vectoring_info  VECTORING_INFO_VALID_MASK) {
+   *idt_vectoring_info = ~VECTORING_INFO_TYPE_MASK;
+   *idt_vectoring_info |= INTR_TYPE_EXT_INTR;
return;
}
-   vmx-idt_vectoring_info =
+   *idt_vectoring_info =
VECTORING_INFO_VALID_MASK
| INTR_TYPE_EXT_INTR
| vmx-rmode.irq.vector;
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: Check for pending events before attempting injection

2010-09-16 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Instead of blindly attempting to inject an event before each guest entry,
check for a possible event first in vcpu-requests.  Sites that can trigger
event injection are modified to set KVM_REQ_EVENT:

- interrupt, nmi window opening
- ppr updates
- i8259 output changes
- local apic irr changes
- rflags updates
- gif flag set
- event set on exit

This improves non-injecting entry performance, and sets the stage for
non-atomic injection.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 6e77471..ab1bb8f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s)
if (!found)
return;
 
+   kvm_make_request(KVM_REQ_EVENT, found);
kvm_vcpu_kick(found);
}
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 77d8c0f..c6f2f15 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic 
*apic)
 
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
-   u32 tpr, isrv, ppr;
+   u32 tpr, isrv, ppr, old_ppr;
int isr;
 
+   old_ppr = apic_get_reg(apic, APIC_PROCPRI);
tpr = apic_get_reg(apic, APIC_TASKPRI);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
@@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic)
apic_debug(vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x,
   apic, ppr, isr, isrv);
 
-   apic_set_reg(apic, APIC_PROCPRI, ppr);
+   if (old_ppr != ppr) {
+   apic_set_reg(apic, APIC_PROCPRI, ppr);
+   kvm_make_request(KVM_REQ_EVENT, apic-vcpu);
+   }
 }
 
 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
break;
}
 
+   kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
break;
 
@@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
   INIT on a runnable vcpu %d\n,
   vcpu-vcpu_id);
vcpu-arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+   kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
} else {
apic_debug(Ignoring de-assert INIT to vcpu %d\n,
@@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
result = 1;
vcpu-arch.sipi_vector = vector;
vcpu-arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
+   kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
break;
@@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
trigger_mode = IOAPIC_EDGE_TRIG;
if (!(apic_get_reg(apic, APIC_SPIV)  APIC_SPIV_DIRECTED_EOI))
kvm_ioapic_update_eoi(apic-vcpu-kvm, vector, trigger_mode);
+   kvm_make_request(KVM_REQ_EVENT, apic-vcpu);
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1152,6 +1160,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
update_divide_count(apic);
start_apic_timer(apic);
apic-irr_pending = true;
+   kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index eeb08d6..bc317eb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2371,6 +2371,7 @@ static int stgi_interception(struct vcpu_svm *svm)
 
svm-next_rip = kvm_rip_read(svm-vcpu) + 3;
skip_emulated_instruction(svm-vcpu);
+   kvm_make_request(KVM_REQ_EVENT, svm-vcpu);
 
enable_gif(svm);
 
@@ -2763,6 +2764,7 @@ static int interrupt_window_interception(struct vcpu_svm 
*svm)
 {
struct kvm_run *kvm_run = svm-vcpu.run;
 
+   kvm_make_request(KVM_REQ_EVENT, svm-vcpu);
svm_clear_vintr(svm);
svm-vmcb-control.int_ctl = ~V_IRQ_MASK;
/*
@@ -3209,8 +3211,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 
svm-int3_injected = 0;
 
-   if (svm-vcpu.arch.hflags  HF_IRET_MASK)
+   if (svm-vcpu.arch.hflags  HF_IRET_MASK) {
svm-vcpu.arch.hflags = ~(HF_NMI_MASK | HF_IRET_MASK);
+   kvm_make_request(KVM_REQ_EVENT, svm-vcpu);
+   }
 
svm-vcpu.arch.nmi_injected = false;
kvm_clear_exception_queue(svm-vcpu);
@@ -3219,6 +3223,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
if (!(exitintinfo  SVM_EXITINTINFO_VALID))
return;
 
+   

[COMMIT master] KVM: Non-atomic interrupt injection

2010-09-16 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Change the interrupt injection code to work from preemptible, interrupts
enabled context.  This works by adding a -cancel_injection() operation
that undoes an injection in case we were not able to actually enter the guest
(this condition could never happen with atomic injection).

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3a00741..02f9780 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -552,6 +552,7 @@ struct kvm_x86_ops {
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject);
+   void (*cancel_injection)(struct kvm_vcpu *vcpu);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
int (*nmi_allowed)(struct kvm_vcpu *vcpu);
bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bc317eb..43f5558 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3261,6 +3261,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
}
 }
 
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+   struct vmcb_control_area *control = svm-vmcb-control;
+
+   control-exit_int_info = control-event_inj;
+   control-exit_int_info_err = control-event_inj_err;
+   control-event_inj = 0;
+   svm_complete_interrupts(svm);
+}
+
 #ifdef CONFIG_X86_64
 #define R r
 #else
@@ -3626,6 +3637,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_irq = svm_set_irq,
.set_nmi = svm_inject_nmi,
.queue_exception = svm_queue_exception,
+   .cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
.nmi_allowed = svm_nmi_allowed,
.get_nmi_mask = svm_get_nmi_mask,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 108dbaf..199fa7e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3901,6 +3901,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
  IDT_VECTORING_ERROR_CODE);
 }
 
+static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+   __vmx_complete_interrupts(to_vmx(vcpu),
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ VM_ENTRY_INSTRUCTION_LEN,
+ VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+}
+
 /*
  * Failure to inject an interrupt should give us the information
  * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
@@ -4354,6 +4364,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_irq = vmx_inject_irq,
.set_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
+   .cancel_injection = vmx_cancel_injection,
.interrupt_allowed = vmx_interrupt_allowed,
.nmi_allowed = vmx_nmi_allowed,
.get_nmi_mask = vmx_get_nmi_mask,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e719803..a465bd2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5005,7 +5005,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
int r;
bool req_int_win = !irqchip_in_kernel(vcpu-kvm) 
vcpu-run-request_interrupt_window;
-   bool req_event;
 
if (vcpu-requests) {
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5041,6 +5040,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (unlikely(r))
goto out;
 
+   if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+   inject_pending_event(vcpu);
+
+   /* enable NMI/IRQ window open exits if needed */
+   if (vcpu-arch.nmi_pending)
+   kvm_x86_ops-enable_nmi_window(vcpu);
+   else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+   kvm_x86_ops-enable_irq_window(vcpu);
+
+   if (kvm_lapic_enabled(vcpu)) {
+   update_cr8_intercept(vcpu);
+   kvm_lapic_sync_to_vapic(vcpu);
+   }
+   }
+
preempt_disable();
 
kvm_x86_ops-prepare_guest_switch(vcpu);
@@ -5053,35 +5067,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
local_irq_disable();
 
-   req_event = kvm_check_request(KVM_REQ_EVENT, vcpu);
-
if (!atomic_read(vcpu-guest_mode) || vcpu-requests
|| need_resched() || signal_pending(current)) {
-   if (req_event)
-   kvm_make_request(KVM_REQ_EVENT, vcpu);
atomic_set(vcpu-guest_mode, 0);
smp_wmb();
local_irq_enable();
preempt_enable();
+   kvm_x86_ops-cancel_injection(vcpu);
r = 1;
goto 

[COMMIT master] KVM: VMX: Move fixup_rmode_irq() to avoid forward declaration

2010-09-16 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

No code changes.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 199fa7e..8ef6199 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,7 +182,6 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3833,6 +3832,29 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx 
*vmx)
ktime_to_ns(ktime_sub(ktime_get(), vmx-entry_time));
 }
 
+/*
+ * Failure to inject an interrupt should give us the information
+ * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
+ * when fetching the interrupt redirection bitmap in the real-mode
+ * tss, this doesn't happen.  So we do it ourselves.
+ */
+static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
+{
+   vmx-rmode.irq.pending = 0;
+   if (kvm_rip_read(vmx-vcpu) + 1 != vmx-rmode.irq.rip)
+   return;
+   kvm_rip_write(vmx-vcpu, vmx-rmode.irq.rip);
+   if (*idt_vectoring_info  VECTORING_INFO_VALID_MASK) {
+   *idt_vectoring_info = ~VECTORING_INFO_TYPE_MASK;
+   *idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+   return;
+   }
+   *idt_vectoring_info =
+   VECTORING_INFO_VALID_MASK
+   | INTR_TYPE_EXT_INTR
+   | vmx-rmode.irq.vector;
+}
+
 static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
  u32 idt_vectoring_info,
  int instr_len_field,
@@ -3911,29 +3933,6 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
 }
 
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen.  So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
-{
-   vmx-rmode.irq.pending = 0;
-   if (kvm_rip_read(vmx-vcpu) + 1 != vmx-rmode.irq.rip)
-   return;
-   kvm_rip_write(vmx-vcpu, vmx-rmode.irq.rip);
-   if (*idt_vectoring_info  VECTORING_INFO_VALID_MASK) {
-   *idt_vectoring_info = ~VECTORING_INFO_TYPE_MASK;
-   *idt_vectoring_info |= INTR_TYPE_EXT_INTR;
-   return;
-   }
-   *idt_vectoring_info =
-   VECTORING_INFO_VALID_MASK
-   | INTR_TYPE_EXT_INTR
-   | vmx-rmode.irq.vector;
-}
-
 #ifdef CONFIG_X86_64
 #define R r
 #define Q q
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: MMU: Don't track nested fault info in error-code

2010-09-16 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This patch moves the detection whether a page-fault was
nested or not out of the error code and moves it into a
separate variable in the fault struct.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 02f9780..8c5779d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -322,6 +322,7 @@ struct kvm_vcpu_arch {
struct {
u64  address;
unsigned error_code;
+   bool nested;
} fault;
 
/* only needed in kvm_pv_mmu_op() path, but it's hot so
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 513abbb..7086ca8 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -47,7 +47,6 @@
 #define PFERR_USER_MASK (1U  2)
 #define PFERR_RSVD_MASK (1U  3)
 #define PFERR_FETCH_MASK (1U  4)
-#define PFERR_NESTED_MASK (1U  31)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a465bd2..a51635e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -342,18 +342,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
 
 void kvm_propagate_fault(struct kvm_vcpu *vcpu)
 {
-   u32 nested, error;
-
-   error   = vcpu-arch.fault.error_code;
-   nested  = error   PFERR_NESTED_MASK;
-   error   = error  ~PFERR_NESTED_MASK;
-
-   vcpu-arch.fault.error_code = error;
-
-   if (mmu_is_nested(vcpu)  !nested)
+   if (mmu_is_nested(vcpu)  !vcpu-arch.fault.nested)
vcpu-arch.nested_mmu.inject_page_fault(vcpu);
else
vcpu-arch.mmu.inject_page_fault(vcpu);
+
+   vcpu-arch.fault.nested = false;
 }
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -3524,7 +3518,7 @@ static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, 
gpa_t gpa, u32 access)
access |= PFERR_USER_MASK;
t_gpa  = vcpu-arch.mmu.gva_to_gpa(vcpu, gpa, access, error);
if (t_gpa == UNMAPPED_GVA)
-   vcpu-arch.fault.error_code |= PFERR_NESTED_MASK;
+   vcpu-arch.fault.nested = true;
 
return t_gpa;
 }
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PATCH: virtio_console: Fix poll blocking even though there is data to read

2010-09-16 Thread Rusty Russell
On Wed, 15 Sep 2010 11:16:24 pm Amit Shah wrote:
 On (Wed) Sep 15 2010 [15:37:21], Hans de Goede wrote:
  --- linux-2.6.35.x86_64/drivers/char/virtio_console.c~2010-08-02 
  00:11:14.0 +0200
  +++ linux-2.6.35.x86_64/drivers/char/virtio_console.c 2010-09-15 
  13:39:29.043505000 +0200
  @@ -642,7 +642,7 @@ static unsigned int port_fops_poll(struc
poll_wait(filp,port-waitqueue, wait);
  
ret = 0;
  - if (port-inbuf)
  + if (!will_read_block(port))
  
  Looks correct, but this should be
  
 if (port_has_data(port))
  
  instead.
  
  That certainly works for me (as in will still fix the bug I'm hitting), but
  quoting from man 2 select:
  
 Three  independent  sets of file descriptors are watched.  Those 
  listed
 in readfds will be watched to see if characters  become  available  
  for
 reading  (more  precisely, to see if a read will not block; in 
  particu‐
 lar, a file descriptor is also ready on end-of-file)
  
  Notice the a file descriptor is also ready on end-of-file, and
  port_fops_read treats the host not being connected as eof (it returns 0
  in that case). So from an API pov I'm not sure what is correct.
 
 poll(2) says:
 
   POLLIN There is data to read.
 
 That makes it simple.

That's a documentation bug.  On a pipe, POLLIN is set when the other end
closes.  read() then returns 0 immediately.

poll() sets POLLIN when read() won't block, and people count on it.

Let's do that, please.
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost-net: fix range checking in mrg bufs case

2010-09-16 Thread Jason Wang
Tested-by: Jason Wang jasow...@redhat.com

- Michael S. Tsirkin m...@redhat.com wrote:

 In mergeable buffer case, we use headcount, log_num
 and seg as indexes in same-size arrays, and
 we know that headcount = seg and
 log_num equals either 0 or seg.
 
 Therefore, the right thing to do is range-check seg,
 not headcount as we do now: these will be different
 if guest chains s/g descriptors (this does not
 happen now, but we can not trust the guest).
 
 Long term, we should add BUG_ON checks to verify
 two other indexes are what we think they should be.
 
 Reported-by: Jason Wang jasow...@redhat.com
 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---
 
 Dave, I'll queue this on my tree, no need to bother.
 
  drivers/vhost/net.c |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 6400cd5..f095de6 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -245,7 +245,7 @@ static int get_rx_bufs(struct vhost_virtqueue
 *vq,
   int r, nlogs = 0;
  
   while (datalen  0) {
 - if (unlikely(headcount = VHOST_NET_MAX_SG)) {
 + if (unlikely(seg = VHOST_NET_MAX_SG)) {
   r = -ENOBUFS;
   goto err;
   }
 -- 
 1.7.3.rc1.5.ge5969
 ___
 Virtualization mailing list
 virtualizat...@lists.linux-foundation.org
 https://lists.linux-foundation.org/mailman/listinfo/virtualization
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PATCH: virtio_console: Fix poll blocking even though there is data to read

2010-09-16 Thread Amit Shah
On (Thu) Sep 16 2010 [15:32:54], Rusty Russell wrote:
 On Wed, 15 Sep 2010 11:16:24 pm Amit Shah wrote:
  On (Wed) Sep 15 2010 [15:37:21], Hans de Goede wrote:
   --- linux-2.6.35.x86_64/drivers/char/virtio_console.c~  2010-08-02 
   00:11:14.0 +0200
   +++ linux-2.6.35.x86_64/drivers/char/virtio_console.c   2010-09-15 
   13:39:29.043505000 +0200
   @@ -642,7 +642,7 @@ static unsigned int port_fops_poll(struc
   poll_wait(filp,port-waitqueue, wait);
   
   ret = 0;
   -   if (port-inbuf)
   +   if (!will_read_block(port))
   
   Looks correct, but this should be
   
if (port_has_data(port))
   
   instead.
   
   That certainly works for me (as in will still fix the bug I'm hitting), 
   but
   quoting from man 2 select:
   
  Three  independent  sets of file descriptors are watched.  Those 
   listed
  in readfds will be watched to see if characters  become  available 
for
  reading  (more  precisely, to see if a read will not block; in 
   particu‐
  lar, a file descriptor is also ready on end-of-file)
   
   Notice the a file descriptor is also ready on end-of-file, and
   port_fops_read treats the host not being connected as eof (it returns 0
   in that case). So from an API pov I'm not sure what is correct.
  
  poll(2) says:
  
POLLIN There is data to read.
  
  That makes it simple.
 
 That's a documentation bug.  On a pipe, POLLIN is set when the other end
 closes.  read() then returns 0 immediately.

Currently we don't set POLLIN when host goes down.  I'll do a second
patch for that.

 poll() sets POLLIN when read() won't block, and people count on it.

Yes, that's the behaviour with Hans's new patch as well -- that's not
changing.

The will_read_block() function (and the comment on top of it) are
causing this confusion:


/* The condition that must be true for polling to end */
static bool will_read_block(struct port *port)
{
if (!port-guest_connected) {
/* Port got hot-unplugged. Let's exit. */
return false;
}
return !port_has_data(port)  port-host_connected;
}

This function is only called to unblock a blocking read() call.  So the
comment there has to be changed to read 'waiting' to end instead of
'polling' to end.

read() does return 0 immediately when the other end is not connected
(and there's no data to read).

In effect, we need:
- Hans's patch
- a patch to set POLLIN when host goes down (in addition to POLLHUP and
  SIGIO)
- a patch to change the comment for will_read_block.

Thanks,

Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote:
  That should give us a clue on what's going wrong.
  
 
 I did the steps you mentioned. Here's the gdb transcript:
 
 $ gdb k12_1009 -ex 'target remote localhost:1234'
What is k12_1009? Are you sure this is vmlinux?

BTW what version on kvm/qemu are you using.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alec Joseph Rivera
On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote:
   That should give us a clue on what's going wrong.
   
  
  I did the steps you mentioned. Here's the gdb transcript:
  
  $ gdb k12_1009 -ex 'target remote localhost:1234'
 What is k12_1009? Are you sure this is vmlinux?
 

Hi Gleb:

Yes, k12_1009 is the kernel image that isolinux loads.

-- isolinux.cfg

 DEFAULT 1
 TIMEOUT 100
 PROMPT 1
 DISPLAY isolinux.txt
 LABEL 1
 KERNEL k12_1009
 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
 LABEL 2
 KERNEL k12_1009
 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram BOOT_SUPPORT=1 

-- file test
 $ file k12_1009 
 k12_1009: Linux kernel x86 boot executable bzImage, version 
 2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, 
 Normal VGA

 BTW what version on kvm/qemu are you using.
 

qemu's version is 0.12.5. modinfo kvm doesn't reveal any version string
though, however the kernel is 2.6.35 from Arch.

Question: from what I've read so far kvm versions are of 'kvm-xx' where
xx is a number, on sourceforge there is kvm-kmod-{kernel-version}, am I
correct in assuming and saying then that my kvm version is 2.6.35?

Thanks...

 --
   Gleb.


-- 
--- 


Follow me: 
http://twitter.com/agirivera
Invite as a
friend: 
http://www.facebook.com/agirivera



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote:
 On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote:
That should give us a clue on what's going wrong.

   
   I did the steps you mentioned. Here's the gdb transcript:
   
   $ gdb k12_1009 -ex 'target remote localhost:1234'
  What is k12_1009? Are you sure this is vmlinux?
  
 
 Hi Gleb:
 
 Yes, k12_1009 is the kernel image that isolinux loads.
 
 -- isolinux.cfg
 
  DEFAULT 1
  TIMEOUT 100
  PROMPT 1
  DISPLAY isolinux.txt
  LABEL 1
  KERNEL k12_1009
  APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
  LABEL 2
  KERNEL k12_1009
  APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
  BOOT_SUPPORT=1 
 
 -- file test
  $ file k12_1009 
  k12_1009: Linux kernel x86 boot executable bzImage, version 
  2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, 
  Normal VGA
 
This is bzImage, You need corresponding vmlinux bzImage was created
from.

  BTW what version on kvm/qemu are you using.
  
 
 qemu's version is 0.12.5. modinfo kvm doesn't reveal any version string
 though, however the kernel is 2.6.35 from Arch.
 
 Question: from what I've read so far kvm versions are of 'kvm-xx' where
 xx is a number, on sourceforge there is kvm-kmod-{kernel-version}, am I
 correct in assuming and saying then that my kvm version is 2.6.35?
 
kvm-xx is deprecated. So when asked what kvm are you using you should
provide your kernel version like you did.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alec Joseph Rivera
On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote:
  On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote:
   On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote:
 That should give us a clue on what's going wrong.
 

I did the steps you mentioned. Here's the gdb transcript:

$ gdb k12_1009 -ex 'target remote localhost:1234'
   What is k12_1009? Are you sure this is vmlinux?
   
  
  Hi Gleb:
  
  Yes, k12_1009 is the kernel image that isolinux loads.
  
  -- isolinux.cfg
  
   DEFAULT 1
   TIMEOUT 100
   PROMPT 1
   DISPLAY isolinux.txt
   LABEL 1
   KERNEL k12_1009
   APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
   LABEL 2
   KERNEL k12_1009
   APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
   BOOT_SUPPORT=1 
  
  -- file test
   $ file k12_1009 
   k12_1009: Linux kernel x86 boot executable bzImage, version 
   2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 0x1, 
   Normal VGA
  
 This is bzImage, You need corresponding vmlinux bzImage was created
 from.

Oh I see. Thanks for pointing that out. Unfortunately, I don't think I
can get that image. Either I ask someone from Novell or IBM.

I'll try crawling the system's filesystem, maybe I can find it there...
Will keep you updated.

 
   BTW what version on kvm/qemu are you using.
   
  
  qemu's version is 0.12.5. modinfo kvm doesn't reveal any version string
  though, however the kernel is 2.6.35 from Arch.
  
  Question: from what I've read so far kvm versions are of 'kvm-xx' where
  xx is a number, on sourceforge there is kvm-kmod-{kernel-version}, am I
  correct in assuming and saying then that my kvm version is 2.6.35?
  
 kvm-xx is deprecated. So when asked what kvm are you using you should
 provide your kernel version like you did.
 
 --
   Gleb.


-- 
--- 


Follow me: 
http://twitter.com/agirivera
Invite as a
friend: 
http://www.facebook.com/agirivera



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alec Joseph Rivera
On Wed, 2010-09-15 at 22:37 +0400, Michael Tokarev wrote:
 Alec Joseph Rivera wrote:
 []
  Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
  Memory: 1022272k/1048564k available (1671k kernel code, 25488k reserved, 
  890k data, 200k init, 131064k highmem)
  Checking if this processor honours the WP bit even in supervisor mode... 
  Ok.
  -- end
 
 Hm.  There was some error with earlier kernels and
 kvm that resulted in exactly this behavour.  Maybe
 even a (guest) kernel bug.. no?  My memory is vague
 in this area, but it tells there was something...
 

Hi Michael:

I suspect has something to do with loops (re: bogomips calibration) or
timers (re: calibration + pci probing). Will verify this when I get the
time to read the source.

Maybe someone more familiar with this kernel area can shed some light on
the topic too...? :-) only if I'm not stretching too much..

 /mjt


-- 
--- 


Follow me: 
http://twitter.com/agirivera
Invite as a
friend: 
http://www.facebook.com/agirivera



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel

2010-09-16 Thread Xin, Xiaohui
From: Michael S. Tsirkin [mailto:m...@redhat.com]
Sent: Wednesday, September 15, 2010 5:59 PM
To: Xin, Xiaohui
Cc: Shirley Ma; Arnd Bergmann; Avi Kivity; David Miller; 
net...@vger.kernel.org;
kvm@vger.kernel.org; linux-ker...@vger.kernel.org
Subject: Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host 
kernel

On Wed, Sep 15, 2010 at 10:46:02AM +0800, Xin, Xiaohui wrote:
 From: Michael S. Tsirkin [mailto:m...@redhat.com]
 Sent: Wednesday, September 15, 2010 12:30 AM
 To: Shirley Ma
 Cc: Arnd Bergmann; Avi Kivity; Xin, Xiaohui; David Miller; 
 net...@vger.kernel.org;
 kvm@vger.kernel.org; linux-ker...@vger.kernel.org
 Subject: Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host 
 kernel
 
 On Tue, Sep 14, 2010 at 09:00:25AM -0700, Shirley Ma wrote:
  On Tue, 2010-09-14 at 17:22 +0200, Michael S. Tsirkin wrote:
   I would expect this to hurt performance significantly.
   We could do this for asynchronous requests only to avoid the
   slowdown.
 
  Is kiocb in sendmsg helpful here? It is not used now.
 
  Shirley
 
 Precisely. This is what the patch from Xin Xiaohui does.  That code
 already seems to do most of what you are trying to do, right?
 
 The main thing missing seems to be macvtap integration, so that we can fall 
 back
 on data copy if zero copy is unavailable?
 How hard would it be to basically link the mp and macvtap modules
 together to get us this functionality? Anyone?
 
 Michael,
 Is to support macvtap with zero-copy through mp device the functionality
 you mentioned above?

I have trouble parsing the above question.  At some point Arnd suggested
that the mp device functionality would fit nicely as part of the macvtap
driver.  It seems to make sense superficially, the advantage if it
worked would be that we would get zero copy (mostly) transparently.

Do you agree with this goal?


I would say yes.

 Before Shirley Ma has suggested to move the zero-copy functionality into
 tun/tap device or macvtap device. How do you think about that?
 I suspect
 there will be a lot of duplicate code in that three drivers except we can 
 extract
 code of zero-copy into kernel APIs and vhost APIs.


tap would be very hard at this point as it does not bind to a device.
macvtap might work, we mainly need to figure out a way to detect that
device can do zero copy so the right mode is used.  I think a first step
could be to simply link mp code into macvtap module, pass necessary
ioctls on, then move some code around as necessary.  This might get rid
of code duplication nicely.

I'll look into this to see how much effort would be.



 Do you think that's worth to do and help current process which is blocked too
 long than I expected?

I think it's nice to have.

And if done hopefully this will get the folk working on the macvtap
driver to review the code, which will help find all issues faster.

I think if you post some performance numbers,
this will also help get people excited and looking at the code.


The performance data I have posted before is compared with raw socket on 
vhost-net.
But currently, the raw socket backend is removed from the qemu side.
So I can only compare with tap on vhost-net. But unfortunately, I missed 
something
that I even can't bring it up. I was blocked by this for a time.

I also don't see the process as completely blocked, each review round points
out more issues: we aren't going back and forth changing
same lines again and again, are we?

One thing that might help is increase the frequency of updates,
try sending them out sooner.
On the other hand 10 new patches each revision is a lot:
if there is a part of patchset that has stabilised you can split it out,
post once and keep posting the changing part separately.

I hope these suggestions help.

Thanks, Michael!


 
 --
 MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Dave Young
On Thu, Sep 16, 2010 at 4:17 PM, Alec Joseph Rivera eij...@gmail.com wrote:
 On Wed, 2010-09-15 at 22:37 +0400, Michael Tokarev wrote:
 Alec Joseph Rivera wrote:
 []
  Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
  Memory: 1022272k/1048564k available (1671k kernel code, 25488k reserved, 
  890k data, 200k init, 131064k highmem)
  Checking if this processor honours the WP bit even in supervisor mode... 
  Ok.
  -- end

 Hm.  There was some error with earlier kernels and
 kvm that resulted in exactly this behavour.  Maybe
 even a (guest) kernel bug.. no?  My memory is vague
 in this area, but it tells there was something...


 Hi Michael:

 I suspect has something to do with loops (re: bogomips calibration) or
 timers (re: calibration + pci probing). Will verify this when I get the
 time to read the source.

 Maybe someone more familiar with this kernel area can shed some light on
 the topic too...? :-) only if I'm not stretching too much..

boot with param lpj=xxx to avoid calibration

copy xxx value from your normal boot kernel messages

 /mjt


 --
 ---


 Follow me:
 http://twitter.com/agirivera
 Invite as a
 friend:
 http://www.facebook.com/agirivera



 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html




-- 
Regards
dave
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 11:02:56AM +0200, Gleb Natapov wrote:
 On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote:
  To avoid bouncing irqfd injection out to workqueue context,
  we must support injecting irqs from local interrupt
  context. Doing this seems to only require disabling
  irqs locally.
  
  RFC, completely untested, x86 only.
  
  Thoughts?
  
 We do not want to disable irqs for a long time and some of code paths
 under lock involve looping over all cpus. For MSI injection path is
 lockless and this is the only case that matters,

MSI only appeared in rhel6, older guests still use level interrupts.
Which paths require looping over all cpus? Do PCI interrupts
need this?

 so inject irqs from
 local interrupt context if it is MSI or from workqueue otherwise.

OK, but this would need a rework of irqfd code as it is currently unaware
of interrupt type.

  Signed-off-by: Michael S. Tsirkin m...@redhat.com
  ---
   arch/x86/kvm/i8259.c |   12 
   virt/kvm/eventfd.c   |   30 +-
   virt/kvm/ioapic.c|   34 --
   3 files changed, 41 insertions(+), 35 deletions(-)
  
  diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
  index 8d10c06..294fa36 100644
  --- a/arch/x86/kvm/i8259.c
  +++ b/arch/x86/kvm/i8259.c
  @@ -198,8 +198,10 @@ void kvm_pic_update_irq(struct kvm_pic *s)
   int kvm_pic_set_irq(void *opaque, int irq, int level)
   {
  struct kvm_pic *s = opaque;
  +   unsigned long flags;
  int ret = -1;
   
  +   local_irq_save(flags);
  pic_lock(s);
  if (irq = 0  irq  PIC_NUM_PINS) {
  ret = pic_set_irq1(s-pics[irq  3], irq  7, level);
  @@ -208,6 +210,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
s-pics[irq  3].imr, ret == 0);
  }
  pic_unlock(s);
  +   local_irq_restore(flags);
   
  return ret;
   }
  @@ -235,8 +238,10 @@ static inline void pic_intack(struct kvm_kpic_state 
  *s, int irq)
   int kvm_pic_read_irq(struct kvm *kvm)
   {
  int irq, irq2, intno;
  +   unsigned long flags;
  struct kvm_pic *s = pic_irqchip(kvm);
   
  +   local_irq_save(flags);
  pic_lock(s);
  irq = pic_get_irq(s-pics[0]);
  if (irq = 0) {
  @@ -263,6 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
  }
  pic_update_irq(s);
  pic_unlock(s);
  +   local_irq_restore(flags);
   
  return intno;
   }
  @@ -476,6 +482,7 @@ static int picdev_write(struct kvm_io_device *this,
   {
  struct kvm_pic *s = to_pic(this);
  unsigned char data = *(unsigned char *)val;
  +   unsigned long flags;
  if (!picdev_in_range(addr))
  return -EOPNOTSUPP;
   
  @@ -484,6 +491,7 @@ static int picdev_write(struct kvm_io_device *this,
  printk(KERN_ERR PIC: non byte write\n);
  return 0;
  }
  +   local_irq_save(flags);
  pic_lock(s);
  switch (addr) {
  case 0x20:
  @@ -498,6 +506,7 @@ static int picdev_write(struct kvm_io_device *this,
  break;
  }
  pic_unlock(s);
  +   local_irq_restore(flags);
  return 0;
   }
   
  @@ -506,6 +515,7 @@ static int picdev_read(struct kvm_io_device *this,
   {
  struct kvm_pic *s = to_pic(this);
  unsigned char data = 0;
  +   unsigned long flags;
  if (!picdev_in_range(addr))
  return -EOPNOTSUPP;
   
  @@ -514,6 +524,7 @@ static int picdev_read(struct kvm_io_device *this,
  printk(KERN_ERR PIC: non byte read\n);
  return 0;
  }
  +   local_irq_save(flags);
  pic_lock(s);
  switch (addr) {
  case 0x20:
  @@ -529,6 +540,7 @@ static int picdev_read(struct kvm_io_device *this,
  }
  *(unsigned char *)val = data;
  pic_unlock(s);
  +   local_irq_restore(flags);
  return 0;
   }
   
  diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
  index 66cf65b..30ede90 100644
  --- a/virt/kvm/eventfd.c
  +++ b/virt/kvm/eventfd.c
  @@ -50,22 +50,11 @@ struct _irqfd {
  struct list_head  list;
  poll_tablept;
  wait_queue_t  wait;
  -   struct work_structinject;
  struct work_structshutdown;
   };
   
   static struct workqueue_struct *irqfd_cleanup_wq;
   
  -static void
  -irqfd_inject(struct work_struct *work)
  -{
  -   struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
  -   struct kvm *kvm = irqfd-kvm;
  -
  -   kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 1);
  -   kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 0);
  -}
  -
   /*
* Race-free decouple logic (ordering is critical)
*/
  @@ -82,12 +71,6 @@ irqfd_shutdown(struct work_struct *work)
  eventfd_ctx_remove_wait_queue(irqfd-eventfd, irqfd-wait, cnt);
   
  /*
  -* We know no new events will be scheduled at this point, so block
  -* until all previously outstanding events have completed
  -*/
  -   flush_work(irqfd-inject);
  -
  -  

Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 11:10:55AM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 11:02:56AM +0200, Gleb Natapov wrote:
  On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote:
   To avoid bouncing irqfd injection out to workqueue context,
   we must support injecting irqs from local interrupt
   context. Doing this seems to only require disabling
   irqs locally.
   
   RFC, completely untested, x86 only.
   
   Thoughts?
   
  We do not want to disable irqs for a long time and some of code paths
  under lock involve looping over all cpus. For MSI injection path is
  lockless and this is the only case that matters,
 
 MSI only appeared in rhel6, older guests still use level interrupts.
So they are already slow for other reasons.

 Which paths require looping over all cpus? Do PCI interrupts
 need this?
 
All interrupts need it. IOAPIC has a loop to find dst cpu for interrupt.
Pic has a loop to find cpu in virtual wire mode.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] dma_rw.h (was Re: [PATCH 0/7] AMD IOMMU emulation patchset v4)

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 10:06:16AM +0300, Eduard - Gabriel Munteanu wrote:
 On Mon, Sep 13, 2010 at 10:01:20PM +0200, Michael S. Tsirkin wrote:
  So I think the following will give the idea of what an API
  might look like that will let us avoid the scary hacks in
  e.g. the ide layer and other generic layers that need to do DMA,
  without either binding us to pci, adding more complexity with
  callbacks, or losing type safety with casts and void*.
  
  Basically we have DMADevice that we can use container_of on
  to get a PCIDevice from, and DMAMmu that will get instanciated
  in a specific MMU.
  
  This is not complete code - just a header - I might complete
  this later if/when there's interest or hopefully someone interested
  in iommu emulation will.
 
 Hi,
 
 I personally like this approach better. It also seems to make poisoning
 cpu_physical_memory_*() easier if we convert every device to this API.
 We could then ban cpu_physical_memory_*(), perhaps by requiring a
 #define and #ifdef-ing those declarations.
 
  Notes:
  the IOMMU_PERM_RW code seem unused, so I replaced
  this with plain is_write. Is it ever useful?
 
 The original idea made provisions for stuff like full R/W memory maps.
 In that case cpu_physical_memory_map() would call the translation /
 checking function with perms == IOMMU_PERM_RW. That's not there yet so
 it can be removed at the moment, especially since it only affects these
 helpers.
 
 Also, I'm not sure if there are other sorts of accesses besides reads
 and writes we want to check or translate.
 
  It seems that invalidate callback should be able to
  get away with just a device, so I switched to that
  from a void pointer for type safety.
  Seems enough for the users I saw.
 
 I think this makes matters too complicated. Normally, a single DMADevice
 should be embedded within a busDevice,

No, DMADevice is a device that does DMA.
So e.g. a PCI device would embed one.
Remember, traslations are per device, right?
DMAMmu is part of the iommu object.

 so doing this makes it really
 hard to invalidate a specific map when there are more of them. It forces
 device code to act as a bus, provide fake 'DMADevice's for each map and
 dispatch translation to the real DMATranslateFunc. I see no other way.
 
 If you really want more type-safety (although I think this is a case of
 a true opaque identifying something only device code understands), I
 have another proposal: have a DMAMap embedded in the opaque. Example
 from dma-helpers.c:
 
 typedef struct {
   DMADevice *owner;
   [...]
 } DMAMap;
 
 typedef struct {
   [...]
   DMAMap map;
   [...]
 } DMAAIOCB;
 
 /* The callback. */
 static void dma_bdrv_cancel(DMAMap *map)
 {
   DMAAIOCB *dbs = container_of(map, DMAAIOCB, map);
 
   [...]
 }
 
 The upside is we only need to pass the DMAMap. That can also contain
 details of the actual map in case the device wants to release only the
 relevant range and remap the rest.

Fine.
Or maybe DMAAIOCB (just make some letters lower case: DMAIocb?).
Everyone will use it anyway, right?

  I saw devices do stl_le_phys and such, these
  might need to be wrapped as well.
 
 stl_le_phys() is defined and used only by hw/eepro100.c. That's already
 dealt with by converting the device.
 

I see. Need to get around to adding some prefix to it to make this clear.

   Thanks,
   Eduard
 
  Signed-off-by: Michael S. Tsirkin m...@redhat.com
  
  ---
  
  diff --git a/hw/dma_rw.h b/hw/dma_rw.h
  new file mode 100644
  index 000..d63fd17
  --- /dev/null
  +++ b/hw/dma_rw.h
  @@ -0,0 +1,122 @@
  +#ifndef DMA_RW_H
  +#define DMA_RW_H
  +
  +#include qemu-common.h
  +
  +/* We currently only have pci mmus, but using
  +   a generic type makes it possible to use this
  +   e.g. from the generic ide code without callbacks. */
  +typedef uint64_t dma_addr_t;
  +
  +typedef struct DMAMmu DMAMmu;
  +typedef struct DMADevice DMADevice;
  +
  +typedef int DMATranslateFunc(DMAMmu *mmu,
  + DMADevice *dev,
  + dma_addr_t addr,
  + dma_addr_t *paddr,
  + dma_addr_t *len,
  + int is_write);
  +
  +typedef int DMAInvalidateMapFunc(DMADevice *);
  +struct DMAMmu {
  +   /* invalidate, etc. */
  +   DmaTranslateFunc *translate;
  +};
  +
  +struct DMADevice {
  +   DMAMmu *mmu;
  +   DMAInvalidateMapFunc *invalidate;
  +};
  +
  +void dma_device_init(DMADevice *, DMAMmu *, DMAInvalidateMapFunc *);
  +
  +static inline void dma_memory_rw(DMADevice *dev,
  +dma_addr_t addr,
  +void *buf,
  +uint32_t len,
  +int is_write)
  +{
  +uint32_t plen;
  +/* Fast-path non-iommu.
  + * More importantly, makes it obvious what this function does. */
  +if (!dev-mmu) {
  +   cpu_physical_memory_rw(paddr, buf, plen, is_write);
  

Re: [Qemu-devel] [PATCH RFC] dma_rw.h (was Re: [PATCH 0/7] AMD IOMMU emulation patchset v4)

2010-09-16 Thread Michael S. Tsirkin
On Mon, Sep 13, 2010 at 03:45:34PM -0500, Anthony Liguori wrote:
 On 09/13/2010 03:01 PM, Michael S. Tsirkin wrote:
 So I think the following will give the idea of what an API
 might look like that will let us avoid the scary hacks in
 e.g. the ide layer and other generic layers that need to do DMA,
 without either binding us to pci, adding more complexity with
 callbacks, or losing type safety with casts and void*.
 
 Basically we have DMADevice that we can use container_of on
 to get a PCIDevice from, and DMAMmu that will get instanciated
 in a specific MMU.
 
 This is not complete code - just a header - I might complete
 this later if/when there's interest or hopefully someone interested
 in iommu emulation will.
 
 Notes:
 the IOMMU_PERM_RW code seem unused, so I replaced
 this with plain is_write. Is it ever useful?
 
 It seems that invalidate callback should be able to
 get away with just a device, so I switched to that
 from a void pointer for type safety.
 Seems enough for the users I saw.
 
 I saw devices do stl_le_phys and such, these
 might need to be wrapped as well.
 
 Signed-off-by: Michael S. Tsirkinm...@redhat.com
 
 One of the troubles with an interface like this is that I'm not sure
 a generic model universally works.
 
 For instance, I know some PCI busses do transparent byte swapping.
 For this to work, there has to be a notion of generic memory
 reads/writes vs. reads of a 32-bit, 16-bit, and 8-bit value.
 
 With a generic API, we lose the flexibility to do this type of bus
 interface.
 
 Regards,
 
 Anthony Liguori

Surely only PCI root can do such tricks.
Anyway, I suspect what you refer to is byte swapping of config cycles
and similar IO done by driver.  If a bus byteswapped a DMA transaction,
this basically breaks DMA as driver will have to go and fix up all data
before passing it up to the OS. Right?

We'd have to add more wrappers to emulate such insanity,
as MMU intentionally only handles translation.


 ---
 
 diff --git a/hw/dma_rw.h b/hw/dma_rw.h
 new file mode 100644
 index 000..d63fd17
 --- /dev/null
 +++ b/hw/dma_rw.h
 @@ -0,0 +1,122 @@
 +#ifndef DMA_RW_H
 +#define DMA_RW_H
 +
 +#include qemu-common.h
 +
 +/* We currently only have pci mmus, but using
 +   a generic type makes it possible to use this
 +   e.g. from the generic ide code without callbacks. */
 +typedef uint64_t dma_addr_t;
 +
 +typedef struct DMAMmu DMAMmu;
 +typedef struct DMADevice DMADevice;
 +
 +typedef int DMATranslateFunc(DMAMmu *mmu,
 + DMADevice *dev,
 + dma_addr_t addr,
 + dma_addr_t *paddr,
 + dma_addr_t *len,
 + int is_write);
 +
 +typedef int DMAInvalidateMapFunc(DMADevice *);
 +struct DMAMmu {
 +/* invalidate, etc. */
 +DmaTranslateFunc *translate;
 +};
 +
 +struct DMADevice {
 +DMAMmu *mmu;
 +DMAInvalidateMapFunc *invalidate;
 +};
 +
 +void dma_device_init(DMADevice *, DMAMmu *, DMAInvalidateMapFunc *);
 +
 +static inline void dma_memory_rw(DMADevice *dev,
 + dma_addr_t addr,
 + void *buf,
 + uint32_t len,
 + int is_write)
 +{
 +uint32_t plen;
 +/* Fast-path non-iommu.
 + * More importantly, makes it obvious what this function does. */
 +if (!dev-mmu) {
 +cpu_physical_memory_rw(paddr, buf, plen, is_write);
 +return;
 +}
 +while (len) {
 +err = dev-mmu-translate(iommu, dev, addr,paddr,plen, is_write);
 +if (err) {
 +return;
 +}
 +
 +/* The translation might be valid for larger regions. */
 +if (plen  len) {
 +plen = len;
 +}
 +
 +cpu_physical_memory_rw(paddr, buf, plen, is_write);
 +
 +len -= plen;
 +addr += plen;
 +buf += plen;
 +}
 +}
 +
 +void *dma_memory_map(DMADevice *dev,
 +dma_addr_t addr,
 +uint32_t *len,
 +int is_write);
 +void dma_memory_unmap(DMADevice *dev,
 +  void *buffer,
 +  uint32_t len,
 +  int is_write,
 +  uint32_t access_len);
 +
 +
 ++#define DEFINE_DMA_LD(suffix, size)   \
 ++uint##size##_t dma_ld##suffix(DMADevice *dev, dma_addr_t addr)\
 ++{ \
 ++int err;  \
 ++target_phys_addr_t paddr, plen;   \
 ++if (!dev-mmu) {  \
 ++return ld##suffix##_phys(addr, val);  \
 ++} \
 ++

Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 11:25:53AM +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 11:10:55AM +0200, Michael S. Tsirkin wrote:
  On Thu, Sep 16, 2010 at 11:02:56AM +0200, Gleb Natapov wrote:
   On Wed, Sep 15, 2010 at 08:54:17PM +0200, Michael S. Tsirkin wrote:
To avoid bouncing irqfd injection out to workqueue context,
we must support injecting irqs from local interrupt
context. Doing this seems to only require disabling
irqs locally.

RFC, completely untested, x86 only.

Thoughts?

   We do not want to disable irqs for a long time and some of code paths
   under lock involve looping over all cpus. For MSI injection path is
   lockless and this is the only case that matters,
  
  MSI only appeared in rhel6, older guests still use level interrupts.
 So they are already slow for other reasons.

They are slower than MSI, but I do not want irqfd to be slower
than ioctls.

  Which paths require looping over all cpus? Do PCI interrupts
  need this?
  
 All interrupts need it. IOAPIC has a loop to find dst cpu for interrupt.
 Pic has a loop to find cpu in virtual wire mode.
 
 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alec Joseph Rivera
On Thu, 2010-09-16 at 16:32 +0800, Dave Young wrote:
 On Thu, Sep 16, 2010 at 4:17 PM, Alec Joseph Rivera eij...@gmail.com wrote:
  On Wed, 2010-09-15 at 22:37 +0400, Michael Tokarev wrote:
  Alec Joseph Rivera wrote:
  []
   Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
   Memory: 1022272k/1048564k available (1671k kernel code, 25488k 
   reserved, 890k data, 200k init, 131064k highmem)
   Checking if this processor honours the WP bit even in supervisor 
   mode... Ok.
   -- end
 
  Hm.  There was some error with earlier kernels and
  kvm that resulted in exactly this behavour.  Maybe
  even a (guest) kernel bug.. no?  My memory is vague
  in this area, but it tells there was something...
 
 
  Hi Michael:
 
  I suspect has something to do with loops (re: bogomips calibration) or
  timers (re: calibration + pci probing). Will verify this when I get the
  time to read the source.
 
  Maybe someone more familiar with this kernel area can shed some light on
  the topic too...? :-) only if I'm not stretching too much..
 
 boot with param lpj=xxx to avoid calibration
 
 copy xxx value from your normal boot kernel messages

Hi Dave:

Crossing out calibration, added the lpj parameter (and combinations with
-no-hpet and -no-kvm-irqchip) but still hangs after WP bit checking.

 
  /mjt
 
 
  --
  ---
 
 
  Follow me:
  http://twitter.com/agirivera
  Invite as a
  friend:
  http://www.facebook.com/agirivera
 
 
 
  --
  To unsubscribe from this list: send the line unsubscribe kvm in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
 
 


-- 
--- 


Follow me: 
http://twitter.com/agirivera
Invite as a
friend: 
http://www.facebook.com/agirivera



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
 
   MSI only appeared in rhel6, older guests still use level interrupts.
 So they are already slow for other reasons.
 
 
 Exactly, for example they need to exit to userspace to ack the
 interrupt.  That's far slower than the workqueue.

Well, this is not exactly comparable: you might get
same irq asserted multiple times and only deasserted once.


 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 04:18:10PM +0800, Xin, Xiaohui wrote:
 From: Michael S. Tsirkin [mailto:m...@redhat.com]
 Sent: Wednesday, September 15, 2010 5:59 PM
 To: Xin, Xiaohui
 Cc: Shirley Ma; Arnd Bergmann; Avi Kivity; David Miller; 
 net...@vger.kernel.org;
 kvm@vger.kernel.org; linux-ker...@vger.kernel.org
 Subject: Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host 
 kernel
 
 On Wed, Sep 15, 2010 at 10:46:02AM +0800, Xin, Xiaohui wrote:
  From: Michael S. Tsirkin [mailto:m...@redhat.com]
  Sent: Wednesday, September 15, 2010 12:30 AM
  To: Shirley Ma
  Cc: Arnd Bergmann; Avi Kivity; Xin, Xiaohui; David Miller; 
  net...@vger.kernel.org;
  kvm@vger.kernel.org; linux-ker...@vger.kernel.org
  Subject: Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host 
  kernel
  
  On Tue, Sep 14, 2010 at 09:00:25AM -0700, Shirley Ma wrote:
   On Tue, 2010-09-14 at 17:22 +0200, Michael S. Tsirkin wrote:
I would expect this to hurt performance significantly.
We could do this for asynchronous requests only to avoid the
slowdown.
  
   Is kiocb in sendmsg helpful here? It is not used now.
  
   Shirley
  
  Precisely. This is what the patch from Xin Xiaohui does.  That code
  already seems to do most of what you are trying to do, right?
  
  The main thing missing seems to be macvtap integration, so that we can 
  fall back
  on data copy if zero copy is unavailable?
  How hard would it be to basically link the mp and macvtap modules
  together to get us this functionality? Anyone?
  
  Michael,
  Is to support macvtap with zero-copy through mp device the functionality
  you mentioned above?
 
 I have trouble parsing the above question.  At some point Arnd suggested
 that the mp device functionality would fit nicely as part of the macvtap
 driver.  It seems to make sense superficially, the advantage if it
 worked would be that we would get zero copy (mostly) transparently.
 
 Do you agree with this goal?
 
 
 I would say yes.

In that case, it's a blocker for upstream merge because this change
affects userspace.

  Before Shirley Ma has suggested to move the zero-copy functionality into
  tun/tap device or macvtap device. How do you think about that?
  I suspect
  there will be a lot of duplicate code in that three drivers except we can 
  extract
  code of zero-copy into kernel APIs and vhost APIs.
 
 
 tap would be very hard at this point as it does not bind to a device.
 macvtap might work, we mainly need to figure out a way to detect that
 device can do zero copy so the right mode is used.  I think a first step
 could be to simply link mp code into macvtap module, pass necessary
 ioctls on, then move some code around as necessary.  This might get rid
 of code duplication nicely.
 
 I'll look into this to see how much effort would be.
 
 
 
  Do you think that's worth to do and help current process which is blocked 
  too
  long than I expected?
 
 I think it's nice to have.
 
 And if done hopefully this will get the folk working on the macvtap
 driver to review the code, which will help find all issues faster.
 
 I think if you post some performance numbers,
 this will also help get people excited and looking at the code.
 
 
 The performance data I have posted before is compared with raw socket on 
 vhost-net.
 But currently, the raw socket backend is removed from the qemu side.
 So I can only compare with tap on vhost-net. But unfortunately, I missed 
 something
 that I even can't bring it up. I was blocked by this for a time.

Hey, maybe you are seeing the bug that was reported recently.
Could you try tcpdump -i on the tap interface in host and ethX on guest
and tell me what you see?
If you see packet in guest but not in host, could you try
adding printks in vhost handle_tx to see whether it gets called
and if yes where it fails?

 I also don't see the process as completely blocked, each review round points
 out more issues: we aren't going back and forth changing
 same lines again and again, are we?
 
 One thing that might help is increase the frequency of updates,
 try sending them out sooner.
 On the other hand 10 new patches each revision is a lot:
 if there is a part of patchset that has stabilised you can split it out,
 post once and keep posting the changing part separately.
 
 I hope these suggestions help.
 
 Thanks, Michael!
 
 
  
  --
  MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alec Joseph Rivera
On Thu, 2010-09-16 at 16:03 +0800, Alec Joseph Rivera wrote:
 On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote:
   On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote:
On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote:
  That should give us a clue on what's going wrong.
  
 
 I did the steps you mentioned. Here's the gdb transcript:
 
 $ gdb k12_1009 -ex 'target remote localhost:1234'
What is k12_1009? Are you sure this is vmlinux?

   
   Hi Gleb:
   
   Yes, k12_1009 is the kernel image that isolinux loads.
   
   -- isolinux.cfg
   
DEFAULT 1
TIMEOUT 100
PROMPT 1
DISPLAY isolinux.txt
LABEL 1
KERNEL k12_1009
APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
LABEL 2
KERNEL k12_1009
APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
BOOT_SUPPORT=1 
   
   -- file test
$ file k12_1009 
k12_1009: Linux kernel x86 boot executable bzImage, version 
2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 
0x1, Normal VGA
   
  This is bzImage, You need corresponding vmlinux bzImage was created
  from.
 
 Oh I see. Thanks for pointing that out. Unfortunately, I don't think I
 can get that image. Either I ask someone from Novell or IBM.
 
 I'll try crawling the system's filesystem, maybe I can find it there...
 Will keep you updated.
 

No vmlinux/z image found on cd or an installed system :-(


  
BTW what version on kvm/qemu are you using.

   
   qemu's version is 0.12.5. modinfo kvm doesn't reveal any version string
   though, however the kernel is 2.6.35 from Arch.
   
   Question: from what I've read so far kvm versions are of 'kvm-xx' where
   xx is a number, on sourceforge there is kvm-kmod-{kernel-version}, am I
   correct in assuming and saying then that my kvm version is 2.6.35?
   
  kvm-xx is deprecated. So when asked what kvm are you using you should
  provide your kernel version like you did.
  
  --
  Gleb.
 
 


-- 
--- 


Follow me: 
http://twitter.com/agirivera
Invite as a
friend: 
http://www.facebook.com/agirivera



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
   On 09/16/2010 11:25 AM, Gleb Natapov wrote:
  
MSI only appeared in rhel6, older guests still use level interrupts.
  So they are already slow for other reasons.
  
  
  Exactly, for example they need to exit to userspace to ack the
  interrupt.  That's far slower than the workqueue.
 
 Well, this is not exactly comparable: you might get
 same irq asserted multiple times and only deasserted once.
 
Are we talking about level interrupts? Why would you assert level
triggered interrupt multiple times before deasserting it?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 06:11:36PM +0800, Alec Joseph Rivera wrote:
 On Thu, 2010-09-16 at 16:03 +0800, Alec Joseph Rivera wrote:
  On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote:
   On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote:
On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote:
   That should give us a clue on what's going wrong.
   
  
  I did the steps you mentioned. Here's the gdb transcript:
  
  $ gdb k12_1009 -ex 'target remote localhost:1234'
 What is k12_1009? Are you sure this is vmlinux?
 

Hi Gleb:

Yes, k12_1009 is the kernel image that isolinux loads.

-- isolinux.cfg

 DEFAULT 1
 TIMEOUT 100
 PROMPT 1
 DISPLAY isolinux.txt
 LABEL 1
 KERNEL k12_1009
 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
 LABEL 2
 KERNEL k12_1009
 APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
 BOOT_SUPPORT=1 

-- file test
 $ file k12_1009 
 k12_1009: Linux kernel x86 boot executable bzImage, version 
 2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, swap_dev 
 0x1, Normal VGA

   This is bzImage, You need corresponding vmlinux bzImage was created
   from.
  
  Oh I see. Thanks for pointing that out. Unfortunately, I don't think I
  can get that image. Either I ask someone from Novell or IBM.
  
  I'll try crawling the system's filesystem, maybe I can find it there...
  Will keep you updated.
  
 
 No vmlinux/z image found on cd or an installed system :-(
 
 
It may be hidden somewhere in debug rpm.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
  On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
On 09/16/2010 11:25 AM, Gleb Natapov wrote:
   
 MSI only appeared in rhel6, older guests still use level interrupts.
   So they are already slow for other reasons.
   
   
   Exactly, for example they need to exit to userspace to ack the
   interrupt.  That's far slower than the workqueue.
  
  Well, this is not exactly comparable: you might get
  same irq asserted multiple times and only deasserted once.
  
 Are we talking about level interrupts? Why would you assert level
 triggered interrupt multiple times before deasserting it?

User of irqfd has no way to know what current interrupt level is.
So it has to keep asserting.

 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
   On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
 On 09/16/2010 11:25 AM, Gleb Natapov wrote:

  MSI only appeared in rhel6, older guests still use level interrupts.
So they are already slow for other reasons.


Exactly, for example they need to exit to userspace to ack the
interrupt.  That's far slower than the workqueue.
   
   Well, this is not exactly comparable: you might get
   same irq asserted multiple times and only deasserted once.
   
  Are we talking about level interrupts? Why would you assert level
  triggered interrupt multiple times before deasserting it?
 
 User of irqfd has no way to know what current interrupt level is.
 So it has to keep asserting.
 
Why can't it keep track of current level?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alexander Graf
Alec Joseph Rivera wrote:
 On Wed, 2010-09-15 at 20:15 +0200, Alexander Graf wrote:
   
 On 15.09.2010, at 20:07, Alec Joseph Rivera wrote:

 
 On Wed, 2010-09-15 at 19:51 +0200, Alexander Graf wrote:
   
 On 15.09.2010, at 19:48, Alec Joseph Rivera wrote:

 
 On Wed, 2010-09-15 at 19:28 +0200, Alexander Graf wrote:
   
 On 15.09.2010, at 18:53, Alec Joseph Rivera wrote:

 
 Hi all:

 I'm Alec Joseph Rivera, from the Philippines and new on this list.
 Anyway, I'm trying to run Lotus Foundations on kvm and it just hangs on
 bootup. It stops right after:

 Checking if this processor honours the WP bit even in supervisor
 mode... Ok.

 There's no more output after that. My invocation line is:

 $ qemu-kvm -cpu host -m 1G -cdrom lfs.iso
   
 Your host is probably too new for this old guest. -cpu host directly 
 passes through this host cpu's identifiers on which the guest might 
 choke. Please try again without -cpu.


 
 Tried without -cpu, still hangs after the WP bit checking...

 $ qemu-kvm -m 1G -cdrom lfs.iso

 I was scanning the changelogs and read something about a
 -no-kernel-irqchip. The man page doesn't say anything about it but will
 try this one too..
   
 Please try:

 $ qemu-kvm -m 1G -cdrom lfs.iso -serial stdio

 Then when it shows the bootloader, add console=ttyS0 to the kernel 
 command line. That should give all the debugging output necessary.


 
 Already done this one too (I've read it from your conversations with a
 Peter guy if I'm not mistaken).

 It just stops with no panic messages right after the mentioned checking
 part, which made me dig after timers (bogomips calibration should be
 next I believe). But I've gotten nowhere so far...

 Also, there's no -no-kernel-irqchip parameter, must have been a typo for
 -no-kvm-irqchip.

 -- begin kernel messages
   
 Linux version 2.6.16.54-0.2.5-bigsmp (ge...@buildhost) (gcc version 4.1.2 
 20070115 (prerelease) (SUSE Linux)) #1 SMP Mon Jan 21 08:29:51 EST 2008
 

Took me quite a while to find that kernel in the graveyard :). It's a
SLES10 SP1 kernel from the update repo at one specific point in time
(more updates came after that one).

 BIOS-provided physical RAM map:
 BIOS-e820:  - 0009f400 (usable)
 BIOS-e820: 0009f400 - 000a (reserved)
 BIOS-e820: 000f - 0010 (reserved)
 BIOS-e820: 0010 - 3fffd000 (usable)
 BIOS-e820: 3fffd000 - 4000 (reserved)
 BIOS-e820: fffbc000 - 0001 (reserved)
 127MB HIGHMEM available.
 895MB LOWMEM available.
 found SMP MP-table at 000f8990
 DMI 2.4 present.
 Using APIC driver default
 ACPI: PM-Timer IO Port: 0xb008
 ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
 Processor #0 15:11 APIC version 20
 ACPI: LAPIC (acpi_id[0x01] lapic_id[0x01] enabled)
 Processor #1 15:11 APIC version 20
 Overriding APIC driver with bigsmp
 ACPI: IOAPIC (id[0x02] address[0xfec0] gsi_base[0])
 IOAPIC[0]: apic_id 2, version 17, address 0xfec0, GSI 0-23
 ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
 ACPI: INT_SRC_OVR (bus 0 bus_irq 5 global_irq 5 high level)
 ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
 ACPI: INT_SRC_OVR (bus 0 bus_irq 10 global_irq 10 high level)
 ACPI: INT_SRC_OVR (bus 0 bus_irq 11 global_irq 11 high level)
 Enabling APIC mode:  Physflat.  Using 1 I/O APICs
 ACPI: HPET id: 0x8086a201 base: 0xfed0
 Using ACPI (MADT) for SMP configuration information
 Allocating PCI resources starting at 5000 (gap: 4000:bffbc000)
 Built 1 zonelists
 Kernel command line: ramdisk_size=32768 initrd=initrd root=/dev/ram 
 BOOT_IMAGE=k12_1009 console=ttyS0
 Enabling fast FPU save and restore... done.
 Enabling unmasked SIMD FPU exception support... done.
 Initializing CPU#0
 PID hash table entries: 4096 (order: 12, 65536 bytes)
 Console: colour VGA+ 80x25
 Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
 Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
 Memory: 1022272k/1048564k available (1671k kernel code, 25488k reserved, 
 890k data, 200k init, 131064k highmem)
 Checking if this processor honours the WP bit even in supervisor mode... 
 Ok.
 
 Interesting. Mind to check if you find a the vmlinux binary for that kernel 
 on the cd? Maybe it's hidden inside an rpm. Then you can use:

 $ qemu-kvm -s -cdrom ...
 $ gdb vmlinux -ex 'target remote localhost:1234'

 (gdb) bt

 That should give us a clue on what's going wrong.

 

 I did the steps you mentioned. Here's the gdb transcript:

 $ gdb k12_1009 -ex 'target remote localhost:1234'
 GNU gdb (GDB) 7.2
 Copyright (C) 2010 Free Software Foundation, Inc.
 License GPLv3+: GNU GPL version 3 or later
 http://gnu.org/licenses/gpl.html
 This is free software: you are free to change and redistribute it.
 There is NO WARRANTY, to the extent permitted by law.  Type show
 copying
 and show 

Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
  On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
   On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
  On 09/16/2010 11:25 AM, Gleb Natapov wrote:
 
   MSI only appeared in rhel6, older guests still use level 
  interrupts.
 So they are already slow for other reasons.
 
 
 Exactly, for example they need to exit to userspace to ack the
 interrupt.  That's far slower than the workqueue.

Well, this is not exactly comparable: you might get
same irq asserted multiple times and only deasserted once.

   Are we talking about level interrupts? Why would you assert level
   triggered interrupt multiple times before deasserting it?
  
  User of irqfd has no way to know what current interrupt level is.
  So it has to keep asserting.
  
 Why can't it keep track of current level?

This breaks the model: eventfd user is unaware of PCI, levels and such:
it just signals the event.  Remember that asserts are done from e.g. vhost-net,
deasserts need to be handled by qemu.

 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 12:44:55PM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
   On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
   On 09/16/2010 11:25 AM, Gleb Natapov wrote:
  
MSI only appeared in rhel6, older guests still use level 
   interrupts.
  So they are already slow for other reasons.
  
  
  Exactly, for example they need to exit to userspace to ack the
  interrupt.  That's far slower than the workqueue.
 
 Well, this is not exactly comparable: you might get
 same irq asserted multiple times and only deasserted once.
 
Are we talking about level interrupts? Why would you assert level
triggered interrupt multiple times before deasserting it?
   
   User of irqfd has no way to know what current interrupt level is.
   So it has to keep asserting.
   
  Why can't it keep track of current level?
 
 This breaks the model: eventfd user is unaware of PCI, levels and such:
 it just signals the event.  Remember that asserts are done from e.g. 
 vhost-net,
 deasserts need to be handled by qemu.
 
eventfd user implements HW and it knows exactly what type of interrupt
this HW generates.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alec Joseph Rivera
On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote:
 On Thu, Sep 16, 2010 at 12:27:29PM +0200, Alexander Graf wrote:
  Alec Joseph Rivera wrote:
   Program received signal SIGINT, Interrupt.
   0xc015405f in ?? ()
   (gdb) bt
   #0  0xc015405f in ?? ()
 
  
  0xc015405a __pte_alloc_kernel+82:mov(%edi),%eax
  0xc015405c __pte_alloc_kernel+84:mov0x4(%edi),%edx
  0xc015405f __pte_alloc_kernel+87:lock cmpxchg8b (%edi)
  0xc0154063 __pte_alloc_kernel+91:jne0xc015405a
  __pte_alloc_kernel+82
 
 This looks like a bug I also have seen recently on 32 bit host-kvm. The
 instruction emulation for 'lock cmpxchg8b' was broken so that the rip
 was not advanced and the guest just iterated over this instruction again
 and again. I thought this was fixed with the latest kvm updates in
 2.6.36.
 

Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free
time :-)

Thanks, will update the list when either comes.

   Joerg
 


-- 
--- 


Follow me: 
http://twitter.com/agirivera
Invite as a
friend: 
http://www.facebook.com/agirivera



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alec Joseph Rivera
On Thu, 2010-09-16 at 12:14 +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 06:11:36PM +0800, Alec Joseph Rivera wrote:
  On Thu, 2010-09-16 at 16:03 +0800, Alec Joseph Rivera wrote:
   On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote:
On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote:
 On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera wrote:
That should give us a clue on what's going wrong.

   
   I did the steps you mentioned. Here's the gdb transcript:
   
   $ gdb k12_1009 -ex 'target remote localhost:1234'
  What is k12_1009? Are you sure this is vmlinux?
  
 
 Hi Gleb:
 
 Yes, k12_1009 is the kernel image that isolinux loads.
 
 -- isolinux.cfg
 
  DEFAULT 1
  TIMEOUT 100
  PROMPT 1
  DISPLAY isolinux.txt
  LABEL 1
  KERNEL k12_1009
  APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
  LABEL 2
  KERNEL k12_1009
  APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
  BOOT_SUPPORT=1 
 
 -- file test
  $ file k12_1009 
  k12_1009: Linux kernel x86 boot executable bzImage, version 
  2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, 
  swap_dev 0x1, Normal VGA
 
This is bzImage, You need corresponding vmlinux bzImage was created
from.
   
   Oh I see. Thanks for pointing that out. Unfortunately, I don't think I
   can get that image. Either I ask someone from Novell or IBM.
   
   I'll try crawling the system's filesystem, maybe I can find it there...
   Will keep you updated.
   
  
  No vmlinux/z image found on cd or an installed system :-(
  
  
 It may be hidden somewhere in debug rpm.
 

Sadly, Lotus Foundations doesn't supply the debug rpms :-(

 --
   Gleb.


-- 
--- 


Follow me: 
http://twitter.com/agirivera
Invite as a
friend: 
http://www.facebook.com/agirivera



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] dma_rw.h (was Re: [PATCH 0/7] AMD IOMMU emulation patchset v4)

2010-09-16 Thread Eduard - Gabriel Munteanu
On Thu, Sep 16, 2010 at 11:20:43AM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 10:06:16AM +0300, Eduard - Gabriel Munteanu wrote:

[snip]

 
 No, DMADevice is a device that does DMA.
 So e.g. a PCI device would embed one.
 Remember, traslations are per device, right?
 DMAMmu is part of the iommu object.
 

I agree, all I'm saying is a DMADevice is insufficient to invalidate one
of the many maps a given device holds, at least without resorting to
horrible tricks or destroying them all.

  so doing this makes it really
  hard to invalidate a specific map when there are more of them. It forces
  device code to act as a bus, provide fake 'DMADevice's for each map and
  dispatch translation to the real DMATranslateFunc. I see no other way.
  
  If you really want more type-safety (although I think this is a case of
  a true opaque identifying something only device code understands), I
  have another proposal: have a DMAMap embedded in the opaque. Example
  from dma-helpers.c:
  
  typedef struct {
  DMADevice *owner;
  [...]
  } DMAMap;
  
  typedef struct {
  [...]
  DMAMap map;
  [...]
  } DMAAIOCB;
  
  /* The callback. */
  static void dma_bdrv_cancel(DMAMap *map)
  {
  DMAAIOCB *dbs = container_of(map, DMAAIOCB, map);
  
  [...]
  }
  
  The upside is we only need to pass the DMAMap. That can also contain
  details of the actual map in case the device wants to release only the
  relevant range and remap the rest.
 
 Fine.
 Or maybe DMAAIOCB (just make some letters lower case: DMAIocb?).
 Everyone will use it anyway, right?

No, you misunderstood me. DMAAIOCB is already there in dma-helpers.c,
it's the opaque I used and it was already there before my patches.

The idea was to define DMAMap and embed it in DMAAIOCB, so we can
upcast from the former to the latter (which is what we actually need).

IDE DMA uses that, but other devices would use whatever they want, even
nothing except the DMAMap. The only requirement is that the container
struct allows device code to acknowledge the invalidation (in case of
AIO, we simply kill that thread and release resources).

Well, perhaps DMAMap isn't the best name, but you get the idea.

   I saw devices do stl_le_phys and such, these
   might need to be wrapped as well.
  
  stl_le_phys() is defined and used only by hw/eepro100.c. That's already
  dealt with by converting the device.
  
 
 I see. Need to get around to adding some prefix to it to make this clear.
 
  Thanks,
  Eduard
  

[snip]


Eduard
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 12:53:52PM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 12:54:03PM +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 12:44:55PM +0200, Michael S. Tsirkin wrote:
   On Thu, Sep 16, 2010 at 12:20:47PM +0200, Gleb Natapov wrote:
On Thu, Sep 16, 2010 at 12:13:39PM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 12:13:32PM +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 11:53:10AM +0200, Michael S. Tsirkin wrote:
   On Thu, Sep 16, 2010 at 11:46:03AM +0200, Avi Kivity wrote:
 On 09/16/2010 11:25 AM, Gleb Natapov wrote:

  MSI only appeared in rhel6, older guests still use level 
 interrupts.
So they are already slow for other reasons.


Exactly, for example they need to exit to userspace to ack the
interrupt.  That's far slower than the workqueue.
   
   Well, this is not exactly comparable: you might get
   same irq asserted multiple times and only deasserted once.
   
  Are we talking about level interrupts? Why would you assert level
  triggered interrupt multiple times before deasserting it?
 
 User of irqfd has no way to know what current interrupt level is.
 So it has to keep asserting.
 
Why can't it keep track of current level?
   
   This breaks the model: eventfd user is unaware of PCI, levels and such:
   it just signals the event.  Remember that asserts are done from e.g. 
   vhost-net,
   deasserts need to be handled by qemu.
   
  eventfd user implements HW and it knows exactly what type of interrupt
  this HW generates.
 
 We haver two users: qemu does deasserts, vhost-net does asserts.
Well this is broken. You want KVM to track level for you and this is
wrong. KVM does this anyway because it can't relay on devise model
to behave correctly [0], but in your case it is designed to behave
incorrectly.

Interrupt type is a device property. PCI devices just happen to be level
triggered according to PCI spec. What if you want to use vhost-net to
implement network device which has active-low interrupt line? [1]

If you want to split parts that asserts irq and de-asserts it then we
should have irqfd that tracks line status and knows interrupt line
polarity.

 Another application is out of process virtio (sandboxing!).
It will still assert and de-assert irq at the same code, so it will be
able to track irq line status.

 Again, pci stuff needs to stay in qemu.
 

Nothing to do with PCI whatsoever.

[0] most qemu devices behave incorrectly and trigger level irq more then
needed.
[1] this is how correct PCI device should behave but we override
polarity in ACPI, but now incorrect behaviour is deeply designed
into vhost-net.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 07:13:03PM +0800, Alec Joseph Rivera wrote:
 On Thu, 2010-09-16 at 12:14 +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 06:11:36PM +0800, Alec Joseph Rivera wrote:
   On Thu, 2010-09-16 at 16:03 +0800, Alec Joseph Rivera wrote:
On Thu, 2010-09-16 at 09:56 +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 03:52:17PM +0800, Alec Joseph Rivera wrote:
  On Thu, 2010-09-16 at 08:39 +0200, Gleb Natapov wrote:
   On Thu, Sep 16, 2010 at 09:55:38AM +0800, Alec Joseph Rivera 
   wrote:
 That should give us a clue on what's going wrong.
 

I did the steps you mentioned. Here's the gdb transcript:

$ gdb k12_1009 -ex 'target remote localhost:1234'
   What is k12_1009? Are you sure this is vmlinux?
   
  
  Hi Gleb:
  
  Yes, k12_1009 is the kernel image that isolinux loads.
  
  -- isolinux.cfg
  
   DEFAULT 1
   TIMEOUT 100
   PROMPT 1
   DISPLAY isolinux.txt
   LABEL 1
   KERNEL k12_1009
   APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
   LABEL 2
   KERNEL k12_1009
   APPEND ramdisk_size=32768 initrd=initrd root=/dev/ram 
   BOOT_SUPPORT=1 
  
  -- file test
   $ file k12_1009 
   k12_1009: Linux kernel x86 boot executable bzImage, version 
   2.6.16.54-0.2.5-bigsmp (ge...@b, RO-rootFS, root_dev 0x301, 
   swap_dev 0x1, Normal VGA
  
 This is bzImage, You need corresponding vmlinux bzImage was created
 from.

Oh I see. Thanks for pointing that out. Unfortunately, I don't think I
can get that image. Either I ask someone from Novell or IBM.

I'll try crawling the system's filesystem, maybe I can find it there...
Will keep you updated.

   
   No vmlinux/z image found on cd or an installed system :-(
   
   
  It may be hidden somewhere in debug rpm.
  
 
 Sadly, Lotus Foundations doesn't supply the debug rpms :-(
 
Looks like Alexander already found relevant rpm.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Joerg Roedel
On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote:
 On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote:
 
 Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free
 time :-)
 
 Thanks, will update the list when either comes.

Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on
32 bit hosts.

Joerg

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Michael Tokarev
16.09.2010 15:32, Joerg Roedel пишет:
 On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote:
 On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote:

 Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free
 time :-)

 Thanks, will update the list when either comes.
 
 Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on
 32 bit hosts.

What commit it is?

I swear I saw it in 2.6.32-stable somewhere.  Is it this one (2.6.32.12):

From: Gleb Natapov g...@redhat.com
Date: Fri, 19 Mar 2010 15:47:31 +0100
Subject: KVM: x86 emulator: fix memory access during x86 emulation
To: sta...@kernel.org
Cc: Marcelo Tosatti mtosa...@redhat.com, Avi Kivity a...@redhat.com, Gleb 
Natapov g...@redhat.com
Message-ID: 1269010059-25309-4-git-send-email-stefan.ba...@canonical.com


From: Gleb Natapov g...@redhat.com

commit 1871c6020d7308afb99127bba51f04548e7ca84e upstream

Currently when x86 emulator needs to access memory, page walk is done with
broadest permission possible, so if emulated instruction was executed
by userspace process it can still access kernel memory. Fix that by
providing correct memory access to page walker during emulation.


/mjt
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0

2010-09-16 Thread Liu Yu-B13201
 

 -Original Message-
 From: Alexander Graf [mailto:ag...@suse.de] 
 Sent: Friday, September 10, 2010 7:24 AM
 To: Liu Yu-B13201
 Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org
 Subject: Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0
 
 
 On 08.09.2010, at 11:40, Liu Yu wrote:
 
  Current guest TLB1 is mapped to host TLB1.
  As host kernel only provides 4K uncontinuous pages,
  we have to break guest large mapping into 4K shadow mappings.
  These 4K shadow mappings are then mapped into host TLB1 on fly.
  As host TLB1 only has 13 free entries, there's serious tlb miss.
  
  Since e500v2 has a big number of TLB0 entries,
  it should be help to map those 4K shadow mappings to host TLB0.
  To achieve this, we need to unlink guest tlb and host tlb,
  So that guest TLB1 mappings can route to any host TLB0 
 entries freely.
  
  Pages/mappings are considerred in the same kind as host tlb entry.
  This patch remove the link between pages and guest tlb 
 entry to do the unlink.
  And keep host_tlb0_ref in each vcpu to trace pages.
  Then it's easy to map guest TLB1 to host TLB0.
  
  In guest ramdisk boot test(guest mainly uses TLB1),
  with this patch, the tlb miss number get down 90%.
  
  Signed-off-by: Liu Yu yu@freescale.com
  ---
  arch/powerpc/include/asm/kvm_e500.h |7 +-
  arch/powerpc/kvm/e500.c |4 +
  arch/powerpc/kvm/e500_tlb.c |  280 
 ---
  arch/powerpc/kvm/e500_tlb.h |1 +
  4 files changed, 104 insertions(+), 188 deletions(-)
  

 
  
  -static unsigned int tlb1_entry_num;
  +static inline unsigned int get_tlb0_entry_offset(u32 
 eaddr, u32 esel)
  +{
  +   return ((eaddr  0x7F000)  (12 - host_tlb0_assoc_bit) |
  +   (esel  (host_tlb0_assoc - 1)));
  +}
  
  void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
  {
  @@ -62,11 +68,6 @@ static inline unsigned int tlb0_get_next_victim(
  return victim;
  }
  
  -static inline unsigned int tlb1_max_shadow_size(void)
  -{
  -   return tlb1_entry_num - tlbcam_index;
  -}
  -
  static inline int tlbe_is_writable(struct tlbe *tlbe)
  {
  return tlbe-mas3  (MAS3_SW|MAS3_UW);
  @@ -100,7 +101,7 @@ static inline u32 
 e500_shadow_mas2_attrib(u32 mas2, int usermode)
  /*
   * writing shadow tlb entry to host TLB
   */
  -static inline void __write_host_tlbe(struct tlbe *stlbe)
  +static inline void __host_tlbe_write(struct tlbe *stlbe)
  {
  mtspr(SPRN_MAS1, stlbe-mas1);
  mtspr(SPRN_MAS2, stlbe-mas2);
  @@ -109,25 +110,22 @@ static inline void 
 __write_host_tlbe(struct tlbe *stlbe)
  __asm__ __volatile__ (tlbwe\n : : );
  }
  
  -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 
 *vcpu_e500,
  -   int tlbsel, int esel, struct tlbe *stlbe)
  +static inline u32 host_tlb0_write(struct kvmppc_vcpu_e500 
 *vcpu_e500,
  +   u32 gvaddr, struct tlbe *stlbe)
  {
  -   local_irq_disable();
  -   if (tlbsel == 0) {
  -   __write_host_tlbe(stlbe);
  -   } else {
  -   unsigned register mas0;
  +   unsigned register mas0;
  
  -   mas0 = mfspr(SPRN_MAS0);
  +   local_irq_disable();
 
 Do you have to disable interrupts for a tlb write? If you get 
 preempted before the write, the entry you overwrite could be 
 different. But you don't protect against that either way. And 
 if you get preempted afterwards, you could lose the entry. 
 But since you enable interrupts beyond that, that could 
 happen either way too.
 
 So what's the reason for the disable here?
 

Hello Alex,

Doesn't local_irq_disable() also disable preempt?
The reason to disable interrupts is because it's possible to have tlb
misses in interrupt service route.


Thanks,
Yu

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alec Joseph Rivera
On Thu, 2010-09-16 at 13:32 +0200, Joerg Roedel wrote:
 On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote:
  On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote:
  
  Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free
  time :-)
  
  Thanks, will update the list when either comes.
 
 Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on
 32 bit hosts.
 

Nice :-) woot woot... Will definite compile a test kernel asap.

   Joerg
 


-- 
--- 


Follow me: 
http://twitter.com/agirivera
Invite as a
friend: 
http://www.facebook.com/agirivera



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 03:35:17PM +0400, Michael Tokarev wrote:
 16.09.2010 15:32, Joerg Roedel пишет:
  On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote:
  On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote:
 
  Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free
  time :-)
 
  Thanks, will update the list when either comes.
  
  Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on
  32 bit hosts.
 
 What commit it is?
 
 I swear I saw it in 2.6.32-stable somewhere.  Is it this one (2.6.32.12):
 
it is 16518d5ada690643453eb0aef3cc7841d3623c2d, but the bug shouldn't be
present in 2.6.32.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 07:37:55PM +0800, Alec Joseph Rivera wrote:
 On Thu, 2010-09-16 at 13:32 +0200, Joerg Roedel wrote:
  On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote:
   On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote:
   
   Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free
   time :-)
   
   Thanks, will update the list when either comes.
  
  Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on
  32 bit hosts.
  
 
 Nice :-) woot woot... Will definite compile a test kernel asap.
 
So your host is 32bit?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Joerg Roedel
On Thu, Sep 16, 2010 at 03:35:17PM +0400, Michael Tokarev wrote:
 16.09.2010 15:32, Joerg Roedel пишет:
  On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote:
  On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote:
 
  Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free
  time :-)
 
  Thanks, will update the list when either comes.
  
  Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on
  32 bit hosts.
 
 What commit it is?
 
 I swear I saw it in 2.6.32-stable somewhere.  Is it this one (2.6.32.12):

It is 16518d5ada690643453eb0aef3cc7841d3623c2d in latest linus/master
branch. It was merged between -rc3 and -rc4.

Joerg

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0

2010-09-16 Thread Alexander Graf
Liu Yu-B13201 wrote:
  

   
 -Original Message-
 From: Alexander Graf [mailto:ag...@suse.de] 
 Sent: Friday, September 10, 2010 7:24 AM
 To: Liu Yu-B13201
 Cc: kvm@vger.kernel.org; kvm-...@vger.kernel.org
 Subject: Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0


 On 08.09.2010, at 11:40, Liu Yu wrote:

 
 Current guest TLB1 is mapped to host TLB1.
 As host kernel only provides 4K uncontinuous pages,
 we have to break guest large mapping into 4K shadow mappings.
 These 4K shadow mappings are then mapped into host TLB1 on fly.
 As host TLB1 only has 13 free entries, there's serious tlb miss.

 Since e500v2 has a big number of TLB0 entries,
 it should be help to map those 4K shadow mappings to host TLB0.
 To achieve this, we need to unlink guest tlb and host tlb,
 So that guest TLB1 mappings can route to any host TLB0 
   
 entries freely.
 
 Pages/mappings are considerred in the same kind as host tlb entry.
 This patch remove the link between pages and guest tlb 
   
 entry to do the unlink.
 
 And keep host_tlb0_ref in each vcpu to trace pages.
 Then it's easy to map guest TLB1 to host TLB0.

 In guest ramdisk boot test(guest mainly uses TLB1),
 with this patch, the tlb miss number get down 90%.

 Signed-off-by: Liu Yu yu@freescale.com
 ---
 arch/powerpc/include/asm/kvm_e500.h |7 +-
 arch/powerpc/kvm/e500.c |4 +
 arch/powerpc/kvm/e500_tlb.c |  280 
   
 ---
 
 arch/powerpc/kvm/e500_tlb.h |1 +
 4 files changed, 104 insertions(+), 188 deletions(-)

   

   
 -static unsigned int tlb1_entry_num;
 +static inline unsigned int get_tlb0_entry_offset(u32 
   
 eaddr, u32 esel)
 
 +{
 +   return ((eaddr  0x7F000)  (12 - host_tlb0_assoc_bit) |
 +   (esel  (host_tlb0_assoc - 1)));
 +}

 void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 {
 @@ -62,11 +68,6 @@ static inline unsigned int tlb0_get_next_victim(
 return victim;
 }

 -static inline unsigned int tlb1_max_shadow_size(void)
 -{
 -   return tlb1_entry_num - tlbcam_index;
 -}
 -
 static inline int tlbe_is_writable(struct tlbe *tlbe)
 {
 return tlbe-mas3  (MAS3_SW|MAS3_UW);
 @@ -100,7 +101,7 @@ static inline u32 
   
 e500_shadow_mas2_attrib(u32 mas2, int usermode)
 
 /*
  * writing shadow tlb entry to host TLB
  */
 -static inline void __write_host_tlbe(struct tlbe *stlbe)
 +static inline void __host_tlbe_write(struct tlbe *stlbe)
 {
 mtspr(SPRN_MAS1, stlbe-mas1);
 mtspr(SPRN_MAS2, stlbe-mas2);
 @@ -109,25 +110,22 @@ static inline void 
   
 __write_host_tlbe(struct tlbe *stlbe)
 
 __asm__ __volatile__ (tlbwe\n : : );
 }

 -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 
   
 *vcpu_e500,
 
 -   int tlbsel, int esel, struct tlbe *stlbe)
 +static inline u32 host_tlb0_write(struct kvmppc_vcpu_e500 
   
 *vcpu_e500,
 
 +   u32 gvaddr, struct tlbe *stlbe)
 {
 -   local_irq_disable();
 -   if (tlbsel == 0) {
 -   __write_host_tlbe(stlbe);
 -   } else {
 -   unsigned register mas0;
 +   unsigned register mas0;

 -   mas0 = mfspr(SPRN_MAS0);
 +   local_irq_disable();
   
 Do you have to disable interrupts for a tlb write? If you get 
 preempted before the write, the entry you overwrite could be 
 different. But you don't protect against that either way. And 
 if you get preempted afterwards, you could lose the entry. 
 But since you enable interrupts beyond that, that could 
 happen either way too.

 So what's the reason for the disable here?

 

 Hello Alex,

 Doesn't local_irq_disable() also disable preempt?
 The reason to disable interrupts is because it's possible to have tlb
 misses in interrupt service route.
   

Yes, local_irq_disable disables preempt. That's exactly what I was
referring to :).
I don't understand the statement about the service route. A tlb miss
still arrives with local_irq_disable.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.16 Guest Hangs on Boot

2010-09-16 Thread Alec Joseph Rivera
On Thu, 2010-09-16 at 13:41 +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 07:37:55PM +0800, Alec Joseph Rivera wrote:
  On Thu, 2010-09-16 at 13:32 +0200, Joerg Roedel wrote:
   On Thu, Sep 16, 2010 at 07:11:17PM +0800, Alec Joseph Rivera wrote:
On Thu, 2010-09-16 at 13:02 +0200, Joerg Roedel wrote:

Will wait for a 2.6.36 kernel then or compile a .36-rc myself on free
time :-)

Thanks, will update the list when either comes.
   
   Just checked, 2.6.36-rc4 contains the fix for the cmpxchg8b problem on
   32 bit hosts.
   
  
  Nice :-) woot woot... Will definite compile a test kernel asap.
  
 So your host is 32bit?
 

Yes. It's Arch-based. I initially thought of installing a 64-bit kernel
when I saw the guest host status on kvm's home, but programs are
compiled 32-bit. I was put off thinking about binary compatibility.

Haven't tested such since my Debian days...

 --
   Gleb.


-- 
--- 


Follow me: 
http://twitter.com/agirivera
Invite as a
friend: 
http://www.facebook.com/agirivera



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: .img on nfs, relative on ram, consuming mass ram

2010-09-16 Thread TOURNIER Frédéric
Ok, thanks for taking time.
I'll dig into your answers.

So as i run relative.img on diskless systems with original.img on nfs, what are 
the best practice/tips i can use ?

Is ramfs more suitable than tmpfs ?

Fred.

On Thu, 16 Sep 2010 11:09:49 +0200
Andre Przywara andre.przyw...@amd.com wrote:

 TOURNIER Frédéric wrote:
  Hi !
  Here's my config :
  
  Version : qemu-kvm-0.12.5, qemu-kvm-0.12.4
  Hosts : AMD 64X2, Phenom and Core 2 duo
  OS : Slackware 64 13.0
  Kernel : 2.6.35.4 and many previous versions
  
  I use a PXE server to boot semi-diskless (swap partitions and some local 
  stuff) stations.
  This server also serves a read-only nfs folder, with plenty of .img on it.
  When clients connects, a relative image is created in /tmp, which is a 
  tmpfs, so hosted in ram.
  
  And here i go on my 2G stations :
  qemu-system-x86_64 -m 1024 -vga std -usb -usbdevice tablet -localtime 
  -soundhw es1370 /tmp/relimg.img
  qemu-system-x86_64 -m 1024 -vga std -usb -usbdevice tablet -localtime 
  -soundhw es1370 /dev/shm/relimg.img
  
  I tried both. Always the same result : the ram is consumed quickly, and 
  mass swap occurs.
 Which is only natural, as tmpfs is promising to never swap. So it will 
 take precedence over other RAM (that's why it is limited to half of the 
 memory by default). As soon as the guest has (re)written more disk 
 sectors than your free RAM can hold, the system will start to swap out 
 your guest RAM (and other host applications).
 So in general you should avoid putting relative disk images to tmpfs if 
 your host memory is limited. As a workaround you could try to further 
 limit the tmpfs max size (mount -t tmpfs -o size=512M none /dev/shm), 
 but this could lead to data loss in your guest as it possibly cannot 
 back the written sectors anymore.
  On a 4G system, i see kvm uses more than 1024, maybe 1200.
  And everytime a launch a program inside the vm, the amount of the host free 
  ram (not cached) diminishes, which is weird, because it should have been 
  reserved.
 KVM uses on-demand paging like other applications. So it will not 
 reserve memory for your guest (unless you use hugetlbfs's -mempath):
 $ kvm -cdrom ttylinux_ser.iso -nographic -m 3072M
 $ top
PID USER  PR  NI  VIRT  RES  SHR S %CPU %MEMTIME+  COMMAND 
 
   6015 user  20   0 3205m 128m 3020 S2  2.2   0:04.94 kvm 
 
 
 Regards,
 Andre.
 
  
  So on a 2G system, swap occurs very fast and the machine slow a lot down.
  An on a total diskless system, this leads fast to a freeze.
  
  I have no problem if i use a relative image on disk :
  qemu-system-x86_64 -m 1024 -vga std -usb -usbdevice tablet -localtime 
  -soundhw es1370 -drive file=/mnt/hd/sda/sda1/tmp/relimg.img,cache=none
 
 -- 
 Andre Przywara
 AMD-Operating System Research Center (OSRC), Dresden, Germany
 Tel: +49 351 448-3567-12
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
   We haver two users: qemu does deasserts, vhost-net does asserts.
  Well this is broken. You want KVM to track level for you and this is
  wrong. KVM does this anyway because it can't relay on devise model
  to behave correctly [0], but in your case it is designed to behave
  incorrectly.
  
  Interrupt type is a device property. PCI devices just happen to be level
  triggered according to PCI spec. What if you want to use vhost-net to
  implement network device which has active-low interrupt line? [1]
 
 The polarity would have to be reversed in gsi (irq line can be shared,
 all devices must be active high or low consistently).
 
There are gsi dedicated to PCI. They can be shared only between PCI
devices.

  If you want to split parts that asserts irq and de-asserts it then we
  should have irqfd that tracks line status and knows interrupt line
  polarity.
 
 Yes, it can know about polarity even though I think it's cleaner to do this
 per gsi. But it can not track line status as line is shared with
 other devices.
It should track only device's line status.

 
   Another application is out of process virtio (sandboxing!).
  It will still assert and de-assert irq at the same code, so it will be
  able to track irq line status.
  
   Again, pci stuff needs to stay in qemu.
   
  
  Nothing to do with PCI whatsoever.
  
  [0] most qemu devices behave incorrectly and trigger level irq more then
  needed.
 
 Which devices?
Most of them. They just call update_irq_status() or something and
re-assert interrupt regardless of what previous status was.

 pci core tracks line status and will never assert the same
 line multiple times.
That's good if pci core does this, but device shouldn't even try it.

 
  [1] this is how correct PCI device should behave but we override
  polarity in ACPI, but now incorrect behaviour is deeply designed
  into vhost-net.
 
 Not really, vhost net signals an eventfd. What happens then is
 up to kvm.
 
That is what current broken design does and it works, but if you want to
save unneeded calls into kvm fix design.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
We haver two users: qemu does deasserts, vhost-net does asserts.
   Well this is broken. You want KVM to track level for you and this is
   wrong. KVM does this anyway because it can't relay on devise model
   to behave correctly [0], but in your case it is designed to behave
   incorrectly.
   
   Interrupt type is a device property. PCI devices just happen to be level
   triggered according to PCI spec. What if you want to use vhost-net to
   implement network device which has active-low interrupt line? [1]
  
  The polarity would have to be reversed in gsi (irq line can be shared,
  all devices must be active high or low consistently).
  
 There are gsi dedicated to PCI. They can be shared only between PCI
 devices.
 
   If you want to split parts that asserts irq and de-asserts it then we
   should have irqfd that tracks line status and knows interrupt line
   polarity.
  
  Yes, it can know about polarity even though I think it's cleaner to do this
  per gsi. But it can not track line status as line is shared with
  other devices.
 It should track only device's line status.

There is no such thing as device's line status on real hardware, either.
Devices do not drive INT# high: they drive it low (all the time)
or do not drive it at all.

Or consider express, the spec explicitly says:
Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but
 are not errors.

  
Another application is out of process virtio (sandboxing!).
   It will still assert and de-assert irq at the same code, so it will be
   able to track irq line status.
   
Again, pci stuff needs to stay in qemu.

   
   Nothing to do with PCI whatsoever.
   
   [0] most qemu devices behave incorrectly and trigger level irq more then
   needed.
  
  Which devices?
 Most of them. They just call update_irq_status() or something and
 re-assert interrupt regardless of what previous status was.

At least for PCI devices, these calls do nothing if status does not change.

  pci core tracks line status and will never assert the same
  line multiple times.
 That's good if pci core does this, but device shouldn't even try it.

I disagree. We don't want to duplicate a ton of code all over
the codebase.

  
   [1] this is how correct PCI device should behave but we override
   polarity in ACPI, but now incorrect behaviour is deeply designed
   into vhost-net.
  
  Not really, vhost net signals an eventfd. What happens then is
  up to kvm.
  
 That is what current broken design does and it works, but if you want to
 save unneeded calls into kvm fix design.

The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle 
pci.
Making vhost aware of pci breaks this, I would not call that fixing the
design.

 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Avi Kivity

 On 09/16/2010 02:57 PM, Michael S. Tsirkin wrote:


  If you want to split parts that asserts irq and de-asserts it then we
  should have irqfd that tracks line status and knows interrupt line
  polarity.
  
Yes, it can know about polarity even though I think it's cleaner to do 
this
per gsi. But it can not track line status as line is shared with
other devices.
  It should track only device's line status.

There is no such thing as device's line status on real hardware, either.
Devices do not drive INT# high: they drive it low (all the time)
or do not drive it at all.



That's just an implementation detail.  Devices either assert INT# or 
they do not.  Tying the wires together constitutes an AND gate.  This 
gate has to be modelled somewhere, currently it's in qemu's pci emulation.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
 We haver two users: qemu does deasserts, vhost-net does asserts.
Well this is broken. You want KVM to track level for you and this is
wrong. KVM does this anyway because it can't relay on devise model
to behave correctly [0], but in your case it is designed to behave
incorrectly.

Interrupt type is a device property. PCI devices just happen to be level
triggered according to PCI spec. What if you want to use vhost-net to
implement network device which has active-low interrupt line? [1]
   
   The polarity would have to be reversed in gsi (irq line can be shared,
   all devices must be active high or low consistently).
   
  There are gsi dedicated to PCI. They can be shared only between PCI
  devices.
  
If you want to split parts that asserts irq and de-asserts it then we
should have irqfd that tracks line status and knows interrupt line
polarity.
   
   Yes, it can know about polarity even though I think it's cleaner to do 
   this
   per gsi. But it can not track line status as line is shared with
   other devices.
  It should track only device's line status.
 
 There is no such thing as device's line status on real hardware, either.
 Devices do not drive INT# high: they drive it low (all the time)
 or do not drive it at all.
Same thing, other naming. Device either drive it low (irq_set(1)) or
not (irq_set(0)).

 
 Or consider express, the spec explicitly says:
 Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but
  are not errors.
 
   
 Another application is out of process virtio (sandboxing!).
It will still assert and de-assert irq at the same code, so it will be
able to track irq line status.

 Again, pci stuff needs to stay in qemu.
 

Nothing to do with PCI whatsoever.

[0] most qemu devices behave incorrectly and trigger level irq more then
needed.
   
   Which devices?
  Most of them. They just call update_irq_status() or something and
  re-assert interrupt regardless of what previous status was.
 
 At least for PCI devices, these calls do nothing if status does not change.
I am not sure what code are you locking at. e1000 device emulation
doesn't check previous line status before calling qemu_set_irq().

 
   pci core tracks line status and will never assert the same
   line multiple times.
  That's good if pci core does this, but device shouldn't even try it.
 
 I disagree. We don't want to duplicate a ton of code all over
 the codebase.
 
So abstract it into a function. It shouldn't be part of PCI emulation.

   
[1] this is how correct PCI device should behave but we override
polarity in ACPI, but now incorrect behaviour is deeply designed
into vhost-net.
   
   Not really, vhost net signals an eventfd. What happens then is
   up to kvm.
   
  That is what current broken design does and it works, but if you want to
  save unneeded calls into kvm fix design.
 
 The interface seems clean enough: vhost handles virtio ring, qemu/kvm handle 
 pci.
 Making vhost aware of pci breaks this, I would not call that fixing the
 design.
 
Once again. Nothing to do with PCI, everything to do with device
emulation. And I propose to abstract interrupt assertion part into
irqfd, not into vhost.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: .img on nfs, relative on ram, consuming mass ram

2010-09-16 Thread Stefan Hajnoczi
2010/9/16 Andre Przywara andre.przyw...@amd.com:
 TOURNIER Frédéric wrote:

 Ok, thanks for taking time.
 I'll dig into your answers.

 So as i run relative.img on diskless systems with original.img on nfs,
 what are the best practice/tips i can use ?

 I thinks it is -snapshot you are looking for.
 This will put the backing store into normal RAM, and you can later commit
 it to the original image if needed. See the qemu manpage for more details.
 In a nutshell you just specify the original image and add -snapshot to the
 command line.

-snapshot creates a temporary qcow2 image in /tmp whose backing file
is your original image.  I'm not sure what you mean by This will put
the backing store into normal RAM?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/6] Nonatomic interrupt injection

2010-09-16 Thread Avi Kivity

 On 08/30/2010 02:36 PM, Avi Kivity wrote:

This patchset changes interrupt injection to be done from normal process
context instead of interrupts disabled context.  This is useful for real
mode interrupt injection on Intel without the current hacks (injecting as
a software interrupt of a vm86 task), reducing latencies, and later, for
allowing nested virtualization code to use kvm_read_guest()/kvm_write_guest()
instead of kmap() to access the guest vmcb/vmcs.

Seems to survive a hack that cancels every 16th entry, after injection has
already taken place.

With the PIC reset fix posted earlier, this passes autotest on both AMD and
Intel, with in-kernel irqchip.  I'll run -no-kvm-irqchip tests shortly.

Please review carefully, esp. the first patch.  Any missing kvm_make_request()
there may result in a hung guest.



This is now merged, with the change pointed out by Marcelo.  Windows XP 
x64 fails installation without


(vmx.c handle_cr())
 case 8: {
 u8 cr8_prev = kvm_get_cr8(vcpu);
 u8 cr8 = kvm_register_read(vcpu, reg);
 kvm_set_cr8(vcpu, cr8);
 skip_emulated_instruction(vcpu);
 if (irqchip_in_kernel(vcpu-kvm))
 return 1;
-if (cr8_prev = cr8)
-return 1;
 vcpu-run-exit_reason = KVM_EXIT_SET_TPR;
 return 0;
 }

Which doesn't make any sense (anyone?).  The failure is present even 
without the patchset, and is fixed by the same hack, so a regression was 
not introduced.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 03:14:11PM +0200, Avi Kivity wrote:
  On 09/16/2010 02:57 PM, Michael S. Tsirkin wrote:
 
   If you want to split parts that asserts irq and de-asserts it then 
  we
   should have irqfd that tracks line status and knows interrupt line
   polarity.
   
 Yes, it can know about polarity even though I think it's cleaner to do 
  this
 per gsi. But it can not track line status as line is shared with
 other devices.
   It should track only device's line status.
 
 There is no such thing as device's line status on real hardware, either.
 Devices do not drive INT# high: they drive it low (all the time)
 or do not drive it at all.
 
 
 That's just an implementation detail.  Devices either assert INT# or
 they do not.  Tying the wires together constitutes an AND gate.
 This gate has to be modelled somewhere, currently it's in qemu's pci
 emulation.

Right. kvm in kernel has this as well, we need to keep this in
kvm kernel if we want to support level with irqfd.
Where it does not belong is individual devices: these
should be able to assert INTx multiple times
and it should have no effect, as per spec.

 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Avi Kivity

 On 09/16/2010 03:38 PM, Michael S. Tsirkin wrote:


  That's just an implementation detail.  Devices either assert INT# or
  they do not.  Tying the wires together constitutes an AND gate.
  This gate has to be modelled somewhere, currently it's in qemu's pci
  emulation.

Right. kvm in kernel has this as well, we need to keep this in
kvm kernel if we want to support level with irqfd.


Right, we added the irq_source_id hack for device assignment.

What's wrong with letting userspace mediate this, though?


Where it does not belong is individual devices: these
should be able to assert INTx multiple times
and it should have no effect, as per spec.




Yes.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: .img on nfs, relative on ram, consuming mass ram

2010-09-16 Thread Andre Przywara

Stefan Hajnoczi wrote:

2010/9/16 Andre Przywara andre.przyw...@amd.com:

TOURNIER Frédéric wrote:

Ok, thanks for taking time.
I'll dig into your answers.

So as i run relative.img on diskless systems with original.img on nfs,
what are the best practice/tips i can use ?

I thinks it is -snapshot you are looking for.
This will put the backing store into normal RAM, and you can later commit
it to the original image if needed. See the qemu manpage for more details.
In a nutshell you just specify the original image and add -snapshot to the
command line.


-snapshot creates a temporary qcow2 image in /tmp whose backing file
is your original image.  I'm not sure what you mean by This will put
the backing store into normal RAM?
Stefan, you are right. I never looked into the code and because the file 
in /tmp is deleted just after creation there wasn't a sign of it.
For some reason I thought that the buffer would just be allocated in 
memory. Sorry, my mistake and thanks for pointing this out.


So Fred, unfortunately this does not solve your problem. I guess you run 
into a general problem. If the guest actually changes so much of the 
disk that this cannot be backed by RAM in the host, you have lost.
One solution could be to just make (at least parts of) the disk 
read-only (a write protected /usr partition works quite well).
If you are sure that writes are not that frequent, you could think of 
putting the overlay file also on the remote storage (NFS). Although this 
is rather slow, it shouldn't matter if there aren't many writes and the 
local page cache should catch most of the accesses (while still being 
nice to other RAM users).


Regards,
Andre.


Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html




--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 448-3567-12

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 03:38:28PM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 03:14:11PM +0200, Avi Kivity wrote:
   On 09/16/2010 02:57 PM, Michael S. Tsirkin wrote:
  
If you want to split parts that asserts irq and de-asserts it 
   then we
should have irqfd that tracks line status and knows interrupt line
polarity.

  Yes, it can know about polarity even though I think it's cleaner to 
   do this
  per gsi. But it can not track line status as line is shared with
  other devices.
It should track only device's line status.
  
  There is no such thing as device's line status on real hardware, either.
  Devices do not drive INT# high: they drive it low (all the time)
  or do not drive it at all.
  
  
  That's just an implementation detail.  Devices either assert INT# or
  they do not.  Tying the wires together constitutes an AND gate.
  This gate has to be modelled somewhere, currently it's in qemu's pci
  emulation.
 
 Right. kvm in kernel has this as well, we need to keep this in
 kvm kernel if we want to support level with irqfd.
 Where it does not belong is individual devices: these
 should be able to assert INTx multiple times
 and it should have no effect, as per spec.
Assert_INTx/Deassert_INTx you mentioned are internal PCI thing. What KVM
sees logically is status of the line between pci controller and irq
chip. We do not emulate PCI inside kernel, but I agree that kernel should
handle multiple asserts without de-assert in the middle and, in fact, it does.
But the thread started with you trying to optimize this non-optimal
device behaviour and I am saying that the fix should be elsewhere. Namely in
irqfd.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 03:18:23PM +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote:
  On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote:
   On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
  We haver two users: qemu does deasserts, vhost-net does asserts.
 Well this is broken. You want KVM to track level for you and this is
 wrong. KVM does this anyway because it can't relay on devise model
 to behave correctly [0], but in your case it is designed to behave
 incorrectly.
 
 Interrupt type is a device property. PCI devices just happen to be 
 level
 triggered according to PCI spec. What if you want to use vhost-net to
 implement network device which has active-low interrupt line? [1]

The polarity would have to be reversed in gsi (irq line can be shared,
all devices must be active high or low consistently).

   There are gsi dedicated to PCI. They can be shared only between PCI
   devices.
   
 If you want to split parts that asserts irq and de-asserts it then we
 should have irqfd that tracks line status and knows interrupt line
 polarity.

Yes, it can know about polarity even though I think it's cleaner to do 
this
per gsi. But it can not track line status as line is shared with
other devices.
   It should track only device's line status.
  
  There is no such thing as device's line status on real hardware, either.
  Devices do not drive INT# high: they drive it low (all the time)
  or do not drive it at all.
 Same thing, other naming. Device either drive it low (irq_set(1)) or
 not (irq_set(0)).

Well, if I drive it low any number of times it should hae no effect.

  
  Or consider express, the spec explicitly says:
  Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but
   are not errors.
  

  Another application is out of process virtio (sandboxing!).
 It will still assert and de-assert irq at the same code, so it will be
 able to track irq line status.
 
  Again, pci stuff needs to stay in qemu.
  
 
 Nothing to do with PCI whatsoever.
 
 [0] most qemu devices behave incorrectly and trigger level irq more 
 then
 needed.

Which devices?
   Most of them. They just call update_irq_status() or something and
   re-assert interrupt regardless of what previous status was.
  
  At least for PCI devices, these calls do nothing if status does not change.
 I am not sure what code are you locking at. e1000 device emulation
 doesn't check previous line status before calling qemu_set_irq().

Right. If you dig through useless levels of indirection, you will
see that each PCI device has 4 pin levels, when one of these
changes this makes it up level to the parent bus, and so on.

  
pci core tracks line status and will never assert the same
line multiple times.
   That's good if pci core does this, but device shouldn't even try it.
  
  I disagree. We don't want to duplicate a ton of code all over
  the codebase.
  
 So abstract it into a function. It shouldn't be part of PCI emulation.

I don't know what you mean by this, send a patch and we can discuss?
Note that when I patches PCI interrupt handling for compliance
I made it mimic hardware as closely as possible: devices
can send any # of assert/deassert messages, bus discards duplicates.


 [1] this is how correct PCI device should behave but we override
 polarity in ACPI, but now incorrect behaviour is deeply designed
 into vhost-net.

Not really, vhost net signals an eventfd. What happens then is
up to kvm.

   That is what current broken design does and it works, but if you want to
   save unneeded calls into kvm fix design.
  
  The interface seems clean enough: vhost handles virtio ring, qemu/kvm 
  handle pci.
  Making vhost aware of pci breaks this, I would not call that fixing the
  design.
  
 Once again. Nothing to do with PCI, everything to do with device
 emulation. And I propose to abstract interrupt assertion part into
 irqfd, not into vhost.
 
 --
   Gleb.

This could work. KVM would need to find all irqfd
objects mapped to gsi and notify them on deassert.
No idea how hard this is.


-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] KVM: MMU: Use base_role.nxe for mmu.nx

2010-09-16 Thread Avi Kivity

 On 09/15/2010 10:48 AM, Roedel, Joerg wrote:

On Tue, Sep 14, 2010 at 06:08:37PM -0400, Marcelo Tosatti wrote:

  For tdp better set base_role.nxe to zero, otherwise duplicate tdp
  pagetables can be created if the guest switches between nx/non-nx.


This does not work because bit 63 is marked as reserved if base_role.nxe
is 0. If the walk_addr_generic function then runs with tdp enabled it
would report a set nx bit as a rsvd fault.
We also can't use is_nx() in those path because it does not distinguish
between l1 and l2 nx. Are there guests that switch the efer.nx bit
regularly enough so that it matters? If so I would suggest to drop this
patch and keep mmu.nx.




I'll do that then.  I'm still unhappy about the duplication.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] KVM: MMU: Don't track nested fault info in error-code

2010-09-16 Thread Avi Kivity

 On 09/14/2010 05:46 PM, Joerg Roedel wrote:

This patch moves the detection whether a page-fault was
nested or not out of the error code and moves it into a
separate variable in the fault struct.



Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] Make kvm64 the default cpu model when kvm_enabled()

2010-09-16 Thread Avi Kivity

 On 09/14/2010 05:52 PM, Joerg Roedel wrote:

As requested by Alex this patch makes kvm64 the default CPU
model when qemu is started with -enable-kvm.




This breaks compatiblity; if started with -M 0.13 or prior we should 
default to qemu64.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 03:51:37PM +0200, Michael S. Tsirkin wrote:
 On Thu, Sep 16, 2010 at 03:18:23PM +0200, Gleb Natapov wrote:
  On Thu, Sep 16, 2010 at 02:57:17PM +0200, Michael S. Tsirkin wrote:
   On Thu, Sep 16, 2010 at 02:33:01PM +0200, Gleb Natapov wrote:
On Thu, Sep 16, 2010 at 02:13:38PM +0200, Michael S. Tsirkin wrote:
   We haver two users: qemu does deasserts, vhost-net does asserts.
  Well this is broken. You want KVM to track level for you and this is
  wrong. KVM does this anyway because it can't relay on devise model
  to behave correctly [0], but in your case it is designed to behave
  incorrectly.
  
  Interrupt type is a device property. PCI devices just happen to be 
  level
  triggered according to PCI spec. What if you want to use vhost-net 
  to
  implement network device which has active-low interrupt line? [1]
 
 The polarity would have to be reversed in gsi (irq line can be shared,
 all devices must be active high or low consistently).
 
There are gsi dedicated to PCI. They can be shared only between PCI
devices.

  If you want to split parts that asserts irq and de-asserts it then 
  we
  should have irqfd that tracks line status and knows interrupt line
  polarity.
 
 Yes, it can know about polarity even though I think it's cleaner to 
 do this
 per gsi. But it can not track line status as line is shared with
 other devices.
It should track only device's line status.
   
   There is no such thing as device's line status on real hardware, either.
   Devices do not drive INT# high: they drive it low (all the time)
   or do not drive it at all.
  Same thing, other naming. Device either drive it low (irq_set(1)) or
  not (irq_set(0)).
 
 Well, if I drive it low any number of times it should hae no effect.
 
There is no meaning of driving low the line multiple times on real HW.
You either drive it low or not. We try to emulate this with individual
drive low/high events in software.

   
   Or consider express, the spec explicitly says:
   Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, but
are not errors.
   
 
   Another application is out of process virtio (sandboxing!).
  It will still assert and de-assert irq at the same code, so it will 
  be
  able to track irq line status.
  
   Again, pci stuff needs to stay in qemu.
   
  
  Nothing to do with PCI whatsoever.
  
  [0] most qemu devices behave incorrectly and trigger level irq more 
  then
  needed.
 
 Which devices?
Most of them. They just call update_irq_status() or something and
re-assert interrupt regardless of what previous status was.
   
   At least for PCI devices, these calls do nothing if status does not 
   change.
  I am not sure what code are you locking at. e1000 device emulation
  doesn't check previous line status before calling qemu_set_irq().
 
 Right. If you dig through useless levels of indirection, you will
 see that each PCI device has 4 pin levels, when one of these
 changes this makes it up level to the parent bus, and so on.
Yes. Qemu PCI level does it right. Ideally device would not even invoke 
this logic if interrupt status haven't changed.

 
   
 pci core tracks line status and will never assert the same
 line multiple times.
That's good if pci core does this, but device shouldn't even try it.
   
   I disagree. We don't want to duplicate a ton of code all over
   the codebase.
   
  So abstract it into a function. It shouldn't be part of PCI emulation.
 
 I don't know what you mean by this, send a patch and we can discuss?
I don't care enough to send patch.  Just remember previous irq status
and do not call qemu_set_irq() if it doesn't change. Three lines of
code.

 Note that when I patches PCI interrupt handling for compliance
 I made it mimic hardware as closely as possible: devices
 can send any # of assert/deassert messages, bus discards duplicates.
 
Qemu PCI code is correct as far as I can see. Not all devices are connected
via PCI and there is not need to go through couple of layer of
indirection to figure out that nothing should be done.


 
  [1] this is how correct PCI device should behave but we override
  polarity in ACPI, but now incorrect behaviour is deeply designed
  into vhost-net.
 
 Not really, vhost net signals an eventfd. What happens then is
 up to kvm.
 
That is what current broken design does and it works, but if you want to
save unneeded calls into kvm fix design.
   
   The interface seems clean enough: vhost handles virtio ring, qemu/kvm 
   handle pci.
   Making vhost aware of pci breaks this, I would not call that fixing the
   design.
   
  Once again. Nothing to do with PCI, everything to do with device
  emulation. And I propose to abstract interrupt assertion part into
  irqfd, not into 

Re: [PATCH 3/3] Add svm cpuid features

2010-09-16 Thread Alexander Graf
Roedel, Joerg wrote:
 On Thu, Sep 16, 2010 at 10:06:48AM -0400, Avi Kivity wrote:
   
   On 09/14/2010 05:52 PM, Joerg Roedel wrote:
 
 This patch adds the svm cpuid feature flags to the qemu
 intialization path. It also adds the svm features available
 on phenom to its cpu-definition and extends the host cpu
 type to support all svm features KVM can provide.


   
 Does phenom really have vmcbclean?  I thought it was a really new feature.
 

 No, but we could emulate it on almost all hardware that has SVM. Its
 basically the same as with x2apic.
   

-cpu non-host is not about what we could emulate, but what the
hardware really is. Features not supported by the particular CPU do not
belong there.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] Add svm cpuid features

2010-09-16 Thread Alexander Graf
Roedel, Joerg wrote:
 On Thu, Sep 16, 2010 at 10:17:09AM -0400, Alexander Graf wrote:
   
 Roedel, Joerg wrote:
 
 On Thu, Sep 16, 2010 at 10:06:48AM -0400, Avi Kivity wrote:
   
   
   On 09/14/2010 05:52 PM, Joerg Roedel wrote:
 
 
 This patch adds the svm cpuid feature flags to the qemu
 intialization path. It also adds the svm features available
 on phenom to its cpu-definition and extends the host cpu
 type to support all svm features KVM can provide.


   
   
 Does phenom really have vmcbclean?  I thought it was a really new feature.
 
 
 No, but we could emulate it on almost all hardware that has SVM. Its
 basically the same as with x2apic.
   
   
 -cpu non-host is not about what we could emulate, but what the
 hardware really is. Features not supported by the particular CPU do not
 belong there.
 

 I'll remove it then. It would have been nice to have it there because it
 will improve performance of svm emulation. But if it is about emulating
 the real cpu then I'll just drop it.
   

speed == -cpu host :). If the performance increase is significant, we
can always add a new cpu type for a cpu that does support those flags so
people can use it there and still be migration safe. In fact, that would
even benefit you guys as you'd sell more new machines for speed ;).


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Gleb Natapov
On Thu, Sep 16, 2010 at 04:23:35PM +0200, Michael S. Tsirkin wrote:
 There is no such thing as device's line status on real hardware, 
 either.
 Devices do not drive INT# high: they drive it low (all the time)
 or do not drive it at all.
Same thing, other naming. Device either drive it low (irq_set(1)) or
not (irq_set(0)).
   
   Well, if I drive it low any number of times it should hae no effect.
   
  There is no meaning of driving low the line multiple times on real HW.
  You either drive it low or not. We try to emulate this with individual
  drive low/high events in software.
 
 Or consider express, the spec explicitly says:
 Note: Duplicate Assert_INTx/Deassert_INTx Messages have no effect, 
 but
 
 Express has assert and deassert messages. It might be easier for
 you to think in these terms. level 1: assert, level 0: deassert.
 Seems a simple model and what we do models this pretty well.
 
They are between device and pci controller. I am talking about what
happens between pci controller and irq chip. We are talking about
different things really.

  are not errors.
 
   
 Another application is out of process virtio (sandboxing!).
It will still assert and de-assert irq at the same code, so it 
will be
able to track irq line status.

 Again, pci stuff needs to stay in qemu.
 

Nothing to do with PCI whatsoever.

[0] most qemu devices behave incorrectly and trigger level irq 
more then
needed.
   
   Which devices?
  Most of them. They just call update_irq_status() or something and
  re-assert interrupt regardless of what previous status was.
 
 At least for PCI devices, these calls do nothing if status does not 
 change.
I am not sure what code are you locking at. e1000 device emulation
doesn't check previous line status before calling qemu_set_irq().
   
   Right. If you dig through useless levels of indirection, you will
   see that each PCI device has 4 pin levels, when one of these
   changes this makes it up level to the parent bus, and so on.
  Yes. Qemu PCI level does it right. Ideally device would not even invoke 
  this logic if interrupt status haven't changed.
 
 It needs to call *some* function to check status
 and assert, right? qemu_set_irq is that function.
qemu_set_irq does not check previous level and calls a callback
unconditionally.

 
   
 
   pci core tracks line status and will never assert the same
   line multiple times.
  That's good if pci core does this, but device shouldn't even try it.
 
 I disagree. We don't want to duplicate a ton of code all over
 the codebase.
 
So abstract it into a function. It shouldn't be part of PCI emulation.
   
   I don't know what you mean by this, send a patch and we can discuss?
  I don't care enough to send patch.  Just remember previous irq status
  and do not call qemu_set_irq() if it doesn't change. Three lines of
  code.
 
 Heh, we have a ton of devices to support.
So?

 And then we need to migrate this extra status, and make sure it's in
 sync with PCI code.  We'll end up with much more more than 3 lines all
 of it in a very sensitive and hard to test parts code.
 
You should be able to reconstruct it from device state. What should be
in sync with PCI code? 

   Note that when I patches PCI interrupt handling for compliance
   I made it mimic hardware as closely as possible: devices
   can send any # of assert/deassert messages, bus discards duplicates.
   
  Qemu PCI code is correct as far as I can see. Not all devices are connected
  via PCI and there is not need to go through couple of layer of
  indirection to figure out that nothing should be done.
  
 
 If we want to remove the indirection I would be much more
 interested to remove it for all cases, not just when
 nothing should be done.
I don't care. This indirection may be justified for all I know. You try to
shift this discussion to areas I am not interested to look into :) All I
am saying is that each device is capable of knowing its current irq line
state and optimize out function call + additional logic. Whether upper
layer should handle two asserts without de-assert in between is different
point and I think we agree on it.

 
   
[1] this is how correct PCI device should behave but we override
polarity in ACPI, but now incorrect behaviour is deeply 
designed
into vhost-net.
   
   Not really, vhost net signals an eventfd. What happens then is
   up to kvm.
   
  That is what current broken design does and it works, but if you 
  want to
  save unneeded calls into kvm fix design.
 
 The interface seems clean enough: vhost handles virtio ring, qemu/kvm 
 handle pci.
 Making vhost aware of pci breaks this, I would not call that fixing 
 the
 design.

Re: [PATCH 19/24] Deciding if L0 or L1 should handle an L2 exit

2010-09-16 Thread Nadav Har'El
On Mon, Jun 14, 2010, Avi Kivity wrote about Re: [PATCH 19/24] Deciding if L0 
or L1 should handle an L2 exit:
 On 06/13/2010 03:32 PM, Nadav Har'El wrote:
 This patch contains the logic of whether an L2 exit should be handled by L0
 and then L2 should be resumed, or whether L1 should be run to handle this
 exit (using the nested_vmx_vmexit() function of the previous patch).
 The basic idea is to let L1 handle the exit only if it actually asked to
 trap this sort of event. For example, when L2 exits on a change to CR0,
...
 @@ -3819,6 +3841,8 @@ static int handle_exception(struct kvm_v
 
  if (is_no_device(intr_info)) {
  vmx_fpu_activate(vcpu);
 +if (vmx-nested.nested_mode)
 +vmx-nested.nested_run_pending = 1;
  return 1;
  }

 
 Isn't this true for many other exceptions?  #UD which we emulate (but 
 the guest doesn't trap), page faults which we handle completely...

I was trying to think why nested_run_pending=1 (forcing us to run L2 next)
is necessary in the specific case of #NM, and couldn't think of any convincing
reason. Sure, in most cases we would like to continue running L2 after L0
serviced this exception that L1 didn't care about, but in the rare cases
where L1 should run next (especially, user-space injecting an interrupt),
what's so wrong with L1 running next?
And, like you said, if it's important for #NM, why not for #PF or other
things?

Anyway, the code appears to run correctly also without this setting,
so I'm guessing that it's an historic artifact, from older code which
was written before the days of the lazy fpu loading. So I'm removing it.

Good catch. I was aware of this peculiar case in the code (and even
documented it in the patch's description), but should have stopped to
think if it is really such a special case, or simply an error. And now
I believe it was nothing more than an error.

 +/* Return 1 if we should exit from L2 to L1 to handle a CR access exit,
 + * rather than handle it ourselves in L0. I.e., check if L1 wanted to
 + * intercept (via guest_host_mask etc.) the current event.
 + */
 +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 +struct shadow_vmcs *l2svmcs)
 +{
...
 +case 8:
 +if (l2svmcs-cpu_based_vm_exec_control
 +CPU_BASED_CR8_LOAD_EXITING)
 +return 1;

 
 Should check TPR threshold here too if enabled.

I'll return to this issue in another mail. This one is getting long enough.

 +case 3: /* lmsw */
 +if (l2svmcs-cr0_guest_host_mask
 +(val ^ l2svmcs-cr0_read_shadow))
 +return 1;

 
 Need to mask off bit 0 (cr0.pe) of val, since lmsw can't clear it.

Right. Also, lmsw only works on the first 4 bits of cr0: The first bit, it can
only turn on (like you said), and the next 3 bits, it can change at will. Any
other attempted changes to cr0 through lmsw should be ignored, and not cause
exits.  So I changed the code to this:

if (vmcs12-cr0_guest_host_mask  0xe 
(val ^ vmcs12-cr0_read_shadow))
return 1;
if ((vmcs12-cr0_guest_host_mask  0x1) 
!(vmcs12-cr0_read_shadow  0x1) 
(val  0x1))
return 1;

I wonder if there's a less ugly way to write the same thing...

This LMSW is so 80s :( I wonder who's using it these days, and specifically
if it would bother anyone if lmsw suddenly acquired new abilities to
change bits it never could... Oh, the things that we do for backward
compatibility :-)

 +/* Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
 + * should handle it ourselves in L0. Only call this when in nested_mode 
 (L2).
 + */
 +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool afterexit)
...
 +case EXIT_REASON_EXCEPTION_NMI:
 +if (is_external_interrupt(intr_info)
 +(l2svmcs-pin_based_vm_exec_control
 + PIN_BASED_EXT_INTR_MASK))
 +r = 1;

 
 A real external interrupt should never be handled by the guest, only a 
 virtual external interrupt.

You've opened a whole can of worms ;-) But it's very good that you did.

It appears that this nested_vmx_exit_handled() was called for two completely
different reasons, with different afterexit parameter (if you remember,
this flag had the puzzling name kvm_override in a previous version).
On normal exits, it is called with afterexit=1 and did exactly what you wanted,
i.e., never handle in the guest:

case EXIT_REASON_EXTERNAL_INTERRUPT:
return 0;

The case which you saw was only relevant for the other place this function is
called - in exception injection. But most of the code in this function was
irrelevant and/or plain wrong in this case.

This part of the code was just terrible, and I couldn't leave it like this,
even 

Re: Exceed 1GB/s with virtio-net ?

2010-09-16 Thread Alex Williamson
On Tue, 2010-09-14 at 18:14 +0200, Thibault VINCENT wrote:
 On 13/09/2010 19:34, Alex Williamson wrote:
  On Mon, Sep 13, 2010 at 4:32 AM, Thibault VINCENT
  thibault.vinc...@smartjog.com wrote:
  Hello
 
  I'm trying to achieve higher than gigabit transferts over a virtio NIC
  with no success, and I can't find a recent bug or discussion about such
  an issue.
 
  The simpler test consist of two VM running on a high-end blade server
  with 4 cores and 4GB RAM each, and a virtio NIC dedicated to the
  inter-VM communication. On the host, the two vnet interfaces are
  enslaved into a bridge. I use a combination of 2.6.35 on the host and
  2.6.32 in the VMs.
  Running iperf or netperf on these VMs, with TCP or UDP, result in
  ~900Mbits/s transferts. This is what could be expected of a 1G
  interface, and indeed the e1000 emulation performs similar.
 
  Changing the txqueuelen, MTU, and offloading settings on every interface
  (bridge/tap/virtio_net) didn't improve the speed, nor did the
  installation of irqbalance and the increase in CPU and RAM.
 
  Is this normal ? Is the multiple queue patch intended to address this ?
  It's quite possible I missed something :)
  
  I'm able to achieve quite a bit more than 1Gbps using virtio-net
  between 2 guests on the same host connected via an internal bridge.
  With the virtio-net TX bottom half handler I can easily hit 7Gbps TCP
  and 10+Gbps UDP using netperf (TCP_STREAM/UDP_STREAM tests).  Even
  without the bottom half patches (not yet in qemu-kvm.git), I can get
  ~5Gbps.  Maybe you could describe your setup further, host details,
  bridge setup, guests, specific tests, etc...  Thanks,
 
 Thanks Alex, I don't use the bottom half patches but anything between
 3Gbps and 5Gbps would be fine. Here are some more details:
 
 Host
 -
 Dell M610 ; 2 x Xeon X5650 ; 6 x 8GB
 Debian Squeeze amd64
 qemu-kvm 0.12.5+dfsg-1
 kernel 2.6.35-1 amd64 (Debian Experimental)
 
 Guests
 ---
 Debian Squeeze amd64
 kernel 2.6.35-1 amd64 (Debian Experimental)
 
 To measure the throughput between the guests, I do the following.
 
 On the host:
  * create a bridge
# brctl addbr br_test
# ifconfig br_test 1.1.1.1 up
  * start two guests
# kvm -enable-kvm -m 4096 -smp 4 -drive
 file=/dev/vg/deb0,id=0,boot=on,format=raw -device
 virtio-blk-pci,drive=0,id=0 -device
 virtio-net-pci,vlan=0,id=1,mac=52:54:00:cf:6a:b0 -net
 tap,vlan=0,name=hostnet0
# kvm -enable-kvm -m 4096 -smp 4 -drive
 file=/dev/vg/deb1,id=0,boot=on,format=raw -device
 virtio-blk-pci,drive=0,id=0 -device
 virtio-net-pci,vlan=0,id=1,mac=52:54:00:cf:6a:b1 -net
 tap,vlan=0,name=hostnet0
  * add guests to the bridge
# brctl addif br_test tap0
# brctl addif br_test tap1
 
 On the first guest:
  # ifconfig eth0 1.1.1.2 up
  # iperf -s -i 1
 
 On the second guest:
  # ifconfig eth0 1.1.1.3 up
  # iperf -c 1.1.1.2 -i 1
 
 Client connecting to 1.1.1.2, TCP port 5001
 TCP window size: 16.0 KByte (default)
 
 [  3] local 1.1.1.3 port 43510 connected with 1.1.1.2 port 5001
 [ ID] Interval   Transfer Bandwidth
 [  3]  0.0- 1.0 sec  80.7 MBytes677 Mbits/sec
 [  3]  1.0- 2.0 sec102 MBytes855 Mbits/sec
 [  3]  2.0- 3.0 sec101 MBytes847 Mbits/sec
 [  3]  3.0- 4.0 sec104 MBytes873 Mbits/sec
 [  3]  4.0- 5.0 sec104 MBytes874 Mbits/sec
 [  3]  5.0- 6.0 sec105 MBytes881 Mbits/sec
 [  3]  6.0- 7.0 sec103 MBytes862 Mbits/sec
 [  3]  7.0- 8.0 sec101 MBytes848 Mbits/sec
 [  3]  8.0- 9.0 sec105 MBytes878 Mbits/sec
 [  3]  9.0-10.0 sec105 MBytes882 Mbits/sec
 [  3]  0.0-10.0 sec  1011 MBytes848 Mbits/sec
 
 On the host again:
  # iperf -c 1.1.1.1 -i 1
 
 Client connecting to 1.1.1.1, TCP port 5001
 TCP window size: 16.0 KByte (default)
 
 [  3] local 1.1.1.3 port 60456 connected with 1.1.1.1 port 5001
 [ ID] Interval   Transfer Bandwidth
 [  3]  0.0- 1.0 sec  97.9 MBytes821 Mbits/sec
 [  3]  1.0- 2.0 sec136 MBytes  1.14 Gbits/sec
 [  3]  2.0- 3.0 sec153 MBytes  1.28 Gbits/sec
 [  3]  3.0- 4.0 sec160 MBytes  1.34 Gbits/sec
 [  3]  4.0- 5.0 sec156 MBytes  1.31 Gbits/sec
 [  3]  5.0- 6.0 sec122 MBytes  1.02 Gbits/sec
 [  3]  6.0- 7.0 sec121 MBytes  1.02 Gbits/sec
 [  3]  7.0- 8.0 sec137 MBytes  1.15 Gbits/sec
 [  3]  8.0- 9.0 sec139 MBytes  1.17 Gbits/sec
 [  3]  9.0-10.0 sec140 MBytes  1.17 Gbits/sec
 [  3]  0.0-10.0 sec  1.33 GBytes  1.14 Gbits/sec
 
 
 You can see it's quite slow compared to your figures, between the guests
 and with the host too. And there is no specific load on any of the three
 systems, htop in a guest only report one of the four cores going up to
 70% (sys+user+wait) during the test.
 
 The other tests I mentioned are:
  * iperf 

Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 04:51:17PM +0200, Gleb Natapov wrote:
   What for? Device emulation should do de-assert.
  
  Sorry, but at this point I have no idea what you call device emulation.
 The same thing everyone calls device emulation. In case of virtio-net it
 is in hw/virtio-net.c. If vhost-net is in use device emulation is split
 between userspace and kernel, but it is still just device emulation.


case in point, virtio net does not know about pci at all,
so it can not deassert.

  qemu has code to de-assert. vhost has code to assert.
 Good. So qemu will de-assert. So what do you mean by
 KVM would need to find all irqfd objects mapped to gsi and notify
 them on deassert
 
  I would like to optimize level interrupts and stop driving
  scheduler insane if at all possible.
  
 Worthy goal. Do it in irqfd. Irqfd shouldn't call kvm_set_irq() if irq
 level hasn't changed.

Right. Then it needs to know about deasserts. It does not get this
information, so when kvm gets deassert ioctl it should
locate irqfd object and set level to 0.
See?

-- 
MST

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] device-assignment: register a reset function

2010-09-16 Thread Alex Williamson
On Tue, Sep 14, 2010 at 9:04 AM, Bernhard Kohl bernhard.k...@nsn.com wrote:
 This is necessary because during reboot of a VM the assigned devices
 continue DMA transfers which causes memory corruption.

 Signed-off-by: Thomas Ostler thomas.ost...@nsn.com
 Signed-off-by: Bernhard Kohl bernhard.k...@nsn.com
 ---
  hw/device-assignment.c |   14 ++
  1 files changed, 14 insertions(+), 0 deletions(-)

 diff --git a/hw/device-assignment.c b/hw/device-assignment.c
 index 87f7418..001aee8 100644
 --- a/hw/device-assignment.c
 +++ b/hw/device-assignment.c
 @@ -1450,6 +1450,17 @@ static void 
 assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
     dev-msix_table_page = NULL;
  }

 +static void reset_assigned_device(void *opaque)
 +{
 +    PCIDevice *d = (PCIDevice *)opaque;
 +    uint32_t conf;
 +
 +    /* reset the bus master bit to avoid further DMA transfers */
 +    conf = assigned_dev_pci_read_config(d, 0x04, 0x02);
 +    conf = ~0x04;
 +    assigned_dev_pci_write_config(d, 0x04, conf, 0x02);

This should use defined macros, PCI_COMMAND  PCI_COMMAND_MASTER.
Otherwise seems ok.  Thanks,

Alex

 +}
 +
  static int assigned_initfn(struct PCIDevice *pci_dev)
  {
     AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
 @@ -1499,6 +1510,9 @@ static int assigned_initfn(struct PCIDevice *pci_dev)
     if (r  0)
         goto assigned_out;

 +    /* register reset function for the device */
 +    qemu_register_reset(reset_assigned_device, pci_dev);
 +
     /* intercept MSI-X entry page in the MMIO */
     if (dev-cap.available  ASSIGNED_DEVICE_CAP_MSIX)
         if (assigned_dev_register_msix_mmio(dev))
 --
 1.7.2.2

 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 20/24] Correct handling of interrupt injection

2010-09-16 Thread Nadav Har'El
On Mon, Jun 14, 2010, Avi Kivity wrote about Re: [PATCH 20/24] Correct 
handling of interrupt injection:
..
 However, properly doing what is described above requires invasive changes 
 to
 the flow of the existing code, which we elected not to do in this stage.
 Instead we do something more simplistic and less efficient: we modify
...
 
 That's a little sad.

I agree. I'd like to change this code to do the proper thing (as I explained
in the patch's description), but as I said this will require some invasive
changes to existing KVM code outside vmx.c.
So seeing that this existing code also works, and despite hurting performance
a bit there are much more pressing performance issues (namely, the need
for nested EPT) - with your permission I'd like to postpone fixing this issue.

   static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
   {
 +if (to_vmx(vcpu)-nested.nested_mode  nested_exit_on_intr(vcpu)) {
 +if (to_vmx(vcpu)-nested.nested_run_pending)
 +return 0;
 +nested_vmx_vmexit(vcpu, true);
 +/* fall through to normal code, but now in L1, not L2 */
 +}
 +

 
 What exit is reported here?

When nested_vmx_vmexit is called with the second parameter true, as above,
it modifies the (vmcs12) exit reason to be EXIT_REASON_EXTERNAL_INTERRUPT.
A hack, but it does the right thing in this case because L1 doesn't even get
a chance to care about this exit reason before it exits again (as I tried to
explain in the patch's description).

-- 
Nadav Har'El| Thursday, Sep 16 2010, 8 Tishri 5771
n...@math.technion.ac.il |-
Phone +972-523-790466, ICQ 13349191 |Debugging: the art of removing bugs.
http://nadav.harel.org.il   |Programming: the art of inserting them.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: .img on nfs, relative on ram, consuming mass ram

2010-09-16 Thread TOURNIER Frédéric
I can't do this because i need performance.

I'm currently doing some tests. Will post soon.

My config's map :

NFS  PXE server  qemu-kvm host
---  
| img.img |__|relimg.img|
| readonly|   net|  |
---  

Keep in touch and thx for time.

Fred.

On Thu, 16 Sep 2010 15:48:18 +0200
Andre Przywara andre.przyw...@amd.com wrote:

 Stefan Hajnoczi wrote:
  2010/9/16 Andre Przywara andre.przyw...@amd.com:
  TOURNIER Frédéric wrote:
  Ok, thanks for taking time.
  I'll dig into your answers.
 
  So as i run relative.img on diskless systems with original.img on nfs,
  what are the best practice/tips i can use ?
  I thinks it is -snapshot you are looking for.
  This will put the backing store into normal RAM, and you can later commit
  it to the original image if needed. See the qemu manpage for more details.
  In a nutshell you just specify the original image and add -snapshot to the
  command line.
  
  -snapshot creates a temporary qcow2 image in /tmp whose backing file
  is your original image.  I'm not sure what you mean by This will put
  the backing store into normal RAM?
 Stefan, you are right. I never looked into the code and because the file 
 in /tmp is deleted just after creation there wasn't a sign of it.
 For some reason I thought that the buffer would just be allocated in 
 memory. Sorry, my mistake and thanks for pointing this out.
 
 So Fred, unfortunately this does not solve your problem. I guess you run 
 into a general problem. If the guest actually changes so much of the 
 disk that this cannot be backed by RAM in the host, you have lost.
 One solution could be to just make (at least parts of) the disk 
 read-only (a write protected /usr partition works quite well).
 If you are sure that writes are not that frequent, you could think of 
 putting the overlay file also on the remote storage (NFS). Although this 
 is rather slow, it shouldn't matter if there aren't many writes and the 
 local page cache should catch most of the accesses (while still being 
 nice to other RAM users).
 
 Regards,
 Andre.
  
  Stefan
  --
  To unsubscribe from this list: send the line unsubscribe kvm in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
  
 
 
 -- 
 Andre Przywara
 AMD-Operating System Research Center (OSRC), Dresden, Germany
 Tel: +49 351 448-3567-12
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 16/24] Implement VMLAUNCH and VMRESUME

2010-09-16 Thread Nadav Har'El
On Thu, Jun 17, 2010, Gleb Natapov wrote about Re: [PATCH 16/24] Implement 
VMLAUNCH and VMRESUME:
  +static int handle_launch_or_resume(struct kvm_vcpu *vcpu, bool launch)
  +{
  +   if (!nested_vmx_check_permission(vcpu))
...
 Should also check MOV SS blocking. Why Intel decided that vm entry
 should fail in this case? How knows, but spec says so.

Thanks. Added the check:

if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)  GUEST_INTR_STATE_MOV_SS){
nested_vmx_failValid(vcpu,
VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
skip_emulated_instruction(vcpu);
return 1;
}

Like you, I don't understand why this test is at all necessary...

-- 
Nadav Har'El| Thursday, Sep 16 2010, 8 Tishri 5771
n...@math.technion.ac.il |-
Phone +972-523-790466, ICQ 13349191 |Help Microsoft stamp out piracy. Give
http://nadav.harel.org.il   |Linux to a friend today!
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: VNC console CTRL key locked

2010-09-16 Thread Michael Tokarev
16.09.2010 18:01, Peter Lieven wrote:
 Hi,
 
 with Ubuntu 10.04 Desktop guest in Qemu-KVM 0.12.4 I sometimes see a
 condition where the CTRL key
 is locked and cannot be released anymore. The only way I have found to
 solve this is to restart
 the VM.
 
 Has anyone else experienced this so far?

Do you use any -k $lang switch?  I've seen something
similar with -k, but it was long time ago...

/mjt
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: VNC console CTRL key locked

2010-09-16 Thread Peter Lieven
Am 16.09.2010 um 19:13 schrieb Michael Tokarev:

 16.09.2010 18:01, Peter Lieven wrote:
 Hi,
 
 with Ubuntu 10.04 Desktop guest in Qemu-KVM 0.12.4 I sometimes see a
 condition where the CTRL key
 is locked and cannot be released anymore. The only way I have found to
 solve this is to restart
 the VM.
 
 Has anyone else experienced this so far?
 
 Do you use any -k $lang switch?  I've seen something
 similar with -k, but it was long time ago...

hi michael,

i use -k de

br,
peter

 
 /mjt

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] kvm: enable irq injection from interrupt context

2010-09-16 Thread Michael S. Tsirkin
On Thu, Sep 16, 2010 at 05:43:26PM +0200, Gleb Natapov wrote:
 On Thu, Sep 16, 2010 at 05:24:11PM +0200, Michael S. Tsirkin wrote:
  On Thu, Sep 16, 2010 at 04:51:17PM +0200, Gleb Natapov wrote:
 What for? Device emulation should do de-assert.

Sorry, but at this point I have no idea what you call device emulation.
   The same thing everyone calls device emulation. In case of virtio-net it
   is in hw/virtio-net.c. If vhost-net is in use device emulation is split
   between userspace and kernel, but it is still just device emulation.
  
  
  case in point, virtio net does not know about pci at all,
  so it can not deassert.
  
 I don't see what PCI has to do with it. virtio-net device implementation is
 split between several files. virtio-net/virtio-ring/virtio-pci. All of
 them still implement one emulated device. Whoever implemented virtio-net
 initially decided to put irq ack register into PCI config space, so code
 that does de-assert is in virtio-pci,
 but it is still part of emulated
 device.
 
qemu has code to de-assert. vhost has code to assert.
   Good. So qemu will de-assert. So what do you mean by
   KVM would need to find all irqfd objects mapped to gsi and notify
   them on deassert
   
I would like to optimize level interrupts and stop driving
scheduler insane if at all possible.

   Worthy goal. Do it in irqfd. Irqfd shouldn't call kvm_set_irq() if irq
   level hasn't changed.
  
  Right. Then it needs to know about deasserts. It does not get this
 Dessert should be done by qemu writing 0 into irqfd.

writing 0 to eventfd does nothing. The way to deassert irq
is currently through ioctl and it seems entirely sensible
to me to keep it that way.

Also - which irqfd :)
We need multiple irqfds to map to a single gsi, with kvm
doing OR on them.

 Assertion and
 de-assertion should be done through irqfd.
  information, so when kvm gets deassert ioctl it should
  locate irqfd object and set level to 0.
  See?
 No.
 
 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


PCI passthru issue

2010-09-16 Thread Anirban Chakraborty
Hi All,

I noticed that the PCI registers are not all mapped to the guest for a PCI pass 
through device. For example, I was trying to pass through a PCI function (NIC) 
to a guest and in the guest the MSI-X table offset register is not mapped 
properly. It was showing all 0s. This is in RHEL 6.0 beta and the guest was a 
SLES11 64 bit. 
In the RHEL 6.0 host I see following for the PCI function:
-
05:00.0 Ethernet controller: QLogic Corp. cLOM8214 1/10GbE Controller (rev 54)
Subsystem: QLogic Corp. 8200 Series Dual Port 10GbE Converged Network 
Adapter (TCP/IP Networking)
Control: I/O- Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr+ 
Stepping- SERR+ FastB2B- DisINTx+
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- 
MAbort- SERR- PERR- INTx-
Interrupt: pin A routed to IRQ 74
Region 0: Memory at df20 (64-bit, non-prefetchable) [size=2M]
Region 4: Memory at df1b (64-bit, non-prefetchable) [size=64K]
Expansion ROM at df1c [disabled] [size=256K]
Capabilities: [40] MSI-X: Enable- Count=32 Masked-
Vector table: BAR=0 offset=0009
PBA: BAR=0 offset=00090800
...
Kernel driver in use: pci-stub
00: 77 10 20 80 42 05 10 00 54 00 00 02 40 00 80 00
10: 04 00 20 df 00 00 00 00 00 00 00 00 00 00 00 00
20: 04 00 1b df 00 00 00 00 00 00 00 00 77 10 07 02
30: 00 00 1c df 40 00 00 00 00 00 00 00 0a 01 00 00
40: 11 80 1f 00 00 00 09 00 00 08 09 00 00 00 00 00
--
And in the SLES11 guest, I see following for the same function:

00:07.0 Ethernet controller: QLogic Corp. Device 8020 (rev 54)
Subsystem: QLogic Corp. Device 0207
Control: I/O- Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr+ 
Stepping- SERR+ FastB2B- DisINTx+
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- 
MAbort- SERR- PERR- INTx-
Interrupt: pin A routed to IRQ 11
Region 0: Memory at f220 (32-bit, non-prefetchable) [size=2M]
Region 4: Memory at f240 (32-bit, non-prefetchable) [size=64K]
Expansion ROM at f244 [disabled] [size=256K]
Capabilities: [40] Message Signalled Interrupts: Mask- 64bit- Count=1/1 
Enable-
Address:   Data: 
Capabilities: [50] MSI-X: Enable- Mask- TabSize=32
Vector table: BAR=0 offset=0009
PBA: BAR=0 offset=
00: 77 10 20 80 42 05 10 00 54 00 00 02 40 00 80 00
10: 00 00 20 f2 00 00 00 00 00 00 00 00 00 00 00 00
20: 00 00 40 f2 00 00 00 00 00 00 00 00 77 10 07 02
30: 08 00 44 f2 40 00 00 00 00 00 00 00 0b 01 00 00
40: 05 50 00 00 00 00 00 00 00 00 00 00 00 00 00 00


Note that the data at byte offset 40h is different for the host and the guest. 
Is this expected or is it a bug in KVM?

Is there any spec out there that specifies what PCI registers are accessible 
from guest and how much of the PCI config space is mapped to the guest? This is 
helpful to know, especially in the context of SR-IOV. If a VF is pass throughed 
to a guest, are all the VF registers visible in host system will be mapped (and 
accessible) to the guest OS?

thanks much,
Anirban--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PCI passthru issue

2010-09-16 Thread Hidetoshi Seto
(2010/09/17 8:01), Anirban Chakraborty wrote:
 Hi All,
 
 I noticed that the PCI registers are not all mapped to the
 guest for a PCI pass through device. For example, I was trying
 to pass through a PCI function (NIC) to a guest and in the
 guest the MSI-X table offset register is not mapped properly.
 It was showing all 0s. This is in RHEL 6.0 beta and the guest
 was a SLES11 64 bit. 

 In the RHEL 6.0 host I see following for the PCI function:
 -
 05:00.0 Ethernet controller: QLogic Corp. cLOM8214 1/10GbE Controller (rev 54)
   Subsystem: QLogic Corp. 8200 Series Dual Port 10GbE Converged Network 
 Adapter (TCP/IP Networking)
   Control: I/O- Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr+ 
 Stepping- SERR+ FastB2B- DisINTx+
   Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- 
 MAbort- SERR- PERR- INTx-
   Interrupt: pin A routed to IRQ 74
   Region 0: Memory at df20 (64-bit, non-prefetchable) [size=2M]
   Region 4: Memory at df1b (64-bit, non-prefetchable) [size=64K]
   Expansion ROM at df1c [disabled] [size=256K]
   Capabilities: [40] MSI-X: Enable- Count=32 Masked-
   Vector table: BAR=0 offset=0009
   PBA: BAR=0 offset=00090800
 ...
   Kernel driver in use: pci-stub
 00: 77 10 20 80 42 05 10 00 54 00 00 02 40 00 80 00
 10: 04 00 20 df 00 00 00 00 00 00 00 00 00 00 00 00
 20: 04 00 1b df 00 00 00 00 00 00 00 00 77 10 07 02
 30: 00 00 1c df 40 00 00 00 00 00 00 00 0a 01 00 00
 40: 11 80 1f 00 00 00 09 00 00 08 09 00 00 00 00 00
 --
 And in the SLES11 guest, I see following for the same function:
 
 00:07.0 Ethernet controller: QLogic Corp. Device 8020 (rev 54)
   Subsystem: QLogic Corp. Device 0207
   Control: I/O- Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr+ 
 Stepping- SERR+ FastB2B- DisINTx+
   Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- 
 MAbort- SERR- PERR- INTx-
   Interrupt: pin A routed to IRQ 11
   Region 0: Memory at f220 (32-bit, non-prefetchable) [size=2M]
   Region 4: Memory at f240 (32-bit, non-prefetchable) [size=64K]
   Expansion ROM at f244 [disabled] [size=256K]
   Capabilities: [40] Message Signalled Interrupts: Mask- 64bit- Count=1/1 
 Enable-
   Address:   Data: 
   Capabilities: [50] MSI-X: Enable- Mask- TabSize=32
   Vector table: BAR=0 offset=0009
   PBA: BAR=0 offset=
 00: 77 10 20 80 42 05 10 00 54 00 00 02 40 00 80 00
 10: 00 00 20 f2 00 00 00 00 00 00 00 00 00 00 00 00
 20: 00 00 40 f2 00 00 00 00 00 00 00 00 77 10 07 02
 30: 08 00 44 f2 40 00 00 00 00 00 00 00 0b 01 00 00
 40: 05 50 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 
 
 Note that the data at byte offset 40h is different for the
 host and the guest. Is this expected or is it a bug in KVM?

This is expected, since MSIs of PCI pass through device need
some emulation to route them to the guest instead of the host.

So what we can see on the guest's PCI pass through device is
modified capability list that contains emulated capabilities.

 
 Is there any spec out there that specifies what PCI registers
 are accessible from guest and how much of the PCI config space
 is mapped to the guest? This is helpful to know, especially
 in the context of SR-IOV. If a VF is pass throughed to a guest,
 are all the VF registers visible in host system will be mapped
 (and accessible) to the guest OS?

AFAIK the QEMU source code, hw/device-assignment.c, will help
you to know what are going on there.

I'm not sure what interesting registers can the VF have.
But I think you should be careful because the register in the
PCI config space might not work like that on the host even the
register is mapped and visible from the guest.


Thanks,
H.Seto

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [RFC PATCH v9 12/16] Add mp(mediate passthru) device.

2010-09-16 Thread Xin, Xiaohui
From: Michael S. Tsirkin [mailto:m...@redhat.com]
Sent: Wednesday, September 15, 2010 7:28 PM
To: Xin, Xiaohui
Cc: net...@vger.kernel.org; kvm@vger.kernel.org; linux-ker...@vger.kernel.org;
mi...@elte.hu; da...@davemloft.net; herb...@gondor.hengli.com.au;
jd...@linux.intel.com
Subject: Re: [RFC PATCH v9 12/16] Add mp(mediate passthru) device.

On Wed, Sep 15, 2010 at 11:13:44AM +0800, Xin, Xiaohui wrote:
 From: Michael S. Tsirkin [mailto:m...@redhat.com]
 Sent: Sunday, September 12, 2010 9:37 PM
 To: Xin, Xiaohui
 Cc: net...@vger.kernel.org; kvm@vger.kernel.org; 
 linux-ker...@vger.kernel.org;
 mi...@elte.hu; da...@davemloft.net; herb...@gondor.hengli.com.au;
 jd...@linux.intel.com
 Subject: Re: [RFC PATCH v9 12/16] Add mp(mediate passthru) device.
 
 On Sat, Sep 11, 2010 at 03:41:14PM +0800, Xin, Xiaohui wrote:
  Playing with rlimit on data path, transparently to the application in 
  this way
  looks strange to me, I suspect this has unexpected security 
  implications.
  Further, applications may have other uses for locked memory
  besides mpassthru - you should not just take it because it's there.
  
  Can we have an ioctl that lets userspace configure how much
  memory to lock? This ioctl will decrement the rlimit and store
  the data in the device structure so we can do accounting
  internally. Put it back on close or on another ioctl.
  Yes, we can decrement the rlimit in ioctl in one time to avoid
  data path.
  
  Need to be careful for when this operation gets called
  again with 0 or another small value while we have locked memory -
  maybe just fail with EBUSY?  or wait until it gets unlocked?
  Maybe 0 can be special-cased and deactivate zero-copy?.
  
 
  How about we don't use a new ioctl, but just check the rlimit
  in one MPASSTHRU_BINDDEV ioctl? If we find mp device
  break the rlimit, then we fail the bind ioctl, and thus can't do
  zero copy any more.
 
 Yes, and not just check, but decrement as well.
 I think we should give userspace control over
 how much memory we can lock and subtract from the rlimit.
 It's OK to add this as a parameter to MPASSTHRU_BINDDEV.
 Then increment the rlimit back on unbind and on close?
 
 This opens up an interesting condition: process 1
 calls bind, process 2 calls unbind or close.
 This will increment rlimit for process 2.
 Not sure how to fix this properly.
 
 I can't too, can we do any synchronous operations on rlimit stuff?
 I quite suspect in it.

 --
 MST

Here's what infiniband does: simply pass the amount of memory userspace
wants you to lock on an ioctl, and verify that either you have
CAP_IPC_LOCK or this number does not exceed the current rlimit.  (must
be on ioctl, not on open, as we likely want the fd passed around between
processes), but do not decrement rlimit.  Use this on following
operations.  Be careful if this can be changed while operations are in
progress.

This does mean that the effective amount of memory that userspace can
lock is doubled, but at least it is not unlimited, and we sidestep all
other issues such as userspace running out of lockable memory simply by
virtue of using the driver.


What I have done in mp device is almost the same as it. The difference is that
I do not check the capability, and I use my own parameter ctor-pages instead
of mm-locked_vm.

So currently, 1) add the capability check 2) use mm-locked_vm 3) add
an ioctl for userspace to configure how much memory can lock.
 
--
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0

2010-09-16 Thread Liu Yu-B13201
 

 -Original Message-
 From: Alexander Graf [mailto:ag...@suse.de] 
 Sent: Friday, September 10, 2010 7:24 AM
 To: Liu Yu-B13201
 Cc: k...@vger.kernel.org; kvm-ppc@vger.kernel.org
 Subject: Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0
 
 
 On 08.09.2010, at 11:40, Liu Yu wrote:
 
  Current guest TLB1 is mapped to host TLB1.
  As host kernel only provides 4K uncontinuous pages,
  we have to break guest large mapping into 4K shadow mappings.
  These 4K shadow mappings are then mapped into host TLB1 on fly.
  As host TLB1 only has 13 free entries, there's serious tlb miss.
  
  Since e500v2 has a big number of TLB0 entries,
  it should be help to map those 4K shadow mappings to host TLB0.
  To achieve this, we need to unlink guest tlb and host tlb,
  So that guest TLB1 mappings can route to any host TLB0 
 entries freely.
  
  Pages/mappings are considerred in the same kind as host tlb entry.
  This patch remove the link between pages and guest tlb 
 entry to do the unlink.
  And keep host_tlb0_ref in each vcpu to trace pages.
  Then it's easy to map guest TLB1 to host TLB0.
  
  In guest ramdisk boot test(guest mainly uses TLB1),
  with this patch, the tlb miss number get down 90%.
  
  Signed-off-by: Liu Yu yu@freescale.com
  ---
  arch/powerpc/include/asm/kvm_e500.h |7 +-
  arch/powerpc/kvm/e500.c |4 +
  arch/powerpc/kvm/e500_tlb.c |  280 
 ---
  arch/powerpc/kvm/e500_tlb.h |1 +
  4 files changed, 104 insertions(+), 188 deletions(-)
  

 
  
  -static unsigned int tlb1_entry_num;
  +static inline unsigned int get_tlb0_entry_offset(u32 
 eaddr, u32 esel)
  +{
  +   return ((eaddr  0x7F000)  (12 - host_tlb0_assoc_bit) |
  +   (esel  (host_tlb0_assoc - 1)));
  +}
  
  void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
  {
  @@ -62,11 +68,6 @@ static inline unsigned int tlb0_get_next_victim(
  return victim;
  }
  
  -static inline unsigned int tlb1_max_shadow_size(void)
  -{
  -   return tlb1_entry_num - tlbcam_index;
  -}
  -
  static inline int tlbe_is_writable(struct tlbe *tlbe)
  {
  return tlbe-mas3  (MAS3_SW|MAS3_UW);
  @@ -100,7 +101,7 @@ static inline u32 
 e500_shadow_mas2_attrib(u32 mas2, int usermode)
  /*
   * writing shadow tlb entry to host TLB
   */
  -static inline void __write_host_tlbe(struct tlbe *stlbe)
  +static inline void __host_tlbe_write(struct tlbe *stlbe)
  {
  mtspr(SPRN_MAS1, stlbe-mas1);
  mtspr(SPRN_MAS2, stlbe-mas2);
  @@ -109,25 +110,22 @@ static inline void 
 __write_host_tlbe(struct tlbe *stlbe)
  __asm__ __volatile__ (tlbwe\n : : );
  }
  
  -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 
 *vcpu_e500,
  -   int tlbsel, int esel, struct tlbe *stlbe)
  +static inline u32 host_tlb0_write(struct kvmppc_vcpu_e500 
 *vcpu_e500,
  +   u32 gvaddr, struct tlbe *stlbe)
  {
  -   local_irq_disable();
  -   if (tlbsel == 0) {
  -   __write_host_tlbe(stlbe);
  -   } else {
  -   unsigned register mas0;
  +   unsigned register mas0;
  
  -   mas0 = mfspr(SPRN_MAS0);
  +   local_irq_disable();
 
 Do you have to disable interrupts for a tlb write? If you get 
 preempted before the write, the entry you overwrite could be 
 different. But you don't protect against that either way. And 
 if you get preempted afterwards, you could lose the entry. 
 But since you enable interrupts beyond that, that could 
 happen either way too.
 
 So what's the reason for the disable here?
 

Hello Alex,

Doesn't local_irq_disable() also disable preempt?
The reason to disable interrupts is because it's possible to have tlb
misses in interrupt service route.


Thanks,
Yu

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0

2010-09-16 Thread Alexander Graf
Liu Yu-B13201 wrote:
  

   
 -Original Message-
 From: Alexander Graf [mailto:ag...@suse.de] 
 Sent: Friday, September 10, 2010 7:24 AM
 To: Liu Yu-B13201
 Cc: k...@vger.kernel.org; kvm-ppc@vger.kernel.org
 Subject: Re: [PATCH 2/2] kvm/e500v2: mapping guest TLB1 to host TLB0


 On 08.09.2010, at 11:40, Liu Yu wrote:

 
 Current guest TLB1 is mapped to host TLB1.
 As host kernel only provides 4K uncontinuous pages,
 we have to break guest large mapping into 4K shadow mappings.
 These 4K shadow mappings are then mapped into host TLB1 on fly.
 As host TLB1 only has 13 free entries, there's serious tlb miss.

 Since e500v2 has a big number of TLB0 entries,
 it should be help to map those 4K shadow mappings to host TLB0.
 To achieve this, we need to unlink guest tlb and host tlb,
 So that guest TLB1 mappings can route to any host TLB0 
   
 entries freely.
 
 Pages/mappings are considerred in the same kind as host tlb entry.
 This patch remove the link between pages and guest tlb 
   
 entry to do the unlink.
 
 And keep host_tlb0_ref in each vcpu to trace pages.
 Then it's easy to map guest TLB1 to host TLB0.

 In guest ramdisk boot test(guest mainly uses TLB1),
 with this patch, the tlb miss number get down 90%.

 Signed-off-by: Liu Yu yu@freescale.com
 ---
 arch/powerpc/include/asm/kvm_e500.h |7 +-
 arch/powerpc/kvm/e500.c |4 +
 arch/powerpc/kvm/e500_tlb.c |  280 
   
 ---
 
 arch/powerpc/kvm/e500_tlb.h |1 +
 4 files changed, 104 insertions(+), 188 deletions(-)

   

   
 -static unsigned int tlb1_entry_num;
 +static inline unsigned int get_tlb0_entry_offset(u32 
   
 eaddr, u32 esel)
 
 +{
 +   return ((eaddr  0x7F000)  (12 - host_tlb0_assoc_bit) |
 +   (esel  (host_tlb0_assoc - 1)));
 +}

 void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 {
 @@ -62,11 +68,6 @@ static inline unsigned int tlb0_get_next_victim(
 return victim;
 }

 -static inline unsigned int tlb1_max_shadow_size(void)
 -{
 -   return tlb1_entry_num - tlbcam_index;
 -}
 -
 static inline int tlbe_is_writable(struct tlbe *tlbe)
 {
 return tlbe-mas3  (MAS3_SW|MAS3_UW);
 @@ -100,7 +101,7 @@ static inline u32 
   
 e500_shadow_mas2_attrib(u32 mas2, int usermode)
 
 /*
  * writing shadow tlb entry to host TLB
  */
 -static inline void __write_host_tlbe(struct tlbe *stlbe)
 +static inline void __host_tlbe_write(struct tlbe *stlbe)
 {
 mtspr(SPRN_MAS1, stlbe-mas1);
 mtspr(SPRN_MAS2, stlbe-mas2);
 @@ -109,25 +110,22 @@ static inline void 
   
 __write_host_tlbe(struct tlbe *stlbe)
 
 __asm__ __volatile__ (tlbwe\n : : );
 }

 -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 
   
 *vcpu_e500,
 
 -   int tlbsel, int esel, struct tlbe *stlbe)
 +static inline u32 host_tlb0_write(struct kvmppc_vcpu_e500 
   
 *vcpu_e500,
 
 +   u32 gvaddr, struct tlbe *stlbe)
 {
 -   local_irq_disable();
 -   if (tlbsel == 0) {
 -   __write_host_tlbe(stlbe);
 -   } else {
 -   unsigned register mas0;
 +   unsigned register mas0;

 -   mas0 = mfspr(SPRN_MAS0);
 +   local_irq_disable();
   
 Do you have to disable interrupts for a tlb write? If you get 
 preempted before the write, the entry you overwrite could be 
 different. But you don't protect against that either way. And 
 if you get preempted afterwards, you could lose the entry. 
 But since you enable interrupts beyond that, that could 
 happen either way too.

 So what's the reason for the disable here?

 

 Hello Alex,

 Doesn't local_irq_disable() also disable preempt?
 The reason to disable interrupts is because it's possible to have tlb
 misses in interrupt service route.
   

Yes, local_irq_disable disables preempt. That's exactly what I was
referring to :).
I don't understand the statement about the service route. A tlb miss
still arrives with local_irq_disable.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html