Re: [kvm-devel] [BUG] Oops with KVM-27
Luca Tettamanti wrote: >> Actually we haven't; just before the memcpy(), we can put a memcmp() to >> guard the kvm_mmu_pte_write(), which is the really expensive operation, >> especially with guest smp. >> > > Yup, but it seemed wasteful to map (at least when highmem is in use) a > page just to check for something that we already knew. That was a > preemptive optmization though, I haven't actually benchmarked the cost > of setting up the mapping ;-) > > It's negligible compared to the vmexit cost and to the emulation (which does a kmap_atomic() for every byte of the instruction; this can be easily optimized away). In any case, I expect that performance sensitive uses will use x86_64, whereas i386 is mostly for desktops. >> I think we can simply remove the if (). For the register case, the >> check is more expensive that the write; for mmio, we don't want it; and >> for memory writes, we can put it in emulator_write_phys(). >> > > Ok, this way it's simpler. How does this look: > > --- a/kernel/x86_emulate.c2007-06-15 21:13:51.0 +0200 > +++ b/kernel/x86_emulate.c2007-06-17 16:57:50.0 +0200 > @@ -1057,40 +1057,38 @@ > } > > writeback: > - if ((d & Mov) || (dst.orig_val != dst.val)) { > - switch (dst.type) { > - case OP_REG: > - /* The 4-byte case *is* correct: in 64-bit mode we > zero-extend. */ > - switch (dst.bytes) { > - case 1: > - *(u8 *)dst.ptr = (u8)dst.val; > - break; > - case 2: > - *(u16 *)dst.ptr = (u16)dst.val; > - break; > - case 4: > - *dst.ptr = (u32)dst.val; > - break; /* 64b: zero-ext */ > - case 8: > - *dst.ptr = dst.val; > - break; > - } > + switch (dst.type) { > + case OP_REG: > + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. > */ > + switch (dst.bytes) { > + case 1: > + *(u8 *)dst.ptr = (u8)dst.val; > break; > - case OP_MEM: > - if (lock_prefix) > - rc = ops->cmpxchg_emulated((unsigned long)dst. > -ptr, &dst.orig_val, > -&dst.val, dst.bytes, > -ctxt); > - else > - rc = ops->write_emulated((unsigned long)dst.ptr, > - &dst.val, dst.bytes, > - ctxt); > - if (rc != 0) > - goto done; > - default: > + case 2: > + *(u16 *)dst.ptr = (u16)dst.val; > + break; > + case 4: > + *dst.ptr = (u32)dst.val; > + break; /* 64b: zero-ext */ > + case 8: > + *dst.ptr = dst.val; > break; > } > + break; > + case OP_MEM: > + if (lock_prefix) > + rc = ops->cmpxchg_emulated((unsigned long)dst. > +ptr, &dst.orig_val, > +&dst.val, dst.bytes, > +ctxt); > + else > + rc = ops->write_emulated((unsigned long)dst.ptr, > + &dst.val, dst.bytes, > + ctxt); > + if (rc != 0) > + goto done; > + default: > + break; > } > > /* Commit shadow register state. */ > > --- a/kernel/kvm_main.c 2007-06-15 21:18:08.0 +0200 > +++ b/kernel/kvm_main.c 2007-06-17 16:59:33.0 +0200 > @@ -1139,8 +1139,10 @@ > return 0; > mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); > virt = kmap_atomic(page, KM_USER0); > - kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); > - memcpy(virt + offset_in_page(gpa), val, bytes); > + if (memcmp(virt + offset_in_page(gpa), val, bytes)) { > + kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); > + memcpy(virt + offset_in_page(gpa), val, bytes); > + } > kunmap_atomic(virt, KM_USER0); > return 1; > } > > > Excellent. We win back a precious indentation level and fix a bug at the same time. Please test, send me a changelog and a signoff and I'll commit it. -- error compiling committ
Re: [kvm-devel] [BUG] Oops with KVM-27
Il Sat, Jun 16, 2007 at 10:43:23AM +0300, Avi Kivity ha scritto: > Luca Tettamanti wrote: > > Il Fri, Jun 15, 2007 at 12:06:50PM +0300, Avi Kivity ha scritto: > > > >>> After a bit of thinking: it's correct but removes an optimization; > >>> furthermore it may miss other instructions that write to memory mapped > >>> areas. > >>> A more proper fix should be "force the writeback if dst.ptr is in some > >>> kind of mmio area". > >>> > >>> > >> I think we can just disable the optimization, and (in a separate patch) > >> add it back in emulator_write_phys(), where we know we're modifying > >> memory and not hitting mmio. > >> > > > > Problem is that in emulator_write_phys() we've already lost track of the > > previous (dst.orig_val) value. I moved up the decision in > > > > Actually we haven't; just before the memcpy(), we can put a memcmp() to > guard the kvm_mmu_pte_write(), which is the really expensive operation, > especially with guest smp. Yup, but it seemed wasteful to map (at least when highmem is in use) a page just to check for something that we already knew. That was a preemptive optmization though, I haven't actually benchmarked the cost of setting up the mapping ;-) [...] > I think we can simply remove the if (). For the register case, the > check is more expensive that the write; for mmio, we don't want it; and > for memory writes, we can put it in emulator_write_phys(). Ok, this way it's simpler. How does this look: --- a/kernel/x86_emulate.c 2007-06-15 21:13:51.0 +0200 +++ b/kernel/x86_emulate.c 2007-06-17 16:57:50.0 +0200 @@ -1057,40 +1057,38 @@ } writeback: - if ((d & Mov) || (dst.orig_val != dst.val)) { - switch (dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ - switch (dst.bytes) { - case 1: - *(u8 *)dst.ptr = (u8)dst.val; - break; - case 2: - *(u16 *)dst.ptr = (u16)dst.val; - break; - case 4: - *dst.ptr = (u32)dst.val; - break; /* 64b: zero-ext */ - case 8: - *dst.ptr = dst.val; - break; - } + switch (dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch (dst.bytes) { + case 1: + *(u8 *)dst.ptr = (u8)dst.val; break; - case OP_MEM: - if (lock_prefix) - rc = ops->cmpxchg_emulated((unsigned long)dst. - ptr, &dst.orig_val, - &dst.val, dst.bytes, - ctxt); - else - rc = ops->write_emulated((unsigned long)dst.ptr, -&dst.val, dst.bytes, -ctxt); - if (rc != 0) - goto done; - default: + case 2: + *(u16 *)dst.ptr = (u16)dst.val; + break; + case 4: + *dst.ptr = (u32)dst.val; + break; /* 64b: zero-ext */ + case 8: + *dst.ptr = dst.val; break; } + break; + case OP_MEM: + if (lock_prefix) + rc = ops->cmpxchg_emulated((unsigned long)dst. + ptr, &dst.orig_val, + &dst.val, dst.bytes, + ctxt); + else + rc = ops->write_emulated((unsigned long)dst.ptr, +&dst.val, dst.bytes, +ctxt); + if (rc != 0) + goto done; + default: + break; } /* Commit shadow register state. */ --- a/kernel/kvm_main.c 2007-06-15 21:18:08.0 +0200 +++ b/kernel/kvm_main.c 2007-06-17 16:59:33.0 +0200 @@ -1139,8 +1139,10 @@ return 0; mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); - kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); - memcpy(virt + offset_in_page(gpa), val, bytes); + if (memcmp(virt + offset_i
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca Tettamanti wrote: > Il Fri, Jun 15, 2007 at 12:06:50PM +0300, Avi Kivity ha scritto: > >>> After a bit of thinking: it's correct but removes an optimization; >>> furthermore it may miss other instructions that write to memory mapped >>> areas. >>> A more proper fix should be "force the writeback if dst.ptr is in some >>> kind of mmio area". >>> >>> >> I think we can just disable the optimization, and (in a separate patch) >> add it back in emulator_write_phys(), where we know we're modifying >> memory and not hitting mmio. >> > > Problem is that in emulator_write_phys() we've already lost track of the > previous (dst.orig_val) value. I moved up the decision in > Actually we haven't; just before the memcpy(), we can put a memcmp() to guard the kvm_mmu_pte_write(), which is the really expensive operation, especially with guest smp. > cmpxchg_emulated; unfortunately this means that the non-locked write > path (write_emulated) can't do this optimization, unless I change its > signature to include the old value. > > The first patch makes the writeback step uncoditional whenever we have a > destination operand (the mov check (d & Mov) may be superfluous, yes?). > The write-to-registry path still has the optimization that skips the > write if possible. > The mov check is in done since the destination is not read for moves; so the check for change would read uninitialized memory. I think we can simply remove the if (). For the register case, the check is more expensive that the write; for mmio, we don't want it; and for memory writes, we can put it in emulator_write_phys(). > Next one: I've splitted emulator_write_phys into emulator_write_phys_mem > (for normal RAM) and emulator_write_phys_mmio (for the rest). The > This is in order to properly emulate cmpxchg(), right? I don't think it's necessary since all that memory is write protected and the modifications happen under kvm->lock. > > I'm a bit confused about this test, found in emulator_write_phys > (original code): > > if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) > return 0; > > AFAICT is makes the emulator skip the write if the modified area spans > across two different (physical) pages. When this happens write_emulated > does a MMIO write. I'd expect the function to load the 2 pages and do the > memory write on both instead. > This isn't skipping the write; instead it uses the mmio path to update memory instead of the direct path, as I was too lazy to code split page writes. It's also wrong in case someone uses nonaligned writes to update page tables, but no-one does that. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Il Fri, Jun 15, 2007 at 12:06:50PM +0300, Avi Kivity ha scritto: > > After a bit of thinking: it's correct but removes an optimization; > > furthermore it may miss other instructions that write to memory mapped > > areas. > > A more proper fix should be "force the writeback if dst.ptr is in some > > kind of mmio area". > > > > I think we can just disable the optimization, and (in a separate patch) > add it back in emulator_write_phys(), where we know we're modifying > memory and not hitting mmio. Problem is that in emulator_write_phys() we've already lost track of the previous (dst.orig_val) value. I moved up the decision in cmpxchg_emulated; unfortunately this means that the non-locked write path (write_emulated) can't do this optimization, unless I change its signature to include the old value. The first patch makes the writeback step uncoditional whenever we have a destination operand (the mov check (d & Mov) may be superfluous, yes?). The write-to-registry path still has the optimization that skips the write if possible. --- a/kernel/x86_emulate.c 2007-06-15 21:13:51.0 +0200 +++ b/kernel/x86_emulate.c 2007-06-15 22:12:15.0 +0200 @@ -1057,9 +1057,8 @@ } writeback: - if ((d & Mov) || (dst.orig_val != dst.val)) { - switch (dst.type) { - case OP_REG: + if ((d & Mov) || (d & DstMask) == DstMem || (d & DstMask) == DstReg ) { + if (dst.type == OP_REG && dst.orig_val != dst.val) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ switch (dst.bytes) { case 1: @@ -1075,8 +1074,10 @@ *dst.ptr = dst.val; break; } - break; - case OP_MEM: + } else if (dst.type == OP_MEM) { + /* Always dispatch the write, since it may hit a +* MMIO area. +*/ if (lock_prefix) rc = ops->cmpxchg_emulated((unsigned long)dst. ptr, &dst.orig_val, @@ -1088,8 +1089,6 @@ ctxt); if (rc != 0) goto done; - default: - break; } } Next one: I've splitted emulator_write_phys into emulator_write_phys_mem (for normal RAM) and emulator_write_phys_mmio (for the rest). The cmpxchg code path is roughly: if (!mapped) page_fault(); page = gfn_to_page(...) if (page) { if (!memcmp(old, new)) return X86EMUL_CONTINUE; write_mem(); retrun X86EMUL_CONTINUE; } /* for mmio always do the write */ write_mmio(); I'm a bit confused about this test, found in emulator_write_phys (original code): if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; AFAICT is makes the emulator skip the write if the modified area spans across two different (physical) pages. When this happens write_emulated does a MMIO write. I'd expect the function to load the 2 pages and do the memory write on both instead. I've preserved this logic in the following patch, but I don't understand why it's correct :| --- a/kernel/kvm_main.c 2007-06-15 21:18:08.0 +0200 +++ b/kernel/kvm_main.c 2007-06-15 23:34:13.0 +0200 @@ -1125,26 +1125,39 @@ } } -static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) +static int emulator_write_phys_mem(struct kvm_vcpu *vcpu, gpa_t gpa, + struct page *page, const void *val, + int bytes) { - struct page *page; void *virt; - unsigned offset = offset_in_page(gpa); + unsigned int offset; + + offset = offset_in_page(gpa); if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!page) - return 0; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); memcpy(virt + offset_in_page(gpa), val, bytes); kunmap_atomic(virt, KM_USER0); + return 1; } +static int emulator_write_phys_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) +{ + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, val, bytes); + + return 0; +} + static int emulator_write_emulated(unsigned long addr, const void
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca wrote: >> >> Got it! >> The emulator skips the writeback if the old value is unchanged, so the >> apic doesn't see the write. >> >> Forcing the writeback: >> >> - if ((d & Mov) || (dst.orig_val != dst.val)) { >> - if ((d & Mov) || (dst.orig_val != dst.val) || isxchg) { >> >> seems to fix the issue :D I'm not sure that fix is correct though. > Good detective work! > After a bit of thinking: it's correct but removes an optimization; > furthermore it may miss other instructions that write to memory mapped > areas. > A more proper fix should be "force the writeback if dst.ptr is in some > kind of mmio area". > I think we can just disable the optimization, and (in a separate patch) add it back in emulator_write_phys(), where we know we're modifying memory and not hitting mmio. Incidentally, in Xen, where this code originated from, this emulator is used only for page table updates (which are always to memory). A separate emulator is used for mmio. I guess the optimization targets the vm scanner doing test_and_clear_bit() type of operations against the page tables' dirty bits. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
On 6/15/07, Luca Tettamanti <[EMAIL PROTECTED]> wrote: > Il Fri, Jun 15, 2007 at 12:53:24AM +0200, Luca Tettamanti ha scritto: > > Il Thu, Jun 14, 2007 at 11:26:29AM +0300, Avi Kivity ha scritto: > > > Luca Tettamanti wrote: > > > >With GOOD_APIC apic_read_around is a nop, while apic_write_around is a > > > >normal write. With !GOOD_APIC apic_write_around writes to the APIC reg > > > >using xchg. With !GOOD_APIC and this patch: > > > > > > > >--- include/asm-i386/apic.h~ 2007-04-26 05:08:32.0 +0200 > > > >+++ include/asm-i386/apic.h2007-06-13 22:35:00.0 +0200 > > > >@@ -56,7 +56,8 @@ > > > > static __inline fastcall void native_apic_write_atomic(unsigned long > > > > reg, > > > > unsigned long v) > > > > { > > > >- xchg((volatile unsigned long *)(APIC_BASE+reg), v); > > > >+//xchg((volatile unsigned long *)(APIC_BASE+reg), v); > > > >+ *((volatile unsigned long *)(APIC_BASE+reg)) = v; > > > > } > > > > > > > > static __inline fastcall unsigned long native_apic_read(unsigned long > > > > reg) > > > > > > > >The kernel boots fine. > > > > > > > > > > Looking at the xchg emulation code, it seems fine, but clearly it > > > isn't. > > > > Btw, I've put a printk in x86_emulate.c, where it prepares the operands > > for the xchg operations: all the write_atomic are hitting this point, > > so the write is lost somewhere in cmpxchg_emulated->write_emulated. > > Got it! > The emulator skips the writeback if the old value is unchanged, so the > apic doesn't see the write. > > Forcing the writeback: > > - if ((d & Mov) || (dst.orig_val != dst.val)) { > - if ((d & Mov) || (dst.orig_val != dst.val) || isxchg) { > > seems to fix the issue :D I'm not sure that fix is correct though. After a bit of thinking: it's correct but removes an optimization; furthermore it may miss other instructions that write to memory mapped areas. A more proper fix should be "force the writeback if dst.ptr is in some kind of mmio area". Ok, enough of reply-to-self. I'll go to sleep... Luca - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Il Fri, Jun 15, 2007 at 12:53:24AM +0200, Luca Tettamanti ha scritto: > Il Thu, Jun 14, 2007 at 11:26:29AM +0300, Avi Kivity ha scritto: > > Luca Tettamanti wrote: > > >With GOOD_APIC apic_read_around is a nop, while apic_write_around is a > > >normal write. With !GOOD_APIC apic_write_around writes to the APIC reg > > >using xchg. With !GOOD_APIC and this patch: > > > > > >--- include/asm-i386/apic.h~ 2007-04-26 05:08:32.0 +0200 > > >+++ include/asm-i386/apic.h2007-06-13 22:35:00.0 +0200 > > >@@ -56,7 +56,8 @@ > > > static __inline fastcall void native_apic_write_atomic(unsigned long reg, > > > unsigned long v) > > > { > > >- xchg((volatile unsigned long *)(APIC_BASE+reg), v); > > >+//xchg((volatile unsigned long *)(APIC_BASE+reg), v); > > >+ *((volatile unsigned long *)(APIC_BASE+reg)) = v; > > > } > > > > > > static __inline fastcall unsigned long native_apic_read(unsigned long reg) > > > > > >The kernel boots fine. > > > > > > > Looking at the xchg emulation code, it seems fine, but clearly it > > isn't. > > Btw, I've put a printk in x86_emulate.c, where it prepares the operands > for the xchg operations: all the write_atomic are hitting this point, > so the write is lost somewhere in cmpxchg_emulated->write_emulated. Got it! The emulator skips the writeback if the old value is unchanged, so the apic doesn't see the write. Forcing the writeback: - if ((d & Mov) || (dst.orig_val != dst.val)) { - if ((d & Mov) || (dst.orig_val != dst.val) || isxchg) { seems to fix the issue :D I'm not sure that fix is correct though. This is the patch I'm using: --- a/kernel/x86_emulate.c 2007-06-03 10:31:15.0 +0200 +++ b/kernel/x86_emulate.c 2007-06-15 01:12:12.0 +0200 @@ -486,6 +486,7 @@ unsigned long _regs[NR_VCPU_REGS]; unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags; unsigned long modrm_val = 0; + int isxchg = 0; memcpy(_regs, ctxt->vcpu->regs, sizeof _regs); @@ -912,6 +913,7 @@ break; case 0x86 ... 0x87: /* xchg */ /* Write back the register source. */ + isxchg = 1; switch (dst.bytes) { case 1: *(u8 *) src.ptr = (u8) dst.val; @@ -1056,7 +1058,7 @@ } writeback: - if ((d & Mov) || (dst.orig_val != dst.val)) { + if ((d & Mov) || (dst.orig_val != dst.val) || isxchg) { switch (dst.type) { case OP_REG: /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ Luca -- "Su cio` di cui non si puo` parlare e` bene tacere". Ludwig Wittgenstein - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Il Thu, Jun 14, 2007 at 11:26:29AM +0300, Avi Kivity ha scritto: > Luca Tettamanti wrote: > >With GOOD_APIC apic_read_around is a nop, while apic_write_around is a > >normal write. With !GOOD_APIC apic_write_around writes to the APIC reg > >using xchg. With !GOOD_APIC and this patch: > > > >--- include/asm-i386/apic.h~ 2007-04-26 05:08:32.0 +0200 > >+++ include/asm-i386/apic.h 2007-06-13 22:35:00.0 +0200 > >@@ -56,7 +56,8 @@ > > static __inline fastcall void native_apic_write_atomic(unsigned long reg, > >unsigned long v) > > { > >-xchg((volatile unsigned long *)(APIC_BASE+reg), v); > >+// xchg((volatile unsigned long *)(APIC_BASE+reg), v); > >+*((volatile unsigned long *)(APIC_BASE+reg)) = v; > > } > > > > static __inline fastcall unsigned long native_apic_read(unsigned long reg) > > > >The kernel boots fine. > > > > Looking at the xchg emulation code, it seems fine, but clearly it > isn't. Btw, I've put a printk in x86_emulate.c, where it prepares the operands for the xchg operations: all the write_atomic are hitting this point, so the write is lost somewhere in cmpxchg_emulated->write_emulated. Luca -- Il dottore mi ha detto di smettere di fare cene intime per quattro. A meno che non ci siamo altre tre persone. - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Il Thu, Jun 14, 2007 at 11:26:29AM +0300, Avi Kivity ha scritto: > Luca Tettamanti wrote: > >With GOOD_APIC apic_read_around is a nop, while apic_write_around is a > >normal write. With !GOOD_APIC apic_write_around writes to the APIC reg > >using xchg. With !GOOD_APIC and this patch: > > > >--- include/asm-i386/apic.h~ 2007-04-26 05:08:32.0 +0200 > >+++ include/asm-i386/apic.h 2007-06-13 22:35:00.0 +0200 > >@@ -56,7 +56,8 @@ > > static __inline fastcall void native_apic_write_atomic(unsigned long reg, > >unsigned long v) > > { > >-xchg((volatile unsigned long *)(APIC_BASE+reg), v); > >+// xchg((volatile unsigned long *)(APIC_BASE+reg), v); > >+*((volatile unsigned long *)(APIC_BASE+reg)) = v; > > } > > > > static __inline fastcall unsigned long native_apic_read(unsigned long reg) > > > >The kernel boots fine. > > > > Looking at the xchg emulation code, it seems fine, but clearly it > isn't. Can you add logging to the kernel apic driver and to the qemu > device emulation (qemu/hw/apic.c, apic_mem_readl()/apic_mem_writel()) > and compare the results? Done, but at this point I don't know what I'm looking at ;) I'm attaching logs for working and non working kernels. I've made apic_read_around() a nop in both cases since it doesn't influence the outcome; the only difference is that working kernel writes directly to the mapped APIC registers while the non-working one uses xchg. As expected kernel side logs show zero differences (except for timer calibration). QEMU is a different story. The most obvious difference is that in non-working case each write is preceded by a read at the same register (due to xchg). What is strange is that almost all the writes done using apic_write_atomic are not hitting QEMU (see qemu.diff). In the log there's only the read generated by the xchg. For example, during timer calibration the kernel is doing: write to 0x0b0 read from 0x390 but - in the non-working case - QEMU sees: read from 0x0b0 (probably generated by xchg) read from 0x390 but it doesn't see the write! The write is not always lost: apic_read: 0xd030 = 0x00050011 apic_write_atomic: 0xd370 = 0x000100fe ... apic_read: fee00030 = 00050011 apic_read: fee00370 = 0001 apic_write: fee00370 = 000100fe Luca -- > While we're on all of this, are we going to change "tained" to some > other less alarmist word? "screwed" -- Alexander Viro apic-logs.tar.bz2 Description: Binary data - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca Tettamanti wrote: > With GOOD_APIC apic_read_around is a nop, while apic_write_around is a > normal write. With !GOOD_APIC apic_write_around writes to the APIC reg > using xchg. With !GOOD_APIC and this patch: > > --- include/asm-i386/apic.h~ 2007-04-26 05:08:32.0 +0200 > +++ include/asm-i386/apic.h 2007-06-13 22:35:00.0 +0200 > @@ -56,7 +56,8 @@ > static __inline fastcall void native_apic_write_atomic(unsigned long reg, > unsigned long v) > { > - xchg((volatile unsigned long *)(APIC_BASE+reg), v); > +// xchg((volatile unsigned long *)(APIC_BASE+reg), v); > + *((volatile unsigned long *)(APIC_BASE+reg)) = v; > } > > static __inline fastcall unsigned long native_apic_read(unsigned long reg) > > The kernel boots fine. > Looking at the xchg emulation code, it seems fine, but clearly it isn't. Can you add logging to the kernel apic driver and to the qemu device emulation (qemu/hw/apic.c, apic_mem_readl()/apic_mem_writel()) and compare the results? -- error compiling committee.c: too many arguments to function - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Il Wed, Jun 13, 2007 at 11:59:25AM +0300, Avi Kivity ha scritto: > Luca Tettamanti wrote: > >Il Mon, Jun 11, 2007 at 10:44:45AM +0300, Avi Kivity ha scritto: > > > >>Luca wrote: > >> > I've managed to reproduce this on kvm-21 (it takes many boots for this > to happen, but it does eventually). > > >>>Hum, any clue on the cause? > >>> > >>From what I've seen, it's the new Linux clocksource code. > >> > >> > >>>Should I test older versions? > >>> > >>They're unlikely to be better. Instead, it would be best to see what > >>the guest is doing. > >> > > > >RCU is not working. Network initialization hangs because it happens to > >be the first RCU user. > >The guest is stuck waiting for RCU syncronization: > > > >[4.992207] [] synchronize_rcu+0x4e/0x80 > >[4.994379] [] wakeme_after_rcu+0x0/0x8 > >[4.996521] [] synchronize_net+0x64/0x8c > >[4.998678] [] inet_register_protosw+0xef/0x151 > >[5.000984] [] inet_init+0x1cd/0x498 > > > >wait_for_completion() in synchronize_rcu() calls schedule() and the > >completion is never signaled (wakeme_after_rcu is never called). > >The completion AFAICS would be signaled via rcu_process_callbacks(), > >which is called in tasklet context. > >Scheduler and completion are working fine since they're used in other > >part of the kernel without problems. > > > >To recap: > > > >i686 F7 kernel: always works. > > > >i586 F7 kernel: sometime hangs due to RCU problems. When it does work > >it's because the LAPIC is disabled on boot: > > > >Using local APIC timer interrupts. > >calibrating APIC timer ... > >... lapic delta = 25745109 > >... PM timer delta = 0 > >. delta 25745109 > >. mult: 1105912110 > >. calibration result: 4119217 > >. CPU clock speed is 8794.0417 MHz. > >. host bus clock speed is 4119.0217 MHz. > >... verify APIC timer > >... jiffies delta = 103 > >APIC timer disabled due to verification failure. > > > >When it doesn't work LAPIC passes the test: > > > >[1.304717] Using local APIC timer interrupts. > >[1.304719] calibrating APIC timer ... > >[1.718823] ... lapic delta = 25251444 > >[1.720582] ... PM timer delta = 0 > >[1.722219] . delta 25251444 > >[1.723827] . mult: 1084706136 > >[1.725470] . calibration result: 4040231 > >[1.727374] . CPU clock speed is 8625.0780 MHz. > >[1.729396] . host bus clock speed is 4040.0231 MHz. > >[1.731540] ... verify APIC timer > >[2.158342] ... jiffies delta = 102 > >[2.160035] ... jiffies result ok > > > >i586 F7 kernel, with 'nolapic': always works. > > > > Can you check which .config option causes it (a special type of > bisecting...)? > > This looks likely based on your findings: > > -CONFIG_X86_ALIGNMENT_16=y > +CONFIG_X86_GOOD_APIC=y > CONFIG_X86_INTEL_USERCOPY=y > +CONFIG_X86_USE_PPRO_CHECKSUM=y > +CONFIG_X86_TSC=y > > I expect it's not directly related to i586 vs i686. And the winner is... CONFIG_X86_GOOD_APIC ;-) I think that !GOOD_APIC is a workaround for "11AP erratum in Pentium Processor Specification Update" aka read-before-write bug. The config symbol is used in include/asm-i386/apic.h: #ifdef CONFIG_X86_GOOD_APIC # define FORCE_READ_AROUND_WRITE 0 # define apic_read_around(x) # define apic_write_around(x,y) apic_write((x),(y)) #else # define FORCE_READ_AROUND_WRITE 1 # define apic_read_around(x) apic_read(x) # define apic_write_around(x,y) apic_write_atomic((x),(y)) #endif With GOOD_APIC apic_read_around is a nop, while apic_write_around is a normal write. With !GOOD_APIC apic_write_around writes to the APIC reg using xchg. With !GOOD_APIC and this patch: --- include/asm-i386/apic.h~2007-04-26 05:08:32.0 +0200 +++ include/asm-i386/apic.h 2007-06-13 22:35:00.0 +0200 @@ -56,7 +56,8 @@ static __inline fastcall void native_apic_write_atomic(unsigned long reg, unsigned long v) { - xchg((volatile unsigned long *)(APIC_BASE+reg), v); +// xchg((volatile unsigned long *)(APIC_BASE+reg), v); + *((volatile unsigned long *)(APIC_BASE+reg)) = v; } static __inline fastcall unsigned long native_apic_read(unsigned long reg) The kernel boots fine. Luca -- "Sei l'unica donna della mia vita". (Adamo) - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca Tettamanti wrote: > Il Mon, Jun 11, 2007 at 10:44:45AM +0300, Avi Kivity ha scritto: > >> Luca wrote: >> I've managed to reproduce this on kvm-21 (it takes many boots for this to happen, but it does eventually). >>> Hum, any clue on the cause? >>> >> From what I've seen, it's the new Linux clocksource code. >> >> >>> Should I test older versions? >>> >> They're unlikely to be better. Instead, it would be best to see what >> the guest is doing. >> > > RCU is not working. Network initialization hangs because it happens to > be the first RCU user. > The guest is stuck waiting for RCU syncronization: > > [4.992207] [] synchronize_rcu+0x4e/0x80 > [4.994379] [] wakeme_after_rcu+0x0/0x8 > [4.996521] [] synchronize_net+0x64/0x8c > [4.998678] [] inet_register_protosw+0xef/0x151 > [5.000984] [] inet_init+0x1cd/0x498 > > wait_for_completion() in synchronize_rcu() calls schedule() and the > completion is never signaled (wakeme_after_rcu is never called). > The completion AFAICS would be signaled via rcu_process_callbacks(), > which is called in tasklet context. > Scheduler and completion are working fine since they're used in other > part of the kernel without problems. > > To recap: > > i686 F7 kernel: always works. > > i586 F7 kernel: sometime hangs due to RCU problems. When it does work > it's because the LAPIC is disabled on boot: > > Using local APIC timer interrupts. > calibrating APIC timer ... > ... lapic delta = 25745109 > ... PM timer delta = 0 > . delta 25745109 > . mult: 1105912110 > . calibration result: 4119217 > . CPU clock speed is 8794.0417 MHz. > . host bus clock speed is 4119.0217 MHz. > ... verify APIC timer > ... jiffies delta = 103 > APIC timer disabled due to verification failure. > > When it doesn't work LAPIC passes the test: > > [1.304717] Using local APIC timer interrupts. > [1.304719] calibrating APIC timer ... > [1.718823] ... lapic delta = 25251444 > [1.720582] ... PM timer delta = 0 > [1.722219] . delta 25251444 > [1.723827] . mult: 1084706136 > [1.725470] . calibration result: 4040231 > [1.727374] . CPU clock speed is 8625.0780 MHz. > [1.729396] . host bus clock speed is 4040.0231 MHz. > [1.731540] ... verify APIC timer > [2.158342] ... jiffies delta = 102 > [2.160035] ... jiffies result ok > > i586 F7 kernel, with 'nolapic': always works. > Can you check which .config option causes it (a special type of bisecting...)? This looks likely based on your findings: -CONFIG_X86_ALIGNMENT_16=y +CONFIG_X86_GOOD_APIC=y CONFIG_X86_INTEL_USERCOPY=y +CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_TSC=y I expect it's not directly related to i586 vs i686. -- error compiling committee.c: too many arguments to function - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Il Mon, Jun 11, 2007 at 10:44:45AM +0300, Avi Kivity ha scritto: > Luca wrote: > >> > >>I've managed to reproduce this on kvm-21 (it takes many boots for this > >>to happen, but it does eventually). > > > >Hum, any clue on the cause? > > From what I've seen, it's the new Linux clocksource code. > > >Should I test older versions? > > They're unlikely to be better. Instead, it would be best to see what > the guest is doing. RCU is not working. Network initialization hangs because it happens to be the first RCU user. The guest is stuck waiting for RCU syncronization: [4.992207] [] synchronize_rcu+0x4e/0x80 [4.994379] [] wakeme_after_rcu+0x0/0x8 [4.996521] [] synchronize_net+0x64/0x8c [4.998678] [] inet_register_protosw+0xef/0x151 [5.000984] [] inet_init+0x1cd/0x498 wait_for_completion() in synchronize_rcu() calls schedule() and the completion is never signaled (wakeme_after_rcu is never called). The completion AFAICS would be signaled via rcu_process_callbacks(), which is called in tasklet context. Scheduler and completion are working fine since they're used in other part of the kernel without problems. To recap: i686 F7 kernel: always works. i586 F7 kernel: sometime hangs due to RCU problems. When it does work it's because the LAPIC is disabled on boot: Using local APIC timer interrupts. calibrating APIC timer ... ... lapic delta = 25745109 ... PM timer delta = 0 . delta 25745109 . mult: 1105912110 . calibration result: 4119217 . CPU clock speed is 8794.0417 MHz. . host bus clock speed is 4119.0217 MHz. ... verify APIC timer ... jiffies delta = 103 APIC timer disabled due to verification failure. When it doesn't work LAPIC passes the test: [1.304717] Using local APIC timer interrupts. [1.304719] calibrating APIC timer ... [1.718823] ... lapic delta = 25251444 [1.720582] ... PM timer delta = 0 [1.722219] . delta 25251444 [1.723827] . mult: 1084706136 [1.725470] . calibration result: 4040231 [1.727374] . CPU clock speed is 8625.0780 MHz. [1.729396] . host bus clock speed is 4040.0231 MHz. [1.731540] ... verify APIC timer [2.158342] ... jiffies delta = 102 [2.160035] ... jiffies result ok i586 F7 kernel, with 'nolapic': always works. Luca -- Sbagliare รจ umano, ma per incasinare davvero le cose serve un computer. - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca wrote: > On 6/11/07, Avi Kivity <[EMAIL PROTECTED]> wrote: >> Luca wrote: >> >> >> >> I've managed to reproduce this on kvm-21 (it takes many boots for >> this >> >> to happen, but it does eventually). >> > >> > Hum, any clue on the cause? >> >> From what I've seen, it's the new Linux clocksource code. > > Actually I tried forcing the PIT (and any other combination of > tsc,acpi_pm,jiffies) as the clocksource, without success. > Well, there's lots of APIC stuff going on when it hangs. >> > Should I test older versions? >> >> They're unlikely to be better. Instead, it would be best to see what >> the guest is doing. >> >> I suggest downloading the source rpm for the kernel, building it, and >> sprinkling printk()s until we know exactly what source the guest is >> executing at the time of the hang. > > Ok, will do. Meanwhile I discovered that the kernel on the boot cd > (the one that hangs) is compiled for i586, while the one installed on > disk is for i686 (this one works). Ah. > > i686 has this options enabled: > > +CONFIG_X86_GOOD_APIC=y > +CONFIG_X86_USE_PPRO_CHECKSUM=y > +CONFIG_X86_TSC=y > > but disabling tsc on the command line doesn't make any difference. Is > it possible that KVM is choking on some instruction not used by the > i686 kernel? Unlikely, as then the hang would occur all the time instead of randomly. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
On 6/11/07, Avi Kivity <[EMAIL PROTECTED]> wrote: > Luca wrote: > >> > >> I've managed to reproduce this on kvm-21 (it takes many boots for this > >> to happen, but it does eventually). > > > > Hum, any clue on the cause? > > From what I've seen, it's the new Linux clocksource code. Actually I tried forcing the PIT (and any other combination of tsc,acpi_pm,jiffies) as the clocksource, without success. > > Should I test older versions? > > They're unlikely to be better. Instead, it would be best to see what > the guest is doing. > > I suggest downloading the source rpm for the kernel, building it, and > sprinkling printk()s until we know exactly what source the guest is > executing at the time of the hang. Ok, will do. Meanwhile I discovered that the kernel on the boot cd (the one that hangs) is compiled for i586, while the one installed on disk is for i686 (this one works). i686 has this options enabled: +CONFIG_X86_GOOD_APIC=y +CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_TSC=y but disabling tsc on the command line doesn't make any difference. Is it possible that KVM is choking on some instruction not used by the i686 kernel? Luca - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca wrote: >> >> I've managed to reproduce this on kvm-21 (it takes many boots for this >> to happen, but it does eventually). > > Hum, any clue on the cause? From what I've seen, it's the new Linux clocksource code. > Should I test older versions? They're unlikely to be better. Instead, it would be best to see what the guest is doing. I suggest downloading the source rpm for the kernel, building it, and sprinkling printk()s until we know exactly what source the guest is executing at the time of the hang. Use of the qemu -kernel, -append, and -serial can redirect the console to the command line so that it's easy to capture all debug output. -- error compiling committee.c: too many arguments to function - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
On 6/10/07, Avi Kivity <[EMAIL PROTECTED]> wrote: > Luca wrote: > > On 6/5/07, Avi Kivity <[EMAIL PROTECTED]> wrote: > >> Luca Tettamanti wrote: > >> > Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto: > >> > > >> >>> While doing repeated tests with the installer I ran into another > >> >>> (unrelated) problem. Sometimes the guest kernel hangs at boot at: > >> >>> > >> >>> NET: Registered protocol family 2 > >> >>> > >> >>> with any kind of networking options (except for -net none, which > >> works). > >> >>> With -no-kvm it boots with any networking option. > >> >>> > >> >>> > >> >>> > >> >> Can you try to pin the guest on a single core with taskset: > >> >> > >> >>taskset 1 qemu ... > >> >> > >> > > >> > Doesn't help. What works is 'nolapic', i.e. disabling the local > >> APIC on > >> > the guest kernel. > >> > I've also tried disabling TSC (notsc) and forcing PIT as the > >> clocksource > >> > (clocksouce=pit clock=pit); neither of them helped. > >> > > >> > > >> > >> Is this a regression relative to a previous kvm version? > > > > Hello, > > sorry for the delay, I was having troubles compiling older KVMs with a > > recent kernel... > > The last version that works is kvm-21; starting from kvm-22 the VM > > hangs during network initialization (now always, but pretty often). > > This only occurs when the guest is Fedora7 setup ISO. The regular boot > > (i.e. from the hd) seems unaffected. > > I've managed to reproduce this on kvm-21 (it takes many boots for this > to happen, but it does eventually). Hum, any clue on the cause? Should I test older versions? Luca - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca wrote: > On 6/5/07, Avi Kivity <[EMAIL PROTECTED]> wrote: >> Luca Tettamanti wrote: >> > Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto: >> > >> >>> While doing repeated tests with the installer I ran into another >> >>> (unrelated) problem. Sometimes the guest kernel hangs at boot at: >> >>> >> >>> NET: Registered protocol family 2 >> >>> >> >>> with any kind of networking options (except for -net none, which >> works). >> >>> With -no-kvm it boots with any networking option. >> >>> >> >>> >> >>> >> >> Can you try to pin the guest on a single core with taskset: >> >> >> >>taskset 1 qemu ... >> >> >> > >> > Doesn't help. What works is 'nolapic', i.e. disabling the local >> APIC on >> > the guest kernel. >> > I've also tried disabling TSC (notsc) and forcing PIT as the >> clocksource >> > (clocksouce=pit clock=pit); neither of them helped. >> > >> > >> >> Is this a regression relative to a previous kvm version? > > Hello, > sorry for the delay, I was having troubles compiling older KVMs with a > recent kernel... > The last version that works is kvm-21; starting from kvm-22 the VM > hangs during network initialization (now always, but pretty often). > This only occurs when the guest is Fedora7 setup ISO. The regular boot > (i.e. from the hd) seems unaffected. I've managed to reproduce this on kvm-21 (it takes many boots for this to happen, but it does eventually). -- error compiling committee.c: too many arguments to function - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
On 6/5/07, Avi Kivity <[EMAIL PROTECTED]> wrote: > Luca Tettamanti wrote: > > Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto: > > > >>> While doing repeated tests with the installer I ran into another > >>> (unrelated) problem. Sometimes the guest kernel hangs at boot at: > >>> > >>> NET: Registered protocol family 2 > >>> > >>> with any kind of networking options (except for -net none, which works). > >>> With -no-kvm it boots with any networking option. > >>> > >>> > >>> > >> Can you try to pin the guest on a single core with taskset: > >> > >>taskset 1 qemu ... > >> > > > > Doesn't help. What works is 'nolapic', i.e. disabling the local APIC on > > the guest kernel. > > I've also tried disabling TSC (notsc) and forcing PIT as the clocksource > > (clocksouce=pit clock=pit); neither of them helped. > > > > > > Is this a regression relative to a previous kvm version? Hello, sorry for the delay, I was having troubles compiling older KVMs with a recent kernel... The last version that works is kvm-21; starting from kvm-22 the VM hangs during network initialization (now always, but pretty often). This only occurs when the guest is Fedora7 setup ISO. The regular boot (i.e. from the hd) seems unaffected. Luca - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca Tettamanti wrote: > Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto: > >>> While doing repeated tests with the installer I ran into another >>> (unrelated) problem. Sometimes the guest kernel hangs at boot at: >>> >>> NET: Registered protocol family 2 >>> >>> with any kind of networking options (except for -net none, which works). >>> With -no-kvm it boots with any networking option. >>> >>> >>> >> Can you try to pin the guest on a single core with taskset: >> >>taskset 1 qemu ... >> > > Doesn't help. What works is 'nolapic', i.e. disabling the local APIC on > the guest kernel. > I've also tried disabling TSC (notsc) and forcing PIT as the clocksource > (clocksouce=pit clock=pit); neither of them helped. > > Is this a regression relative to a previous kvm version? -- error compiling committee.c: too many arguments to function - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto: > > While doing repeated tests with the installer I ran into another > > (unrelated) problem. Sometimes the guest kernel hangs at boot at: > > > > NET: Registered protocol family 2 > > > > with any kind of networking options (except for -net none, which works). > > With -no-kvm it boots with any networking option. > > > > > > Can you try to pin the guest on a single core with taskset: > >taskset 1 qemu ... Doesn't help. What works is 'nolapic', i.e. disabling the local APIC on the guest kernel. I've also tried disabling TSC (notsc) and forcing PIT as the clocksource (clocksouce=pit clock=pit); neither of them helped. Luca -- Il dottore mi ha detto di smettere di fare cene intime per quattro. A meno che non ci siamo altre tre persone. - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca Tettamanti wrote: > It turned out that it was somewhat reproducible with fedora installer. > With your patch it doesn't oops anymore. > > Ok, good. > While doing repeated tests with the installer I ran into another > (unrelated) problem. Sometimes the guest kernel hangs at boot at: > > NET: Registered protocol family 2 > > with any kind of networking options (except for -net none, which works). > With -no-kvm it boots with any networking option. > > Can you try to pin the guest on a single core with taskset: taskset 1 qemu ... ? -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. - This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ___ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel
Re: [kvm-devel] [BUG] Oops with KVM-27
Hi, Il Mon, Jun 04, 2007 at 12:35:37PM +0300, Avi Kivity ha scritto: > Luca Tettamanti wrote: > >Hello, > >my kernel just exploded :) > > > >The host is running 2.6-git-current, with KVM modules from KVM-27 > >package. kernel is 32bit, SMP, with PREEMPT enabled, no HIGHMEM (but I'm > >using CONFIG_VMSPLIT_3G_OPT=y). The CPU is a Core2 (hence I'm using > >kvm-intel). > >Guest was a Fedora7 setup DVD, which died somewhere during the > >installation (anaconda was already active at that point). Bad news is > >that I cannot reproduce the bug :| > > > Fortunately the trace clearly shows the problem (out of mmu working > memory on guest context switch). The attached patch should fix it. Let > me know if it works for you. It turned out that it was somewhat reproducible with fedora installer. With your patch it doesn't oops anymore. While doing repeated tests with the installer I ran into another (unrelated) problem. Sometimes the guest kernel hangs at boot at: NET: Registered protocol family 2 with any kind of networking options (except for -net none, which works). With -no-kvm it boots with any networking option. The only difference in dmesg is that when KVM is enable the guest uses the TSC: NetLabel: unlabeled traffic allowed by default -Time: tsc clocksource has been installed. PCI: Ignore bogus resource 6 [0:0] of :00:02.0 For reference this is the command line that I'm using: ./kvm-27/qemu/i386-softmmu/qemu -hda /home/kronos/tmp/fedora.img -cdrom /home/kronos/tmp/boot.iso -boot d -net tap -net nic -m 256 and boot.iso is the fedora7 net install image (you can find it on any mirror: fedora/linux/releases/7/Fedora/arch/os/images/boot.iso). The guest kernel doesn't respond to sysrq, so I don't known exactly where it's hanging. The stack trace on the host seems rather uninteresting: qemu S 0002 2404 18905 7312 (NOTLB) dca4db48 0086 0002 b0478900 eec4a0f0 b02f418b b0478900 000a eec4a0f0 ef31ca70 267db8c3 08c7 3ea3 eec4a1fc b1810980 efcc62a0 b0478900 b0129580 0292 dca4db58 0023935c Call Trace: [] _spin_unlock_irqrestore+0x34/0x58 [] __mod_timer+0x9d/0xa7 [] schedule_timeout+0x70/0x8d [] _spin_unlock_irqrestore+0x34/0x58 [] process_timeout+0x0/0x5 [] schedule_timeout+0x6b/0x8d [] do_select+0x399/0x3e7 [] __pollwait+0x0/0xac [] default_wake_function+0x0/0xc [] free_poll_entry+0xe/0x16 [] poll_freewait+0x18/0x4c [] do_sys_poll+0x302/0x327 [] __pollwait+0x0/0xac [] default_wake_function+0x0/0xc [] task_rq_lock+0x36/0x5d [] _spin_lock+0x33/0x3e [] _spin_unlock_irqrestore+0x40/0x58 [] try_to_wake_up+0x325/0x32f [] mark_held_locks+0x39/0x53 [] _spin_unlock_irqrestore+0x34/0x58 [] restore_nocheck+0x12/0x15 [] trace_hardirqs_on+0x11a/0x13d [] do_IRQ+0xc4/0xde [] restore_nocheck+0x12/0x15 [] core_sys_select+0x2ee/0x30f [] setup_sigcontext+0x105/0x189 [] _spin_unlock_irq+0x20/0x41 [] trace_hardirqs_on+0x11a/0x13d [] do_notify_resume+0x5d1/0x611 [] _spin_unlock_irq+0x2b/0x41 [] do_notify_resume+0x52f/0x611 [] restore_nocheck+0x12/0x15 [] convert_fxsr_from_user+0x26/0xe6 [] sys_select+0xa4/0x187 [] restore_nocheck+0x12/0x15 [] trace_hardirqs_on+0x11a/0x13d [] syscall_call+0x7/0xb === qemu S CF9E5DC0 2996 18911 7312 (NOTLB) cf9e5dd4 0082 0002 cf9e5dc0 cf9e5dbc b013b1ee cf9e5ea0 0007 0001 d252b4f0 b194c030 70a8bf29 08ac a554 d252b5fc b181a980 efcc62a0 00232330 0003 cf9e5ea0 efcc62d4 Call Trace: [] trace_hardirqs_on+0x11a/0x13d [] futex_wait+0x251/0x3ed [] hrtimer_wakeup+0x0/0x18 [] futex_wait+0x242/0x3ed [] default_wake_function+0x0/0xc [] do_futex+0x6c/0xaad [] sys_rt_sigqueueinfo+0x44/0x4e [] getnstimeofday+0x30/0xbe [] ktime_get_ts+0x16/0x44 [] sys_futex+0xc8/0xda [] syscall_call+0x7/0xb === I'm attaching the dmesg for both -kvm and -no-kvm cases. Luca -- "La teoria e` quando sappiamo come funzionano le cose ma non funzionano. La pratica e` quando le cose funzionano ma non sappiamo perche`. Abbiamo unito la teoria e la pratica: le cose non funzionano piu` e non sappiamo il perche`." -- A. Einstein Linux version 2.6.21-1.3194.fc7 ([EMAIL PROTECTED]) (gcc version 4.1.2 20070502 (Red Hat 4.1.2-12)) #1 SMP Wed May 23 22:11:19 EDT 2007 BIOS-provided physical RAM map: sanitize start sanitize end copy_e820_map() start: size: 0009fc00 end: 0009fc00 type: 1 copy_e820_map() type is E820_RAM copy_e820_map() start: 0009fc00 size: 0400 end: 000a type: 2 copy_e820_map() start: 000e8000 size: 00018000 end: 0010 type: 2 copy_e820_map() start: 0010 size: 0ff0 end: 1000 type: 1 copy_e820_map() type is E820_RAM copy_e820_map() start: fffc size: 0004 end: 0001 type: 2
Re: [kvm-devel] [BUG] Oops with KVM-27
Luca Tettamanti wrote: Hello, my kernel just exploded :) The host is running 2.6-git-current, with KVM modules from KVM-27 package. kernel is 32bit, SMP, with PREEMPT enabled, no HIGHMEM (but I'm using CONFIG_VMSPLIT_3G_OPT=y). The CPU is a Core2 (hence I'm using kvm-intel). Guest was a Fedora7 setup DVD, which died somewhere during the installation (anaconda was already active at that point). Bad news is that I cannot reproduce the bug :| Fortunately the trace clearly shows the problem (out of mmu working memory on guest context switch). The attached patch should fix it. Let me know if it works for you. -- error compiling committee.c: too many arguments to function diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 0632d0b..0fdd5a6 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -543,6 +543,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *old, const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); +int kvm_mmu_load(struct kvm_vcpu *vcpu); +void kvm_mmu_unload(struct kvm_vcpu *vcpu); int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); @@ -554,6 +556,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, return vcpu->mmu.page_fault(vcpu, gva, error_code); } +static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) +{ + if (likely(vcpu->mmu.root_hpa != INVALID_PAGE)) + return 0; + + return kvm_mmu_load(vcpu); +} + static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 283df03..5915d7a 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -949,9 +949,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->free = nonpaging_free; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa); + context->root_hpa = INVALID_PAGE; return 0; } @@ -965,11 +963,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) { pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); mmu_free_roots(vcpu); - if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) - kvm_mmu_free_some_pages(vcpu); - mmu_alloc_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); - kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } static void inject_page_fault(struct kvm_vcpu *vcpu, @@ -1003,10 +996,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->free = paging_free; context->root_level = level; context->shadow_root_level = level; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa | - (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + context->root_hpa = INVALID_PAGE; return 0; } @@ -1025,10 +1015,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->free = paging_free; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa | - (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + context->root_hpa = INVALID_PAGE; return 0; } @@ -1042,7 +1029,6 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - mmu_topup_memory_caches(vcpu); if (!is_paging(vcpu)) return nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) @@ -1064,16 +1050,31 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} + +int kvm_mmu_load(struct kvm_vcpu *vcpu) +{ int r; - destroy_kvm_mmu(vcpu); - r = init_kvm_mmu(vcpu); - if (r < 0) - goto out; + spin_lock(&vcpu->kvm->lock); r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + mmu_alloc_roots(vcpu); + kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); + kvm_mmu_flush_tlb(vcpu); out: + spin_unlock(&vcpu->kvm->lock); return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); + +void kvm_mmu_unload(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index b621403..ed33f59 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1482,6 +1482,10 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; again: + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + return r; + if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 09e3609..a499989 100644 --- a/drivers/kvm/vmx