Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-17 Thread Avi Kivity
Luca Tettamanti wrote:
>> Actually we haven't; just before the memcpy(), we can put a memcmp() to
>> guard the kvm_mmu_pte_write(), which is the really expensive operation,
>> especially with guest smp.
>> 
>
> Yup, but it seemed wasteful to map (at least when highmem is in use) a
> page just to check for something that we already knew. That was a
> preemptive optmization though, I haven't actually benchmarked the cost
> of setting up the mapping ;-)
>
>   

It's negligible compared to the vmexit cost and to the emulation  (which 
does a kmap_atomic() for every byte of the instruction; this can be 
easily optimized away).

In any case, I expect that performance sensitive uses will use x86_64, 
whereas i386 is mostly for desktops.


>> I think we can simply remove the if ().  For the register case, the
>> check is more expensive that the write; for mmio, we don't want it; and
>> for memory writes, we can put it in emulator_write_phys().
>> 
>
> Ok, this way it's simpler. How does this look:
>
> --- a/kernel/x86_emulate.c2007-06-15 21:13:51.0 +0200
> +++ b/kernel/x86_emulate.c2007-06-17 16:57:50.0 +0200
> @@ -1057,40 +1057,38 @@
>   }
>  
>  writeback:
> - if ((d & Mov) || (dst.orig_val != dst.val)) {
> - switch (dst.type) {
> - case OP_REG:
> - /* The 4-byte case *is* correct: in 64-bit mode we 
> zero-extend. */
> - switch (dst.bytes) {
> - case 1:
> - *(u8 *)dst.ptr = (u8)dst.val;
> - break;
> - case 2:
> - *(u16 *)dst.ptr = (u16)dst.val;
> - break;
> - case 4:
> - *dst.ptr = (u32)dst.val;
> - break;  /* 64b: zero-ext */
> - case 8:
> - *dst.ptr = dst.val;
> - break;
> - }
> + switch (dst.type) {
> + case OP_REG:
> + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. 
> */
> + switch (dst.bytes) {
> + case 1:
> + *(u8 *)dst.ptr = (u8)dst.val;
>   break;
> - case OP_MEM:
> - if (lock_prefix)
> - rc = ops->cmpxchg_emulated((unsigned long)dst.
> -ptr, &dst.orig_val,
> -&dst.val, dst.bytes,
> -ctxt);
> - else
> - rc = ops->write_emulated((unsigned long)dst.ptr,
> -  &dst.val, dst.bytes,
> -  ctxt);
> - if (rc != 0)
> - goto done;
> - default:
> + case 2:
> + *(u16 *)dst.ptr = (u16)dst.val;
> + break;
> + case 4:
> + *dst.ptr = (u32)dst.val;
> + break;  /* 64b: zero-ext */
> + case 8:
> + *dst.ptr = dst.val;
>   break;
>   }
> + break;
> + case OP_MEM:
> + if (lock_prefix)
> + rc = ops->cmpxchg_emulated((unsigned long)dst.
> +ptr, &dst.orig_val,
> +&dst.val, dst.bytes,
> +ctxt);
> + else
> + rc = ops->write_emulated((unsigned long)dst.ptr,
> +  &dst.val, dst.bytes,
> +  ctxt);
> + if (rc != 0)
> + goto done;
> + default:
> + break;
>   }
>  
>   /* Commit shadow register state. */
>
> --- a/kernel/kvm_main.c   2007-06-15 21:18:08.0 +0200
> +++ b/kernel/kvm_main.c   2007-06-17 16:59:33.0 +0200
> @@ -1139,8 +1139,10 @@
>   return 0;
>   mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
>   virt = kmap_atomic(page, KM_USER0);
> - kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
> - memcpy(virt + offset_in_page(gpa), val, bytes);
> + if (memcmp(virt + offset_in_page(gpa), val, bytes)) {
> + kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
> + memcpy(virt + offset_in_page(gpa), val, bytes);
> + }
>   kunmap_atomic(virt, KM_USER0);
>   return 1;
>  }
>
>
>   

Excellent.  We win back a precious indentation level and fix a bug at 
the same time.  Please test, send me a changelog and a signoff and I'll 
commit it.


-- 
error compiling committ

Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-17 Thread Luca Tettamanti
Il Sat, Jun 16, 2007 at 10:43:23AM +0300, Avi Kivity ha scritto: 
> Luca Tettamanti wrote:
> > Il Fri, Jun 15, 2007 at 12:06:50PM +0300, Avi Kivity ha scritto: 
> >   
> >>> After a bit of thinking: it's correct but removes an optimization;
> >>> furthermore it may miss other instructions that write to memory mapped
> >>> areas.
> >>> A more proper fix should be "force the writeback if dst.ptr is in some
> >>> kind of mmio area".
> >>>
> >>>   
> >> I think we can just disable the optimization, and (in a separate patch)
> >> add it back in emulator_write_phys(), where we know we're modifying
> >> memory and not hitting mmio.
> >> 
> >
> > Problem is that in emulator_write_phys() we've already lost track of the
> > previous (dst.orig_val) value. I moved up the decision in
> >   
> 
> Actually we haven't; just before the memcpy(), we can put a memcmp() to
> guard the kvm_mmu_pte_write(), which is the really expensive operation,
> especially with guest smp.

Yup, but it seemed wasteful to map (at least when highmem is in use) a
page just to check for something that we already knew. That was a
preemptive optmization though, I haven't actually benchmarked the cost
of setting up the mapping ;-)

[...]
> I think we can simply remove the if ().  For the register case, the
> check is more expensive that the write; for mmio, we don't want it; and
> for memory writes, we can put it in emulator_write_phys().

Ok, this way it's simpler. How does this look:

--- a/kernel/x86_emulate.c  2007-06-15 21:13:51.0 +0200
+++ b/kernel/x86_emulate.c  2007-06-17 16:57:50.0 +0200
@@ -1057,40 +1057,38 @@
}
 
 writeback:
-   if ((d & Mov) || (dst.orig_val != dst.val)) {
-   switch (dst.type) {
-   case OP_REG:
-   /* The 4-byte case *is* correct: in 64-bit mode we 
zero-extend. */
-   switch (dst.bytes) {
-   case 1:
-   *(u8 *)dst.ptr = (u8)dst.val;
-   break;
-   case 2:
-   *(u16 *)dst.ptr = (u16)dst.val;
-   break;
-   case 4:
-   *dst.ptr = (u32)dst.val;
-   break;  /* 64b: zero-ext */
-   case 8:
-   *dst.ptr = dst.val;
-   break;
-   }
+   switch (dst.type) {
+   case OP_REG:
+   /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. 
*/
+   switch (dst.bytes) {
+   case 1:
+   *(u8 *)dst.ptr = (u8)dst.val;
break;
-   case OP_MEM:
-   if (lock_prefix)
-   rc = ops->cmpxchg_emulated((unsigned long)dst.
-  ptr, &dst.orig_val,
-  &dst.val, dst.bytes,
-  ctxt);
-   else
-   rc = ops->write_emulated((unsigned long)dst.ptr,
-&dst.val, dst.bytes,
-ctxt);
-   if (rc != 0)
-   goto done;
-   default:
+   case 2:
+   *(u16 *)dst.ptr = (u16)dst.val;
+   break;
+   case 4:
+   *dst.ptr = (u32)dst.val;
+   break;  /* 64b: zero-ext */
+   case 8:
+   *dst.ptr = dst.val;
break;
}
+   break;
+   case OP_MEM:
+   if (lock_prefix)
+   rc = ops->cmpxchg_emulated((unsigned long)dst.
+  ptr, &dst.orig_val,
+  &dst.val, dst.bytes,
+  ctxt);
+   else
+   rc = ops->write_emulated((unsigned long)dst.ptr,
+&dst.val, dst.bytes,
+ctxt);
+   if (rc != 0)
+   goto done;
+   default:
+   break;
}
 
/* Commit shadow register state. */

--- a/kernel/kvm_main.c 2007-06-15 21:18:08.0 +0200
+++ b/kernel/kvm_main.c 2007-06-17 16:59:33.0 +0200
@@ -1139,8 +1139,10 @@
return 0;
mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
virt = kmap_atomic(page, KM_USER0);
-   kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
-   memcpy(virt + offset_in_page(gpa), val, bytes);
+   if (memcmp(virt + offset_i

Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-16 Thread Avi Kivity
Luca Tettamanti wrote:
> Il Fri, Jun 15, 2007 at 12:06:50PM +0300, Avi Kivity ha scritto: 
>   
>>> After a bit of thinking: it's correct but removes an optimization;
>>> furthermore it may miss other instructions that write to memory mapped
>>> areas.
>>> A more proper fix should be "force the writeback if dst.ptr is in some
>>> kind of mmio area".
>>>
>>>   
>> I think we can just disable the optimization, and (in a separate patch)
>> add it back in emulator_write_phys(), where we know we're modifying
>> memory and not hitting mmio.
>> 
>
> Problem is that in emulator_write_phys() we've already lost track of the
> previous (dst.orig_val) value. I moved up the decision in
>   

Actually we haven't; just before the memcpy(), we can put a memcmp() to
guard the kvm_mmu_pte_write(), which is the really expensive operation,
especially with guest smp.

> cmpxchg_emulated; unfortunately this means that the non-locked write
> path (write_emulated) can't do this optimization, unless I change its
> signature to include the old value.
>
> The first patch makes the writeback step uncoditional whenever we have a
> destination operand (the mov check (d & Mov) may be superfluous, yes?).
> The write-to-registry path still has the optimization that skips the
> write if possible.
>   

The mov check is in done since the destination is not read for moves; so
the check for change would read uninitialized memory.

I think we can simply remove the if ().  For the register case, the
check is more expensive that the write; for mmio, we don't want it; and
for memory writes, we can put it in emulator_write_phys().

> Next one: I've splitted emulator_write_phys into emulator_write_phys_mem
> (for normal RAM) and emulator_write_phys_mmio (for the rest). The
>   

This is in order to properly emulate cmpxchg(), right?  I don't think
it's necessary since all that memory is write protected and the
modifications happen under kvm->lock.

>
> I'm a bit confused about this test, found in emulator_write_phys
> (original code):
>
> if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
> return 0;
>
> AFAICT is makes the emulator skip the write if the modified area spans
> across two different (physical) pages. When this happens write_emulated
> does a MMIO write. I'd expect the function to load the 2 pages and do the
> memory write on both instead.
>   

This isn't skipping the write; instead it uses the mmio path to update
memory instead of the direct path, as I was too lazy to code split page
writes.  It's also wrong in case someone uses nonaligned writes to
update page tables, but no-one does that.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-15 Thread Luca Tettamanti
Il Fri, Jun 15, 2007 at 12:06:50PM +0300, Avi Kivity ha scritto: 
> > After a bit of thinking: it's correct but removes an optimization;
> > furthermore it may miss other instructions that write to memory mapped
> > areas.
> > A more proper fix should be "force the writeback if dst.ptr is in some
> > kind of mmio area".
> >
> 
> I think we can just disable the optimization, and (in a separate patch)
> add it back in emulator_write_phys(), where we know we're modifying
> memory and not hitting mmio.

Problem is that in emulator_write_phys() we've already lost track of the
previous (dst.orig_val) value. I moved up the decision in
cmpxchg_emulated; unfortunately this means that the non-locked write
path (write_emulated) can't do this optimization, unless I change its
signature to include the old value.

The first patch makes the writeback step uncoditional whenever we have a
destination operand (the mov check (d & Mov) may be superfluous, yes?).
The write-to-registry path still has the optimization that skips the
write if possible.


--- a/kernel/x86_emulate.c  2007-06-15 21:13:51.0 +0200
+++ b/kernel/x86_emulate.c  2007-06-15 22:12:15.0 +0200
@@ -1057,9 +1057,8 @@
}
 
 writeback:
-   if ((d & Mov) || (dst.orig_val != dst.val)) {
-   switch (dst.type) {
-   case OP_REG:
+   if ((d & Mov) || (d & DstMask) == DstMem || (d & DstMask) == DstReg ) {
+   if (dst.type == OP_REG && dst.orig_val != dst.val) {
/* The 4-byte case *is* correct: in 64-bit mode we 
zero-extend. */
switch (dst.bytes) {
case 1:
@@ -1075,8 +1074,10 @@
*dst.ptr = dst.val;
break;
}
-   break;
-   case OP_MEM:
+   } else if (dst.type == OP_MEM) {
+   /* Always dispatch the write, since it may hit a
+* MMIO area.
+*/
if (lock_prefix)
rc = ops->cmpxchg_emulated((unsigned long)dst.
   ptr, &dst.orig_val,
@@ -1088,8 +1089,6 @@
 ctxt);
if (rc != 0)
goto done;
-   default:
-   break;
}
}
 

Next one: I've splitted emulator_write_phys into emulator_write_phys_mem
(for normal RAM) and emulator_write_phys_mmio (for the rest). The
cmpxchg code path is roughly:

if (!mapped)
page_fault();

page = gfn_to_page(...)
if (page) {
if (!memcmp(old, new))
return X86EMUL_CONTINUE;

write_mem();
retrun X86EMUL_CONTINUE;
}
/* for mmio always do the write */
write_mmio();

I'm a bit confused about this test, found in emulator_write_phys
(original code):

if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
return 0;

AFAICT is makes the emulator skip the write if the modified area spans
across two different (physical) pages. When this happens write_emulated
does a MMIO write. I'd expect the function to load the 2 pages and do the
memory write on both instead.
I've preserved this logic in the following patch, but I don't understand
why it's correct :|

--- a/kernel/kvm_main.c 2007-06-15 21:18:08.0 +0200
+++ b/kernel/kvm_main.c 2007-06-15 23:34:13.0 +0200
@@ -1125,26 +1125,39 @@
}
 }
 
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-  const void *val, int bytes)
+static int emulator_write_phys_mem(struct kvm_vcpu *vcpu, gpa_t gpa,
+  struct page *page, const void *val,
+  int bytes)
 {
-   struct page *page;
void *virt;
-   unsigned offset = offset_in_page(gpa);
+   unsigned int offset;
+
+   offset = offset_in_page(gpa);
 
if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
return 0;
-   page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-   if (!page)
-   return 0;
+
mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
virt = kmap_atomic(page, KM_USER0);
kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
memcpy(virt + offset_in_page(gpa), val, bytes);
kunmap_atomic(virt, KM_USER0);
+
return 1;
 }
 
+static int emulator_write_phys_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+   const void *val, int bytes)
+{
+   vcpu->mmio_needed = 1;
+   vcpu->mmio_phys_addr = gpa;
+   vcpu->mmio_size = bytes;
+   vcpu->mmio_is_write = 1;
+   memcpy(vcpu->mmio_data, val, bytes);
+
+   return 0;
+}
+
 static int emulator_write_emulated(unsigned long addr,
   const void 

Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-15 Thread Avi Kivity
Luca wrote:
>>
>> Got it!
>> The emulator skips the writeback if the old value is unchanged, so the
>> apic doesn't see the write.
>>
>> Forcing the writeback:
>>
>> -  if ((d & Mov) || (dst.orig_val != dst.val)) {
>> -  if ((d & Mov) || (dst.orig_val != dst.val) || isxchg) {
>>
>> seems to fix the issue :D I'm not sure that fix is correct though.
>

Good detective work!


> After a bit of thinking: it's correct but removes an optimization;
> furthermore it may miss other instructions that write to memory mapped
> areas.
> A more proper fix should be "force the writeback if dst.ptr is in some
> kind of mmio area".
>

I think we can just disable the optimization, and (in a separate patch)
add it back in emulator_write_phys(), where we know we're modifying
memory and not hitting mmio.

Incidentally, in Xen, where this code originated from, this emulator is
used only for page table updates (which are always to memory).  A
separate emulator is used for mmio.  I guess the optimization targets
the vm scanner doing test_and_clear_bit() type of operations against the
page tables' dirty bits.


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-14 Thread Luca
On 6/15/07, Luca Tettamanti <[EMAIL PROTECTED]> wrote:
> Il Fri, Jun 15, 2007 at 12:53:24AM +0200, Luca Tettamanti ha scritto:
> > Il Thu, Jun 14, 2007 at 11:26:29AM +0300, Avi Kivity ha scritto:
> > > Luca Tettamanti wrote:
> > > >With GOOD_APIC apic_read_around is a nop, while apic_write_around is a
> > > >normal write. With !GOOD_APIC apic_write_around writes to the APIC reg
> > > >using xchg. With !GOOD_APIC and this patch:
> > > >
> > > >--- include/asm-i386/apic.h~   2007-04-26 05:08:32.0 +0200
> > > >+++ include/asm-i386/apic.h2007-06-13 22:35:00.0 +0200
> > > >@@ -56,7 +56,8 @@
> > > > static __inline fastcall void native_apic_write_atomic(unsigned long 
> > > > reg,
> > > >  unsigned long v)
> > > > {
> > > >-  xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> > > >+//xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> > > >+  *((volatile unsigned long *)(APIC_BASE+reg)) = v;
> > > > }
> > > >
> > > > static __inline fastcall unsigned long native_apic_read(unsigned long 
> > > > reg)
> > > >
> > > >The kernel boots fine.
> > > >
> > >
> > > Looking at the xchg emulation code, it seems fine, but clearly it
> > > isn't.
> >
> > Btw, I've put a printk in x86_emulate.c, where it prepares the operands
> > for the xchg operations: all the write_atomic are hitting this point,
> > so the write is lost somewhere in cmpxchg_emulated->write_emulated.
>
> Got it!
> The emulator skips the writeback if the old value is unchanged, so the
> apic doesn't see the write.
>
> Forcing the writeback:
>
> -  if ((d & Mov) || (dst.orig_val != dst.val)) {
> -  if ((d & Mov) || (dst.orig_val != dst.val) || isxchg) {
>
> seems to fix the issue :D I'm not sure that fix is correct though.

After a bit of thinking: it's correct but removes an optimization;
furthermore it may miss other instructions that write to memory mapped
areas.
A more proper fix should be "force the writeback if dst.ptr is in some
kind of mmio area".

Ok, enough of reply-to-self. I'll go to sleep...
Luca

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-14 Thread Luca Tettamanti
Il Fri, Jun 15, 2007 at 12:53:24AM +0200, Luca Tettamanti ha scritto: 
> Il Thu, Jun 14, 2007 at 11:26:29AM +0300, Avi Kivity ha scritto: 
> > Luca Tettamanti wrote:
> > >With GOOD_APIC apic_read_around is a nop, while apic_write_around is a
> > >normal write. With !GOOD_APIC apic_write_around writes to the APIC reg
> > >using xchg. With !GOOD_APIC and this patch:
> > >
> > >--- include/asm-i386/apic.h~   2007-04-26 05:08:32.0 +0200
> > >+++ include/asm-i386/apic.h2007-06-13 22:35:00.0 +0200
> > >@@ -56,7 +56,8 @@
> > > static __inline fastcall void native_apic_write_atomic(unsigned long reg,
> > >  unsigned long v)
> > > {
> > >-  xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> > >+//xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> > >+  *((volatile unsigned long *)(APIC_BASE+reg)) = v;
> > > }
> > > 
> > > static __inline fastcall unsigned long native_apic_read(unsigned long reg)
> > >
> > >The kernel boots fine. 
> > >  
> > 
> > Looking at the xchg emulation code, it seems fine, but clearly it 
> > isn't.
> 
> Btw, I've put a printk in x86_emulate.c, where it prepares the operands
> for the xchg operations: all the write_atomic are hitting this point,
> so the write is lost somewhere in cmpxchg_emulated->write_emulated.

Got it!
The emulator skips the writeback if the old value is unchanged, so the
apic doesn't see the write.

Forcing the writeback:

-  if ((d & Mov) || (dst.orig_val != dst.val)) {
-  if ((d & Mov) || (dst.orig_val != dst.val) || isxchg) {

seems to fix the issue :D I'm not sure that fix is correct though. This
is the patch I'm using:


--- a/kernel/x86_emulate.c  2007-06-03 10:31:15.0 +0200
+++ b/kernel/x86_emulate.c  2007-06-15 01:12:12.0 +0200
@@ -486,6 +486,7 @@
unsigned long _regs[NR_VCPU_REGS];
unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
unsigned long modrm_val = 0;
+   int isxchg = 0;
 
memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
 
@@ -912,6 +913,7 @@
break;
case 0x86 ... 0x87: /* xchg */
/* Write back the register source. */
+   isxchg = 1;
switch (dst.bytes) {
case 1:
*(u8 *) src.ptr = (u8) dst.val;
@@ -1056,7 +1058,7 @@
}
 
 writeback:
-   if ((d & Mov) || (dst.orig_val != dst.val)) {
+   if ((d & Mov) || (dst.orig_val != dst.val) || isxchg) {
switch (dst.type) {
case OP_REG:
/* The 4-byte case *is* correct: in 64-bit mode we 
zero-extend. */

Luca
-- 
"Su cio` di cui non si puo` parlare e` bene tacere".
 Ludwig Wittgenstein

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-14 Thread Luca Tettamanti
Il Thu, Jun 14, 2007 at 11:26:29AM +0300, Avi Kivity ha scritto: 
> Luca Tettamanti wrote:
> >With GOOD_APIC apic_read_around is a nop, while apic_write_around is a
> >normal write. With !GOOD_APIC apic_write_around writes to the APIC reg
> >using xchg. With !GOOD_APIC and this patch:
> >
> >--- include/asm-i386/apic.h~ 2007-04-26 05:08:32.0 +0200
> >+++ include/asm-i386/apic.h  2007-06-13 22:35:00.0 +0200
> >@@ -56,7 +56,8 @@
> > static __inline fastcall void native_apic_write_atomic(unsigned long reg,
> >unsigned long v)
> > {
> >-xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> >+//  xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> >+*((volatile unsigned long *)(APIC_BASE+reg)) = v;
> > }
> > 
> > static __inline fastcall unsigned long native_apic_read(unsigned long reg)
> >
> >The kernel boots fine. 
> >  
> 
> Looking at the xchg emulation code, it seems fine, but clearly it 
> isn't.

Btw, I've put a printk in x86_emulate.c, where it prepares the operands
for the xchg operations: all the write_atomic are hitting this point,
so the write is lost somewhere in cmpxchg_emulated->write_emulated.

Luca
-- 
Il dottore mi ha detto di smettere di fare cene intime per quattro.
A meno che non ci siamo altre tre persone.

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-14 Thread Luca Tettamanti
Il Thu, Jun 14, 2007 at 11:26:29AM +0300, Avi Kivity ha scritto: 
> Luca Tettamanti wrote:
> >With GOOD_APIC apic_read_around is a nop, while apic_write_around is a
> >normal write. With !GOOD_APIC apic_write_around writes to the APIC reg
> >using xchg. With !GOOD_APIC and this patch:
> >
> >--- include/asm-i386/apic.h~ 2007-04-26 05:08:32.0 +0200
> >+++ include/asm-i386/apic.h  2007-06-13 22:35:00.0 +0200
> >@@ -56,7 +56,8 @@
> > static __inline fastcall void native_apic_write_atomic(unsigned long reg,
> >unsigned long v)
> > {
> >-xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> >+//  xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> >+*((volatile unsigned long *)(APIC_BASE+reg)) = v;
> > }
> > 
> > static __inline fastcall unsigned long native_apic_read(unsigned long reg)
> >
> >The kernel boots fine. 
> >  
> 
> Looking at the xchg emulation code, it seems fine, but clearly it 
> isn't.  Can you add logging to the kernel apic driver and to the qemu 
> device emulation (qemu/hw/apic.c, apic_mem_readl()/apic_mem_writel()) 
> and compare the results?

Done, but at this point I don't know what I'm looking at ;)

I'm attaching logs for working and non working kernels. I've made
apic_read_around() a nop in both cases since it doesn't influence the
outcome; the only difference is that working kernel writes directly to
the mapped APIC registers while the non-working one uses xchg.

As expected kernel side logs show zero differences (except for timer
calibration).

QEMU is a different story. The most obvious difference is that in
non-working case each write is preceded by a read at the same register
(due to xchg).

What is strange is that almost all the writes done using
apic_write_atomic are not hitting QEMU (see qemu.diff). In the log
there's only the read generated by the xchg.

For example, during timer calibration the kernel is doing:

write to 0x0b0
read from 0x390

but - in the non-working case - QEMU sees:

read from 0x0b0 (probably generated by xchg)
read from 0x390

but it doesn't see the write!

The write is not always lost:

apic_read: 0xd030 = 0x00050011
apic_write_atomic: 0xd370 = 0x000100fe
...
apic_read: fee00030 = 00050011
apic_read: fee00370 = 0001
apic_write: fee00370 = 000100fe

Luca
-- 
> While we're on all of this, are we going to change "tained" to some
> other less alarmist word?
"screwed" -- Alexander Viro


apic-logs.tar.bz2
Description: Binary data
-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-14 Thread Avi Kivity
Luca Tettamanti wrote:
> With GOOD_APIC apic_read_around is a nop, while apic_write_around is a
> normal write. With !GOOD_APIC apic_write_around writes to the APIC reg
> using xchg. With !GOOD_APIC and this patch:
>
> --- include/asm-i386/apic.h~  2007-04-26 05:08:32.0 +0200
> +++ include/asm-i386/apic.h   2007-06-13 22:35:00.0 +0200
> @@ -56,7 +56,8 @@
>  static __inline fastcall void native_apic_write_atomic(unsigned long reg,
>  unsigned long v)
>  {
> - xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> +//   xchg((volatile unsigned long *)(APIC_BASE+reg), v);
> + *((volatile unsigned long *)(APIC_BASE+reg)) = v;
>  }
>  
>  static __inline fastcall unsigned long native_apic_read(unsigned long reg)
>
> The kernel boots fine. 
>   

Looking at the xchg emulation code, it seems fine, but clearly it 
isn't.  Can you add logging to the kernel apic driver and to the qemu 
device emulation (qemu/hw/apic.c, apic_mem_readl()/apic_mem_writel()) 
and compare the results?

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-13 Thread Luca Tettamanti
Il Wed, Jun 13, 2007 at 11:59:25AM +0300, Avi Kivity ha scritto: 
> Luca Tettamanti wrote:
> >Il Mon, Jun 11, 2007 at 10:44:45AM +0300, Avi Kivity ha scritto: 
> >  
> >>Luca wrote:
> >>
> I've managed to reproduce this on kvm-21 (it takes many boots for this
> to happen, but it does eventually).
> 
> >>>Hum, any clue on the cause? 
> >>>  
> >>From what I've seen, it's the new Linux clocksource code.
> >>
> >>
> >>>Should I test older versions?
> >>>  
> >>They're unlikely to be better.  Instead, it would be best to see what 
> >>the guest is doing.
> >>
> >
> >RCU is not working. Network initialization hangs because it happens to
> >be the first RCU user.
> >The guest is stuck waiting for RCU syncronization:
> >
> >[4.992207]  [] synchronize_rcu+0x4e/0x80
> >[4.994379]  [] wakeme_after_rcu+0x0/0x8
> >[4.996521]  [] synchronize_net+0x64/0x8c
> >[4.998678]  [] inet_register_protosw+0xef/0x151
> >[5.000984]  [] inet_init+0x1cd/0x498
> >
> >wait_for_completion() in synchronize_rcu() calls schedule() and the
> >completion is never signaled (wakeme_after_rcu is never called).
> >The completion AFAICS would be signaled via rcu_process_callbacks(),
> >which is called in tasklet context.
> >Scheduler and completion are working fine since they're used in other
> >part of the kernel without problems.
> >
> >To recap:
> >
> >i686 F7 kernel: always works.
> >
> >i586 F7 kernel: sometime hangs due to RCU problems. When it does work
> >it's because the LAPIC is disabled on boot:
> >
> >Using local APIC timer interrupts.
> >calibrating APIC timer ...
> >... lapic delta = 25745109
> >... PM timer delta = 0
> >. delta 25745109
> >. mult: 1105912110
> >. calibration result: 4119217
> >. CPU clock speed is 8794.0417 MHz.
> >. host bus clock speed is 4119.0217 MHz.
> >... verify APIC timer
> >... jiffies delta = 103
> >APIC timer disabled due to verification failure.
> >
> >When it doesn't work LAPIC passes the test:
> >
> >[1.304717] Using local APIC timer interrupts.
> >[1.304719] calibrating APIC timer ...
> >[1.718823] ... lapic delta = 25251444
> >[1.720582] ... PM timer delta = 0
> >[1.722219] . delta 25251444
> >[1.723827] . mult: 1084706136
> >[1.725470] . calibration result: 4040231
> >[1.727374] . CPU clock speed is 8625.0780 MHz.
> >[1.729396] . host bus clock speed is 4040.0231 MHz.
> >[1.731540] ... verify APIC timer
> >[2.158342] ... jiffies delta = 102
> >[2.160035] ... jiffies result ok
> >
> >i586 F7 kernel, with 'nolapic': always works.
> >  
> 
> Can you check which .config option causes it (a special type of 
> bisecting...)?
> 
> This looks likely based on your findings:
> 
> -CONFIG_X86_ALIGNMENT_16=y
> +CONFIG_X86_GOOD_APIC=y
> CONFIG_X86_INTEL_USERCOPY=y
> +CONFIG_X86_USE_PPRO_CHECKSUM=y
> +CONFIG_X86_TSC=y
> 
> I expect it's not directly related to i586 vs i686.

And the winner is... CONFIG_X86_GOOD_APIC ;-)

I think that !GOOD_APIC is a workaround for "11AP erratum in Pentium
Processor Specification Update" aka read-before-write bug.

The config symbol is used in include/asm-i386/apic.h:

#ifdef CONFIG_X86_GOOD_APIC
# define FORCE_READ_AROUND_WRITE 0
# define apic_read_around(x)
# define apic_write_around(x,y) apic_write((x),(y))
#else
# define FORCE_READ_AROUND_WRITE 1
# define apic_read_around(x) apic_read(x)
# define apic_write_around(x,y) apic_write_atomic((x),(y))
#endif

With GOOD_APIC apic_read_around is a nop, while apic_write_around is a
normal write. With !GOOD_APIC apic_write_around writes to the APIC reg
using xchg. With !GOOD_APIC and this patch:

--- include/asm-i386/apic.h~2007-04-26 05:08:32.0 +0200
+++ include/asm-i386/apic.h 2007-06-13 22:35:00.0 +0200
@@ -56,7 +56,8 @@
 static __inline fastcall void native_apic_write_atomic(unsigned long reg,
   unsigned long v)
 {
-   xchg((volatile unsigned long *)(APIC_BASE+reg), v);
+// xchg((volatile unsigned long *)(APIC_BASE+reg), v);
+   *((volatile unsigned long *)(APIC_BASE+reg)) = v;
 }
 
 static __inline fastcall unsigned long native_apic_read(unsigned long reg)

The kernel boots fine. 

Luca
-- 
"Sei l'unica donna della mia vita".
(Adamo)

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-13 Thread Avi Kivity
Luca Tettamanti wrote:
> Il Mon, Jun 11, 2007 at 10:44:45AM +0300, Avi Kivity ha scritto: 
>   
>> Luca wrote:
>> 
 I've managed to reproduce this on kvm-21 (it takes many boots for this
 to happen, but it does eventually).
 
>>> Hum, any clue on the cause? 
>>>   
>> From what I've seen, it's the new Linux clocksource code.
>>
>> 
>>> Should I test older versions?
>>>   
>> They're unlikely to be better.  Instead, it would be best to see what 
>> the guest is doing.
>> 
>
> RCU is not working. Network initialization hangs because it happens to
> be the first RCU user.
> The guest is stuck waiting for RCU syncronization:
>
> [4.992207]  [] synchronize_rcu+0x4e/0x80
> [4.994379]  [] wakeme_after_rcu+0x0/0x8
> [4.996521]  [] synchronize_net+0x64/0x8c
> [4.998678]  [] inet_register_protosw+0xef/0x151
> [5.000984]  [] inet_init+0x1cd/0x498
>
> wait_for_completion() in synchronize_rcu() calls schedule() and the
> completion is never signaled (wakeme_after_rcu is never called).
> The completion AFAICS would be signaled via rcu_process_callbacks(),
> which is called in tasklet context.
> Scheduler and completion are working fine since they're used in other
> part of the kernel without problems.
>
> To recap:
>
> i686 F7 kernel: always works.
>
> i586 F7 kernel: sometime hangs due to RCU problems. When it does work
> it's because the LAPIC is disabled on boot:
>
> Using local APIC timer interrupts.
> calibrating APIC timer ...
> ... lapic delta = 25745109
> ... PM timer delta = 0
> . delta 25745109
> . mult: 1105912110
> . calibration result: 4119217
> . CPU clock speed is 8794.0417 MHz.
> . host bus clock speed is 4119.0217 MHz.
> ... verify APIC timer
> ... jiffies delta = 103
> APIC timer disabled due to verification failure.
>
> When it doesn't work LAPIC passes the test:
>
> [1.304717] Using local APIC timer interrupts.
> [1.304719] calibrating APIC timer ...
> [1.718823] ... lapic delta = 25251444
> [1.720582] ... PM timer delta = 0
> [1.722219] . delta 25251444
> [1.723827] . mult: 1084706136
> [1.725470] . calibration result: 4040231
> [1.727374] . CPU clock speed is 8625.0780 MHz.
> [1.729396] . host bus clock speed is 4040.0231 MHz.
> [1.731540] ... verify APIC timer
> [2.158342] ... jiffies delta = 102
> [2.160035] ... jiffies result ok
>
> i586 F7 kernel, with 'nolapic': always works.
>   

Can you check which .config option causes it (a special type of 
bisecting...)?

This looks likely based on your findings:

-CONFIG_X86_ALIGNMENT_16=y
+CONFIG_X86_GOOD_APIC=y
 CONFIG_X86_INTEL_USERCOPY=y
+CONFIG_X86_USE_PPRO_CHECKSUM=y
+CONFIG_X86_TSC=y

I expect it's not directly related to i586 vs i686.


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-12 Thread Luca Tettamanti
Il Mon, Jun 11, 2007 at 10:44:45AM +0300, Avi Kivity ha scritto: 
> Luca wrote:
> >>
> >>I've managed to reproduce this on kvm-21 (it takes many boots for this
> >>to happen, but it does eventually).
> >
> >Hum, any clue on the cause? 
> 
> From what I've seen, it's the new Linux clocksource code.
> 
> >Should I test older versions?
> 
> They're unlikely to be better.  Instead, it would be best to see what 
> the guest is doing.

RCU is not working. Network initialization hangs because it happens to
be the first RCU user.
The guest is stuck waiting for RCU syncronization:

[4.992207]  [] synchronize_rcu+0x4e/0x80
[4.994379]  [] wakeme_after_rcu+0x0/0x8
[4.996521]  [] synchronize_net+0x64/0x8c
[4.998678]  [] inet_register_protosw+0xef/0x151
[5.000984]  [] inet_init+0x1cd/0x498

wait_for_completion() in synchronize_rcu() calls schedule() and the
completion is never signaled (wakeme_after_rcu is never called).
The completion AFAICS would be signaled via rcu_process_callbacks(),
which is called in tasklet context.
Scheduler and completion are working fine since they're used in other
part of the kernel without problems.

To recap:

i686 F7 kernel: always works.

i586 F7 kernel: sometime hangs due to RCU problems. When it does work
it's because the LAPIC is disabled on boot:

Using local APIC timer interrupts.
calibrating APIC timer ...
... lapic delta = 25745109
... PM timer delta = 0
. delta 25745109
. mult: 1105912110
. calibration result: 4119217
. CPU clock speed is 8794.0417 MHz.
. host bus clock speed is 4119.0217 MHz.
... verify APIC timer
... jiffies delta = 103
APIC timer disabled due to verification failure.

When it doesn't work LAPIC passes the test:

[1.304717] Using local APIC timer interrupts.
[1.304719] calibrating APIC timer ...
[1.718823] ... lapic delta = 25251444
[1.720582] ... PM timer delta = 0
[1.722219] . delta 25251444
[1.723827] . mult: 1084706136
[1.725470] . calibration result: 4040231
[1.727374] . CPU clock speed is 8625.0780 MHz.
[1.729396] . host bus clock speed is 4040.0231 MHz.
[1.731540] ... verify APIC timer
[2.158342] ... jiffies delta = 102
[2.160035] ... jiffies result ok

i586 F7 kernel, with 'nolapic': always works.

Luca
-- 
Sbagliare รจ umano, ma per incasinare davvero le cose serve un computer.

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-12 Thread Avi Kivity
Luca wrote:
> On 6/11/07, Avi Kivity <[EMAIL PROTECTED]> wrote:
>> Luca wrote:
>> >>
>> >> I've managed to reproduce this on kvm-21 (it takes many boots for
>> this
>> >> to happen, but it does eventually).
>> >
>> > Hum, any clue on the cause?
>>
>>  From what I've seen, it's the new Linux clocksource code.
>
> Actually I tried forcing the PIT (and any other combination of
> tsc,acpi_pm,jiffies) as the clocksource, without success.
>

Well, there's lots of APIC stuff going on when it hangs.

>> > Should I test older versions?
>>
>> They're unlikely to be better.  Instead, it would be best to see what
>> the guest is doing.
>>
>> I suggest downloading the source rpm for the kernel, building it, and
>> sprinkling printk()s until we know exactly what source the guest is
>> executing at the time of the hang.
>
> Ok, will do. Meanwhile I discovered that the kernel on the boot cd
> (the one that hangs) is compiled for i586, while the one installed on
> disk is for i686 (this one works).

Ah.

>
> i686 has this options enabled:
>
> +CONFIG_X86_GOOD_APIC=y
> +CONFIG_X86_USE_PPRO_CHECKSUM=y
> +CONFIG_X86_TSC=y
>
> but disabling tsc on the command line doesn't make any difference. Is
> it possible that KVM is choking on some instruction not used by the
> i686 kernel?

Unlikely, as then the hang would occur all the time instead of randomly.


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-12 Thread Luca
On 6/11/07, Avi Kivity <[EMAIL PROTECTED]> wrote:
> Luca wrote:
> >>
> >> I've managed to reproduce this on kvm-21 (it takes many boots for this
> >> to happen, but it does eventually).
> >
> > Hum, any clue on the cause?
>
>  From what I've seen, it's the new Linux clocksource code.

Actually I tried forcing the PIT (and any other combination of
tsc,acpi_pm,jiffies) as the clocksource, without success.

> > Should I test older versions?
>
> They're unlikely to be better.  Instead, it would be best to see what
> the guest is doing.
>
> I suggest downloading the source rpm for the kernel, building it, and
> sprinkling printk()s until we know exactly what source the guest is
> executing at the time of the hang.

Ok, will do. Meanwhile I discovered that the kernel on the boot cd
(the one that hangs) is compiled for i586, while the one installed on
disk is for i686 (this one works).

i686 has this options enabled:

+CONFIG_X86_GOOD_APIC=y
+CONFIG_X86_USE_PPRO_CHECKSUM=y
+CONFIG_X86_TSC=y

but disabling tsc on the command line doesn't make any difference. Is
it possible that KVM is choking on some instruction not used by the
i686 kernel?

Luca

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-12 Thread Avi Kivity
Luca wrote:
>>
>> I've managed to reproduce this on kvm-21 (it takes many boots for this
>> to happen, but it does eventually).
>
> Hum, any clue on the cause? 

 From what I've seen, it's the new Linux clocksource code.

> Should I test older versions?

They're unlikely to be better.  Instead, it would be best to see what 
the guest is doing.

I suggest downloading the source rpm for the kernel, building it, and 
sprinkling printk()s until we know exactly what source the guest is 
executing at the time of the hang.

Use of the qemu -kernel, -append, and -serial can redirect the console 
to the command line so that it's easy to capture all debug output.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-12 Thread Luca
On 6/10/07, Avi Kivity <[EMAIL PROTECTED]> wrote:
> Luca wrote:
> > On 6/5/07, Avi Kivity <[EMAIL PROTECTED]> wrote:
> >> Luca Tettamanti wrote:
> >> > Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto:
> >> >
> >> >>> While doing repeated tests with the installer I ran into another
> >> >>> (unrelated) problem. Sometimes the guest kernel hangs at boot at:
> >> >>>
> >> >>> NET: Registered protocol family 2
> >> >>>
> >> >>> with any kind of networking options (except for -net none, which
> >> works).
> >> >>> With -no-kvm it boots with any networking option.
> >> >>>
> >> >>>
> >> >>>
> >> >> Can you try to pin the guest on a single core with taskset:
> >> >>
> >> >>taskset 1 qemu ...
> >> >>
> >> >
> >> > Doesn't help. What works is 'nolapic', i.e. disabling the local
> >> APIC on
> >> > the guest kernel.
> >> > I've also tried disabling TSC (notsc) and forcing PIT as the
> >> clocksource
> >> > (clocksouce=pit clock=pit); neither of them helped.
> >> >
> >> >
> >>
> >> Is this a regression relative to a previous kvm version?
> >
> > Hello,
> > sorry for the delay, I was having troubles compiling older KVMs with a
> > recent kernel...
> > The last version that works is kvm-21; starting from kvm-22 the VM
> > hangs during network initialization (now always, but pretty often).
> > This only occurs when the guest is Fedora7 setup ISO. The regular boot
> > (i.e. from the hd) seems unaffected.
>
> I've managed to reproduce this on kvm-21 (it takes many boots for this
> to happen, but it does eventually).

Hum, any clue on the cause? Should I test older versions?

Luca

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-12 Thread Avi Kivity
Luca wrote:
> On 6/5/07, Avi Kivity <[EMAIL PROTECTED]> wrote:
>> Luca Tettamanti wrote:
>> > Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto:
>> >
>> >>> While doing repeated tests with the installer I ran into another
>> >>> (unrelated) problem. Sometimes the guest kernel hangs at boot at:
>> >>>
>> >>> NET: Registered protocol family 2
>> >>>
>> >>> with any kind of networking options (except for -net none, which 
>> works).
>> >>> With -no-kvm it boots with any networking option.
>> >>>
>> >>>
>> >>>
>> >> Can you try to pin the guest on a single core with taskset:
>> >>
>> >>taskset 1 qemu ...
>> >>
>> >
>> > Doesn't help. What works is 'nolapic', i.e. disabling the local 
>> APIC on
>> > the guest kernel.
>> > I've also tried disabling TSC (notsc) and forcing PIT as the 
>> clocksource
>> > (clocksouce=pit clock=pit); neither of them helped.
>> >
>> >
>>
>> Is this a regression relative to a previous kvm version?
>
> Hello,
> sorry for the delay, I was having troubles compiling older KVMs with a
> recent kernel...
> The last version that works is kvm-21; starting from kvm-22 the VM
> hangs during network initialization (now always, but pretty often).
> This only occurs when the guest is Fedora7 setup ISO. The regular boot
> (i.e. from the hd) seems unaffected.

I've managed to reproduce this on kvm-21 (it takes many boots for this 
to happen, but it does eventually).

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-07 Thread Luca
On 6/5/07, Avi Kivity <[EMAIL PROTECTED]> wrote:
> Luca Tettamanti wrote:
> > Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto:
> >
> >>> While doing repeated tests with the installer I ran into another
> >>> (unrelated) problem. Sometimes the guest kernel hangs at boot at:
> >>>
> >>> NET: Registered protocol family 2
> >>>
> >>> with any kind of networking options (except for -net none, which works).
> >>> With -no-kvm it boots with any networking option.
> >>>
> >>>
> >>>
> >> Can you try to pin the guest on a single core with taskset:
> >>
> >>taskset 1 qemu ...
> >>
> >
> > Doesn't help. What works is 'nolapic', i.e. disabling the local APIC on
> > the guest kernel.
> > I've also tried disabling TSC (notsc) and forcing PIT as the clocksource
> > (clocksouce=pit clock=pit); neither of them helped.
> >
> >
>
> Is this a regression relative to a previous kvm version?

Hello,
sorry for the delay, I was having troubles compiling older KVMs with a
recent kernel...
The last version that works is kvm-21; starting from kvm-22 the VM
hangs during network initialization (now always, but pretty often).
This only occurs when the guest is Fedora7 setup ISO. The regular boot
(i.e. from the hd) seems unaffected.

Luca

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-05 Thread Avi Kivity
Luca Tettamanti wrote:
> Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto: 
>   
>>> While doing repeated tests with the installer I ran into another
>>> (unrelated) problem. Sometimes the guest kernel hangs at boot at:
>>>
>>> NET: Registered protocol family 2
>>>
>>> with any kind of networking options (except for -net none, which works).
>>> With -no-kvm it boots with any networking option.
>>>
>>>   
>>>   
>> Can you try to pin the guest on a single core with taskset:
>>
>>taskset 1 qemu ...
>> 
>
> Doesn't help. What works is 'nolapic', i.e. disabling the local APIC on
> the guest kernel.
> I've also tried disabling TSC (notsc) and forcing PIT as the clocksource
> (clocksouce=pit clock=pit); neither of them helped.
>
>   

Is this a regression relative to a previous kvm version?


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-04 Thread Luca Tettamanti
Il Mon, Jun 04, 2007 at 11:51:10PM +0300, Avi Kivity ha scritto: 
> > While doing repeated tests with the installer I ran into another
> > (unrelated) problem. Sometimes the guest kernel hangs at boot at:
> >
> > NET: Registered protocol family 2
> >
> > with any kind of networking options (except for -net none, which works).
> > With -no-kvm it boots with any networking option.
> >
> >   
> 
> Can you try to pin the guest on a single core with taskset:
> 
>taskset 1 qemu ...

Doesn't help. What works is 'nolapic', i.e. disabling the local APIC on
the guest kernel.
I've also tried disabling TSC (notsc) and forcing PIT as the clocksource
(clocksouce=pit clock=pit); neither of them helped.

Luca
-- 
Il dottore mi ha detto di smettere di fare cene intime per quattro.
A meno che non ci siamo altre tre persone.

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-04 Thread Avi Kivity
Luca Tettamanti wrote:
> It turned out that it was somewhat reproducible with fedora installer.
> With your patch it doesn't oops anymore.
>
>   

Ok, good.

> While doing repeated tests with the installer I ran into another
> (unrelated) problem. Sometimes the guest kernel hangs at boot at:
>
> NET: Registered protocol family 2
>
> with any kind of networking options (except for -net none, which works).
> With -no-kvm it boots with any networking option.
>
>   

Can you try to pin the guest on a single core with taskset:

   taskset 1 qemu ...

?

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-04 Thread Luca Tettamanti
Hi,

Il Mon, Jun 04, 2007 at 12:35:37PM +0300, Avi Kivity ha scritto: 
> Luca Tettamanti wrote:
> >Hello,
> >my kernel just exploded :)
> >
> >The host is running 2.6-git-current, with KVM modules from KVM-27
> >package. kernel is 32bit, SMP, with PREEMPT enabled, no HIGHMEM (but I'm
> >using CONFIG_VMSPLIT_3G_OPT=y). The CPU is a Core2 (hence I'm using
> >kvm-intel).
> >Guest was a Fedora7 setup DVD, which died somewhere during the
> >installation (anaconda was already active at that point). Bad news is
> >that I cannot reproduce the bug :|
> >  
> Fortunately the trace clearly shows the problem (out of mmu working 
> memory on guest context switch).  The attached patch should fix it.  Let 
> me know if it works for you.

It turned out that it was somewhat reproducible with fedora installer.
With your patch it doesn't oops anymore.

While doing repeated tests with the installer I ran into another
(unrelated) problem. Sometimes the guest kernel hangs at boot at:

NET: Registered protocol family 2

with any kind of networking options (except for -net none, which works).
With -no-kvm it boots with any networking option.

The only difference in dmesg is that when KVM is enable the guest uses
the TSC:

 NetLabel:  unlabeled traffic allowed by default
-Time: tsc clocksource has been installed.
 PCI: Ignore bogus resource 6 [0:0] of :00:02.0

For reference this is the command line that I'm using:

./kvm-27/qemu/i386-softmmu/qemu -hda /home/kronos/tmp/fedora.img
  -cdrom /home/kronos/tmp/boot.iso -boot d -net tap -net nic -m 256

and boot.iso is the fedora7 net install image (you can find it on any
mirror: fedora/linux/releases/7/Fedora/arch/os/images/boot.iso).

The guest kernel doesn't respond to sysrq, so I don't known exactly
where it's hanging. The stack trace on the host seems rather
uninteresting:

qemu  S 0002  2404 18905   7312 (NOTLB)
   dca4db48 0086  0002 b0478900 eec4a0f0 b02f418b b0478900 
   000a  eec4a0f0 ef31ca70 267db8c3 08c7 3ea3 eec4a1fc 
   b1810980 efcc62a0 b0478900 b0129580  0292 dca4db58 0023935c 
Call Trace:
 [] _spin_unlock_irqrestore+0x34/0x58
 [] __mod_timer+0x9d/0xa7
 [] schedule_timeout+0x70/0x8d
 [] _spin_unlock_irqrestore+0x34/0x58
 [] process_timeout+0x0/0x5
 [] schedule_timeout+0x6b/0x8d
 [] do_select+0x399/0x3e7
 [] __pollwait+0x0/0xac
 [] default_wake_function+0x0/0xc
 [] free_poll_entry+0xe/0x16
 [] poll_freewait+0x18/0x4c
 [] do_sys_poll+0x302/0x327
 [] __pollwait+0x0/0xac
 [] default_wake_function+0x0/0xc
 [] task_rq_lock+0x36/0x5d
 [] _spin_lock+0x33/0x3e
 [] _spin_unlock_irqrestore+0x40/0x58
 [] try_to_wake_up+0x325/0x32f
 [] mark_held_locks+0x39/0x53
 [] _spin_unlock_irqrestore+0x34/0x58
 [] restore_nocheck+0x12/0x15
 [] trace_hardirqs_on+0x11a/0x13d
 [] do_IRQ+0xc4/0xde
 [] restore_nocheck+0x12/0x15
 [] core_sys_select+0x2ee/0x30f
 [] setup_sigcontext+0x105/0x189
 [] _spin_unlock_irq+0x20/0x41
 [] trace_hardirqs_on+0x11a/0x13d
 [] do_notify_resume+0x5d1/0x611
 [] _spin_unlock_irq+0x2b/0x41
 [] do_notify_resume+0x52f/0x611
 [] restore_nocheck+0x12/0x15
 [] convert_fxsr_from_user+0x26/0xe6
 [] sys_select+0xa4/0x187
 [] restore_nocheck+0x12/0x15
 [] trace_hardirqs_on+0x11a/0x13d
 [] syscall_call+0x7/0xb
 ===
qemu  S CF9E5DC0  2996 18911   7312 (NOTLB)
   cf9e5dd4 0082 0002 cf9e5dc0 cf9e5dbc  b013b1ee cf9e5ea0 
   0007 0001 d252b4f0 b194c030 70a8bf29 08ac a554 d252b5fc 
   b181a980 efcc62a0 00232330 0003   cf9e5ea0 efcc62d4 
Call Trace:
 [] trace_hardirqs_on+0x11a/0x13d
 [] futex_wait+0x251/0x3ed
 [] hrtimer_wakeup+0x0/0x18
 [] futex_wait+0x242/0x3ed
 [] default_wake_function+0x0/0xc
 [] do_futex+0x6c/0xaad
 [] sys_rt_sigqueueinfo+0x44/0x4e
 [] getnstimeofday+0x30/0xbe
 [] ktime_get_ts+0x16/0x44
 [] sys_futex+0xc8/0xda
 [] syscall_call+0x7/0xb
 ===

I'm attaching the dmesg for both -kvm and -no-kvm cases.

Luca
-- 
"La teoria e` quando sappiamo come funzionano le cose ma non funzionano.
 La pratica e` quando le cose funzionano ma non sappiamo perche`.
 Abbiamo unito la teoria e la pratica: le cose non funzionano piu` e non
 sappiamo il perche`." -- A. Einstein
Linux version 2.6.21-1.3194.fc7 ([EMAIL PROTECTED]) (gcc version 4.1.2 20070502 
(Red Hat 4.1.2-12)) #1 SMP Wed May 23 22:11:19 EDT 2007

BIOS-provided physical RAM map:

sanitize start

sanitize end

copy_e820_map() start:  size: 0009fc00 end: 
0009fc00 type: 1

copy_e820_map() type is E820_RAM

copy_e820_map() start: 0009fc00 size: 0400 end: 
000a type: 2

copy_e820_map() start: 000e8000 size: 00018000 end: 
0010 type: 2

copy_e820_map() start: 0010 size: 0ff0 end: 
1000 type: 1

copy_e820_map() type is E820_RAM

copy_e820_map() start: fffc size: 0004 end: 
0001 type: 2


Re: [kvm-devel] [BUG] Oops with KVM-27

2007-06-04 Thread Avi Kivity

Luca Tettamanti wrote:

Hello,
my kernel just exploded :)

The host is running 2.6-git-current, with KVM modules from KVM-27
package. kernel is 32bit, SMP, with PREEMPT enabled, no HIGHMEM (but I'm
using CONFIG_VMSPLIT_3G_OPT=y). The CPU is a Core2 (hence I'm using
kvm-intel).
Guest was a Fedora7 setup DVD, which died somewhere during the
installation (anaconda was already active at that point). Bad news is
that I cannot reproduce the bug :|
  
Fortunately the trace clearly shows the problem (out of mmu working 
memory on guest context switch).  The attached patch should fix it.  Let 
me know if it works for you.



--
error compiling committee.c: too many arguments to function

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 0632d0b..0fdd5a6 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -543,6 +543,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		   const u8 *old, const u8 *new, int bytes);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 
 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
@@ -554,6 +556,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	return vcpu->mmu.page_fault(vcpu, gva, error_code);
 }
 
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+	if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
+		return 0;
+
+	return kvm_mmu_load(vcpu);
+}
+
 static inline int is_long_mode(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 283df03..5915d7a 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -949,9 +949,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->free = nonpaging_free;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	mmu_alloc_roots(vcpu);
-	ASSERT(VALID_PAGE(context->root_hpa));
-	kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
+	context->root_hpa = INVALID_PAGE;
 	return 0;
 }
 
@@ -965,11 +963,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
 	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
 	mmu_free_roots(vcpu);
-	if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
-		kvm_mmu_free_some_pages(vcpu);
-	mmu_alloc_roots(vcpu);
-	kvm_mmu_flush_tlb(vcpu);
-	kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
 }
 
 static void inject_page_fault(struct kvm_vcpu *vcpu,
@@ -1003,10 +996,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->free = paging_free;
 	context->root_level = level;
 	context->shadow_root_level = level;
-	mmu_alloc_roots(vcpu);
-	ASSERT(VALID_PAGE(context->root_hpa));
-	kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
-		(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
+	context->root_hpa = INVALID_PAGE;
 	return 0;
 }
 
@@ -1025,10 +1015,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->free = paging_free;
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	mmu_alloc_roots(vcpu);
-	ASSERT(VALID_PAGE(context->root_hpa));
-	kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
-		(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
+	context->root_hpa = INVALID_PAGE;
 	return 0;
 }
 
@@ -1042,7 +1029,6 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
 
-	mmu_topup_memory_caches(vcpu);
 	if (!is_paging(vcpu))
 		return nonpaging_init_context(vcpu);
 	else if (is_long_mode(vcpu))
@@ -1064,16 +1050,31 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
+	destroy_kvm_mmu(vcpu);
+	return init_kvm_mmu(vcpu);
+}
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
 	int r;
 
-	destroy_kvm_mmu(vcpu);
-	r = init_kvm_mmu(vcpu);
-	if (r < 0)
-		goto out;
+	spin_lock(&vcpu->kvm->lock);
 	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		goto out;
+	mmu_alloc_roots(vcpu);
+	kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
+	kvm_mmu_flush_tlb(vcpu);
 out:
+	spin_unlock(&vcpu->kvm->lock);
 	return r;
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+	mmu_free_roots(vcpu);
+}
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
   struct kvm_mmu_page *page,
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index b621403..ed33f59 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -1482,6 +1482,10 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 
 again:
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r))
+		return r;
+
 	if (!vcpu->mmio_read_completed)
 		do_interrupt_requests(vcpu, kvm_run);
 
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 09e3609..a499989 100644
--- a/drivers/kvm/vmx