[PATCH 2/2] qemu-kvm: Replace direct phys_ram_dirty access with wrapper functions.

2010-03-16 Thread Yoshiaki Tamura
Replaces direct phys_ram_dirty access with wrapper functions to prevent
direct access to the phys_ram_dirty bitmap.

Signed-off-by: Yoshiaki Tamura 
Signed-off-by: OHMURA Kei 
---
 exec.c |   45 -
 1 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/exec.c b/exec.c
index 9bcb4de..b607212 100644
--- a/exec.c
+++ b/exec.c
@@ -1944,7 +1944,7 @@ static void tlb_protect_code(ram_addr_t ram_addr)
 static void tlb_unprotect_code_phys(CPUState *env, ram_addr_t ram_addr,
 target_ulong vaddr)
 {
-phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] |= CODE_DIRTY_FLAG;
+cpu_physical_memory_set_dirty_flags(ram_addr, CODE_DIRTY_FLAG);
 }
 
 static inline void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry,
@@ -1965,8 +1965,7 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, 
ram_addr_t end,
 {
 CPUState *env;
 unsigned long length, start1;
-int i, mask, len;
-uint8_t *p;
+int i;
 
 start &= TARGET_PAGE_MASK;
 end = TARGET_PAGE_ALIGN(end);
@@ -1974,11 +1973,7 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, 
ram_addr_t end,
 length = end - start;
 if (length == 0)
 return;
-len = length >> TARGET_PAGE_BITS;
-mask = ~dirty_flags;
-p = phys_ram_dirty + (start >> TARGET_PAGE_BITS);
-for(i = 0; i < len; i++)
-p[i] &= mask;
+cpu_physical_memory_mask_dirty_range(start, length, dirty_flags);
 
 /* we modify the TLB cache so that the dirty bit will be set again
when accessing the range */
@@ -2825,16 +2820,16 @@ static void notdirty_mem_writeb(void *opaque, 
target_phys_addr_t ram_addr,
 uint32_t val)
 {
 int dirty_flags;
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 if (!(dirty_flags & CODE_DIRTY_FLAG)) {
 #if !defined(CONFIG_USER_ONLY)
 tb_invalidate_phys_page_fast(ram_addr, 1);
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 #endif
 }
 stb_p(qemu_get_ram_ptr(ram_addr), val);
 dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
-phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
+cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
 /* we remove the notdirty callback only if the code has been
flushed */
 if (dirty_flags == 0xff)
@@ -2845,16 +2840,16 @@ static void notdirty_mem_writew(void *opaque, 
target_phys_addr_t ram_addr,
 uint32_t val)
 {
 int dirty_flags;
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 if (!(dirty_flags & CODE_DIRTY_FLAG)) {
 #if !defined(CONFIG_USER_ONLY)
 tb_invalidate_phys_page_fast(ram_addr, 2);
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 #endif
 }
 stw_p(qemu_get_ram_ptr(ram_addr), val);
 dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
-phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
+cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
 /* we remove the notdirty callback only if the code has been
flushed */
 if (dirty_flags == 0xff)
@@ -2865,16 +2860,16 @@ static void notdirty_mem_writel(void *opaque, 
target_phys_addr_t ram_addr,
 uint32_t val)
 {
 int dirty_flags;
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 if (!(dirty_flags & CODE_DIRTY_FLAG)) {
 #if !defined(CONFIG_USER_ONLY)
 tb_invalidate_phys_page_fast(ram_addr, 4);
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 #endif
 }
 stl_p(qemu_get_ram_ptr(ram_addr), val);
 dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
-phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
+cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
 /* we remove the notdirty callback only if the code has been
flushed */
 if (dirty_flags == 0xff)
@@ -3325,8 +3320,8 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, 
uint8_t *buf,
 /* invalidate code */
 tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
 /* set dirty bit */
-phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
-(0xff & ~CODE_DIRTY_FLAG);
+cpu_physical_memory_set_dirty_flags(
+addr1, (0xff & ~CODE_DIRTY_FLAG));
 }
/* qemu doesn't execute guest code directly, but kvm does
   therefore flush instruction caches */
@@ -3539,8 

[PATCH 1/2] qemu-kvm: Introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Yoshiaki Tamura
Adds wrapper functions to prevent direct access to the phys_ram_dirty bitmap.

Signed-off-by: Yoshiaki Tamura 
Signed-off-by: OHMURA Kei 
---
 cpu-all.h |   25 +
 1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index 9bc01b9..c279c0a 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -882,6 +882,11 @@ static inline int cpu_physical_memory_is_dirty(ram_addr_t 
addr)
 return phys_ram_dirty[addr >> TARGET_PAGE_BITS] == 0xff;
 }
 
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+return phys_ram_dirty[addr >> TARGET_PAGE_BITS];
+}
+
 static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
 int dirty_flags)
 {
@@ -893,6 +898,26 @@ static inline void 
cpu_physical_memory_set_dirty(ram_addr_t addr)
 phys_ram_dirty[addr >> TARGET_PAGE_BITS] = 0xff;
 }
 
+static inline int cpu_physical_memory_set_dirty_flags(ram_addr_t addr,
+  int dirty_flags)
+{
+return phys_ram_dirty[addr >> TARGET_PAGE_BITS] |= dirty_flags;
+}
+
+static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
+int length,
+int dirty_flags)
+{
+int i, mask, len;
+uint8_t *p;
+
+len = length >> TARGET_PAGE_BITS;
+mask = ~dirty_flags;
+p = phys_ram_dirty + (start >> TARGET_PAGE_BITS);
+for (i = 0; i < len; i++)
+p[i] &= mask;
+}
+
 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
  int dirty_flags);
 void cpu_tlb_update_dirty(CPUState *env);
-- 
1.7.0.31.g1df487

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] qemu-kvm: Introduce wrapper functions to access phys_ram_dirty, and replace existing direct accesses to it.

2010-03-16 Thread Yoshiaki Tamura

Before replacing byte-based dirty bitmap with bit-based dirty bitmap,
clearing direct accesses to the bitmap first seems to be good point to
start with.

This patch set is based on the following discussion.

http://www.mail-archive.com/kvm@vger.kernel.org/msg30724.html

Thanks,

Yoshi
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


About KVM Forum 2010

2010-03-16 Thread kazushi takahashi
Hi all

Does anybody know exact important date, such as paper deadline
for KVM Forum 2010?

I can find this
blog(http://www.linux-kvm.com/content/kvm-forum-2010-scheduled-august-9-10-2010)
 but the blog only say about the date of the conference.

Regards,
Kazushi Takahashi
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: cleanup kvm_get_dirty_log()

2010-03-16 Thread Xiao Guangrong


Takuya Yoshikawa wrote:
> Xiao Guangrong wrote:
>> Using bitmap_empty() to see whether memslot->dirty_bitmap is empty
>>
> 
> You can do this for arch specific get_dirty_log() too.

OK, i'll do it in the next version

> 
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  virt/kvm/kvm_main.c |6 ++
>>  1 files changed, 2 insertions(+), 4 deletions(-)
>>
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index bcd08b8..497ae14 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -767,8 +767,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
>>  struct kvm_dirty_log *log, int *is_dirty)
>>  {
>>  struct kvm_memory_slot *memslot;
>> -int r, i;
>> -int n;
>> +int r, n;
>>  unsigned long any = 0;
> 
> any is no longer need to be unsigned long, if you do this?

Yeah, right, thanks for you point out.

Xiao
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: cleanup kvm_get_dirty_log()

2010-03-16 Thread Takuya Yoshikawa

Xiao Guangrong wrote:

Using bitmap_empty() to see whether memslot->dirty_bitmap is empty



You can do this for arch specific get_dirty_log() too.


Signed-off-by: Xiao Guangrong 
---
 virt/kvm/kvm_main.c |6 ++
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bcd08b8..497ae14 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -767,8 +767,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log, int *is_dirty)
 {
struct kvm_memory_slot *memslot;
-   int r, i;
-   int n;
+   int r, n;
unsigned long any = 0;


any is no longer need to be unsigned long, if you do this?

 
 	r = -EINVAL;

@@ -782,8 +781,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
 
 	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 
-	for (i = 0; !any && i < n/sizeof(long); ++i)

-   any = memslot->dirty_bitmap[i];
+   any = !bitmap_empty(memslot->dirty_bitmap, memslot->npages);
 
 	r = -EFAULT;

if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Avi Kivity

On 03/16/2010 10:10 PM, Blue Swirl wrote:



  Yes, and is what tlb_protect_code() does and it's called from
tb_alloc_page() which is what's code when a TB is created.
 

Just a tangential note: a long time ago, I tried to disable self
modifying code detection for Sparc. On most RISC architectures, SMC
needs explicit flushing so in theory we need not track code memory
writes. However, during exceptions the translator needs to access the
original unmodified code that was used to generate the TB. But maybe
there are other ways to avoid SMC tracking, on x86 it's still needed
   


On x86 you're supposed to execute a serializing instruction (one of 
INVD, INVEPT, INVLPG, INVVPID, LGDT, LIDT, LLDT, LTR, MOV (to control 
register, with the exception of MOV CR8), MOV (to debug register), 
WBINVD, WRMSR, CPUID, IRET, and RSM) before running modified code.



but I suppose SMC is pretty rare.
   


Every time you demand load a code page from disk, you're running self 
modifying code (though it usually doesn't exist in the tlb, so there's 
no previous version that can cause trouble).



--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Fix some mmu/emulator atomicity issues (v2)

2010-03-16 Thread Avi Kivity

On 03/16/2010 09:33 PM, Marcelo Tosatti wrote:



How relevant is this for -stable? Races don't sound good to me :)
 

The race mentioned above is not existant on -stable since prefetch is
disabled for invlpg.

The atomic fixes seem like a candidate, since lack of them can trigger
pagetable corruption. Avi?
   


I would this to get some use on mainline before queuing the first few 
patches.  They aren't trivial, and I don't know of any failures 
resulting from the races.


--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Avi Kivity

On 03/17/2010 02:41 AM, Frank Ch. Eigler wrote:

Hi -

On Tue, Mar 16, 2010 at 06:04:10PM -0500, Anthony Liguori wrote:
   

[...]
The only way to really address this is to change the interaction.
Instead of running perf externally to qemu, we should support a perf
command in the qemu monitor that can then tie directly to the perf
tooling.  That gives us the best possible user experience.
 

To what extent could this be solved with less crossing of
isolation/abstraction layers, if the perfctr facilities were properly
virtualized?
   


That's the more interesting (by far) usage model.  In general guest 
owners don't have access to the host, and host owners can't (and 
shouldn't) change guests.


Monitoring guests from the host is useful for kvm developers, but less 
so for users.


--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: cleanup kvm_get_dirty_log()

2010-03-16 Thread Xiao Guangrong
Using bitmap_empty() to see whether memslot->dirty_bitmap is empty

Signed-off-by: Xiao Guangrong 
---
 virt/kvm/kvm_main.c |6 ++
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bcd08b8..497ae14 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -767,8 +767,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log, int *is_dirty)
 {
struct kvm_memory_slot *memslot;
-   int r, i;
-   int n;
+   int r, n;
unsigned long any = 0;
 
r = -EINVAL;
@@ -782,8 +781,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
 
n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 
-   for (i = 0; !any && i < n/sizeof(long); ++i)
-   any = memslot->dirty_bitmap[i];
+   any = !bitmap_empty(memslot->dirty_bitmap, memslot->npages);
 
r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
-- 
1.6.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] KVM MMU: check reserved bits only when CR4.PSE=1 or CR4.PAE=1

2010-03-16 Thread Xiao Guangrong
- The RSV bit is possibility set in error code when #PF occurred
  only if CR4.PSE=1 or CR4.PAE=1
  
- context->rsvd_bits_mask[1][0] is always 0

Changlog:
Move this operation to reset_rsvds_bits_mask() address Avi Kivity's suggestion

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   12 +---
 1 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b137515..c49f8ec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2288,18 +2288,26 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu 
*vcpu, int level)
 
if (!is_nx(vcpu))
exb_bit_rsvd = rsvd_bits(63, 63);
+
+   context->rsvd_bits_mask[1][0] = 0;
switch (level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
context->rsvd_bits_mask[0][1] = 0;
context->rsvd_bits_mask[0][0] = 0;
+
+   /* check rsvd bits only when CR4.PSE=1 or CR4.PAE=1 */
+   if (!is_pse(vcpu)) {
+   context->rsvd_bits_mask[1][1] = 0;
+   break;
+   }
+
if (is_cpuid_PSE36())
/* 36bits PSE 4MB page */
context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
else
/* 32 bits PSE 4MB page */
context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
-   context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
case PT32E_ROOT_LEVEL:
context->rsvd_bits_mask[0][2] =
@@ -2312,7 +2320,6 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 
int level)
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62) |
rsvd_bits(13, 20);  /* large page */
-   context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2330,7 +2337,6 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 
int level)
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 20);  /* large page */
-   context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
}
 }
-- 
1.6.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Zhang, Yanmin
On Tue, 2010-03-16 at 11:32 +0200, Avi Kivity wrote:
> On 03/16/2010 09:48 AM, Zhang, Yanmin wrote:
> >
> >> Excellent, support for guest kernel != host kernel is critical (I can't
> >> remember the last time I ran same kernels).
> >>
> >> How would we support multiple guests with different kernels?
> >>  
> > With the patch, 'perf kvm report --sort pid" could show
> > summary statistics for all guest os instances. Then, use
> > parameter --pid of 'perf kvm record' to collect single problematic instance 
> > data.
> >
> 
> That certainly works, though automatic association of guest data with 
> guest symbols is friendlier.
Thanks. Originally, I planed to add a -G parameter to perf. Such like
-G 
:/XXX/XXX/guestkallsyms:/XXX/XXX/modules,8889:/XXX/XXX/guestkallsyms:/XXX/XXX/modules
 and 8889 are just qemu guest pid.

So we could define multiple guest os symbol files. But it seems ugly,
and 'perf kvm report --sort pid" and 'perf kvm top --pid' could provide
similar functionality.

> 
> >>> diff -Nraup linux-2.6_tipmaster0315/arch/x86/kvm/vmx.c 
> >>> linux-2.6_tipmaster0315_perfkvm/arch/x86/kvm/vmx.c
> >>> --- linux-2.6_tipmaster0315/arch/x86/kvm/vmx.c2010-03-16 
> >>> 08:59:11.825295404 +0800
> >>> +++ linux-2.6_tipmaster0315_perfkvm/arch/x86/kvm/vmx.c2010-03-16 
> >>> 09:01:09.976084492 +0800
> >>> @@ -26,6 +26,7 @@
> >>>#include
> >>>#include
> >>>#include
> >>> +#include
> >>>#include "kvm_cache_regs.h"
> >>>#include "x86.h"
> >>>
> >>> @@ -3632,6 +3633,43 @@ static void update_cr8_intercept(struct
> >>>   vmcs_write32(TPR_THRESHOLD, irr);
> >>>}
> >>>
> >>> +DEFINE_PER_CPU(int, kvm_in_guest) = {0};
> >>> +
> >>> +static void kvm_set_in_guest(void)
> >>> +{
> >>> + percpu_write(kvm_in_guest, 1);
> >>> +}
> >>> +
> >>> +static int kvm_is_in_guest(void)
> >>> +{
> >>> + return percpu_read(kvm_in_guest);
> >>> +}
> >>>
> >>>
> >>  
> >
> >> There is already PF_VCPU for this.
> >>  
> > Right, but there is a scope between kvm_guest_enter and really running
> > in guest os, where a perf event might overflow. Anyway, the scope is very
> > narrow, I will change it to use flag PF_VCPU.
> >
> 
> There is also a window between setting the flag and calling 'int $2' 
> where an NMI might happen and be accounted incorrectly.
> 
> Perhaps separate the 'int $2' into a direct call into perf and another 
> call for the rest of NMI handling.  I don't see how it would work on svm 
> though - AFAICT the NMI is held whereas vmx swallows it. 

>  I guess NMIs 
> will be disabled until the next IRET so it isn't racy, just tricky.
I'm not sure if vmexit does break NMI context or not. Hardware NMI context
isn't reentrant till a IRET. YangSheng would like to double check it.

> 
> >>> +static struct perf_guest_info_callbacks kvm_guest_cbs = {
> >>> + .is_in_guest= kvm_is_in_guest,
> >>> + .is_user_mode   = kvm_is_user_mode,
> >>> + .get_guest_ip   = kvm_get_guest_ip,
> >>> + .reset_in_guest = kvm_reset_in_guest,
> >>> +};
> >>>
> >>>
> >> Should be in common code, not vmx specific.
> >>  
> > Right. I discussed with Yangsheng. I will move above data structures and
> > callbacks to file arch/x86/kvm/x86.c, and add get_ip, a new callback to
> > kvm_x86_ops.
> >
> 
> You will need access to the vcpu pointer (kvm_rip_read() needs it), you 
> can put it in a percpu variable.
We do so now in a new patch.

>   I guess if it's not null, you know 
> you're in a guest, so no need for PF_VCPU.
Good suggestion.

Thanks.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: host panic based on kernel 2.6.34-RC1

2010-03-16 Thread Hao, Xudong
David Miller wrote:
> From: "Hao, Xudong" 
> Date: Wed, 17 Mar 2010 10:14:50 +0800
> 
>> I installed a latest kvm based on kernel 2.6.34-rc1, after I load
>> kvm kvm_i= ntel module, and start /etc/init.d/kvm, a few minutes
>> later, the system wil= l panic. The panic is easy to reproduce when
>> I use tcpdump in host. However, if I stop /etc/init.d/kvm,
>>  everything is OK, host works fine. Does= anyone met similar issue?
>> any hint? 
> 
> This is fixed in Linus's GIT tree already.

Thanks David.

Thanks,
Xudong--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: host panic based on kernel 2.6.34-RC1

2010-03-16 Thread David Miller
From: "Hao, Xudong" 
Date: Wed, 17 Mar 2010 10:14:50 +0800

> I installed a latest kvm based on kernel 2.6.34-rc1, after I load kvm kvm_i=
> ntel module, and start /etc/init.d/kvm, a few minutes later, the system wil=
> l panic. The panic is easy to reproduce when I use tcpdump in host.
> However, if I stop /etc/init.d/kvm, everything is OK, host works fine. Does=
>  anyone met similar issue? any hint?

This is fixed in Linus's GIT tree already.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Frank Ch. Eigler
Hi -

On Tue, Mar 16, 2010 at 06:04:10PM -0500, Anthony Liguori wrote:
> [...]
> The only way to really address this is to change the interaction.  
> Instead of running perf externally to qemu, we should support a perf 
> command in the qemu monitor that can then tie directly to the perf 
> tooling.  That gives us the best possible user experience.

To what extent could this be solved with less crossing of
isolation/abstraction layers, if the perfctr facilities were properly
virtualized?  That way guests could run perf goo internally.
Optionally virt tools on the host side could aggregate data from
cooperating self-monitoring guests.

- FChE
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Paul Brook
> Where does the translator need access to this original code?  I was
> just thinking about this problem today, wondering how much overhead
> there is with this SMC page protection thing.

When an MMU fault occurs qemu re-translates the TB with additional annotations 
to determine which guest instruction caused the fault.
See translate-all.c:cpu_restore_state().

Paul
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Anthony Liguori

On 03/16/2010 12:39 PM, Ingo Molnar wrote:

If we look at the use-case, it's going to be something like, a user is
creating virtual machines and wants to get performance information about
them.

Having to run a separate tool like perf is not going to be what they would
expect they had to do.  Instead, they would either use their existing GUI
tool (like virt-manager) or they would use their management interface
(either QMP or libvirt).

The complexity of interaction is due to the fact that perf shouldn't be a
stand alone tool.  It should be a library or something with a programmatic
interface that another tool can make use of.
 

But ... a GUI interface/integration is of course possible too, and it's being
worked on.

perf is mainly a kernel developer tool, and kernel developers generally dont
use GUIs to do their stuff: which is the (sole) reason why its first ~850
commits of tools/perf/ were done without a GUI. We go where our developers
are.

In any case it's not an excuse to have no proper command-line tooling. In fact
if you cannot get simpler, more atomic command-line tooling right then you'll
probably doubly suck at doing a GUI as well.
   


It's about who owns the user interface.

If qemu owns the user interface, than we can satisfy this in a very 
simple way by adding a perf monitor command.  If we have to support 
third party tools, then it significantly complicates things.


Regards,

Anthony Liguori


Ingo
   


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Anthony Liguori

On 03/16/2010 01:28 PM, Ingo Molnar wrote:

* Anthony Liguori  wrote:

   

On 03/16/2010 12:52 PM, Ingo Molnar wrote:
 

* Anthony Liguori   wrote:

   

On 03/16/2010 10:52 AM, Ingo Molnar wrote:
 

You are quite mistaken: KVM isnt really a 'random unprivileged application' in
this context, it is clearly an extension of system/kernel services.

( Which can be seen from the simple fact that what started the discussion was
   'how do we get /proc/kallsyms from the guest'. I.e. an extension of the
   existing host-space /proc/kallsyms was desired. )
   

Random tools (like perf) should not be able to do what you describe. It's a
security nightmare.
 

A security nightmare exactly how? Mind to go into details as i dont understand
your point.
   

Assume you're using SELinux to implement mandatory access control.
How do you label this file system?

Generally speaking, we don't know the difference between /proc/kallsyms vs.
/dev/mem if we do generic passthrough.  While it might be safe to have a
relaxed label of kallsyms (since it's read only), it's clearly not safe to
do that for /dev/mem, /etc/shadow, or any file containing sensitive
information.
 

What's your _point_? Please outline a threat model, a vector of attack,
_anything_ that substantiates your "it's a security nightmare" claim.
   


You suggested "to have a (read only) mount of all guest filesystems".

As I described earlier, not all of the information within the guest 
filesystem has the same level of sensitivity.  If you exposed a generic 
interface like this, it makes it very difficult to delegate privileges.


Delegating privileges is important because from in a higher security 
environment, you may want to prevent a management tool from accessing 
the VM's disk directly, but still allow it to do basic operations (in 
particular, to view performance statistics).



Rather, we ought to expose a higher level interface that we have more
confidence in with respect to understanding the ramifications of exposing
that guest data.
 

Exactly, we want something that has a flexible namespace and works well with
Linux tools in general. Preferably that namespace should be human readable,
and it should be hierarchic, and it should have a well-known permission model.

This concept exists in Linux and is generally called a 'filesystem'.
   


If you want to use a synthetic filesystem as the management interface 
for qemu, that's one thing.  But you suggested exposing the guest 
filesystem in its entirely and that's what I disagreed with.



If a user cannot read the image file then the user has no access to its
contents via other namespaces either. That is, of course, a basic security
aspect.

( That is perfectly true with a non-SELinux Unix permission model as well, and
   is true in the SELinux case as well. )
   


I don't think that's reasonable at all.  The guest may encrypt it's disk 
image.  It still ought to be possible to run perf against that guest, no?



Erm. Please explain to me, what exactly is 'not that simple' in a MAC
environment?

Also, i'd like to note that the 'restrictive SELinux setups' usecases are
pretty secondary.

To demonstrate that, i'd like every KVM developer on this list who reads this
mail and who has their home development system where they produce their
patches set up in a restrictive MAC environment, in that you cannot even read
the images you are using, to chime in with a "I'm doing that" reply.
   


My home system doesn't run SELinux but I work daily with systems that 
are using SELinux.


I want to be able to run tools like perf on these systems because 
ultimately, I need to debug these systems on a daily basis.


But that's missing the point.  We want to have an interface that works 
for both cases so that we're not maintaining two separate interfaces.


We've rat holed a bit though.  You want:

1) to run perf kvm list and be able to enumerate KVM guests

2) for this to Just Work with qemu guests launched from the command line

You could achieve (1) by tying perf to libvirt but that won't work for 
(2).  There are a few practical problems with (2).


qemu does not require the user to associate any uniquely identifying 
information with a VM.  We've also optimized the command line use case 
so that if all you want to do is run a disk image, you just execute 
"qemu foo.img".  To satisfy your use case, we would either have to force 
a use to always specify unique information, which would be less 
convenient for our users or we would have to let the name be an optional 
parameter.


As it turns out, we already support "qemu -name Fedora foo.img".  What 
we don't do today, but I've been suggesting we should, is automatically 
create a QMP management socket in a well known location based on the 
-name parameter when it's specified.  That would let a tool like perf 
Just Work provided that a user specified -name.


No one uses -name today though and I'm sure you don't either.

Th

Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Masami Hiramatsu
oerg Roedel wrote:
> On Tue, Mar 16, 2010 at 12:25:00PM +0100, Ingo Molnar wrote:
>> Hm, that sounds rather messy if we want to use it to basically expose kernel 
>> functionality in a guest/host unified way. Is the qemu process discoverable 
>> in 
>> some secure way? Can we trust it? Is there some proper tooling available to 
>> do 
>> it, or do we have to push it through 2-3 packages to get such a useful 
>> feature 
>> done?
> 
> Since we want to implement a pmu usable for the guest anyway why we
> don't just use a guests perf to get all information we want? If we get a
> pmu-nmi from the guest we just re-inject it to the guest and perf in the
> guest gives us all information we wand including kernel and userspace
> symbols, stack traces, and so on.

I guess this aims to get information from old environments running on
kvm for life extension :)

> In the previous thread we discussed about a direct trace channel between
> guest and host kernel (which can be used for ftrace events for example).
> This channel could be used to transport this information to the host
> kernel.

Interesting! I know the people who are trying to do that with systemtap.
See, http://vesper.sourceforge.net/

> 
> The only additional feature needed is a way for the host to start a perf
> instance in the guest.

# ssh localguest perf record --host-chanel ... ? B-)

Thank you,

> 
> Opinions?
> 
> 
>   Joerg
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Masami Hiramatsu
e-mail: mhira...@redhat.com
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Yoshiaki Tamura

Anthony Liguori wrote:

On 03/16/2010 07:45 AM, Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to
prevent
direct access to the phys_ram_dirty bitmap.



+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+ int ret = 0;
+
+ mask = 1UL<< offset;
+ if (phys_ram_vga_dirty[index]& mask)
+ ret |= VGA_DIRTY_FLAG;
+ if (phys_ram_code_dirty[index]& mask)
+ ret |= CODE_DIRTY_FLAG;
+ if (phys_ram_migration_dirty[index]& mask)
+ ret |= MIGRATION_DIRTY_FLAG;
+
+ return ret;
}

static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
int dirty_flags)
{
- return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags;
+ return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags;
}


This turns one cacheline access into three. If the dirty bitmaps were
in an array, you could do

return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS +
BITS_IN_LONG)] & mask;

with one cacheline access.


As far as I can tell, we only ever call with a single flag so your
suggestion makes sense.

I'd suggest introducing these functions before splitting the bitmap up.
It makes review a bit easier.


Thanks for your advise.
I'll post the wrapper functions for existing byte-based phys_ram_dirty first.

Yoshi
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Richard Henderson
On 03/16/2010 01:10 PM, Blue Swirl wrote:
> Just a tangential note: a long time ago, I tried to disable self
> modifying code detection for Sparc. On most RISC architectures, SMC
> needs explicit flushing so in theory we need not track code memory
> writes. However, during exceptions the translator needs to access the
> original unmodified code that was used to generate the TB. But maybe
> there are other ways to avoid SMC tracking, on x86 it's still needed
> but I suppose SMC is pretty rare.

True SMC is fairly rare, but the SMC checker triggers fairly often
on the PLT update during dynamic linking.  Nearly all cpus (x86 being
the only exception I recall) needed to re-design their PLT format to
avoid this code update in order to support SELinux.

Where does the translator need access to this original code?  I was
just thinking about this problem today, wondering how much overhead
there is with this SMC page protection thing.


r~
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread oerg Roedel
On Tue, Mar 16, 2010 at 12:25:00PM +0100, Ingo Molnar wrote:
> Hm, that sounds rather messy if we want to use it to basically expose kernel 
> functionality in a guest/host unified way. Is the qemu process discoverable 
> in 
> some secure way? Can we trust it? Is there some proper tooling available to 
> do 
> it, or do we have to push it through 2-3 packages to get such a useful 
> feature 
> done?

Since we want to implement a pmu usable for the guest anyway why we
don't just use a guests perf to get all information we want? If we get a
pmu-nmi from the guest we just re-inject it to the guest and perf in the
guest gives us all information we wand including kernel and userspace
symbols, stack traces, and so on.

In the previous thread we discussed about a direct trace channel between
guest and host kernel (which can be used for ftrace events for example).
This channel could be used to transport this information to the host
kernel.

The only additional feature needed is a way for the host to start a perf
instance in the guest.

Opinions?


Joerg

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Blue Swirl
On 3/16/10, Anthony Liguori  wrote:
> On 03/16/2010 08:57 AM, Avi Kivity wrote:
>
> > On 03/16/2010 03:51 PM, Anthony Liguori wrote:
> >
> > > On 03/16/2010 08:29 AM, Avi Kivity wrote:
> > >
> > > > On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote:
> > > >
> > > > > Avi Kivity wrote:
> > > > >
> > > > > > On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:
> > > > > >
> > > > > > > Modifies wrapper functions for byte-based phys_ram_dirty bitmap
> to
> > > > > > > bit-based phys_ram_dirty bitmap, and adds more wrapper functions
> to
> > > > > > > prevent
> > > > > > > direct access to the phys_ram_dirty bitmap.
> > > > > > >
> > > > > >
> > > > > >
> > > > > > > +
> > > > > > > +static inline int
> cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
> > > > > > > +{
> > > > > > > + unsigned long mask;
> > > > > > > + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
> > > > > > > + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
> > > > > > > + int ret = 0;
> > > > > > > +
> > > > > > > + mask = 1UL<< offset;
> > > > > > > + if (phys_ram_vga_dirty[index]& mask)
> > > > > > > + ret |= VGA_DIRTY_FLAG;
> > > > > > > + if (phys_ram_code_dirty[index]& mask)
> > > > > > > + ret |= CODE_DIRTY_FLAG;
> > > > > > > + if (phys_ram_migration_dirty[index]& mask)
> > > > > > > + ret |= MIGRATION_DIRTY_FLAG;
> > > > > > > +
> > > > > > > + return ret;
> > > > > > > }
> > > > > > >
> > > > > > > static inline int
> cpu_physical_memory_get_dirty(ram_addr_t addr,
> > > > > > > int dirty_flags)
> > > > > > > {
> > > > > > > - return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags;
> > > > > > > + return
> cpu_physical_memory_get_dirty_flags(addr)& dirty_flags;
> > > > > > > }
> > > > > > >
> > > > > >
> > > > > > This turns one cacheline access into three. If the dirty bitmaps
> were in
> > > > > > an array, you could do
> > > > > >
> > > > > > return dirty_bitmaps[dirty_index][addr >>
> (TARGET_PAGE_BITS +
> > > > > > BITS_IN_LONG)] & mask;
> > > > > >
> > > > > > with one cacheline access.
> > > > > >
> > > > >
> > > > > If I'm understanding the existing code correctly,
> > > > > int dirty_flags can be combined, like VGA + MIGRATION.
> > > > > If we only have to worry about a single dirty flag, I agree with
> your idea.
> > > > >
> > > >
> > > > From a quick grep it seems flags are not combined, except for
> something strange with CODE_DIRTY_FLAG:
> > > >
> > > >
> > > > >  static void notdirty_mem_writel(void *opaque,
> target_phys_addr_t ram_addr,
> > > > >uint32_t val)
> > > > > {
> > > > >int dirty_flags;
> > > > >dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
> > > > >if (!(dirty_flags & CODE_DIRTY_FLAG)) {
> > > > > #if !defined(CONFIG_USER_ONLY)
> > > > >tb_invalidate_phys_page_fast(ram_addr, 4);
> > > > >dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
> > > > > #endif
> > > > >}
> > > > >stl_p(qemu_get_ram_ptr(ram_addr), val);
> > > > >dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
> > > > >phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
> > > > >/* we remove the notdirty callback only if the code has been
> > > > >   flushed */
> > > > >if (dirty_flags == 0xff)
> > > > >tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
> > > > > }
> > > > >
> > > >
> > > > I can't say I understand what it does.
> > > >
> > >
> > > The semantics of CODE_DIRTY_FLAG are a little counter intuitive.
> CODE_DIRTY_FLAG means that we know that something isn't code so writes do
> not need checking for self modifying code.
> > >
> >
> > So the hardware equivalent is, when the Instruction TLB loads a page
> address, clear CODE_DIRTY_FLAG?
> >
>
>  Yes, and is what tlb_protect_code() does and it's called from
> tb_alloc_page() which is what's code when a TB is created.

Just a tangential note: a long time ago, I tried to disable self
modifying code detection for Sparc. On most RISC architectures, SMC
needs explicit flushing so in theory we need not track code memory
writes. However, during exceptions the translator needs to access the
original unmodified code that was used to generate the TB. But maybe
there are other ways to avoid SMC tracking, on x86 it's still needed
but I suppose SMC is pretty rare.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[ kvm-Bugs-2971166 ] usb passthrough fails with RDP

2010-03-16 Thread SourceForge.net
Bugs item #2971166, was opened at 2010-03-16 05:32
Message generated for change (Comment added) made by jimatjtan
You can respond by visiting: 
https://sourceforge.net/tracker/?func=detail&atid=893831&aid=2971166&group_id=180599

Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: libkvm
Group: v1.0 (example)
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Peter Brooks (theneb)
Assigned to: Nobody/Anonymous (nobody)
Summary: usb passthrough fails with RDP

Initial Comment:
Base OS: Debian Etch 2.6.26-2-amd64
Processor: Intel(R) Xeon(TM) CPU 3.00GHz
QEMU 0.9.1
Virt-manager 0.6.0
kvm 72+dfsg-5

Guest OS: Windows Server 2008 standard

I've enabled USB pass through using virsh and editing the xml file for my vhost 
to add the usb device attached.

When using the guest OS from virt-manager, the USB device works perfectly.
When using the guest OS from RDP then the USB device fails to work.

The device when failing to work it does still appear in the device manager.

The device itself is a Safenet Sentinel dongle, sadly used for software copy 
protection. It is used with a Galaxy RS security system, the software fails to 
start without the dongle functioning perfectly.

--

Comment By: Jim Paris (jimatjtan)
Date: 2010-03-16 15:39

Message:
Many Windows security systems specifically detect when a remote desktop
session is attached, and refuse to allow access.  Indeed, according to the
Safenet Sentinel End User Guide at
http://www2.safenet-inc.com/support/files/SafeNet_Sentinel_EndUser_Guide.pdf:

"Please Note that if you attempt to run a Sentinel Key protected
application in standalone mode via a remote client (Terminal Server,
VNC, WinXP remote client...), the software protected with Sentinel keys
will not allow this for security reasons if application is protected in
Standalone mode. You will either need to run the software while directly
logged into the machine, or need to get in touch with you software vendor
for software protected in Network mode."

So this is definitely not a libvirt or qemu or kvm problem.  You'll have
to bring this up with your DRM provider.

--

You can respond by visiting: 
https://sourceforge.net/tracker/?func=detail&atid=893831&aid=2971166&group_id=180599
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Fix some mmu/emulator atomicity issues (v2)

2010-03-16 Thread Marcelo Tosatti
On Tue, Mar 16, 2010 at 07:22:55PM +0100, Alexander Graf wrote:
> 
> On 16.03.2010, at 17:36, Marcelo Tosatti wrote:
> 
> > On Mon, Mar 15, 2010 at 01:59:52PM +0200, Avi Kivity wrote:
> >> Currently when we emulate a locked operation into a shadowed guest page
> >> table, we perform a write rather than a true atomic.  This is indicated
> >> by the "emulating exchange as write" message that shows up in dmesg.
> >> 
> >> In addition, the pte prefetch operation during invlpg suffered from a
> >> race.  This was fixed by removing the operation.
> >> 
> >> This patchset fixes both issues and reinstates pte prefetch on invlpg.
> >> 
> >> v3:
> >>   - rebase against next branch (resolves conflicts via hypercall patch)
> >> 
> >> v2:
> >>   - fix truncated description for patch 1
> >>   - add new patch 4, which fixes a bug in patch 5
> > 
> > Applied, thanks.
> 
> How relevant is this for -stable? Races don't sound good to me :)

The race mentioned above is not existant on -stable since prefetch is
disabled for invlpg.

The atomic fixes seem like a candidate, since lack of them can trigger
pagetable corruption. Avi?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Anthony Liguori  wrote:

> On 03/16/2010 12:52 PM, Ingo Molnar wrote:
> >* Anthony Liguori  wrote:
> >
> >>On 03/16/2010 10:52 AM, Ingo Molnar wrote:
> >>>You are quite mistaken: KVM isnt really a 'random unprivileged 
> >>>application' in
> >>>this context, it is clearly an extension of system/kernel services.
> >>>
> >>>( Which can be seen from the simple fact that what started the discussion 
> >>>was
> >>>   'how do we get /proc/kallsyms from the guest'. I.e. an extension of the
> >>>   existing host-space /proc/kallsyms was desired. )
> >>Random tools (like perf) should not be able to do what you describe. It's a
> >>security nightmare.
> >A security nightmare exactly how? Mind to go into details as i dont 
> >understand
> >your point.
> 
> Assume you're using SELinux to implement mandatory access control.
> How do you label this file system?
>
> Generally speaking, we don't know the difference between /proc/kallsyms vs. 
> /dev/mem if we do generic passthrough.  While it might be safe to have a 
> relaxed label of kallsyms (since it's read only), it's clearly not safe to 
> do that for /dev/mem, /etc/shadow, or any file containing sensitive 
> information.

What's your _point_? Please outline a threat model, a vector of attack, 
_anything_ that substantiates your "it's a security nightmare" claim.

> Rather, we ought to expose a higher level interface that we have more 
> confidence in with respect to understanding the ramifications of exposing 
> that guest data.

Exactly, we want something that has a flexible namespace and works well with 
Linux tools in general. Preferably that namespace should be human readable, 
and it should be hierarchic, and it should have a well-known permission model.

This concept exists in Linux and is generally called a 'filesystem'.

> >> No way.  The guest has sensitive data and exposing it widely on the host 
> >> is a bad thing to do. [...]
> >
> > Firstly, you are putting words into my mouth, as i said nothing about 
> > 'exposing it widely'. I suggest exposing it under the privileges of 
> > whoever has access to the guest image.
> 
> That doesn't work as nicely with SELinux.
> 
> It's completely reasonable to have a user that can interact in a read only 
> mode with a VM via libvirt but cannot read the guest's disk images or the 
> guest's memory contents.

If a user cannot read the image file then the user has no access to its 
contents via other namespaces either. That is, of course, a basic security 
aspect.

( That is perfectly true with a non-SELinux Unix permission model as well, and
  is true in the SELinux case as well. )

> > Secondly, regarding confidentiality, and this is guest security 101: whoever
> > can access the image on the host _already_ has access to all the guest data!
> >
> > A Linux image can generally be loopback mounted straight away:
> >
> >   losetup -o 32256 /dev/loop0 ./guest-image.img
> >   mount -o ro /dev/loop0 /mnt-guest
> >
> >(Or, if you are an unprivileged user who cannot mount, it can be read via 
> >ext2
> >tools.)
> >
> > There's nothing the guest can do about that. The host is in total control of
> > guest image data for heaven's sake!
> 
> It's not that simple in a MAC environment.

Erm. Please explain to me, what exactly is 'not that simple' in a MAC 
environment?

Also, i'd like to note that the 'restrictive SELinux setups' usecases are 
pretty secondary.

To demonstrate that, i'd like every KVM developer on this list who reads this 
mail and who has their home development system where they produce their 
patches set up in a restrictive MAC environment, in that you cannot even read 
the images you are using, to chime in with a "I'm doing that" reply.

If there's just a _single_ KVM developer amongst dozens and dozens of 
developers on this list who develops in an environment like that i'd be 
surprised. That result should pretty much tell you where the weight of 
instrumentation focus should lie - and it isnt on restrictive MAC environments 
...

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Fix some mmu/emulator atomicity issues (v2)

2010-03-16 Thread Alexander Graf

On 16.03.2010, at 17:36, Marcelo Tosatti wrote:

> On Mon, Mar 15, 2010 at 01:59:52PM +0200, Avi Kivity wrote:
>> Currently when we emulate a locked operation into a shadowed guest page
>> table, we perform a write rather than a true atomic.  This is indicated
>> by the "emulating exchange as write" message that shows up in dmesg.
>> 
>> In addition, the pte prefetch operation during invlpg suffered from a
>> race.  This was fixed by removing the operation.
>> 
>> This patchset fixes both issues and reinstates pte prefetch on invlpg.
>> 
>> v3:
>>   - rebase against next branch (resolves conflicts via hypercall patch)
>> 
>> v2:
>>   - fix truncated description for patch 1
>>   - add new patch 4, which fixes a bug in patch 5
> 
> Applied, thanks.

How relevant is this for -stable? Races don't sound good to me :)


Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Corrupt qcow2 image, recovery?

2010-03-16 Thread Christian Nilsson
Hi!

I'm running kvm / qemu-kvm on a couple of production servers everything (or at 
least most things) works as it should.
However today someone thought it was a "good" idea to restart one of the 
servers and after that the windows 2k3 guest on that server don't boot anymore.

kvm on this server is a bit "outdated": "QEMU PC emulator version 0.9.1 
(kvm-83)"
(I guess this is one of the qcow2 corruption bugs, and i can only blame myself 
for not upgrading kvm sooner.)
The guest.qcow2 is a 21GiB file for a 60GiB disk

i have tried a couple of things kvm-img convert -f qcow2 -O raw guest.qcow2 
guest.raw
this stops and does nothing after creating a guest.raw that is 60GiB but only 
using 60MiB

so mounted the fs from another server running: "QEMU PC emulator version 0.12.1 
(qemu-kvm-0.12.1.2)"

and run qemu-img with the same options as above and after a few secs got 
"qemu-img: error while reading"
and the same 60MiB used by guest.raw

i also tried booting qemu-kvm with a linux guest and this qcow2 image but only 
get I/O Errors (and no partitions found)

# qemu-img check guest.qcow2
ERROR: invalid cluster offset=0x10a000  
ERROR OFLAG_COPIED: l2_offset=ee73 refcount=1   
ERROR l2_offset=ee73: Table is not cluster aligned; L1 entry corrupted
ERROR: invalid cluster offset=0x11d44100080   
ERROR: invalid cluster offset=0x11d61600080   
ERROR: invalid cluster offset=0x11d68600080   
ERROR: invalid cluster offset=0x11d95300080
(and a loot more in this style, full log can be provided if it 
would be of help to anybody)



is there any possibility to repair this file, or convert it to a RAW file (even 
with parts padded that are not "safe" from the qcow2 image), or as a last 
resort, are there any debug tools for qcow2 images that might be of use?

I have read up on the qcow fileformat but right now i'm a bit short of time, i 
need the data in this guests disk image, or at least the MS SQL datafiles that 
are on this disk) i have also checked the qcow2 file and it do contain a NTLDR 
string and a loot of other NTFS recognized strings so i know that all data is 
not gone. the question is how can i access it as a Filesystem again?


Any help would be appreciated!

Regards
Christian Nilsson
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Anthony Liguori

On 03/16/2010 12:52 PM, Ingo Molnar wrote:

* Anthony Liguori  wrote:

   

On 03/16/2010 10:52 AM, Ingo Molnar wrote:
 

You are quite mistaken: KVM isnt really a 'random unprivileged application' in
this context, it is clearly an extension of system/kernel services.

( Which can be seen from the simple fact that what started the discussion was
   'how do we get /proc/kallsyms from the guest'. I.e. an extension of the
   existing host-space /proc/kallsyms was desired. )
   

Random tools (like perf) should not be able to do what you describe. It's a
security nightmare.
 

A security nightmare exactly how? Mind to go into details as i dont understand
your point.
   


Assume you're using SELinux to implement mandatory access control.  How 
do you label this file system?


Generally speaking, we don't know the difference between /proc/kallsyms 
vs. /dev/mem if we do generic passthrough.  While it might be safe to 
have a relaxed label of kallsyms (since it's read only), it's clearly 
not safe to do that for /dev/mem, /etc/shadow, or any file containing 
sensitive information.


Rather, we ought to expose a higher level interface that we have more 
confidence in with respect to understanding the ramifications of 
exposing that guest data.




No way.  The guest has sensitive data and exposing it widely on the host is
a bad thing to do. [...]
 

Firstly, you are putting words into my mouth, as i said nothing about
'exposing it widely'. I suggest exposing it under the privileges of whoever
has access to the guest image.
   


That doesn't work as nicely with SELinux.

It's completely reasonable to have a user that can interact in a read 
only mode with a VM via libvirt but cannot read the guest's disk images 
or the guest's memory contents.



Secondly, regarding confidentiality, and this is guest security 101: whoever
can access the image on the host _already_ has access to all the guest data!

A Linux image can generally be loopback mounted straight away:

   losetup -o 32256 /dev/loop0 ./guest-image.img
   mount -o ro /dev/loop0 /mnt-guest

(Or, if you are an unprivileged user who cannot mount, it can be read via ext2
tools.)

There's nothing the guest can do about that. The host is in total control of
guest image data for heaven's sake!
   


It's not that simple in a MAC environment.

Regards,

Anthony Liguori


All i'm suggesting is to make what is already possible more convenient.

Ingo
   


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Fix some mmu/emulator atomicity issues (v2)

2010-03-16 Thread Marcelo Tosatti
On Mon, Mar 15, 2010 at 01:59:52PM +0200, Avi Kivity wrote:
> Currently when we emulate a locked operation into a shadowed guest page
> table, we perform a write rather than a true atomic.  This is indicated
> by the "emulating exchange as write" message that shows up in dmesg.
> 
> In addition, the pte prefetch operation during invlpg suffered from a
> race.  This was fixed by removing the operation.
> 
> This patchset fixes both issues and reinstates pte prefetch on invlpg.
> 
> v3:
>- rebase against next branch (resolves conflicts via hypercall patch)
> 
> v2:
>- fix truncated description for patch 1
>- add new patch 4, which fixes a bug in patch 5

Applied, thanks.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Anthony Liguori  wrote:

> On 03/16/2010 10:52 AM, Ingo Molnar wrote:
> >You are quite mistaken: KVM isnt really a 'random unprivileged application' 
> >in
> >this context, it is clearly an extension of system/kernel services.
> >
> >( Which can be seen from the simple fact that what started the discussion was
> >   'how do we get /proc/kallsyms from the guest'. I.e. an extension of the
> >   existing host-space /proc/kallsyms was desired. )
> 
> Random tools (like perf) should not be able to do what you describe. It's a 
> security nightmare.

A security nightmare exactly how? Mind to go into details as i dont understand 
your point.

> If it's desirable to have /proc/kallsyms available, we can expose an 
> interface in QEMU to provide that.  That can then be plumbed through libvirt 
> and QMP.
> 
> Then a management tool can use libvirt or QMP to obtain that information and 
> interact with the kernel appropriately.
> 
> > In that sense the most natural 'extension' would be the solution i 
> > mentioned a week or two ago: to have a (read only) mount of all guest 
> > filesystems, plus a channel for profiling/tracing data. That would make 
> > symbol parsing easier and it's what extends the existing 'host space' 
> > abstraction in the most natural way.
> >
> > ( It doesnt even have to be done via the kernel - Qemu could implement that
> >   via FUSE for example. )
> 
> No way.  The guest has sensitive data and exposing it widely on the host is 
> a bad thing to do. [...]

Firstly, you are putting words into my mouth, as i said nothing about 
'exposing it widely'. I suggest exposing it under the privileges of whoever 
has access to the guest image.

Secondly, regarding confidentiality, and this is guest security 101: whoever 
can access the image on the host _already_ has access to all the guest data!

A Linux image can generally be loopback mounted straight away:

  losetup -o 32256 /dev/loop0 ./guest-image.img
  mount -o ro /dev/loop0 /mnt-guest

(Or, if you are an unprivileged user who cannot mount, it can be read via ext2 
tools.)

There's nothing the guest can do about that. The host is in total control of 
guest image data for heaven's sake!

All i'm suggesting is to make what is already possible more convenient.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Anthony Liguori  wrote:

> On 03/16/2010 08:08 AM, Ingo Molnar wrote:
> >* Avi Kivity  wrote:
> >
> >>On 03/16/2010 02:29 PM, Ingo Molnar wrote:
> >>>I mean, i can trust a kernel service and i can trust /proc/kallsyms.
> >>>
> >>>Can perf trust a random process claiming to be Qemu? What's the trust
> >>>mechanism here?
> >>Obviously you can't trust anything you get from a guest, no matter how you
> >>get it.
> >I'm not talking about the symbol strings and addresses, and the object
> >contents for allocation (or debuginfo). I'm talking about the basic protocol
> >of establishing which guest is which.
> >
> >I.e. we really want to be able users to:
> >
> >  1) have it all working with a single guest, without having to specify 
> > 'which'
> > guest (qemu PID) to work with. That is the dominant usecase both for
> > developers and for a fair portion of testers.
> 
> You're making too many assumptions.
> 
> There is no list of guests anymore than there is a list of web browsers.
> 
> You can have a multi-tenant scenario where you have distinct groups of 
> virtual machines running as unprivileged users.

"multi-tenant" and groups is not a valid excuse at all for giving crappy 
technology in the simplest case: when there's a single VM. Yes, eventually it 
can be supported and any sane scheme will naturally support it too, but it's 
by no means what we care about primarily when it comes to these tools.

I thought everyone learned the lesson behind SystemTap's failure (and to a 
certain degree this was behind Oprofile's failure as well): when it comes to 
tooling/instrumentation we dont want to concentrate on the fancy complex 
setups and abstract requirements drawn up by CIOs, as development isnt being 
done there. Concentrate on our developers today, and provide no-compromises 
usability to those who contribute stuff.

If we dont help make the simplest (and most common) use-case convenient then 
we are failing on a fundamental level.

> >  2) Have some reasonable symbolic identification for guests. For example a
> > usable approach would be to have 'perf kvm list', which would list all
> > currently active guests:
> >
> >  $ perf kvm list
> >[1] Fedora
> >[2] OpenSuse
> >[3] Windows-XP
> >[4] Windows-7
> >
> > And from that point on 'perf kvm -g OpenSuse record' would do the 
> > obvious
> > thing. Users will be able to just use the 'OpenSuse' symbolic name for
> > that guest, even if the guest got restarted and switched its main PID.
> 
> Does "perf kvm list" always run as root?  What if two unprivileged users 
> both have a VM named "Fedora"?

Again, the single-VM case is the most important case, by far. If you have 
multiple VMs running and want to develop the kernel on multiple VMs (sounds 
rather messy if you think it through ...), what would happen is similar to 
what happens when we have two probes for example:

 # perf probe schedule
 Added new event:
   probe:schedule   (on schedule+0)

 You can now use it on all perf tools, such as:

perf record -e probe:schedule -a sleep 1

 # perf probe -f schedule   
 Added new event:
   probe:schedule_1 (on schedule+0)

 You can now use it on all perf tools, such as:

perf record -e probe:schedule_1 -a sleep 1

 # perf probe -f schedule
 Added new event:
   probe:schedule_2 (on schedule+0)

 You can now use it on all perf tools, such as:

perf record -e probe:schedule_2 -a sleep 1

Something similar could be used for KVM/Qemu: whichever got created first is 
named 'Fedora', the second is named 'Fedora-2'.

> If we look at the use-case, it's going to be something like, a user is 
> creating virtual machines and wants to get performance information about 
> them.
> 
> Having to run a separate tool like perf is not going to be what they would 
> expect they had to do.  Instead, they would either use their existing GUI 
> tool (like virt-manager) or they would use their management interface 
> (either QMP or libvirt).
> 
> The complexity of interaction is due to the fact that perf shouldn't be a 
> stand alone tool.  It should be a library or something with a programmatic 
> interface that another tool can make use of.

But ... a GUI interface/integration is of course possible too, and it's being 
worked on.

perf is mainly a kernel developer tool, and kernel developers generally dont 
use GUIs to do their stuff: which is the (sole) reason why its first ~850 
commits of tools/perf/ were done without a GUI. We go where our developers 
are.

In any case it's not an excuse to have no proper command-line tooling. In fact 
if you cannot get simpler, more atomic command-line tooling right then you'll 
probably doubly suck at doing a GUI as well.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.ke

Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Anthony Liguori

On 03/16/2010 10:52 AM, Ingo Molnar wrote:

You are quite mistaken: KVM isnt really a 'random unprivileged application' in
this context, it is clearly an extension of system/kernel services.

( Which can be seen from the simple fact that what started the discussion was
   'how do we get /proc/kallsyms from the guest'. I.e. an extension of the
   existing host-space /proc/kallsyms was desired. )
   


Random tools (like perf) should not be able to do what you describe.  
It's a security nightmare.


If it's desirable to have /proc/kallsyms available, we can expose an 
interface in QEMU to provide that.  That can then be plumbed through 
libvirt and QMP.


Then a management tool can use libvirt or QMP to obtain that information 
and interact with the kernel appropriately.



In that sense the most natural 'extension' would be the solution i mentioned a
week or two ago: to have a (read only) mount of all guest filesystems, plus a
channel for profiling/tracing data. That would make symbol parsing easier and
it's what extends the existing 'host space' abstraction in the most natural
way.

( It doesnt even have to be done via the kernel - Qemu could implement that
   via FUSE for example. )
   


No way.  The guest has sensitive data and exposing it widely on the host 
is a bad thing to do.  It's a bad interface.  We can expose specific 
information about guests but only through our existing channels which 
are validated through a security infrastructure.


Ultimately, your goal is to keep perf a simple tool with little 
dependencies.  But practically speaking, if you want to add features to 
it, it's going to have to interact with other subsystems in the 
appropriate way.  That means, it's going to need to interact with 
libvirt or QMP.


If you want all applications to expose their data via synthetic file 
systems, then there's always plan9 :-)


Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Anthony Liguori

On 03/16/2010 08:08 AM, Ingo Molnar wrote:

* Avi Kivity  wrote:

   

On 03/16/2010 02:29 PM, Ingo Molnar wrote:
 
   

I mean, i can trust a kernel service and i can trust /proc/kallsyms.

Can perf trust a random process claiming to be Qemu? What's the trust
mechanism here?
   

Obviously you can't trust anything you get from a guest, no matter how you
get it.
 

I'm not talking about the symbol strings and addresses, and the object
contents for allocation (or debuginfo). I'm talking about the basic protocol
of establishing which guest is which.

I.e. we really want to be able users to:

  1) have it all working with a single guest, without having to specify 'which'
 guest (qemu PID) to work with. That is the dominant usecase both for
 developers and for a fair portion of testers.
   


You're making too many assumptions.

There is no list of guests anymore than there is a list of web browsers.

You can have a multi-tenant scenario where you have distinct groups of 
virtual machines running as unprivileged users.



  2) Have some reasonable symbolic identification for guests. For example a
 usable approach would be to have 'perf kvm list', which would list all
 currently active guests:

  $ perf kvm list
[1] Fedora
[2] OpenSuse
[3] Windows-XP
[4] Windows-7

 And from that point on 'perf kvm -g OpenSuse record' would do the obvious
 thing. Users will be able to just use the 'OpenSuse' symbolic name for
 that guest, even if the guest got restarted and switched its main PID.
   


Does "perf kvm list" always run as root?  What if two unprivileged users 
both have a VM named "Fedora"?


If we look at the use-case, it's going to be something like, a user is 
creating virtual machines and wants to get performance information about 
them.


Having to run a separate tool like perf is not going to be what they 
would expect they had to do.  Instead, they would either use their 
existing GUI tool (like virt-manager) or they would use their management 
interface (either QMP or libvirt).


The complexity of interaction is due to the fact that perf shouldn't be 
a stand alone tool.  It should be a library or something with a 
programmatic interface that another tool can make use of.


Regards,

Anthony Liguori


Is such a scheme possible/available? I suspect all the KVM configuration tools
(i havent used them in some time - gui and command-line tools alike) use
similar methods to ease guest management?

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
   


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Ideas wiki for GSoC 2010

2010-03-16 Thread Avi Kivity

On 03/16/2010 06:33 PM, Jason wrote:

Avi Kivity  redhat.com>  writes:

   

On 03/16/2010 02:20 AM, Jason wrote:
 

In comparing KVM 2.6.31.6b to XenServer 5.5.0, it seems KVM has fewer overall
VMREADs and VMWRITEs, but there are a lot of VMWRITEs to Host FS_SEL, Host
GS_SEL, Host FS_BASE, and Host GS_BASE that don't appear in Xen.
   

Ugh, these should definitely be eliminated, they keep writing the same
value most of the time.

 

Sorry, I forgot to mention it would also be nice to reduce the unconditional
writes to Host CR0 if the TS issue can still be dealt with.
   


I have some ideas on how to deal with that.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Frank Ch. Eigler  wrote:

> Hi -
> 
> On Tue, Mar 16, 2010 at 04:52:21PM +0100, Ingo Molnar wrote:
> > [...]
> > > Perhaps the fact that kvm happens to deal with an interesting application 
> > > area (virtualization) is misleading here.  As far as the host kernel or 
> > > other host userspace is concerned, qemu is just some random unprivileged 
> > > userspace program [...]
> 
> > You are quite mistaken: KVM isnt really a 'random unprivileged
> > application' in this context, it is clearly an extension of
> > system/kernel services.
> 
> I don't know what "extension of system/kernel services" means in this 
> context, beyond something running on the system/kernel, like every other 
> process. [...]

It means something like my example of 'extended to guest space' 
/proc/kallsyms:

> > [...]
> >
> > ( Which can be seen from the simple fact that what started the
> >   discussion was 'how do we get /proc/kallsyms from the guest'. I.e. an 
> >   extension of the existing host-space /proc/kallsyms was desired. )
> 
> (Sorry, that smacks of circular reasoning.)

To me it sounds like an example supporting my point. /proc/kallsyms is a 
service by the kernel, and 'perf kvm' desires this to be extended to guest 
space as well.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Ideas wiki for GSoC 2010

2010-03-16 Thread Jason
Avi Kivity  redhat.com> writes:

> 
> On 03/16/2010 02:20 AM, Jason wrote:
> >
> > In comparing KVM 2.6.31.6b to XenServer 5.5.0, it seems KVM has fewer 
> > overall
> > VMREADs and VMWRITEs, but there are a lot of VMWRITEs to Host FS_SEL, Host
> > GS_SEL, Host FS_BASE, and Host GS_BASE that don't appear in Xen.
> 
> Ugh, these should definitely be eliminated, they keep writing the same 
> value most of the time.
> 

Sorry, I forgot to mention it would also be nice to reduce the unconditional
writes to Host CR0 if the TS issue can still be dealt with.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Frank Ch. Eigler
Hi -

On Tue, Mar 16, 2010 at 04:52:21PM +0100, Ingo Molnar wrote:
> [...]
> > Perhaps the fact that kvm happens to deal with an interesting application 
> > area (virtualization) is misleading here.  As far as the host kernel or 
> > other host userspace is concerned, qemu is just some random unprivileged 
> > userspace program [...]

> You are quite mistaken: KVM isnt really a 'random unprivileged
> application' in this context, it is clearly an extension of
> system/kernel services.

I don't know what "extension of system/kernel services" means in this
context, beyond something running on the system/kernel, like every
other process.  To clarify, to what extent do you consider your
classification similarly clear for a host is running

* multiple kvm instances run as unprivileged users
* non-kvm OS simulators such as vmware or xen or gdb
* kvm instances running something other than linux

> ( Which can be seen from the simple fact that what started the
> discussion was 'how do we get /proc/kallsyms from the
> guest'. I.e. an extension of the existing host-space /proc/kallsyms
> was desired. )

(Sorry, that smacks of circular reasoning.)

It may be a charming convenience function for perf users to give them
shortcuts for certain favoured configurations (kvm running freshest
linux), but that says more about perf than kvm.


- FChE
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter

2010-03-16 Thread Avi Kivity

On 03/16/2010 04:27 PM, Balbir Singh wrote:



Let's assume the guest has virtio (I agree with IDE we need
reordering on the host).  The guest sends batches of I/O separated
by cache flushes.  If the batches are smaller than the virtio queue
length, ideally things look like:

  io_submit(..., batch_size_1);
  io_getevents(..., batch_size_1);
  fdatasync();
  io_submit(..., batch_size_2);
   io_getevents(..., batch_size_2);
   fdatasync();
   io_submit(..., batch_size_3);
   io_getevents(..., batch_size_3);
   fdatasync();

(certainly that won't happen today, but it could in principle).

How does a write cache give any advantage?  The host kernel sees
_exactly_ the same information as it would from a bunch of threaded
pwritev()s followed by fdatasync().

 

Are you suggesting that the model with cache=writeback gives us the
same I/O pattern as cache=none, so there are no opportunities for
optimization?
   


Yes.  The guest also has a large cache with the same optimization algorithm.



   

(wish: IO_CMD_ORDERED_FDATASYNC)

If the batch size is larger than the virtio queue size, or if there
are no flushes at all, then yes the huge write cache gives more
opportunity for reordering.  But we're already talking hundreds of
requests here.

Let's say the virtio queue size was unlimited.  What
merging/reordering opportunity are we missing on the host?  Again we
have exactly the same information: either the pagecache lru + radix
tree that identifies all dirty pages in disk order, or the block
queue with pending requests that contains exactly the same
information.

Something is wrong.  Maybe it's my understanding, but on the other
hand it may be a piece of kernel code.

 

I assume you are talking of dedicated disk partitions and not
individual disk images residing on the same partition.
   


Correct. Images in files introduce new writes which can be optimized.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Frank Ch. Eigler  wrote:

> 
> Ingo Molnar  writes:
> 
> > [...]
> >> >I.e. we really want to be able users to:
> >> >
> >> >  1) have it all working with a single guest, without having to specify 
> >> > 'which'
> >> > guest (qemu PID) to work with. That is the dominant usecase both for
> >> > developers and for a fair portion of testers.
> >> 
> >> That's reasonable if we can get it working simply.
> >
> > IMO such ease of use is reasonable and required, full stop.
> > If it cannot be gotten simply then that's a bug: either in the code, or in 
> > the 
> > design, or in the development process that led to the design. Bugs need 
> > fixing. [...]
> 
> Perhaps the fact that kvm happens to deal with an interesting application 
> area (virtualization) is misleading here.  As far as the host kernel or 
> other host userspace is concerned, qemu is just some random unprivileged 
> userspace program (with some *optional* /dev/kvm services that might happen 
> to require temporary root).
> 
> As such, perf trying to instrument qemu is no different than perf trying to 
> instrument any other userspace widget.  Therefore, expecting 'trusted 
> enumeration' of instances is just as sensible as using 'trusted ps' and 
> 'trusted /var/run/FOO.pid files'.

You are quite mistaken: KVM isnt really a 'random unprivileged application' in 
this context, it is clearly an extension of system/kernel services.

( Which can be seen from the simple fact that what started the discussion was 
  'how do we get /proc/kallsyms from the guest'. I.e. an extension of the 
  existing host-space /proc/kallsyms was desired. )

In that sense the most natural 'extension' would be the solution i mentioned a 
week or two ago: to have a (read only) mount of all guest filesystems, plus a 
channel for profiling/tracing data. That would make symbol parsing easier and 
it's what extends the existing 'host space' abstraction in the most natural 
way.

( It doesnt even have to be done via the kernel - Qemu could implement that
  via FUSE for example. )

As a second best option a 'symbol server' might be used too.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Mar 16

2010-03-16 Thread Anthony Liguori

On 03/16/2010 10:23 AM, Daniel P. Berrange wrote:

In the context of the RHEV management application, iSCSI/SCSI Fibre are
providing the raw storage, with LVM VGs on top and the carving LVs for
the guests. In the common case the admin/app would monitor VG usage&  LV
rate of increase to ensure extra space was available in the VG ahead of
it being needed. eg if the VG comes close to exhaustion then further LUNS
can be obtained and added as PVs to the LVM volume group. So you can't
guarentee that a VM won't stop on ENOSPC, but it is very unlikely if the
system is operating correctly.

As an added complication, since cluster-LVM isn't used, all LVM operations
have to be performed on a dedicated/exclusive storage host and then metadata
refreshed/propagated to other hosts running VMs. This last issue implies that
letting QEMU resize its LV would never be possible, even if it were not for
the permissions problem.
   


Sounds like a good argument for polling :-)

Regards,

Anthony Liguori


Regards,
Daniel
   


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Broken loadvm ?

2010-03-16 Thread Alpár Török
PS:  It just occurred to me , that  it does  indeed freeze and cause a
100% CPU usage. At least i can say for sure that network, serial line,
keyboard, nor mouse work. If  loadvm is loaded from the command line.
If loaded from the monitor, everything seams to work, except the
mouse.  After a -loadvm from the command line, repeating the command
from the monitor doesn't unfreeze it.

i am really stuck with this. Any help is greatly appreciated, as
downgrading is not an option.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Mar 16

2010-03-16 Thread Daniel P. Berrange
On Tue, Mar 16, 2010 at 10:05:40AM -0500, Anthony Liguori wrote:
> On 03/16/2010 05:45 AM, Daniel P. Berrange wrote:
> >On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote:
> >   
> >>On 03/16/2010 12:31 PM, Daniel P. Berrange wrote:
> >> 
> Polling loops are an indication that something is wrong.
> 
>  
> >>>Except when people suggest they are the right answer, qcow high
> >>>watermark ;-P
> >>>
> >>>   
> >>I liked Anthony's suggestion of an lvm2 block format driver.  No polling.
> >> 
> >Doesn't that require giving QEMU privileges to perform LVM operations which
> >implies QEMU having CAP_SYS_ADMIN  ?
> >   
> 
> If QEMU is able to resize an LVM partition, it needs to carry privileges.
> 
> I'm not sure how this can be done safely in a lesser privileged 
> environment.  Presumably, you're over committing storage and there's not 
> much you can do if the guests exhaust their storage all at once.

In the context of the RHEV management application, iSCSI/SCSI Fibre are
providing the raw storage, with LVM VGs on top and the carving LVs for
the guests. In the common case the admin/app would monitor VG usage & LV
rate of increase to ensure extra space was available in the VG ahead of
it being needed. eg if the VG comes close to exhaustion then further LUNS 
can be obtained and added as PVs to the LVM volume group. So you can't
guarentee that a VM won't stop on ENOSPC, but it is very unlikely if the
system is operating correctly.

As an added complication, since cluster-LVM isn't used, all LVM operations
have to be performed on a dedicated/exclusive storage host and then metadata
refreshed/propagated to other hosts running VMs. This last issue implies that 
letting QEMU resize its LV would never be possible, even if it were not for
the permissions problem.

Regards,
Daniel
-- 
|: Red Hat, Engineering, London-o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :|
|: http://autobuild.org-o- http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-   F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Frank Ch. Eigler

Ingo Molnar  writes:

> [...]
>> >I.e. we really want to be able users to:
>> >
>> >  1) have it all working with a single guest, without having to specify 
>> > 'which'
>> > guest (qemu PID) to work with. That is the dominant usecase both for
>> > developers and for a fair portion of testers.
>> 
>> That's reasonable if we can get it working simply.
>
> IMO such ease of use is reasonable and required, full stop.
> If it cannot be gotten simply then that's a bug: either in the code, or in 
> the 
> design, or in the development process that led to the design. Bugs need 
> fixing. [...]

Perhaps the fact that kvm happens to deal with an interesting
application area (virtualization) is misleading here.  As far as the
host kernel or other host userspace is concerned, qemu is just some
random unprivileged userspace program (with some *optional* /dev/kvm
services that might happen to require temporary root).

As such, perf trying to instrument qemu is no different than perf
trying to instrument any other userspace widget.  Therefore, expecting
'trusted enumeration' of instances is just as sensible as using
'trusted ps' and 'trusted /var/run/FOO.pid files'.


- FChE
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Mar 16

2010-03-16 Thread Anthony Liguori

On 03/16/2010 05:45 AM, Daniel P. Berrange wrote:

On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote:
   

On 03/16/2010 12:31 PM, Daniel P. Berrange wrote:
 

Polling loops are an indication that something is wrong.

 

Except when people suggest they are the right answer, qcow high
watermark ;-P

   

I liked Anthony's suggestion of an lvm2 block format driver.  No polling.
 

Doesn't that require giving QEMU privileges to perform LVM operations which
implies QEMU having CAP_SYS_ADMIN  ?
   


If QEMU is able to resize an LVM partition, it needs to carry privileges.

I'm not sure how this can be done safely in a lesser privileged 
environment.  Presumably, you're over committing storage and there's not 
much you can do if the guests exhaust their storage all at once.


Regards,

Anthony Liguori


Daniel
   


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Broken loadvm ?

2010-03-16 Thread Alpár Török
KVM module:

modinfo kvm :
filename:   /lib/modules/2.6.33-30-default/kernel/arch/x86/kvm/kvm.ko
license:GPL
author: Qumranet
srcversion: 611CD56E712705875D0E7BB
depends:
vermagic:   2.6.33-30-default SMP mod_unload modversions
parm:   oos_shadow:bool
parm:   ignore_msrs:bool

qemu-kvm :

(qemu) info version
0.11.0 (qemu-kvm-0.11.0-4.5.2)

command line :

qemu-kvm -drive
file=/srv/pvmd/vm/templates/XP_32_SP3/disk.qcow2,cache=writeback,index=0,format=qcow2,boot=on
-vnc :10 -monitor stdio -net nic,model=e1000,macaddr=52:54:00:12:34:00
  -net tap,ifname=tap0,script=no -cpu core2duo -m 512

I have a freshly installed win xp. I was trying to create a snapshot
with savevm from the monitor. With cace=writeback performance was
good, but it *seemed* that after a loadvm (from monitor or command
line) the VM was frozen. I played with it a couple of hours before
realizing that in fact, it's only the mouse that doesn't work after a
loadvm (trough vnc, the system is remote). At least i hope that every
other device works as expected. If the loadvm happens in the same
session as savevm everything works as expected. I also tested on a
2.6.27.39 with kvm-88 it all works ok.

Is this a bug?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Anthony Liguori

On 03/16/2010 08:57 AM, Avi Kivity wrote:

On 03/16/2010 03:51 PM, Anthony Liguori wrote:

On 03/16/2010 08:29 AM, Avi Kivity wrote:

On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote:

Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to
prevent
direct access to the phys_ram_dirty bitmap.



+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t 
addr)

+{
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+ int ret = 0;
+
+ mask = 1UL<< offset;
+ if (phys_ram_vga_dirty[index]& mask)
+ ret |= VGA_DIRTY_FLAG;
+ if (phys_ram_code_dirty[index]& mask)
+ ret |= CODE_DIRTY_FLAG;
+ if (phys_ram_migration_dirty[index]& mask)
+ ret |= MIGRATION_DIRTY_FLAG;
+
+ return ret;
}

static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
int dirty_flags)
{
- return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags;
+ return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags;
}


This turns one cacheline access into three. If the dirty bitmaps 
were in

an array, you could do

return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS +
BITS_IN_LONG)] & mask;

with one cacheline access.


If I'm understanding the existing code correctly,
int dirty_flags can be combined, like VGA + MIGRATION.
If we only have to worry about a single dirty flag, I agree with 
your idea.


From a quick grep it seems flags are not combined, except for 
something strange with CODE_DIRTY_FLAG:


static void notdirty_mem_writel(void *opaque, target_phys_addr_t 
ram_addr,

uint32_t val)
{
int dirty_flags;
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
if (!(dirty_flags & CODE_DIRTY_FLAG)) {
#if !defined(CONFIG_USER_ONLY)
tb_invalidate_phys_page_fast(ram_addr, 4);
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
#endif
}
stl_p(qemu_get_ram_ptr(ram_addr), val);
dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
/* we remove the notdirty callback only if the code has been
   flushed */
if (dirty_flags == 0xff)
tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
}


I can't say I understand what it does.


The semantics of CODE_DIRTY_FLAG are a little counter intuitive.  
CODE_DIRTY_FLAG means that we know that something isn't code so 
writes do not need checking for self modifying code.


So the hardware equivalent is, when the Instruction TLB loads a page 
address, clear CODE_DIRTY_FLAG?


Yes, and is what tlb_protect_code() does and it's called from 
tb_alloc_page() which is what's code when a TB is created.


Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter

2010-03-16 Thread Balbir Singh
* Avi Kivity  [2010-03-16 13:08:28]:

> On 03/16/2010 12:44 PM, Christoph Hellwig wrote:
> >On Tue, Mar 16, 2010 at 12:36:31PM +0200, Avi Kivity wrote:
> >>Are you talking about direct volume access or qcow2?
> >Doesn't matter.
> >
> >>For direct volume access, I still don't get it.  The number of barriers
> >>issues by the host must equal (or exceed, but that's pointless) the
> >>number of barriers issued by the guest.  cache=writeback allows the host
> >>to reorder writes, but so does cache=none.  Where does the difference
> >>come from?
> >>
> >>Put it another way.  In an unvirtualized environment, if you implement a
> >>write cache in a storage driver (not device), and sync it on a barrier
> >>request, would you expect to see a performance improvement?
> >cache=none only allows very limited reorderning in the host.  O_DIRECT
> >is synchronous on the host, so there's just some very limited reordering
> >going on in the elevator if we have other I/O going on in parallel.
> 
> Presumably there is lots of I/O going on, or we wouldn't be having
> this conversation.
>

We are speaking of multiple VM's doing I/O in parallel.
 
> >In addition to that the disk writecache can perform limited reodering
> >and caching, but the disk cache has a rather limited size.  The host
> >pagecache gives a much wieder opportunity to reorder, especially if
> >the guest workload is not cache flush heavy.  If the guest workload
> >is extremly cache flush heavy the usefulness of the pagecache is rather
> >limited, as we'll only use very little of it, but pay by having to do
> >a data copy.  If the workload is not cache flush heavy, and we have
> >multiple guests doing I/O to the same spindles it will allow the host
> >do do much more efficient data writeout by beeing able to do better
> >ordered (less seeky) and bigger I/O (especially if the host has real
> >storage compared to ide for the guest).
> 
> Let's assume the guest has virtio (I agree with IDE we need
> reordering on the host).  The guest sends batches of I/O separated
> by cache flushes.  If the batches are smaller than the virtio queue
> length, ideally things look like:
> 
>  io_submit(..., batch_size_1);
>  io_getevents(..., batch_size_1);
>  fdatasync();
>  io_submit(..., batch_size_2);
>   io_getevents(..., batch_size_2);
>   fdatasync();
>   io_submit(..., batch_size_3);
>   io_getevents(..., batch_size_3);
>   fdatasync();
> 
> (certainly that won't happen today, but it could in principle).
>
> How does a write cache give any advantage?  The host kernel sees
> _exactly_ the same information as it would from a bunch of threaded
> pwritev()s followed by fdatasync().
>

Are you suggesting that the model with cache=writeback gives us the
same I/O pattern as cache=none, so there are no opportunities for
optimization?
 
> (wish: IO_CMD_ORDERED_FDATASYNC)
> 
> If the batch size is larger than the virtio queue size, or if there
> are no flushes at all, then yes the huge write cache gives more
> opportunity for reordering.  But we're already talking hundreds of
> requests here.
> 
> Let's say the virtio queue size was unlimited.  What
> merging/reordering opportunity are we missing on the host?  Again we
> have exactly the same information: either the pagecache lru + radix
> tree that identifies all dirty pages in disk order, or the block
> queue with pending requests that contains exactly the same
> information.
> 
> Something is wrong.  Maybe it's my understanding, but on the other
> hand it may be a piece of kernel code.
> 

I assume you are talking of dedicated disk partitions and not
individual disk images residing on the same partition.

-- 
Three Cheers,
Balbir
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Avi Kivity

On 03/16/2010 03:51 PM, Anthony Liguori wrote:

On 03/16/2010 08:29 AM, Avi Kivity wrote:

On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote:

Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to
prevent
direct access to the phys_ram_dirty bitmap.



+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t 
addr)

+{
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+ int ret = 0;
+
+ mask = 1UL<< offset;
+ if (phys_ram_vga_dirty[index]& mask)
+ ret |= VGA_DIRTY_FLAG;
+ if (phys_ram_code_dirty[index]& mask)
+ ret |= CODE_DIRTY_FLAG;
+ if (phys_ram_migration_dirty[index]& mask)
+ ret |= MIGRATION_DIRTY_FLAG;
+
+ return ret;
}

static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
int dirty_flags)
{
- return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags;
+ return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags;
}


This turns one cacheline access into three. If the dirty bitmaps 
were in

an array, you could do

return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS +
BITS_IN_LONG)] & mask;

with one cacheline access.


If I'm understanding the existing code correctly,
int dirty_flags can be combined, like VGA + MIGRATION.
If we only have to worry about a single dirty flag, I agree with 
your idea.


From a quick grep it seems flags are not combined, except for 
something strange with CODE_DIRTY_FLAG:


static void notdirty_mem_writel(void *opaque, target_phys_addr_t 
ram_addr,

uint32_t val)
{
int dirty_flags;
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
if (!(dirty_flags & CODE_DIRTY_FLAG)) {
#if !defined(CONFIG_USER_ONLY)
tb_invalidate_phys_page_fast(ram_addr, 4);
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
#endif
}
stl_p(qemu_get_ram_ptr(ram_addr), val);
dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
/* we remove the notdirty callback only if the code has been
   flushed */
if (dirty_flags == 0xff)
tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
}


I can't say I understand what it does.


The semantics of CODE_DIRTY_FLAG are a little counter intuitive.  
CODE_DIRTY_FLAG means that we know that something isn't code so writes 
do not need checking for self modifying code.


So the hardware equivalent is, when the Instruction TLB loads a page 
address, clear CODE_DIRTY_FLAG?




notdirty_mem_write() is called for any ram that is in the virtual TLB 
that has not been updated yet and once a write has occurred, we can 
switch to faster access functions (provided we've invalidated any 
translation blocks).


That's why the check is if (!(dirty_flags & CODE_DIRTY_FLAG)), if it 
hasn't been set yet, we have to assume that it could be a TB so we 
need to invalidate it.  tb_invalidate_phys_page_fast() will set the 
CODE_DIRTY_FLAG if no code is present in that memory area which is why 
we fetch dirty_flags again.


Ok.



We do the store, and then set the dirty bits to mark that the page is 
now dirty taking care to not change the CODE_DIRTY_FLAG bit.


At the very end, we check to see if CODE_DIRTY_FLAG which indicates 
that we no longer need to trap writes.  If so, we call tlb_set_dirty() 
which will ultimately remove the notdirty callback in favor of a 
faster access mechanism.


With respect patch series, there should be no problem having a 
separate code bitmap that gets updated along with a main bitmap 
provided that the semantics of CODE_DIRTY_FLAG are preserved.



Sounds good to me.
So we're going to introduce 4 (VGA, CODE, MIGRATION, master) 
bit-based bitmaps in total.




Yeah, except CODE doesn't behave like the others.  Would be best to 
understand what it's requirements are before making the change.  
Maybe CODE will need separate handling (so master will only feed VGA 
and MIGRATION).


Generally speaking, cpu_physical_memory_set_dirty() is called by the 
device model.  Any writes by the device model that results in 
self-modifying code are not going to have predictable semantics which 
is why it can set CODE_DIRTY_FLAG.


CODE_DIRTY_FLAG doesn't need to get updated from a master bitmap.  It 
should be treated as a separate bitmap that is strictly dealt with by 
the virtual TLB.


Thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Anthony Liguori

On 03/16/2010 08:29 AM, Avi Kivity wrote:

On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote:

Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to
prevent
direct access to the phys_ram_dirty bitmap.



+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t 
addr)

+{
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+ int ret = 0;
+
+ mask = 1UL<< offset;
+ if (phys_ram_vga_dirty[index]& mask)
+ ret |= VGA_DIRTY_FLAG;
+ if (phys_ram_code_dirty[index]& mask)
+ ret |= CODE_DIRTY_FLAG;
+ if (phys_ram_migration_dirty[index]& mask)
+ ret |= MIGRATION_DIRTY_FLAG;
+
+ return ret;
}

static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
int dirty_flags)
{
- return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags;
+ return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags;
}


This turns one cacheline access into three. If the dirty bitmaps 
were in

an array, you could do

return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS +
BITS_IN_LONG)] & mask;

with one cacheline access.


If I'm understanding the existing code correctly,
int dirty_flags can be combined, like VGA + MIGRATION.
If we only have to worry about a single dirty flag, I agree with your 
idea.


From a quick grep it seems flags are not combined, except for 
something strange with CODE_DIRTY_FLAG:


static void notdirty_mem_writel(void *opaque, target_phys_addr_t 
ram_addr,

uint32_t val)
{
int dirty_flags;
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
if (!(dirty_flags & CODE_DIRTY_FLAG)) {
#if !defined(CONFIG_USER_ONLY)
tb_invalidate_phys_page_fast(ram_addr, 4);
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
#endif
}
stl_p(qemu_get_ram_ptr(ram_addr), val);
dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
/* we remove the notdirty callback only if the code has been
   flushed */
if (dirty_flags == 0xff)
tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
}


I can't say I understand what it does.


The semantics of CODE_DIRTY_FLAG are a little counter intuitive.  
CODE_DIRTY_FLAG means that we know that something isn't code so writes 
do not need checking for self modifying code.


notdirty_mem_write() is called for any ram that is in the virtual TLB 
that has not been updated yet and once a write has occurred, we can 
switch to faster access functions (provided we've invalidated any 
translation blocks).


That's why the check is if (!(dirty_flags & CODE_DIRTY_FLAG)), if it 
hasn't been set yet, we have to assume that it could be a TB so we need 
to invalidate it.  tb_invalidate_phys_page_fast() will set the 
CODE_DIRTY_FLAG if no code is present in that memory area which is why 
we fetch dirty_flags again.


We do the store, and then set the dirty bits to mark that the page is 
now dirty taking care to not change the CODE_DIRTY_FLAG bit.


At the very end, we check to see if CODE_DIRTY_FLAG which indicates that 
we no longer need to trap writes.  If so, we call tlb_set_dirty() which 
will ultimately remove the notdirty callback in favor of a faster access 
mechanism.


With respect patch series, there should be no problem having a separate 
code bitmap that gets updated along with a main bitmap provided that the 
semantics of CODE_DIRTY_FLAG are preserved.



Sounds good to me.
So we're going to introduce 4 (VGA, CODE, MIGRATION, master) 
bit-based bitmaps in total.




Yeah, except CODE doesn't behave like the others.  Would be best to 
understand what it's requirements are before making the change.  Maybe 
CODE will need separate handling (so master will only feed VGA and 
MIGRATION).


Generally speaking, cpu_physical_memory_set_dirty() is called by the 
device model.  Any writes by the device model that results in 
self-modifying code are not going to have predictable semantics which is 
why it can set CODE_DIRTY_FLAG.


CODE_DIRTY_FLAG doesn't need to get updated from a master bitmap.  It 
should be treated as a separate bitmap that is strictly dealt with by 
the virtual TLB.


Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Yoshiaki Tamura

Avi Kivity wrote:

On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote:

Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to
prevent
direct access to the phys_ram_dirty bitmap.



+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+ int ret = 0;
+
+ mask = 1UL<< offset;
+ if (phys_ram_vga_dirty[index]& mask)
+ ret |= VGA_DIRTY_FLAG;
+ if (phys_ram_code_dirty[index]& mask)
+ ret |= CODE_DIRTY_FLAG;
+ if (phys_ram_migration_dirty[index]& mask)
+ ret |= MIGRATION_DIRTY_FLAG;
+
+ return ret;
}

static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
int dirty_flags)
{
- return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags;
+ return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags;
}


This turns one cacheline access into three. If the dirty bitmaps were in
an array, you could do

return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS +
BITS_IN_LONG)] & mask;

with one cacheline access.


If I'm understanding the existing code correctly,
int dirty_flags can be combined, like VGA + MIGRATION.
If we only have to worry about a single dirty flag, I agree with your
idea.


 From a quick grep it seems flags are not combined, except for something
strange with CODE_DIRTY_FLAG:


Thanks for checking out.
But the CODE_DIRTY_FLAG makes me really nervous...


static void notdirty_mem_writel(void *opaque, target_phys_addr_t
ram_addr,
uint32_t val)
{
int dirty_flags;
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
if (!(dirty_flags & CODE_DIRTY_FLAG)) {
#if !defined(CONFIG_USER_ONLY)
tb_invalidate_phys_page_fast(ram_addr, 4);
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
#endif
}
stl_p(qemu_get_ram_ptr(ram_addr), val);
dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
/* we remove the notdirty callback only if the code has been
flushed */
if (dirty_flags == 0xff)
tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
}


I can't say I understand what it does.


Me neither.
This the reason I had to take naive approach...


On the other hand, qemu seems to require getting combined dirty flags.
If we introduce dirty bitmaps for each type, we need to access each
bitmap to get combined flags. I wasn't sure how to make this more
efficient...


static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
{
- phys_ram_dirty[addr>> TARGET_PAGE_BITS] = 0xff;
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+
+ mask = 1UL<< offset;
+ phys_ram_vga_dirty[index] |= mask;
+ phys_ram_code_dirty[index] |= mask;
+ phys_ram_migration_dirty[index] |= mask;
+}


This is also three cacheline accesses. I think we should have a master
bitmap which is updated by set_dirty(), and which is or'ed into the
other bitmaps when they are accessed. At least the vga and migration
bitmaps are only read periodically, not randomly, so this would be very
fast. In a way, this is similar to how the qemu bitmap is updated from
the kvm bitmap today.


Sounds good to me.
So we're going to introduce 4 (VGA, CODE, MIGRATION, master) bit-based
bitmaps in total.



Yeah, except CODE doesn't behave like the others. Would be best to
understand what it's requirements are before making the change. Maybe
CODE will need separate handling (so master will only feed VGA and
MIGRATION).


After implementing this patch set, I thought separating the wrapper functions 
for each dirty flag type might be an option.  Unifying everything makes 
inefficient here.  But anyway, do you know somebody who has a strong insight on 
this CODE_DIRTY_FLAG?


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] qemu-kvm: Introduce bit-based phys_ram_dirty, and bit-based dirty page checker.

2010-03-16 Thread Yoshiaki Tamura

Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Experimental results:
Cond1: 1.9 ~ 61 times speed up
Cond2: 1.9 ~ 56 times speed up
Cond3: 1.9 ~ 59 times speed up
Cond4: 1.7 ~ 59 times speed up


Impressive results. What's the typical speedup? Closer to 1.9 or 61?


To be honest, I thought the result above was too vague...
The speed up grows when the number of dirty pages decreases.
Let me paste the snipped actual data measured during live migration on Cond1.
This result is measured with cpu_get_real_ticks(), so the values should be in 
raw ticks.


135200 dirty pages: orig.2488419, bitbased.1251171, ratio.1.99
...
98346 dirty pages: orig.3580533, bitbased.1386918, ratio.2.58
...
54865 dirty pages: orig.4220865, bitbased.984924, ratio.4.29
...
27883 dirty pages: orig.4088970, bitbased.514602, ratio.7.95
...
11541 dirty pages: orig.3854277, bitbased.220410, ratio.17.49
...
8117 dirty pages: orig.4041765, bitbased.175446, ratio.23.04
3231 dirty pages: orig.3337083, bitbased.105921, ratio.31.51
2401 dirty pages: orig.4103469, bitbased.89406, ratio.45.90
1595 dirty pages: orig.4028949, bitbased.78570, ratio.51.28
756 dirty pages: orig.4036707, bitbased.67662, ratio.59.66
0 dirty pages: orig.3938085, bitbased.23634, ratio.166.63
0 dirty pages: orig.3968163, bitbased.23526, ratio.168.67

We didn't show the data for checking completely empty bitmap because it was too 
fast and didn't wan't to get wrong impression.



Note the issue with the cache accesses for set_dirty() is only
applicable to tcg, since kvm always updates the dirty bitmap in a batch
(well, I/O also updates the bitmap).


I understand.
I'm still concerned regarding the way of reseting the dirty bitmap.
I was thinking to reset them in a batch, but it seems difficult because of the 
consistency with the tlb.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Avi Kivity

On 03/16/2010 03:31 PM, Ingo Molnar wrote:



You can do that through libvirt, but that only works for guests started
through libvirt.  libvirt provides command-line tools to list and manage
guests (for example autostarting them on startup), and tools built on top of
libvirt can manage guests graphically.

Looks like we have a layer inversion here.  Maybe we need a plugin system -
libvirt drops a .so into perf that teaches it how to list guests and get
their symbols.
 

Is libvirt used to start up all KVM guests? If not, if it's only used on some
distros while other distros have other solutions then there's apparently no
good way to get to such information, and the kernel bits of KVM do not provide
it.
   


Developers tend to start qemu from the command line, but the majority of 
users and all distros I know of use libvirt.  Some users cobble up their 
own scripts.



To the user (and to me) this looks like a KVM bug / missing feature. (and the
user doesnt care where the blame is) If that is true then apparently the
current KVM design has no technically actionable solution for certain
categories of features!
   


A plugin system allows anyone who is interested to provide the 
information; they just need to write a plugin for their management tool.


Since we can't prevent people from writing management tools, I don't see 
what else we can do.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Anthony Liguori

On 03/16/2010 07:45 AM, Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to 
prevent

direct access to the phys_ram_dirty bitmap.



+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+unsigned long mask;
+int index = (addr>>  TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int offset = (addr>>  TARGET_PAGE_BITS)&  (HOST_LONG_BITS - 1);
+int ret = 0;
+
+mask = 1UL<<  offset;
+if (phys_ram_vga_dirty[index]&  mask)
+ret |= VGA_DIRTY_FLAG;
+if (phys_ram_code_dirty[index]&  mask)
+ret |=  CODE_DIRTY_FLAG;
+if (phys_ram_migration_dirty[index]&  mask)
+ret |= MIGRATION_DIRTY_FLAG;
+
+return ret;
  }

  static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
  int dirty_flags)
  {
-return phys_ram_dirty[addr>>  TARGET_PAGE_BITS]&  dirty_flags;
+return cpu_physical_memory_get_dirty_flags(addr)&  dirty_flags;
  }


This turns one cacheline access into three.  If the dirty bitmaps were 
in an array, you could do


  return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + 
BITS_IN_LONG)] & mask;


with one cacheline access.


As far as I can tell, we only ever call with a single flag so your 
suggestion makes sense.


I'd suggest introducing these functions before splitting the bitmap up.  
It makes review a bit easier.




  static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
  {
-phys_ram_dirty[addr>>  TARGET_PAGE_BITS] = 0xff;
+unsigned long mask;
+int index = (addr>>  TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int offset = (addr>>  TARGET_PAGE_BITS)&  (HOST_LONG_BITS - 1);
+
+mask = 1UL<<  offset;
+phys_ram_vga_dirty[index] |= mask;
+phys_ram_code_dirty[index] |= mask;
+phys_ram_migration_dirty[index] |= mask;
+}


This is also three cacheline accesses.  I think we should have a 
master bitmap which is updated by set_dirty(), and which is or'ed into 
the other bitmaps when they are accessed.  At least the vga and 
migration bitmaps are only read periodically, not randomly, so this 
would be very fast.  In a way, this is similar to how the qemu bitmap 
is updated from the kvm bitmap today.


I am not sure about the code bitmap though.


I think your suggestion makes sense and would also work for the code bitmap.

Regards,

Anthony Liguori


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Avi Kivity  wrote:

> On 03/16/2010 03:08 PM, Ingo Molnar wrote:
> >
> >>>I mean, i can trust a kernel service and i can trust /proc/kallsyms.
> >>>
> >>>Can perf trust a random process claiming to be Qemu? What's the trust
> >>>mechanism here?
> >>Obviously you can't trust anything you get from a guest, no matter how you
> >>get it.
> >I'm not talking about the symbol strings and addresses, and the object
> >contents for allocation (or debuginfo). I'm talking about the basic protocol
> >of establishing which guest is which.
> 
> There is none.  So far, qemu only dealt with managing just its own
> guest, and left all multiple guest management to higher levels up
> the stack (like libvirt).
> 
> >I.e. we really want to be able users to:
> >
> >  1) have it all working with a single guest, without having to specify 
> > 'which'
> > guest (qemu PID) to work with. That is the dominant usecase both for
> > developers and for a fair portion of testers.
> 
> That's reasonable if we can get it working simply.

IMO such ease of use is reasonable and required, full stop.

If it cannot be gotten simply then that's a bug: either in the code, or in the 
design, or in the development process that led to the design. Bugs need 
fixing.

> >  2) Have some reasonable symbolic identification for guests. For example a
> > usable approach would be to have 'perf kvm list', which would list all
> > currently active guests:
> >
> >  $ perf kvm list
> >[1] Fedora
> >[2] OpenSuse
> >[3] Windows-XP
> >[4] Windows-7
> >
> > And from that point on 'perf kvm -g OpenSuse record' would do the 
> > obvious
> > thing. Users will be able to just use the 'OpenSuse' symbolic name for
> > that guest, even if the guest got restarted and switched its main PID.
> >
> > Any such facility needs trusted enumeration and a protocol where i can 
> > trust that the information i got is authorative. (I.e. 'OpenSuse' truly 
> > matches to the OpenSuse session - not to some local user starting up a 
> > Qemu instance that claims to be 'OpenSuse'.)
> >
> > Is such a scheme possible/available? I suspect all the KVM configuration 
> > tools (i havent used them in some time - gui and command-line tools alike) 
> > use similar methods to ease guest management?
> 
> You can do that through libvirt, but that only works for guests started 
> through libvirt.  libvirt provides command-line tools to list and manage 
> guests (for example autostarting them on startup), and tools built on top of 
> libvirt can manage guests graphically.
> 
> Looks like we have a layer inversion here.  Maybe we need a plugin system - 
> libvirt drops a .so into perf that teaches it how to list guests and get 
> their symbols.

Is libvirt used to start up all KVM guests? If not, if it's only used on some 
distros while other distros have other solutions then there's apparently no 
good way to get to such information, and the kernel bits of KVM do not provide 
it.

To the user (and to me) this looks like a KVM bug / missing feature. (and the 
user doesnt care where the blame is) If that is true then apparently the 
current KVM design has no technically actionable solution for certain 
categories of features!

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Avi Kivity

On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote:

Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to
prevent
direct access to the phys_ram_dirty bitmap.



+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+ int ret = 0;
+
+ mask = 1UL<< offset;
+ if (phys_ram_vga_dirty[index]& mask)
+ ret |= VGA_DIRTY_FLAG;
+ if (phys_ram_code_dirty[index]& mask)
+ ret |= CODE_DIRTY_FLAG;
+ if (phys_ram_migration_dirty[index]& mask)
+ ret |= MIGRATION_DIRTY_FLAG;
+
+ return ret;
}

static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
int dirty_flags)
{
- return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags;
+ return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags;
}


This turns one cacheline access into three. If the dirty bitmaps were in
an array, you could do

return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS +
BITS_IN_LONG)] & mask;

with one cacheline access.


If I'm understanding the existing code correctly,
int dirty_flags can be combined, like VGA + MIGRATION.
If we only have to worry about a single dirty flag, I agree with your 
idea.


From a quick grep it seems flags are not combined, except for something 
strange with CODE_DIRTY_FLAG:



static void notdirty_mem_writel(void *opaque, target_phys_addr_t ram_addr,
uint32_t val)
{
int dirty_flags;
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
if (!(dirty_flags & CODE_DIRTY_FLAG)) {
#if !defined(CONFIG_USER_ONLY)
tb_invalidate_phys_page_fast(ram_addr, 4);
dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
#endif
}
stl_p(qemu_get_ram_ptr(ram_addr), val);
dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
/* we remove the notdirty callback only if the code has been
   flushed */
if (dirty_flags == 0xff)
tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
}


I can't say I understand what it does.



On the other hand, qemu seems to require getting combined dirty flags.
If we introduce dirty bitmaps for each type, we need to access each 
bitmap to get combined flags.  I wasn't sure how to make this more 
efficient...



static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
{
- phys_ram_dirty[addr>> TARGET_PAGE_BITS] = 0xff;
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+
+ mask = 1UL<< offset;
+ phys_ram_vga_dirty[index] |= mask;
+ phys_ram_code_dirty[index] |= mask;
+ phys_ram_migration_dirty[index] |= mask;
+}


This is also three cacheline accesses. I think we should have a master
bitmap which is updated by set_dirty(), and which is or'ed into the
other bitmaps when they are accessed. At least the vga and migration
bitmaps are only read periodically, not randomly, so this would be very
fast. In a way, this is similar to how the qemu bitmap is updated from
the kvm bitmap today.


Sounds good to me.
So we're going to introduce 4 (VGA, CODE, MIGRATION, master) bit-based 
bitmaps in total.




Yeah, except CODE doesn't behave like the others.  Would be best to 
understand what it's requirements are before making the change.  Maybe 
CODE will need separate handling (so master will only feed VGA and 
MIGRATION).


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Yoshiaki Tamura

Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to
prevent
direct access to the phys_ram_dirty bitmap.



+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+ int ret = 0;
+
+ mask = 1UL<< offset;
+ if (phys_ram_vga_dirty[index]& mask)
+ ret |= VGA_DIRTY_FLAG;
+ if (phys_ram_code_dirty[index]& mask)
+ ret |= CODE_DIRTY_FLAG;
+ if (phys_ram_migration_dirty[index]& mask)
+ ret |= MIGRATION_DIRTY_FLAG;
+
+ return ret;
}

static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
int dirty_flags)
{
- return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags;
+ return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags;
}


This turns one cacheline access into three. If the dirty bitmaps were in
an array, you could do

return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS +
BITS_IN_LONG)] & mask;

with one cacheline access.


If I'm understanding the existing code correctly,
int dirty_flags can be combined, like VGA + MIGRATION.
If we only have to worry about a single dirty flag, I agree with your idea.

On the other hand, qemu seems to require getting combined dirty flags.
If we introduce dirty bitmaps for each type, we need to access each bitmap to 
get combined flags.  I wasn't sure how to make this more efficient...



static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
{
- phys_ram_dirty[addr>> TARGET_PAGE_BITS] = 0xff;
+ unsigned long mask;
+ int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+ int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1);
+
+ mask = 1UL<< offset;
+ phys_ram_vga_dirty[index] |= mask;
+ phys_ram_code_dirty[index] |= mask;
+ phys_ram_migration_dirty[index] |= mask;
+}


This is also three cacheline accesses. I think we should have a master
bitmap which is updated by set_dirty(), and which is or'ed into the
other bitmaps when they are accessed. At least the vga and migration
bitmaps are only read periodically, not randomly, so this would be very
fast. In a way, this is similar to how the qemu bitmap is updated from
the kvm bitmap today.


Sounds good to me.
So we're going to introduce 4 (VGA, CODE, MIGRATION, master) bit-based bitmaps 
in total.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Avi Kivity

On 03/16/2010 03:08 PM, Ingo Molnar wrote:



I mean, i can trust a kernel service and i can trust /proc/kallsyms.

Can perf trust a random process claiming to be Qemu? What's the trust
mechanism here?
   

Obviously you can't trust anything you get from a guest, no matter how you
get it.
 

I'm not talking about the symbol strings and addresses, and the object
contents for allocation (or debuginfo). I'm talking about the basic protocol
of establishing which guest is which.
   


There is none.  So far, qemu only dealt with managing just its own 
guest, and left all multiple guest management to higher levels up the 
stack (like libvirt).



I.e. we really want to be able users to:

  1) have it all working with a single guest, without having to specify 'which'
 guest (qemu PID) to work with. That is the dominant usecase both for
 developers and for a fair portion of testers.
   


That's reasonable if we can get it working simply.


  2) Have some reasonable symbolic identification for guests. For example a
 usable approach would be to have 'perf kvm list', which would list all
 currently active guests:

  $ perf kvm list
[1] Fedora
[2] OpenSuse
[3] Windows-XP
[4] Windows-7

 And from that point on 'perf kvm -g OpenSuse record' would do the obvious
 thing. Users will be able to just use the 'OpenSuse' symbolic name for
 that guest, even if the guest got restarted and switched its main PID.

Any such facility needs trusted enumeration and a protocol where i can trust
that the information i got is authorative. (I.e. 'OpenSuse' truly matches to
the OpenSuse session - not to some local user starting up a Qemu instance that
claims to be 'OpenSuse'.)

Is such a scheme possible/available? I suspect all the KVM configuration tools
(i havent used them in some time - gui and command-line tools alike) use
similar methods to ease guest management?
   


You can do that through libvirt, but that only works for guests started 
through libvirt.  libvirt provides command-line tools to list and manage 
guests (for example autostarting them on startup), and tools built on 
top of libvirt can manage guests graphically.


Looks like we have a layer inversion here.  Maybe we need a plugin 
system - libvirt drops a .so into perf that teaches it how to list 
guests and get their symbols.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] qemu-kvm: Introduce bit-based phys_ram_dirty, and bit-based dirty page checker.

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Experimental results:
Cond1: 1.9 ~ 61 times speed up
Cond2: 1.9 ~ 56 times speed up
Cond3: 1.9 ~ 59 times speed up
Cond4: 1.7 ~ 59 times speed up
   


Impressive results.  What's the typical speedup?  Closer to 1.9 or 61?

Note the issue with the cache accesses for set_dirty() is only 
applicable to tcg, since kvm always updates the dirty bitmap in a batch 
(well, I/O also updates the bitmap).


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Avi Kivity  wrote:

> On 03/16/2010 02:29 PM, Ingo Molnar wrote:

> > I mean, i can trust a kernel service and i can trust /proc/kallsyms.
> >
> > Can perf trust a random process claiming to be Qemu? What's the trust 
> > mechanism here?
> 
> Obviously you can't trust anything you get from a guest, no matter how you 
> get it.

I'm not talking about the symbol strings and addresses, and the object 
contents for allocation (or debuginfo). I'm talking about the basic protocol 
of establishing which guest is which.

I.e. we really want to be able users to:

 1) have it all working with a single guest, without having to specify 'which' 
guest (qemu PID) to work with. That is the dominant usecase both for 
developers and for a fair portion of testers.

 2) Have some reasonable symbolic identification for guests. For example a 
usable approach would be to have 'perf kvm list', which would list all 
currently active guests:

 $ perf kvm list
   [1] Fedora
   [2] OpenSuse
   [3] Windows-XP
   [4] Windows-7

And from that point on 'perf kvm -g OpenSuse record' would do the obvious 
thing. Users will be able to just use the 'OpenSuse' symbolic name for 
that guest, even if the guest got restarted and switched its main PID.

Any such facility needs trusted enumeration and a protocol where i can trust 
that the information i got is authorative. (I.e. 'OpenSuse' truly matches to 
the OpenSuse session - not to some local user starting up a Qemu instance that 
claims to be 'OpenSuse'.)

Is such a scheme possible/available? I suspect all the KVM configuration tools 
(i havent used them in some time - gui and command-line tools alike) use 
similar methods to ease guest management?

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/6] qemu-kvm: Introduce bit-based phys_ram_dirty for VGA, CODE and MIGRATION.

2010-03-16 Thread Avi Kivity

On 03/16/2010 03:01 PM, Yoshiaki Tamura wrote:

-uint8_t *phys_ram_dirty;
+unsigned long *phys_ram_vga_dirty;
+unsigned long *phys_ram_code_dirty;
+unsigned long *phys_ram_migration_dirty;


Would be nice to make this an array.



Thanks for pointing out.
I have a question regarding the index of the array.
From the compatibility perspective, I would prefer using the existing 
macros.


#define VGA_DIRTY_FLAG   0x01
#define CODE_DIRTY_FLAG  0x02
#define MIGRATION_DIRTY_FLAG 0x08

However, if I use them as is, I'll get a sparse array...
Is it acceptable to change these values like 0, 1, 2?


Sure.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/6] qemu-kvm: Introduce bit-based phys_ram_dirty for VGA, CODE and MIGRATION.

2010-03-16 Thread Yoshiaki Tamura

Avi Kivity wrote:

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Replaces byte-based phys_ram_dirty bitmap with
three bit-based phys_ram_dirty bitmap.
On allocation, it sets all bits in the bitmap.

Signed-off-by: Yoshiaki Tamura
Signed-off-by: OHMURA Kei
---
exec.c | 22 +-
1 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/exec.c b/exec.c
index 9bcb4de..ba334e7 100644
--- a/exec.c
+++ b/exec.c
@@ -119,7 +119,9 @@ uint8_t *code_gen_ptr;

#if !defined(CONFIG_USER_ONLY)
int phys_ram_fd;
-uint8_t *phys_ram_dirty;
+unsigned long *phys_ram_vga_dirty;
+unsigned long *phys_ram_code_dirty;
+unsigned long *phys_ram_migration_dirty;


Would be nice to make this an array.


Thanks for pointing out.
I have a question regarding the index of the array.
From the compatibility perspective, I would prefer using the existing macros.

#define VGA_DIRTY_FLAG   0x01
#define CODE_DIRTY_FLAG  0x02
#define MIGRATION_DIRTY_FLAG 0x08

However, if I use them as is, I'll get a sparse array...
Is it acceptable to change these values like 0, 1, 2?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] add "xchg ax, reg" emulator test

2010-03-16 Thread Avi Kivity

On 03/16/2010 02:52 PM, Gleb Natapov wrote:

+   MK_INSN(xchg_test1, "xchg %eax,%eax\n\t");
   

AKA 'nop'.

 

Yep, but we still emulate it.

   


And well too.

Note that 'xchg eax, ebx' will clear the top 32-bits of rax and rbx, but 
'xchg eax, eax' will not.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] add "xchg ax, reg" emulator test

2010-03-16 Thread Gleb Natapov
On Tue, Mar 16, 2010 at 02:50:58PM +0200, Avi Kivity wrote:
> On 03/16/2010 02:42 PM, Gleb Natapov wrote:
> >Add test for opcodes 0x90-0x9f emulation
> >
> 
> Reviewed-by: Avi Kivity 
> 
> >+MK_INSN(xchg_test1, "xchg %eax,%eax\n\t");
> 
> AKA 'nop'.
> 
Yep, but we still emulate it.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] add "xchg ax, reg" emulator test

2010-03-16 Thread Avi Kivity

On 03/16/2010 02:42 PM, Gleb Natapov wrote:

Add test for opcodes 0x90-0x9f emulation

   


Reviewed-by: Avi Kivity 


+   MK_INSN(xchg_test1, "xchg %eax,%eax\n\t");
   


AKA 'nop'.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/6] qemu-kvm: Introduce cpu_physical_memory_get_dirty_range().

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Introduces cpu_physical_memory_get_dirty_range().
It checks the first row and puts dirty addr in the array.
If the first row is empty, it skips to the first non-dirty row
or the end addr, and put the length in the first entry of the array.



+/* It checks the first row and puts dirty addrs in the array.
+   If the first row is empty, it skips to the first non-dirty row
+   or the end addr, and put the length in the first entry of the array. */
+int cpu_physical_memory_get_dirty_range(ram_addr_t start, ram_addr_t end,
+ram_addr_t *dirty_rams, int length,
+int dirty_flag)
+{
+unsigned long phys_ram_dirty, page_number, *p;
+ram_addr_t addr;
+int s_idx = (start>>  TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int e_idx = (end>>  TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int i, j, offset;
+
+switch (dirty_flag) {
+case VGA_DIRTY_FLAG:
+p = phys_ram_vga_dirty;
+break;
+case CODE_DIRTY_FLAG:
+p = phys_ram_code_dirty;
+break;
+case MIGRATION_DIRTY_FLAG:
+p = phys_ram_migration_dirty;
+break;
+default:
+abort();
+}
   


This bit would be improved by switching to an array of bitmaps.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent
direct access to the phys_ram_dirty bitmap.
   



+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+unsigned long mask;
+int index = (addr>>  TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int offset = (addr>>  TARGET_PAGE_BITS)&  (HOST_LONG_BITS - 1);
+int ret = 0;
+
+mask = 1UL<<  offset;
+if (phys_ram_vga_dirty[index]&  mask)
+ret |= VGA_DIRTY_FLAG;
+if (phys_ram_code_dirty[index]&  mask)
+ret |=  CODE_DIRTY_FLAG;
+if (phys_ram_migration_dirty[index]&  mask)
+ret |= MIGRATION_DIRTY_FLAG;
+
+return ret;
  }

  static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
  int dirty_flags)
  {
-return phys_ram_dirty[addr>>  TARGET_PAGE_BITS]&  dirty_flags;
+return cpu_physical_memory_get_dirty_flags(addr)&  dirty_flags;
  }
   


This turns one cacheline access into three.  If the dirty bitmaps were 
in an array, you could do


  return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + 
BITS_IN_LONG)] & mask;


with one cacheline access.



  static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
  {
-phys_ram_dirty[addr>>  TARGET_PAGE_BITS] = 0xff;
+unsigned long mask;
+int index = (addr>>  TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int offset = (addr>>  TARGET_PAGE_BITS)&  (HOST_LONG_BITS - 1);
+
+mask = 1UL<<  offset;
+phys_ram_vga_dirty[index] |= mask;
+phys_ram_code_dirty[index] |= mask;
+phys_ram_migration_dirty[index] |= mask;
+}
   


This is also three cacheline accesses.  I think we should have a master 
bitmap which is updated by set_dirty(), and which is or'ed into the 
other bitmaps when they are accessed.  At least the vga and migration 
bitmaps are only read periodically, not randomly, so this would be very 
fast.  In a way, this is similar to how the qemu bitmap is updated from 
the kvm bitmap today.


I am not sure about the code bitmap though.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] add "xchg ax, reg" emulator test

2010-03-16 Thread Gleb Natapov
Add test for opcodes 0x90-0x9f emulation

Signed-off-by: Gleb Natapov 
diff --git a/kvm/user/test/x86/realmode.c b/kvm/user/test/x86/realmode.c
index bc6b27f..bfc2942 100644
--- a/kvm/user/test/x86/realmode.c
+++ b/kvm/user/test/x86/realmode.c
@@ -141,6 +141,90 @@ int regs_equal(const struct regs *r1, const struct regs 
*r2, int ignore)
); \
extern u8 insn_##name[], insn_##name##_end[]
 
+void test_xchg(void)
+{
+   struct regs inregs = { .eax = 0, .ebx = 1, .ecx = 2, .edx = 3, .esi = 
4, .edi = 5, .ebp = 6, .esp = 7}, outregs;
+
+   MK_INSN(xchg_test1, "xchg %eax,%eax\n\t");
+   MK_INSN(xchg_test2, "xchg %eax,%ebx\n\t");
+   MK_INSN(xchg_test3, "xchg %eax,%ecx\n\t");
+   MK_INSN(xchg_test4, "xchg %eax,%edx\n\t");
+   MK_INSN(xchg_test5, "xchg %eax,%esi\n\t");
+   MK_INSN(xchg_test6, "xchg %eax,%edi\n\t");
+   MK_INSN(xchg_test7, "xchg %eax,%ebp\n\t");
+   MK_INSN(xchg_test8, "xchg %eax,%esp\n\t");
+
+   exec_in_big_real_mode(&inregs, &outregs,
+ insn_xchg_test1,
+  insn_xchg_test1_end - insn_xchg_test1);
+
+   if (!regs_equal(&inregs, &outregs, 0))
+   print_serial("xchg test 1: FAIL\n");
+
+   exec_in_big_real_mode(&inregs, &outregs,
+ insn_xchg_test2,
+  insn_xchg_test2_end - insn_xchg_test2);
+
+   if (!regs_equal(&inregs, &outregs, R_AX | R_BX) ||
+outregs.eax != inregs.ebx ||
+outregs.ebx != inregs.eax)
+   print_serial("xchg test 2: FAIL\n");
+
+   exec_in_big_real_mode(&inregs, &outregs,
+ insn_xchg_test3,
+  insn_xchg_test3_end - insn_xchg_test3);
+
+   if (!regs_equal(&inregs, &outregs, R_AX | R_CX) ||
+outregs.eax != inregs.ecx ||
+outregs.ecx != inregs.eax)
+   print_serial("xchg test 3: FAIL\n");
+
+   exec_in_big_real_mode(&inregs, &outregs,
+ insn_xchg_test4,
+  insn_xchg_test4_end - insn_xchg_test4);
+
+   if (!regs_equal(&inregs, &outregs, R_AX | R_DX) ||
+outregs.eax != inregs.edx ||
+outregs.edx != inregs.eax)
+   print_serial("xchg test 4: FAIL\n");
+
+   exec_in_big_real_mode(&inregs, &outregs,
+ insn_xchg_test5,
+  insn_xchg_test5_end - insn_xchg_test5);
+
+   if (!regs_equal(&inregs, &outregs, R_AX | R_SI) ||
+outregs.eax != inregs.esi ||
+outregs.esi != inregs.eax)
+   print_serial("xchg test 5: FAIL\n");
+
+   exec_in_big_real_mode(&inregs, &outregs,
+ insn_xchg_test6,
+  insn_xchg_test6_end - insn_xchg_test6);
+
+   if (!regs_equal(&inregs, &outregs, R_AX | R_DI) ||
+outregs.eax != inregs.edi ||
+outregs.edi != inregs.eax)
+   print_serial("xchg test 6: FAIL\n");
+
+   exec_in_big_real_mode(&inregs, &outregs,
+ insn_xchg_test7,
+  insn_xchg_test7_end - insn_xchg_test7);
+
+   if (!regs_equal(&inregs, &outregs, R_AX | R_BP) ||
+outregs.eax != inregs.ebp ||
+outregs.ebp != inregs.eax)
+   print_serial("xchg test 7: FAIL\n");
+
+   exec_in_big_real_mode(&inregs, &outregs,
+ insn_xchg_test8,
+  insn_xchg_test8_end - insn_xchg_test8);
+
+   if (!regs_equal(&inregs, &outregs, R_AX | R_SP) ||
+outregs.eax != inregs.esp ||
+outregs.esp != inregs.eax)
+   print_serial("xchg test 8: FAIL\n");
+}
+
 void test_shld(void)
 {
struct regs inregs = { .eax = 0xbe, .edx = 0xef00 }, outregs;
@@ -572,6 +656,7 @@ void realmode_start(void)
test_call();
/* long jmp test uses call near so test it after testing call */
test_long_jmp();
+   test_xchg();
 
exit(0);
 }
--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Avi Kivity

On 03/16/2010 02:29 PM, Ingo Molnar wrote:

* Avi Kivity  wrote:

   

On 03/16/2010 01:25 PM, Ingo Molnar wrote:
 
   

I haven't followed vmchannel closely, but I think it is.  vmchannel is
terminated in qemu on the host side, not in the host kernel.  So perf would
need to connect to qemu.
 

Hm, that sounds rather messy if we want to use it to basically expose kernel
functionality in a guest/host unified way. Is the qemu process discoverable in
some secure way?
   

We know its pid.
 

How do i get a list of all 'guest instance PIDs',


Libvirt manages all qemus, but this should be implemented independently 
of libvirt.



and what is the way to talk
to Qemu?
   


In general qemu exposes communication channels (such as the monitor) as 
tcp connections, unix-domain sockets, stdio, etc.  It's very flexible.



Can we trust it?
   

No choice, it contains the guest address space.
 

I mean, i can trust a kernel service and i can trust /proc/kallsyms.

Can perf trust a random process claiming to be Qemu? What's the trust
mechanism here?
   


Obviously you can't trust anything you get from a guest, no matter how 
you get it.


How do you trust a userspace program's symbols?  you don't.  How do you 
get them?  they're on a well-known location.



Is there some proper tooling available to do it, or do we have to push it
through 2-3 packages to get such a useful feature done?
   

libvirt manages qemu processes, but I don't think this should go through
libvirt.  qemu can do this directly by opening a unix domain socket in a
well-known place.
 

So Qemu has never run into such problems before?

( Sounds weird - i think Qemu configuration itself should be done via a
   unix domain socket driven configuration protocol as well. )
   


That's exactly what happens.  You invoke qemu with -monitor 
unix:blah,server (or -qmp for a machine-readable format) and have your 
management application connect to that.  You can redirect guest serial 
ports, console, parallel port, etc. to unix-domain or tcp sockets.  
vmchannel is an extension of that mechanism.




( That is the general thought process how many cross-discipline useful
   desktop/server features hit the bit bucket before having had any chance of
   being vetted by users, and why Linux sucks so much when it comes to feature
   integration and application usability. )
   

You can't solve everything in the kernel, even with a well populated tools/.
 

Certainly not, but this is a technical problem in the kernel's domain, so it's
a fair (and natural) expectation to be able to solve this within the kernel
project.
   


Someone writing perf-gui outside the kernel would have the same 
problems, no?


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Avi Kivity  wrote:

> On 03/16/2010 01:25 PM, Ingo Molnar wrote:
> >
> >>I haven't followed vmchannel closely, but I think it is.  vmchannel is
> >>terminated in qemu on the host side, not in the host kernel.  So perf would
> >>need to connect to qemu.
> >Hm, that sounds rather messy if we want to use it to basically expose kernel
> >functionality in a guest/host unified way. Is the qemu process discoverable 
> >in
> >some secure way?
> 
> We know its pid.

How do i get a list of all 'guest instance PIDs', and what is the way to talk 
to Qemu?

> > Can we trust it?
> 
> No choice, it contains the guest address space.

I mean, i can trust a kernel service and i can trust /proc/kallsyms.

Can perf trust a random process claiming to be Qemu? What's the trust 
mechanism here?

> > Is there some proper tooling available to do it, or do we have to push it 
> > through 2-3 packages to get such a useful feature done?
> 
> libvirt manages qemu processes, but I don't think this should go through 
> libvirt.  qemu can do this directly by opening a unix domain socket in a 
> well-known place.

So Qemu has never run into such problems before?

( Sounds weird - i think Qemu configuration itself should be done via a 
  unix domain socket driven configuration protocol as well. )

> >( That is the general thought process how many cross-discipline useful
> >   desktop/server features hit the bit bucket before having had any chance of
> >   being vetted by users, and why Linux sucks so much when it comes to 
> > feature
> >   integration and application usability. )
> 
> You can't solve everything in the kernel, even with a well populated tools/.

Certainly not, but this is a technical problem in the kernel's domain, so it's 
a fair (and natural) expectation to be able to solve this within the kernel 
project.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/6] qemu-kvm: Introduce bit-based phys_ram_dirty for VGA, CODE and MIGRATION.

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote:

Replaces byte-based phys_ram_dirty bitmap with
three bit-based phys_ram_dirty bitmap.
On allocation, it sets all bits in the bitmap.

Signed-off-by: Yoshiaki Tamura
Signed-off-by: OHMURA Kei
---
  exec.c |   22 +-
  1 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/exec.c b/exec.c
index 9bcb4de..ba334e7 100644
--- a/exec.c
+++ b/exec.c
@@ -119,7 +119,9 @@ uint8_t *code_gen_ptr;

  #if !defined(CONFIG_USER_ONLY)
  int phys_ram_fd;
-uint8_t *phys_ram_dirty;
+unsigned long *phys_ram_vga_dirty;
+unsigned long *phys_ram_code_dirty;
+unsigned long *phys_ram_migration_dirty;
   


Would be nice to make this an array.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Avi Kivity

On 03/16/2010 01:25 PM, Ingo Molnar wrote:



I haven't followed vmchannel closely, but I think it is.  vmchannel is
terminated in qemu on the host side, not in the host kernel.  So perf would
need to connect to qemu.
 

Hm, that sounds rather messy if we want to use it to basically expose kernel
functionality in a guest/host unified way. Is the qemu process discoverable in
some secure way?


We know its pid.


Can we trust it?


No choice, it contains the guest address space.


Is there some proper tooling available to do
it, or do we have to push it through 2-3 packages to get such a useful feature
done?
   


libvirt manages qemu processes, but I don't think this should go through 
libvirt.  qemu can do this directly by opening a unix domain socket in a 
well-known place.



( That is the general thought process how many cross-discipline useful
   desktop/server features hit the bit bucket before having had any chance of
   being vetted by users, and why Linux sucks so much when it comes to feature
   integration and application usability. )
   


You can't solve everything in the kernel, even with a well populated tools/.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-16 Thread Michael S. Tsirkin
On Tue, Mar 16, 2010 at 05:32:17PM +0800, Xin Xiaohui wrote:
> The vhost-net backend now only supports synchronous send/recv
> operations. The patch provides multiple submits and asynchronous
> notifications. This is needed for zero-copy case.
> 
> Signed-off-by: Xin Xiaohui 
> ---
> 
> Michael,
> I don't use the kiocb comes from the sendmsg/recvmsg,
> since I have embeded the kiocb in page_info structure,
> and allocate it when page_info allocated.

So what I suggested was that vhost allocates and tracks the iocbs, and
passes them to your device with sendmsg/ recvmsg calls. This way your
device won't need to share structures and locking strategy with vhost:
you get an iocb, handle it, invoke a callback to notify vhost about
completion.

This also gets rid of the 'receiver' callback.

> Please have a review and thanks for the instruction
> for replying email which helps me a lot.
> 
> Thanks,
> Xiaohui
> 
>  drivers/vhost/net.c   |  159 
> +++--
>  drivers/vhost/vhost.h |   12 
>  2 files changed, 166 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 22d5fef..5483848 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -17,11 +17,13 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
> socket *sock)
>   net->tx_poll_state = VHOST_NET_POLL_STARTED;
>  }
>  
> +static void handle_async_rx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq);
> +
> +static void handle_async_tx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq);
> +

A couple of style comments:

- It's better to arrange functions in such order that forward declarations
aren't necessary.  Since we don't have recursion, this should always be
possible.

- continuation lines should be idented at least at the position of '('
on the previous line.

>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_tx(struct vhost_net *net)
> @@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net)
>   tx_poll_stop(net);
>   hdr_size = vq->hdr_size;
>  
> + handle_async_tx_events_notify(net, vq);
> +
>   for (;;) {
>   head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>ARRAY_SIZE(vq->iov),
> @@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net)
>   /* Skip header. TODO: support TSO. */
>   s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
>   msg.msg_iovlen = out;
> +
> + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> + vq->head = head;
> + msg.msg_control = (void *)vq;

So here a device gets a pointer to vhost_virtqueue structure. If it gets
an iocb and invokes a callback, it would not care about vhost internals.

> + }
> +
>   len = iov_length(vq->iov, out);
>   /* Sanity check */
>   if (!len) {
> @@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net)
>   tx_poll_start(net, sock);
>   break;
>   }
> +
> + if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> + continue;
> +
>   if (err != len)
>   pr_err("Truncated TX packet: "
>  " len %d != %zd\n", err, len);
> @@ -177,6 +197,8 @@ static void handle_tx(struct vhost_net *net)
>   }
>   }
>  
> + handle_async_tx_events_notify(net, vq);
> +
>   mutex_unlock(&vq->mutex);
>   unuse_mm(net->dev.mm);
>  }
> @@ -206,7 +228,8 @@ static void handle_rx(struct vhost_net *net)
>   int err;
>   size_t hdr_size;
>   struct socket *sock = rcu_dereference(vq->private_data);
> - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> + if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> + vq->link_state == VHOST_VQ_LINK_SYNC))
>   return;
>  
>   use_mm(net->dev.mm);
> @@ -214,9 +237,18 @@ static void handle_rx(struct vhost_net *net)
>   vhost_disable_notify(vq);
>   hdr_size = vq->hdr_size;
>  
> - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> + /* In async cases, for write logging, the simple way is to get
> +  * the log info always, and really logging is decided later.
> +  * Thus, when logging enabled, we can get log, and when logging
> +  * disabled, we can get log disabled accordingly.
> +  */
> +


This adds overhead and might be one of the reasons
your patch does not perform that well. A better way
would be to flush outsta

Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Avi Kivity  wrote:

> On 03/16/2010 12:50 PM, Ingo Molnar wrote:
> >
> >>I'm confused - initrd seems to be guest-side.  I was talking about the host
> >>side.
> >host side doesnt need much support - just some client capability in perf
> >itself. I suspect vmchannels are sufficiently flexible and configuration-free
> >for such purposes? (i.e. like a filesystem in essence)
> 
> I haven't followed vmchannel closely, but I think it is.  vmchannel is 
> terminated in qemu on the host side, not in the host kernel.  So perf would 
> need to connect to qemu.

Hm, that sounds rather messy if we want to use it to basically expose kernel 
functionality in a guest/host unified way. Is the qemu process discoverable in 
some secure way? Can we trust it? Is there some proper tooling available to do 
it, or do we have to push it through 2-3 packages to get such a useful feature 
done?

( That is the general thought process how many cross-discipline useful
  desktop/server features hit the bit bucket before having had any chance of
  being vetted by users, and why Linux sucks so much when it comes to feature
  integration and application usability. )

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Mar 16

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:45 PM, Daniel P. Berrange wrote:

On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote:
   

On 03/16/2010 12:31 PM, Daniel P. Berrange wrote:
 

Polling loops are an indication that something is wrong.

 

Except when people suggest they are the right answer, qcow high
watermark ;-P

   

I liked Anthony's suggestion of an lvm2 block format driver.  No polling.
 

Doesn't that require giving QEMU privileges to perform LVM operations which
implies QEMU having CAP_SYS_ADMIN  ?
   


Ouch.  I expect fd permissions on the volume are insufficient, and fd 
permissions on the group are excessive.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:50 PM, Ingo Molnar wrote:



I'm confused - initrd seems to be guest-side.  I was talking about the host
side.
 

host side doesnt need much support - just some client capability in perf
itself. I suspect vmchannels are sufficiently flexible and configuration-free
for such purposes? (i.e. like a filesystem in essence)
   


I haven't followed vmchannel closely, but I think it is.  vmchannel is 
terminated in qemu on the host side, not in the host kernel.  So perf 
would need to connect to qemu.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:44 PM, Christoph Hellwig wrote:

On Tue, Mar 16, 2010 at 12:36:31PM +0200, Avi Kivity wrote:
   

Are you talking about direct volume access or qcow2?
 

Doesn't matter.

   

For direct volume access, I still don't get it.  The number of barriers
issues by the host must equal (or exceed, but that's pointless) the
number of barriers issued by the guest.  cache=writeback allows the host
to reorder writes, but so does cache=none.  Where does the difference
come from?

Put it another way.  In an unvirtualized environment, if you implement a
write cache in a storage driver (not device), and sync it on a barrier
request, would you expect to see a performance improvement?
 

cache=none only allows very limited reorderning in the host.  O_DIRECT
is synchronous on the host, so there's just some very limited reordering
going on in the elevator if we have other I/O going on in parallel.
   


Presumably there is lots of I/O going on, or we wouldn't be having this 
conversation.



In addition to that the disk writecache can perform limited reodering
and caching, but the disk cache has a rather limited size.  The host
pagecache gives a much wieder opportunity to reorder, especially if
the guest workload is not cache flush heavy.  If the guest workload
is extremly cache flush heavy the usefulness of the pagecache is rather
limited, as we'll only use very little of it, but pay by having to do
a data copy.  If the workload is not cache flush heavy, and we have
multiple guests doing I/O to the same spindles it will allow the host
do do much more efficient data writeout by beeing able to do better
ordered (less seeky) and bigger I/O (especially if the host has real
storage compared to ide for the guest).
   


Let's assume the guest has virtio (I agree with IDE we need reordering 
on the host).  The guest sends batches of I/O separated by cache 
flushes.  If the batches are smaller than the virtio queue length, 
ideally things look like:


 io_submit(..., batch_size_1);
 io_getevents(..., batch_size_1);
 fdatasync();
 io_submit(..., batch_size_2);
  io_getevents(..., batch_size_2);
  fdatasync();
  io_submit(..., batch_size_3);
  io_getevents(..., batch_size_3);
  fdatasync();

(certainly that won't happen today, but it could in principle).

How does a write cache give any advantage?  The host kernel sees 
_exactly_ the same information as it would from a bunch of threaded 
pwritev()s followed by fdatasync().


(wish: IO_CMD_ORDERED_FDATASYNC)

If the batch size is larger than the virtio queue size, or if there are 
no flushes at all, then yes the huge write cache gives more opportunity 
for reordering.  But we're already talking hundreds of requests here.


Let's say the virtio queue size was unlimited.  What merging/reordering 
opportunity are we missing on the host?  Again we have exactly the same 
information: either the pagecache lru + radix tree that identifies all 
dirty pages in disk order, or the block queue with pending requests that 
contains exactly the same information.


Something is wrong.  Maybe it's my understanding, but on the other hand 
it may be a piece of kernel code.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] qemu-kvm: Use cpu_physical_memory_set_dirty_range() to update phys_ram_dirty.

2010-03-16 Thread Yoshiaki Tamura
Modifies kvm_get_dirty_pages_log_range to use 
cpu_physical_memory_set_dirty_range() to update the row of 
the bit-based phys_ram_dirty bitmap at once.

Signed-off-by: Yoshiaki Tamura 
Signed-off-by: OHMURA Kei 
---
 qemu-kvm.c |   19 ++-
 1 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/qemu-kvm.c b/qemu-kvm.c
index e417f21..75fa9b0 100644
--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -2305,9 +2305,8 @@ static int kvm_get_dirty_pages_log_range(unsigned long 
start_addr,
  unsigned long offset,
  unsigned long mem_size)
 {
-unsigned int i, j;
-unsigned long page_number, addr, addr1, c;
-ram_addr_t ram_addr;
+unsigned int i;
+unsigned long page_number, addr, addr1;
 unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
 HOST_LONG_BITS;
 
@@ -2317,16 +2316,10 @@ static int kvm_get_dirty_pages_log_range(unsigned long 
start_addr,
  */
 for (i = 0; i < len; i++) {
 if (bitmap[i] != 0) {
-c = leul_to_cpu(bitmap[i]);
-do {
-j = ffsl(c) - 1;
-c &= ~(1ul << j);
-page_number = i * HOST_LONG_BITS + j;
-addr1 = page_number * TARGET_PAGE_SIZE;
-addr = offset + addr1;
-ram_addr = cpu_get_physical_page_desc(addr);
-cpu_physical_memory_set_dirty(ram_addr);
-} while (c != 0);
+page_number = i * HOST_LONG_BITS;
+addr1 = page_number * TARGET_PAGE_SIZE;
+addr = offset + addr1;
+cpu_physical_memory_set_dirty_range(addr, leul_to_cpu(bitmap[i]));
 }
 }
 return 0;
-- 
1.7.0.31.g1df487

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/6] qemu-kvm: Introduce bit-based phys_ram_dirty for VGA, CODE and MIGRATION.

2010-03-16 Thread Yoshiaki Tamura
Replaces byte-based phys_ram_dirty bitmap with 
three bit-based phys_ram_dirty bitmap.
On allocation, it sets all bits in the bitmap.

Signed-off-by: Yoshiaki Tamura 
Signed-off-by: OHMURA Kei 
---
 exec.c |   22 +-
 1 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/exec.c b/exec.c
index 9bcb4de..ba334e7 100644
--- a/exec.c
+++ b/exec.c
@@ -119,7 +119,9 @@ uint8_t *code_gen_ptr;
 
 #if !defined(CONFIG_USER_ONLY)
 int phys_ram_fd;
-uint8_t *phys_ram_dirty;
+unsigned long *phys_ram_vga_dirty;
+unsigned long *phys_ram_code_dirty;
+unsigned long *phys_ram_migration_dirty;
 uint8_t *bios_mem;
 static int in_migration;
 
@@ -2659,10 +2661,20 @@ ram_addr_t qemu_ram_alloc(ram_addr_t size)
 new_block->next = ram_blocks;
 ram_blocks = new_block;
 
-phys_ram_dirty = qemu_realloc(phys_ram_dirty,
-(last_ram_offset + size) >> TARGET_PAGE_BITS);
-memset(phys_ram_dirty + (last_ram_offset >> TARGET_PAGE_BITS),
-   0xff, size >> TARGET_PAGE_BITS);
+if (BITMAP_SIZE(last_ram_offset + size) != BITMAP_SIZE(last_ram_offset)) {
+phys_ram_vga_dirty = qemu_realloc(phys_ram_vga_dirty,
+BITMAP_SIZE(last_ram_offset + size));
+phys_ram_code_dirty = qemu_realloc(phys_ram_code_dirty,
+BITMAP_SIZE(last_ram_offset + size));
+phys_ram_migration_dirty = qemu_realloc(phys_ram_migration_dirty,
+BITMAP_SIZE(last_ram_offset + size));
+memset((uint8_t *)phys_ram_vga_dirty +
+   BITMAP_SIZE(last_ram_offset), 0xff, BITMAP_SIZE(size));
+memset((uint8_t *)phys_ram_code_dirty + 
+   BITMAP_SIZE(last_ram_offset), 0xff, BITMAP_SIZE(size));
+memset((uint8_t *)phys_ram_migration_dirty +
+   BITMAP_SIZE(last_ram_offset), 0xff, BITMAP_SIZE(size));
+}
 
 last_ram_offset += size;
 
-- 
1.7.0.31.g1df487

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] qemu-kvm: Use cpu_physical_memory_get_dirty_range() to check multiple dirty pages.

2010-03-16 Thread Yoshiaki Tamura
Modifies ram_save_block() and ram_save_remaining() to use 
cpu_physical_memory_get_dirty_range() to check multiple dirty and non-dirty
pages at once.

Signed-off-by: Yoshiaki Tamura 
Signed-off-by: OHMURA Kei 
---
 vl.c |   55 +++
 1 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/vl.c b/vl.c
index 6e35cc6..e9ad7c9 100644
--- a/vl.c
+++ b/vl.c
@@ -2779,7 +2779,8 @@ static int ram_save_block(QEMUFile *f)
 static ram_addr_t current_addr = 0;
 ram_addr_t saved_addr = current_addr;
 ram_addr_t addr = 0;
-int found = 0;
+ram_addr_t dirty_rams[HOST_LONG_BITS];
+int i, found = 0;
 
 while (addr < last_ram_offset) {
 if (kvm_enabled() && current_addr == 0) {
@@ -2791,28 +2792,35 @@ static int ram_save_block(QEMUFile *f)
 return 0;
 }
 }
-if (cpu_physical_memory_get_dirty(current_addr, MIGRATION_DIRTY_FLAG)) 
{
+if ((found = cpu_physical_memory_get_dirty_range(
+ current_addr, last_ram_offset, dirty_rams, HOST_LONG_BITS,
+ MIGRATION_DIRTY_FLAG))) {
 uint8_t *p;
 
-cpu_physical_memory_reset_dirty(current_addr,
-current_addr + TARGET_PAGE_SIZE,
-MIGRATION_DIRTY_FLAG);
+for (i = 0; i < found; i++) {
+ram_addr_t page_addr = dirty_rams[i];
+cpu_physical_memory_reset_dirty(page_addr,
+page_addr + TARGET_PAGE_SIZE,
+MIGRATION_DIRTY_FLAG);
 
-p = qemu_get_ram_ptr(current_addr);
+p = qemu_get_ram_ptr(page_addr);
 
-if (is_dup_page(p, *p)) {
-qemu_put_be64(f, current_addr | RAM_SAVE_FLAG_COMPRESS);
-qemu_put_byte(f, *p);
-} else {
-qemu_put_be64(f, current_addr | RAM_SAVE_FLAG_PAGE);
-qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
+if (is_dup_page(p, *p)) {
+qemu_put_be64(f, (page_addr) |
+  RAM_SAVE_FLAG_COMPRESS);
+qemu_put_byte(f, *p);
+} else {
+qemu_put_be64(f, (page_addr) |
+  RAM_SAVE_FLAG_PAGE);
+qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
+}
 }
-
-found = 1;
+   
 break;
+} else {
+addr += dirty_rams[0];
+current_addr = (saved_addr + addr) % last_ram_offset;
 }
-addr += TARGET_PAGE_SIZE;
-current_addr = (saved_addr + addr) % last_ram_offset;
 }
 
 return found;
@@ -2822,12 +2830,19 @@ static uint64_t bytes_transferred;
 
 static ram_addr_t ram_save_remaining(void)
 {
-ram_addr_t addr;
+ram_addr_t addr = 0;
 ram_addr_t count = 0;
+ram_addr_t dirty_rams[HOST_LONG_BITS];
+int found = 0;
 
-for (addr = 0; addr < last_ram_offset; addr += TARGET_PAGE_SIZE) {
-if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG))
-count++;
+while (addr < last_ram_offset) {
+if ((found = cpu_physical_memory_get_dirty_range(addr, last_ram_offset,
+dirty_rams, HOST_LONG_BITS, MIGRATION_DIRTY_FLAG))) {
+count += found;
+addr = dirty_rams[found - 1] + TARGET_PAGE_SIZE;
+} else {
+addr += dirty_rams[0];
+}
 }
 
 return count;
-- 
1.7.0.31.g1df487

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] qemu-kvm: Replace direct phys_ram_dirty access with wrapper functions.

2010-03-16 Thread Yoshiaki Tamura
Replaces direct phys_ram_dirty access with wrapper functions to prevent
direct access to the phys_ram_dirty bitmap.

Signed-off-by: Yoshiaki Tamura 
Signed-off-by: OHMURA Kei 
---
 exec.c |   45 -
 1 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/exec.c b/exec.c
index ba334e7..b31c349 100644
--- a/exec.c
+++ b/exec.c
@@ -1946,7 +1946,7 @@ static void tlb_protect_code(ram_addr_t ram_addr)
 static void tlb_unprotect_code_phys(CPUState *env, ram_addr_t ram_addr,
 target_ulong vaddr)
 {
-phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] |= CODE_DIRTY_FLAG;
+cpu_physical_memory_set_dirty_flags(ram_addr, CODE_DIRTY_FLAG);
 }
 
 static inline void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry,
@@ -1967,8 +1967,7 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, 
ram_addr_t end,
 {
 CPUState *env;
 unsigned long length, start1;
-int i, mask, len;
-uint8_t *p;
+int i;
 
 start &= TARGET_PAGE_MASK;
 end = TARGET_PAGE_ALIGN(end);
@@ -1976,11 +1975,7 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, 
ram_addr_t end,
 length = end - start;
 if (length == 0)
 return;
-len = length >> TARGET_PAGE_BITS;
-mask = ~dirty_flags;
-p = phys_ram_dirty + (start >> TARGET_PAGE_BITS);
-for(i = 0; i < len; i++)
-p[i] &= mask;
+cpu_physical_memory_mask_dirty_range(start, length, dirty_flags);
 
 /* we modify the TLB cache so that the dirty bit will be set again
when accessing the range */
@@ -2837,16 +2832,16 @@ static void notdirty_mem_writeb(void *opaque, 
target_phys_addr_t ram_addr,
 uint32_t val)
 {
 int dirty_flags;
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 if (!(dirty_flags & CODE_DIRTY_FLAG)) {
 #if !defined(CONFIG_USER_ONLY)
 tb_invalidate_phys_page_fast(ram_addr, 1);
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 #endif
 }
 stb_p(qemu_get_ram_ptr(ram_addr), val);
 dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
-phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
+cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
 /* we remove the notdirty callback only if the code has been
flushed */
 if (dirty_flags == 0xff)
@@ -2857,16 +2852,16 @@ static void notdirty_mem_writew(void *opaque, 
target_phys_addr_t ram_addr,
 uint32_t val)
 {
 int dirty_flags;
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 if (!(dirty_flags & CODE_DIRTY_FLAG)) {
 #if !defined(CONFIG_USER_ONLY)
 tb_invalidate_phys_page_fast(ram_addr, 2);
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 #endif
 }
 stw_p(qemu_get_ram_ptr(ram_addr), val);
 dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
-phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
+cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
 /* we remove the notdirty callback only if the code has been
flushed */
 if (dirty_flags == 0xff)
@@ -2877,16 +2872,16 @@ static void notdirty_mem_writel(void *opaque, 
target_phys_addr_t ram_addr,
 uint32_t val)
 {
 int dirty_flags;
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 if (!(dirty_flags & CODE_DIRTY_FLAG)) {
 #if !defined(CONFIG_USER_ONLY)
 tb_invalidate_phys_page_fast(ram_addr, 4);
-dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS];
+dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
 #endif
 }
 stl_p(qemu_get_ram_ptr(ram_addr), val);
 dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
-phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags;
+cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
 /* we remove the notdirty callback only if the code has been
flushed */
 if (dirty_flags == 0xff)
@@ -3337,8 +3332,8 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, 
uint8_t *buf,
 /* invalidate code */
 tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
 /* set dirty bit */
-phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
-(0xff & ~CODE_DIRTY_FLAG);
+cpu_physical_memory_set_dirty_flags(
+addr1, (0xff & ~CODE_DIRTY_FLAG));
 }
/* qemu doesn't execute guest code directly, but kvm does
   therefore flush instruction caches */
@@ -3551,8 +354

[PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.

2010-03-16 Thread Yoshiaki Tamura
Modifies wrapper functions for byte-based phys_ram_dirty bitmap to
bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent
direct access to the phys_ram_dirty bitmap.

Signed-off-by: Yoshiaki Tamura 
Signed-off-by: OHMURA Kei 
---
 cpu-all.h |   94 ++--
 1 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index 9bc01b9..91ec3e5 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -843,7 +843,9 @@ int cpu_str_to_log_mask(const char *str);
 /* memory API */
 
 extern int phys_ram_fd;
-extern uint8_t *phys_ram_dirty;
+extern unsigned long *phys_ram_vga_dirty;
+extern unsigned long *phys_ram_code_dirty;
+extern unsigned long *phys_ram_migration_dirty;
 extern ram_addr_t ram_size;
 extern ram_addr_t last_ram_offset;
 extern uint8_t *bios_mem;
@@ -879,20 +881,104 @@ int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
 /* read dirty bit (return 0 or 1) */
 static inline int cpu_physical_memory_is_dirty(ram_addr_t addr)
 {
-return phys_ram_dirty[addr >> TARGET_PAGE_BITS] == 0xff;
+unsigned long mask;
+int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int offset = (addr >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1);
+
+mask = 1UL << offset;
+return (phys_ram_vga_dirty[index]  &
+phys_ram_code_dirty[index] &
+phys_ram_migration_dirty[index] & mask) == mask;
+}
+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+unsigned long mask;
+int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int offset = (addr >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1);
+int ret = 0;
+
+mask = 1UL << offset;
+if (phys_ram_vga_dirty[index] & mask)
+ret |= VGA_DIRTY_FLAG;
+if (phys_ram_code_dirty[index] & mask)
+ret |=  CODE_DIRTY_FLAG;
+if (phys_ram_migration_dirty[index] & mask)
+ret |= MIGRATION_DIRTY_FLAG;
+
+return ret;
 }
 
 static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
 int dirty_flags)
 {
-return phys_ram_dirty[addr >> TARGET_PAGE_BITS] & dirty_flags;
+return cpu_physical_memory_get_dirty_flags(addr) & dirty_flags;
 }
 
 static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
 {
-phys_ram_dirty[addr >> TARGET_PAGE_BITS] = 0xff;
+unsigned long mask;
+int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int offset = (addr >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1);
+
+mask = 1UL << offset;
+phys_ram_vga_dirty[index] |= mask;
+phys_ram_code_dirty[index] |= mask;
+phys_ram_migration_dirty[index] |= mask;
+}
+
+static inline void cpu_physical_memory_set_dirty_range(ram_addr_t addr,
+   unsigned long mask)
+{
+int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+
+phys_ram_vga_dirty[index] |= mask;
+phys_ram_code_dirty[index] |= mask;
+phys_ram_migration_dirty[index] |= mask;
 }
 
+static inline void cpu_physical_memory_set_dirty_flags(ram_addr_t addr,
+   int dirty_flags)
+{
+unsigned long mask;
+int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int offset = (addr >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1);
+
+mask = 1UL << offset;
+if (dirty_flags & VGA_DIRTY_FLAG)
+phys_ram_vga_dirty[index] |= mask;
+if (dirty_flags & CODE_DIRTY_FLAG)
+phys_ram_code_dirty[index] |= mask;
+if (dirty_flags & MIGRATION_DIRTY_FLAG)
+phys_ram_migration_dirty[index] |= mask;
+}
+
+static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
+int length,
+int dirty_flags)
+{
+ram_addr_t addr = start;
+unsigned long mask;
+int index, offset, i;
+
+for (i = 0;  i < length; i += TARGET_PAGE_SIZE) {
+index = ((addr + i) >> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+offset = ((addr + i) >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1);
+mask = ~(1UL << offset);
+
+if (dirty_flags & VGA_DIRTY_FLAG)
+phys_ram_vga_dirty[index] &= mask;
+if (dirty_flags & CODE_DIRTY_FLAG)
+phys_ram_code_dirty[index] &= mask;
+if (dirty_flags & MIGRATION_DIRTY_FLAG)
+phys_ram_migration_dirty[index] &= mask;
+ }
+}
+
+int cpu_physical_memory_get_dirty_range(ram_addr_t start, ram_addr_t end, 
+ram_addr_t *dirty_rams, int length,
+int dirty_flags);
+
 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
  int dirty_flags);
 void cpu_tlb_update_dirty(CPUState *env);
-- 
1.7.0.31.g1df487

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.ke

[PATCH 4/6] qemu-kvm: Introduce cpu_physical_memory_get_dirty_range().

2010-03-16 Thread Yoshiaki Tamura
Introduces cpu_physical_memory_get_dirty_range().
It checks the first row and puts dirty addr in the array.
If the first row is empty, it skips to the first non-dirty row 
or the end addr, and put the length in the first entry of the array.

Signed-off-by: Yoshiaki Tamura 
Signed-off-by: OHMURA Kei 
---
 exec.c |   73 
 1 files changed, 73 insertions(+), 0 deletions(-)

diff --git a/exec.c b/exec.c
index b31c349..87056a6 100644
--- a/exec.c
+++ b/exec.c
@@ -1961,6 +1961,79 @@ static inline void tlb_reset_dirty_range(CPUTLBEntry 
*tlb_entry,
 }
 }
 
+/* It checks the first row and puts dirty addrs in the array.
+   If the first row is empty, it skips to the first non-dirty row
+   or the end addr, and put the length in the first entry of the array. */
+int cpu_physical_memory_get_dirty_range(ram_addr_t start, ram_addr_t end, 
+ram_addr_t *dirty_rams, int length,
+int dirty_flag)
+{
+unsigned long phys_ram_dirty, page_number, *p;
+ram_addr_t addr;
+int s_idx = (start >> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int e_idx = (end >> TARGET_PAGE_BITS) / HOST_LONG_BITS;
+int i, j, offset;
+
+switch (dirty_flag) {
+case VGA_DIRTY_FLAG:
+p = phys_ram_vga_dirty;
+break;
+case CODE_DIRTY_FLAG:
+p = phys_ram_code_dirty;
+break;
+case MIGRATION_DIRTY_FLAG:
+p = phys_ram_migration_dirty;
+break;
+default:
+abort();
+}
+
+/* mask bits before the start addr */
+offset = (start >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1);
+phys_ram_dirty = p[s_idx] & ~((1UL << offset) - 1);
+
+if (s_idx == e_idx) {
+/* mask bits after the end addr */
+offset = (end >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1);
+phys_ram_dirty &= (1UL << offset) - 1;
+}
+
+if (phys_ram_dirty == 0) {
+/* when the row is empty */
+ram_addr_t skip;
+if (s_idx == e_idx)
+skip = end;
+else {
+/* skip empty rows */
+while (s_idx < e_idx && p[++s_idx] == 0);
+skip = (s_idx * HOST_LONG_BITS * TARGET_PAGE_SIZE);
+}
+dirty_rams[0] = skip - start;
+i = 0;
+
+} else if (phys_ram_dirty == ~0UL) {
+/* when the row is fully dirtied */
+addr = start;
+for (i = 0; i < length; i++) {
+dirty_rams[i] = addr;
+addr += TARGET_PAGE_SIZE;
+}
+} else {
+/* when the row is partially dirtied */
+i = 0;
+do {
+j = ffsl(phys_ram_dirty) - 1;
+phys_ram_dirty &= ~(1UL << j);
+page_number = s_idx * HOST_LONG_BITS + j;
+addr = page_number * TARGET_PAGE_SIZE;
+dirty_rams[i] = addr;
+i++;
+} while (phys_ram_dirty != 0 && i < length);
+}
+
+return i;
+}
+
 /* Note: start and end must be within the same ram block.  */
 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
  int dirty_flags)
-- 
1.7.0.31.g1df487

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/6] qemu-kvm: Introduce bit-based phys_ram_dirty, and bit-based dirty page checker.

2010-03-16 Thread Yoshiaki Tamura

The dirty and non-dirty pages are checked one by one in vl.c.
When the most of the memory is not dirty,
checking the dirty and non-dirty pages by multiple page size
should be much faster than checking them one by one.
We introduced bit-based phys_ram_dirty for VGA, CODE and MIGRATION, and
cpu_physical_memory_get_dirty_range() for this purpose.

This patch is based on the following discussion.

http://www.mail-archive.com/kvm@vger.kernel.org/msg28733.html

To prove our prospect, we have evaluated effect of this patch.
We compared runtime of ram_save_remaining with original 
ram_save_remaining() and ram_save_remaining() using functions of this patch.

Test Environment:
CPU: 4x Intel Xeon Quad Core 2.66GHz
Mem size: 96GB
kvm version: 2.6.33
qemu-kvm version: commit 2b644fd0e737407133c88054ba498e772ce01f27

Host OS: CentOS (kernel 2.6.33)
Guest OS: Debian/GNU Linux lenny (kernel 2.6.26)
Guest Mem size: 512MB

Conditions of experiments are as follows:
Cond1: Guest OS periodically makes the 256MB continuous dirty pages.
Cond2: Guest OS periodically makes the 256MB dirty pages and non-dirty pages
in turn.
Cond3: Guest OS read 1GB file, which is bigger than memory.
Cond4: Guest OS write 1GB file, which is bigger than memory.

Experimental results:
Cond1: 1.9 ~ 61 times speed up
Cond2: 1.9 ~ 56 times speed up
Cond3: 1.9 ~ 59 times speed up
Cond4: 1.7 ~ 59 times speed up
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Avi Kivity  wrote:

> On 03/16/2010 12:20 PM, Ingo Molnar wrote:
> 
> The symbol server's client can certainly access the bits through 
> vmchannel.
> >>>Ok, that would work i suspect.
> >>>
> >>>Would be nice to have the symbol server in tools/perf/ and also make it 
> >>>easy
> >>>to add it to the initrd via a .config switch or so.
> >>>
> >>>That would have basically all of the advantages of being built into the 
> >>>kernel
> >>>(availability, configurability, transparency, hackability), while having 
> >>>all
> >>>the advantages of a user-space approach as well (flexibility, 
> >>>extensibility,
> >>>robustness, ease of maintenance, etc.).
> >>Note, I am not advocating building the vmchannel client into the host
> >>kernel. [...]
> >Neither am i. What i suggested was a user-space binary/executable built in
> >tools/perf and put into the initrd.
> 
> I'm confused - initrd seems to be guest-side.  I was talking about the host 
> side.

host side doesnt need much support - just some client capability in perf 
itself. I suspect vmchannels are sufficiently flexible and configuration-free 
for such purposes? (i.e. like a filesystem in essence)

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Mar 16

2010-03-16 Thread Daniel P. Berrange
On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote:
> On 03/16/2010 12:31 PM, Daniel P. Berrange wrote:
> >>Polling loops are an indication that something is wrong.
> >> 
> >Except when people suggest they are the right answer, qcow high
> >watermark ;-P
> >   
> 
> I liked Anthony's suggestion of an lvm2 block format driver.  No polling.

Doesn't that require giving QEMU privileges to perform LVM operations which
implies QEMU having CAP_SYS_ADMIN  ?


Daniel
-- 
|: Red Hat, Engineering, London-o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :|
|: http://autobuild.org-o- http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-   F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter

2010-03-16 Thread Christoph Hellwig
On Tue, Mar 16, 2010 at 12:36:31PM +0200, Avi Kivity wrote:
> Are you talking about direct volume access or qcow2?

Doesn't matter.

> For direct volume access, I still don't get it.  The number of barriers 
> issues by the host must equal (or exceed, but that's pointless) the 
> number of barriers issued by the guest.  cache=writeback allows the host 
> to reorder writes, but so does cache=none.  Where does the difference 
> come from?
> 
> Put it another way.  In an unvirtualized environment, if you implement a 
> write cache in a storage driver (not device), and sync it on a barrier 
> request, would you expect to see a performance improvement?

cache=none only allows very limited reorderning in the host.  O_DIRECT
is synchronous on the host, so there's just some very limited reordering
going on in the elevator if we have other I/O going on in parallel.
In addition to that the disk writecache can perform limited reodering
and caching, but the disk cache has a rather limited size.  The host
pagecache gives a much wieder opportunity to reorder, especially if
the guest workload is not cache flush heavy.  If the guest workload
is extremly cache flush heavy the usefulness of the pagecache is rather
limited, as we'll only use very little of it, but pay by having to do
a data copy.  If the workload is not cache flush heavy, and we have
multiple guests doing I/O to the same spindles it will allow the host
do do much more efficient data writeout by beeing able to do better
ordered (less seeky) and bigger I/O (especially if the host has real
storage compared to ide for the guest).

> >If you don't have a of lot of cache flushes, e.g. due to dumb
> >applications that do not issue fsync, or even run ext3 in it's default
> >mode never issues cache flushes the benefit will be enormous, but the
> >data loss and possible corruption will be enormous.
> >   
> 
> Shouldn't the host never issue cache flushes in this case? (for direct 
> volume access; qcow2 still needs flushes for metadata integrity).

If the guest never issues a flush the host will neither, indeed.  Data
will only go to disk by background writeout or memory pressure.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Mar 16

2010-03-16 Thread Christoph Hellwig
On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote:
> On 03/16/2010 12:31 PM, Daniel P. Berrange wrote:
>>> Polling loops are an indication that something is wrong.
>>>  
>> Except when people suggest they are the right answer, qcow high
>> watermark ;-P
>>
>
> I liked Anthony's suggestion of an lvm2 block format driver.  No polling.

I have done some work on linking the new lvm library to qemu to control
snapshotting.  But introducing a whole new block format seems like a lot
of duplication to me.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:20 PM, Ingo Molnar wrote:


The symbol server's client can certainly access the bits through vmchannel.
 

Ok, that would work i suspect.

Would be nice to have the symbol server in tools/perf/ and also make it easy
to add it to the initrd via a .config switch or so.

That would have basically all of the advantages of being built into the kernel
(availability, configurability, transparency, hackability), while having all
the advantages of a user-space approach as well (flexibility, extensibility,
robustness, ease of maintenance, etc.).
   

Note, I am not advocating building the vmchannel client into the host
kernel. [...]
 

Neither am i. What i suggested was a user-space binary/executable built in
tools/perf and put into the initrd.
   


I'm confused - initrd seems to be guest-side.  I was talking about the 
host side.


For the guest, placing the symbol server in tools/ is reasonable.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Mar 16

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:31 PM, Daniel P. Berrange wrote:

Polling loops are an indication that something is wrong.
 

Except when people suggest they are the right answer, qcow high
watermark ;-P
   


I liked Anthony's suggestion of an lvm2 block format driver.  No polling.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter

2010-03-16 Thread Avi Kivity

On 03/16/2010 12:26 PM, Christoph Hellwig wrote:

Avi,

cache=writeback can be faster than cache=none for the same reasons
a disk cache speeds up access.  As long as the I/O mix contains more
asynchronous then synchronous writes it allows the host to do much
more reordering, only limited by the cache size (which can be quite
huge when using the host pagecache) and the amount of cache flushes
coming from the host.  If you have a fsync heavy workload or metadata
operation with a filesystem like the current XFS you will get lots
of cache flushes that make the use of the additional cache limits.
   


Are you talking about direct volume access or qcow2?

For direct volume access, I still don't get it.  The number of barriers 
issues by the host must equal (or exceed, but that's pointless) the 
number of barriers issued by the guest.  cache=writeback allows the host 
to reorder writes, but so does cache=none.  Where does the difference 
come from?


Put it another way.  In an unvirtualized environment, if you implement a 
write cache in a storage driver (not device), and sync it on a barrier 
request, would you expect to see a performance improvement?




If you don't have a of lot of cache flushes, e.g. due to dumb
applications that do not issue fsync, or even run ext3 in it's default
mode never issues cache flushes the benefit will be enormous, but the
data loss and possible corruption will be enormous.
   


Shouldn't the host never issue cache flushes in this case? (for direct 
volume access; qcow2 still needs flushes for metadata integrity).



But even for something like btrfs that does provide data integrity
but issues cache flushes fairly effeciently data=writeback may
provide a quite nice speedup, especially if using multiple guest
accessing the same spindle(s).

But I wouldn't be surprised if IBM's exteme differences are indeed due
to the extremly unsafe ext3 default behaviour.
   



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Mar 16

2010-03-16 Thread Daniel P. Berrange
On Tue, Mar 16, 2010 at 11:43:48AM +0200, Avi Kivity wrote:
> On 03/16/2010 11:29 AM, Daniel P. Berrange wrote:
> >On Tue, Mar 16, 2010 at 10:18:03AM +0100, Juan Quintela wrote:
> >   
> >>Chris Wright  wrote:
> >> 
> >>>Please send in any agenda items you are interested in covering.
> >>>   
> >>Migration:
> >>- flexible migration:  I hope to sent an RFC patch on time for the
> >>   call.  idea is to use subsections.
> >>
> >>- callbacks.  block migration introduced several callbacks:
> >>   * cancel()
> >>   * get_status()
> >>   * release()
> >>   in spice we need now another to callbacks: on_start() and on_end().
> >>* on_start(): tells spice that migration has started (it will then
> >>  manage certificates, passwords, ... itself)
> >>* on_end(): it is called when migration ends.  spice use it to
> >>  transparently connect to the new host and user don't have to 
> >>  "reconnect"
> >>
> >>- what to do on migration error:
> >>   - target side:  libvirt folks want the program to print a message if
> >> it fails.  Current code spent 100% cpu time doing select on a closed
> >> fd.  (patches already on the list to make it wait without using
> >> cpu).
> >> 
> >No, that is not correct. We want QEMU to exit when incoming migration
> >fails. Printing to stderr is just something that will end up in the
> >logs for admin to further diagnose the problem if required. There is
> >nothing to be gained by leaving QEMU running, and everything to loose
> >since the failed migration may have left it in a dangerous state from
> >which you do not want to attempt incoming migration again.
> >
> >If we really want to leave it running when migration fails, then we're
> >going to have to add yet another QMP event to inform libvirt when
> >migration has finished/failed, and/or make 'query_migrate' work on
> >the destination too.
> >   
> 
> A qmp event seems the logical thing to do?  Exiting can happen for many 
> reasons, a qmp event is unambiguous.

Yes, for the QEMU upstream adding an event is more flexible. I had
originally suggested exiting in the context of the Fedora bug report
which was for QEMU 0.10.x which has no events capability.

> >
> >Incidentally I have a feeling we might need to introduce a migration
> >event in QMP. Currently libvirt polls on the 'query_migrate' command
> >to get the ongoing migration status. This means there can be a delay
> >in detecting completion as long as the polling interval - for this
> >reason we just dropped libvirt's polling time from 1/2 sec to 50ms
> >to ensure prompt detection.
> >   
> 
> Whenever you implement a polling loop, can you send an event to qemu-de...@?

Yep, sure thing. This is the only polling loop that isn't related to I/O
stats collection.

> 
> Polling loops are an indication that something is wrong.

Except when people suggest they are the right answer, qcow high 
watermark ;-P

Regards,
Daniel
-- 
|: Red Hat, Engineering, London-o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :|
|: http://autobuild.org-o- http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-   F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter

2010-03-16 Thread Christoph Hellwig
Avi,

cache=writeback can be faster than cache=none for the same reasons
a disk cache speeds up access.  As long as the I/O mix contains more
asynchronous then synchronous writes it allows the host to do much
more reordering, only limited by the cache size (which can be quite
huge when using the host pagecache) and the amount of cache flushes
coming from the host.  If you have a fsync heavy workload or metadata
operation with a filesystem like the current XFS you will get lots
of cache flushes that make the use of the additional cache limits.

If you don't have a of lot of cache flushes, e.g. due to dumb
applications that do not issue fsync, or even run ext3 in it's default
mode never issues cache flushes the benefit will be enormous, but the
data loss and possible corruption will be enormous.

But even for something like btrfs that does provide data integrity
but issues cache flushes fairly effeciently data=writeback may
provide a quite nice speedup, especially if using multiple guest
accessing the same spindle(s).

But I wouldn't be surprised if IBM's exteme differences are indeed due
to the extremly unsafe ext3 default behaviour.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Avi Kivity  wrote:

> On 03/16/2010 11:53 AM, Ingo Molnar wrote:
> >* Avi Kivity  wrote:
> >
> >>On 03/16/2010 09:24 AM, Ingo Molnar wrote:
> >>>* Avi Kivity   wrote:
> >>>
> On 03/16/2010 07:27 AM, Zhang, Yanmin wrote:
> >From: Zhang, Yanmin
> >
> >Based on the discussion in KVM community, I worked out the patch to 
> >support
> >perf to collect guest os statistics from host side. This patch is 
> >implemented
> >with Ingo, Peter and some other guys' kind help. Yang Sheng pointed out a
> >critical bug and provided good suggestions with other guys. I really 
> >appreciate
> >their kind help.
> >
> >The patch adds new subcommand kvm to perf.
> >
> >   perf kvm top
> >   perf kvm record
> >   perf kvm report
> >   perf kvm diff
> >
> >The new perf could profile guest os kernel except guest os user space, 
> >but it
> >could summarize guest os user space utilization per guest os.
> >
> >Below are some examples.
> >1) perf kvm top
> >[r...@lkp-ne01 norm]# perf kvm --host --guest 
> >--guestkallsyms=/home/ymzhang/guest/kallsyms
> >--guestmodules=/home/ymzhang/guest/modules top
> >
> Excellent, support for guest kernel != host kernel is critical (I
> can't remember the last time I ran same kernels).
> 
> How would we support multiple guests with different kernels? Perhaps a
> symbol server that perf can connect to (and that would connect to guests 
> in
> turn)?
> >>>The highest quality solution would be if KVM offered a 'guest extension' to
> >>>the guest kernel's /proc/kallsyms that made it easy for user-space to get 
> >>>this
> >>>information from an authorative source.
> >>>
> >>>That's the main reason why the host side /proc/kallsyms is so popular and 
> >>>so
> >>>useful: while in theory it's mostly redundant information which can be 
> >>>gleaned
> >>>from the System.map and other sources of symbol information, it's easily
> >>>available and is _always_ trustable to come from the host kernel.
> >>>
> >>>Separate System.map's have a tendency to go out of sync (or go missing 
> >>>when a
> >>>devel kernel gets rebuilt, or if a devel package is not installed), and 
> >>>server
> >>>ports (be that a TCP port space server or an UDP port space mount-point) 
> >>>are
> >>>both a configuration hassle and are not guest-transparent.
> >>>
> >>>So for instrumentation infrastructure (such as perf) we have a large and 
> >>>well
> >>>founded preference for intrinsic, built-in, kernel-provided information: 
> >>>i.e.
> >>>a largely 'built-in' and transparent mechanism to get to guest symbols.
> >>The symbol server's client can certainly access the bits through vmchannel.
> >Ok, that would work i suspect.
> >
> >Would be nice to have the symbol server in tools/perf/ and also make it easy
> >to add it to the initrd via a .config switch or so.
> >
> >That would have basically all of the advantages of being built into the 
> >kernel
> >(availability, configurability, transparency, hackability), while having all
> >the advantages of a user-space approach as well (flexibility, extensibility,
> >robustness, ease of maintenance, etc.).
> 
> Note, I am not advocating building the vmchannel client into the host 
> kernel. [...]

Neither am i. What i suggested was a user-space binary/executable built in 
tools/perf and put into the initrd.

That approach has the advantages i listed above, without having the 
disadvantages of in-kernel code you listed.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter

2010-03-16 Thread Avi Kivity

On 03/16/2010 11:54 AM, Kevin Wolf wrote:



Is this with qcow2, raw file, or direct volume access?

I can understand it for qcow2, but for direct volume access this
shouldn't happen.  The guest schedules as many writes as it can,
followed by a sync.  The host (and disk) can then reschedule them
whether they are in the writeback cache or in the block layer, and must
sync in the same way once completed.

Perhaps what we need is bdrv_aio_submit() which can take a number of
requests.  For direct volume access, this allows easier reordering
(io_submit() should plug the queues before it starts processing and
unplug them when done, though I don't see the code for this?).  For
qcow2, we can coalesce metadata updates for multiple requests into one
RMW (for example, a sequential write split into multiple 64K-256K write
requests).
 

We already do merge sequential writes back into one larger request. So
this is in fact a case that wouldn't benefit from such changes.


I'm not happy with that.  It increases overall latency.  With qcow2 it's 
fine, but I'd let requests to raw volumes flow unaltered.



It may
help for other cases. But even if it did, coalescing metadata writes in
qcow2 sounds like a good way to mess up, so I'd stay with doing it only
for the data itself.
   


I don't see why.


Apart from that, wouldn't your points apply to writeback as well?
   


They do, but for writeback the host kernel already does all the 
coalescing/merging/blah for us.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Avi Kivity

On 03/16/2010 11:53 AM, Ingo Molnar wrote:

* Avi Kivity  wrote:

   

On 03/16/2010 09:24 AM, Ingo Molnar wrote:
 

* Avi Kivity   wrote:

   

On 03/16/2010 07:27 AM, Zhang, Yanmin wrote:
 

From: Zhang, Yanmin

Based on the discussion in KVM community, I worked out the patch to support
perf to collect guest os statistics from host side. This patch is implemented
with Ingo, Peter and some other guys' kind help. Yang Sheng pointed out a
critical bug and provided good suggestions with other guys. I really appreciate
their kind help.

The patch adds new subcommand kvm to perf.

   perf kvm top
   perf kvm record
   perf kvm report
   perf kvm diff

The new perf could profile guest os kernel except guest os user space, but it
could summarize guest os user space utilization per guest os.

Below are some examples.
1) perf kvm top
[r...@lkp-ne01 norm]# perf kvm --host --guest 
--guestkallsyms=/home/ymzhang/guest/kallsyms
--guestmodules=/home/ymzhang/guest/modules top

   

Excellent, support for guest kernel != host kernel is critical (I
can't remember the last time I ran same kernels).

How would we support multiple guests with different kernels? Perhaps a
symbol server that perf can connect to (and that would connect to guests in
turn)?
 

The highest quality solution would be if KVM offered a 'guest extension' to
the guest kernel's /proc/kallsyms that made it easy for user-space to get this
information from an authorative source.

That's the main reason why the host side /proc/kallsyms is so popular and so
useful: while in theory it's mostly redundant information which can be gleaned
   

>from the System.map and other sources of symbol information, it's easily
 

available and is _always_ trustable to come from the host kernel.

Separate System.map's have a tendency to go out of sync (or go missing when a
devel kernel gets rebuilt, or if a devel package is not installed), and server
ports (be that a TCP port space server or an UDP port space mount-point) are
both a configuration hassle and are not guest-transparent.

So for instrumentation infrastructure (such as perf) we have a large and well
founded preference for intrinsic, built-in, kernel-provided information: i.e.
a largely 'built-in' and transparent mechanism to get to guest symbols.
   

The symbol server's client can certainly access the bits through vmchannel.
 

Ok, that would work i suspect.

Would be nice to have the symbol server in tools/perf/ and also make it easy
to add it to the initrd via a .config switch or so.

That would have basically all of the advantages of being built into the kernel
(availability, configurability, transparency, hackability), while having all
the advantages of a user-space approach as well (flexibility, extensibility,
robustness, ease of maintenance, etc.).
   


Note, I am not advocating building the vmchannel client into the host 
kernel.  While that makes everything simpler for the user, it increases 
the kernel footprint with all the disadvantages that come with that (any 
bug is converted into a host DoS or worse).


So, perf would connect to qemu via (say) a well-known unix domain 
socket, which would then talk to the guest kernel.


I know you won't like it, we'll continue to disagree on this unfortunately.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Mar 16

2010-03-16 Thread Daniel P. Berrange
On Tue, Mar 16, 2010 at 09:29:44AM +, Daniel P. Berrange wrote:
> On Tue, Mar 16, 2010 at 10:18:03AM +0100, Juan Quintela wrote:
> > Chris Wright  wrote:
> > > Please send in any agenda items you are interested in covering.
> > 
> > Migration:
> > - flexible migration:  I hope to sent an RFC patch on time for the
> >   call.  idea is to use subsections.
> > 
> > - callbacks.  block migration introduced several callbacks:
> >   * cancel()
> >   * get_status()
> >   * release()
> >   in spice we need now another to callbacks: on_start() and on_end().
> >* on_start(): tells spice that migration has started (it will then
> >  manage certificates, passwords, ... itself)
> >* on_end(): it is called when migration ends.  spice use it to
> >  transparently connect to the new host and user don't have to 
> > "reconnect"
> > 
> > - what to do on migration error:
> >   - target side:  libvirt folks want the program to print a message if
> > it fails.  Current code spent 100% cpu time doing select on a closed
> > fd.  (patches already on the list to make it wait without using
> > cpu).
> 
> No, that is not correct. We want QEMU to exit when incoming migration
> fails. Printing to stderr is just something that will end up in the
> logs for admin to further diagnose the problem if required. There is 
> nothing to be gained by leaving QEMU running, and everything to loose
> since the failed migration may have left it in a dangerous state from 
> which you do not want to attempt incoming migration again.

Sorry, I forgot to include the original BZ report about this problem from
Fedora. In essence, we just truncated the original save state image and
then tried to restore from it to check handling in the event of corrupted
save image.

  https://bugzilla.redhat.com/show_bug.cgi?id=518032


Regards,
Daniel
-- 
|: Red Hat, Engineering, London-o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :|
|: http://autobuild.org-o- http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-   F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter

2010-03-16 Thread Kevin Wolf
Am 16.03.2010 10:17, schrieb Avi Kivity:
> On 03/15/2010 10:23 PM, Chris Webb wrote:
>> Avi Kivity  writes:
>>
>>
>>> On 03/15/2010 10:07 AM, Balbir Singh wrote:
>>>
>>>  
 Yes, it is a virtio call away, but is the cost of paying twice in
 terms of memory acceptable?

>>> Usually, it isn't, which is why I recommend cache=off.
>>>  
>> Hi Avi. One observation about your recommendation for cache=none:
>>
>> We run hosts of VMs accessing drives backed by logical volumes carved out
>> from md RAID1. Each host has 32GB RAM and eight cores, divided between (say)
>> twenty virtual machines, which pretty much fill the available memory on the
>> host. Our qemu-kvm is new enough that IDE and SCSI drives with writeback
>> caching turned on get advertised to the guest as having a write-cache, and
>> FLUSH gets translated to fsync() by qemu. (Consequently cache=writeback
>> isn't acting as cache=neverflush like it would have done a year ago. I know
>> that comparing performance for cache=none against that unsafe behaviour
>> would be somewhat unfair!)
>>
>> Wasteful duplication of page cache between guest and host notwithstanding,
>> turning on cache=writeback is a spectacular performance win for our guests.
>> For example, even IDE with cache=writeback easily beats virtio with
>> cache=none in most of the guest filesystem performance tests I've tried. The
>> anecdotal feedback from clients is also very strongly in favour of
>> cache=writeback.
>>
> 
> Is this with qcow2, raw file, or direct volume access?
> 
> I can understand it for qcow2, but for direct volume access this 
> shouldn't happen.  The guest schedules as many writes as it can, 
> followed by a sync.  The host (and disk) can then reschedule them 
> whether they are in the writeback cache or in the block layer, and must 
> sync in the same way once completed.
> 
> Perhaps what we need is bdrv_aio_submit() which can take a number of 
> requests.  For direct volume access, this allows easier reordering 
> (io_submit() should plug the queues before it starts processing and 
> unplug them when done, though I don't see the code for this?).  For 
> qcow2, we can coalesce metadata updates for multiple requests into one 
> RMW (for example, a sequential write split into multiple 64K-256K write 
> requests).

We already do merge sequential writes back into one larger request. So
this is in fact a case that wouldn't benefit from such changes. It may
help for other cases. But even if it did, coalescing metadata writes in
qcow2 sounds like a good way to mess up, so I'd stay with doing it only
for the data itself.

Apart from that, wouldn't your points apply to writeback as well?

Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Avi Kivity  wrote:

> On 03/16/2010 09:24 AM, Ingo Molnar wrote:
> >* Avi Kivity  wrote:
> >
> >>On 03/16/2010 07:27 AM, Zhang, Yanmin wrote:
> >>>From: Zhang, Yanmin
> >>>
> >>>Based on the discussion in KVM community, I worked out the patch to support
> >>>perf to collect guest os statistics from host side. This patch is 
> >>>implemented
> >>>with Ingo, Peter and some other guys' kind help. Yang Sheng pointed out a
> >>>critical bug and provided good suggestions with other guys. I really 
> >>>appreciate
> >>>their kind help.
> >>>
> >>>The patch adds new subcommand kvm to perf.
> >>>
> >>>   perf kvm top
> >>>   perf kvm record
> >>>   perf kvm report
> >>>   perf kvm diff
> >>>
> >>>The new perf could profile guest os kernel except guest os user space, but 
> >>>it
> >>>could summarize guest os user space utilization per guest os.
> >>>
> >>>Below are some examples.
> >>>1) perf kvm top
> >>>[r...@lkp-ne01 norm]# perf kvm --host --guest 
> >>>--guestkallsyms=/home/ymzhang/guest/kallsyms
> >>>--guestmodules=/home/ymzhang/guest/modules top
> >>>
> >>Excellent, support for guest kernel != host kernel is critical (I
> >>can't remember the last time I ran same kernels).
> >>
> >>How would we support multiple guests with different kernels? Perhaps a
> >>symbol server that perf can connect to (and that would connect to guests in
> >>turn)?
> >The highest quality solution would be if KVM offered a 'guest extension' to
> >the guest kernel's /proc/kallsyms that made it easy for user-space to get 
> >this
> >information from an authorative source.
> >
> >That's the main reason why the host side /proc/kallsyms is so popular and so
> >useful: while in theory it's mostly redundant information which can be 
> >gleaned
> >from the System.map and other sources of symbol information, it's easily
> >available and is _always_ trustable to come from the host kernel.
> >
> >Separate System.map's have a tendency to go out of sync (or go missing when a
> >devel kernel gets rebuilt, or if a devel package is not installed), and 
> >server
> >ports (be that a TCP port space server or an UDP port space mount-point) are
> >both a configuration hassle and are not guest-transparent.
> >
> >So for instrumentation infrastructure (such as perf) we have a large and well
> >founded preference for intrinsic, built-in, kernel-provided information: i.e.
> >a largely 'built-in' and transparent mechanism to get to guest symbols.
> 
> The symbol server's client can certainly access the bits through vmchannel.

Ok, that would work i suspect.

Would be nice to have the symbol server in tools/perf/ and also make it easy 
to add it to the initrd via a .config switch or so.

That would have basically all of the advantages of being built into the kernel 
(availability, configurability, transparency, hackability), while having all 
the advantages of a user-space approach as well (flexibility, extensibility, 
robustness, ease of maintenance, etc.).

If only we had tools/xorg/ integrated via the initrd that way ;-)

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side

2010-03-16 Thread Ingo Molnar

* Zhang, Yanmin  wrote:

> On Tue, 2010-03-16 at 15:48 +0800, Zhang, Yanmin wrote:
> > On Tue, 2010-03-16 at 07:41 +0200, Avi Kivity wrote:
> > > On 03/16/2010 07:27 AM, Zhang, Yanmin wrote:
> > > > From: Zhang, Yanmin
> > > >
> > > > Based on the discussion in KVM community, I worked out the patch to 
> > > > support
> > > > perf to collect guest os statistics from host side. This patch is 
> > > > implemented
> > > > with Ingo, Peter and some other guys' kind help. Yang Sheng pointed out 
> > > > a
> > > > critical bug and provided good suggestions with other guys. I really 
> > > > appreciate
> > > > their kind help.
> > > >
> > > > The patch adds new subcommand kvm to perf.
> > > >
> > > >perf kvm top
> > > >perf kvm record
> > > >perf kvm report
> > > >perf kvm diff
> > > >
> > > > The new perf could profile guest os kernel except guest os user space, 
> > > > but it
> > > > could summarize guest os user space utilization per guest os.
> > > >
> > > > Below are some examples.
> > > > 1) perf kvm top
> > > > [r...@lkp-ne01 norm]# perf kvm --host --guest 
> > > > --guestkallsyms=/home/ymzhang/guest/kallsyms
> > > > --guestmodules=/home/ymzhang/guest/modules top
> > > >
> > > >
> > > 
> > Thanks for your kind comments.
> > 
> > > Excellent, support for guest kernel != host kernel is critical (I can't 
> > > remember the last time I ran same kernels).
> > > 
> > > How would we support multiple guests with different kernels?
> > With the patch, 'perf kvm report --sort pid" could show
> > summary statistics for all guest os instances. Then, use
> > parameter --pid of 'perf kvm record' to collect single problematic instance 
> > data.
> Sorry. I found currently --pid isn't process but a thread (main thread).
> 
> Ingo,
> 
> Is it possible to support a new parameter or extend --inherit, so 'perf 
> record' and 'perf top' could collect data on all threads of a process when 
> the process is running?
> 
> If not, I need add a new ugly parameter which is similar to --pid to filter 
> out process data in userspace.

Yeah. For maximum utility i'd suggest to extend --pid to include this, and 
introduce --tid for the previous, limited-to-a-single-task functionality.

Most users would expect --pid to work like a 'late attach' - i.e. to work like 
strace -f or like a gdb attach.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >