[PATCH 2/2] qemu-kvm: Replace direct phys_ram_dirty access with wrapper functions.
Replaces direct phys_ram_dirty access with wrapper functions to prevent direct access to the phys_ram_dirty bitmap. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- exec.c | 45 - 1 files changed, 20 insertions(+), 25 deletions(-) diff --git a/exec.c b/exec.c index 9bcb4de..b607212 100644 --- a/exec.c +++ b/exec.c @@ -1944,7 +1944,7 @@ static void tlb_protect_code(ram_addr_t ram_addr) static void tlb_unprotect_code_phys(CPUState *env, ram_addr_t ram_addr, target_ulong vaddr) { -phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] |= CODE_DIRTY_FLAG; +cpu_physical_memory_set_dirty_flags(ram_addr, CODE_DIRTY_FLAG); } static inline void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, @@ -1965,8 +1965,7 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end, { CPUState *env; unsigned long length, start1; -int i, mask, len; -uint8_t *p; +int i; start &= TARGET_PAGE_MASK; end = TARGET_PAGE_ALIGN(end); @@ -1974,11 +1973,7 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end, length = end - start; if (length == 0) return; -len = length >> TARGET_PAGE_BITS; -mask = ~dirty_flags; -p = phys_ram_dirty + (start >> TARGET_PAGE_BITS); -for(i = 0; i < len; i++) -p[i] &= mask; +cpu_physical_memory_mask_dirty_range(start, length, dirty_flags); /* we modify the TLB cache so that the dirty bit will be set again when accessing the range */ @@ -2825,16 +2820,16 @@ static void notdirty_mem_writeb(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 1); -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); #endif } stb_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); -phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; +cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags); /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) @@ -2845,16 +2840,16 @@ static void notdirty_mem_writew(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 2); -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); #endif } stw_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); -phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; +cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags); /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) @@ -2865,16 +2860,16 @@ static void notdirty_mem_writel(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 4); -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); #endif } stl_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); -phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; +cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags); /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) @@ -3325,8 +3320,8 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, /* invalidate code */ tb_invalidate_phys_page_range(addr1, addr1 + l, 0); /* set dirty bit */ -phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |= -(0xff & ~CODE_DIRTY_FLAG); +cpu_physical_memory_set_dirty_flags( +addr1, (0xff & ~CODE_DIRTY_FLAG)); } /* qemu doesn't execute guest code directly, but kvm does therefore flush instruction caches */ @@ -3539,8
[PATCH 1/2] qemu-kvm: Introduce wrapper functions to access phys_ram_dirty.
Adds wrapper functions to prevent direct access to the phys_ram_dirty bitmap. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- cpu-all.h | 25 + 1 files changed, 25 insertions(+), 0 deletions(-) diff --git a/cpu-all.h b/cpu-all.h index 9bc01b9..c279c0a 100644 --- a/cpu-all.h +++ b/cpu-all.h @@ -882,6 +882,11 @@ static inline int cpu_physical_memory_is_dirty(ram_addr_t addr) return phys_ram_dirty[addr >> TARGET_PAGE_BITS] == 0xff; } +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ +return phys_ram_dirty[addr >> TARGET_PAGE_BITS]; +} + static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { @@ -893,6 +898,26 @@ static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) phys_ram_dirty[addr >> TARGET_PAGE_BITS] = 0xff; } +static inline int cpu_physical_memory_set_dirty_flags(ram_addr_t addr, + int dirty_flags) +{ +return phys_ram_dirty[addr >> TARGET_PAGE_BITS] |= dirty_flags; +} + +static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start, +int length, +int dirty_flags) +{ +int i, mask, len; +uint8_t *p; + +len = length >> TARGET_PAGE_BITS; +mask = ~dirty_flags; +p = phys_ram_dirty + (start >> TARGET_PAGE_BITS); +for (i = 0; i < len; i++) +p[i] &= mask; +} + void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end, int dirty_flags); void cpu_tlb_update_dirty(CPUState *env); -- 1.7.0.31.g1df487 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/2] qemu-kvm: Introduce wrapper functions to access phys_ram_dirty, and replace existing direct accesses to it.
Before replacing byte-based dirty bitmap with bit-based dirty bitmap, clearing direct accesses to the bitmap first seems to be good point to start with. This patch set is based on the following discussion. http://www.mail-archive.com/kvm@vger.kernel.org/msg30724.html Thanks, Yoshi -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
About KVM Forum 2010
Hi all Does anybody know exact important date, such as paper deadline for KVM Forum 2010? I can find this blog(http://www.linux-kvm.com/content/kvm-forum-2010-scheduled-august-9-10-2010) but the blog only say about the date of the conference. Regards, Kazushi Takahashi -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: cleanup kvm_get_dirty_log()
Takuya Yoshikawa wrote: > Xiao Guangrong wrote: >> Using bitmap_empty() to see whether memslot->dirty_bitmap is empty >> > > You can do this for arch specific get_dirty_log() too. OK, i'll do it in the next version > >> Signed-off-by: Xiao Guangrong >> --- >> virt/kvm/kvm_main.c |6 ++ >> 1 files changed, 2 insertions(+), 4 deletions(-) >> >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c >> index bcd08b8..497ae14 100644 >> --- a/virt/kvm/kvm_main.c >> +++ b/virt/kvm/kvm_main.c >> @@ -767,8 +767,7 @@ int kvm_get_dirty_log(struct kvm *kvm, >> struct kvm_dirty_log *log, int *is_dirty) >> { >> struct kvm_memory_slot *memslot; >> -int r, i; >> -int n; >> +int r, n; >> unsigned long any = 0; > > any is no longer need to be unsigned long, if you do this? Yeah, right, thanks for you point out. Xiao -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: cleanup kvm_get_dirty_log()
Xiao Guangrong wrote: Using bitmap_empty() to see whether memslot->dirty_bitmap is empty You can do this for arch specific get_dirty_log() too. Signed-off-by: Xiao Guangrong --- virt/kvm/kvm_main.c |6 ++ 1 files changed, 2 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bcd08b8..497ae14 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -767,8 +767,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty) { struct kvm_memory_slot *memslot; - int r, i; - int n; + int r, n; unsigned long any = 0; any is no longer need to be unsigned long, if you do this? r = -EINVAL; @@ -782,8 +781,7 @@ int kvm_get_dirty_log(struct kvm *kvm, n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - for (i = 0; !any && i < n/sizeof(long); ++i) - any = memslot->dirty_bitmap[i]; + any = !bitmap_empty(memslot->dirty_bitmap, memslot->npages); r = -EFAULT; if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
On 03/16/2010 10:10 PM, Blue Swirl wrote: Yes, and is what tlb_protect_code() does and it's called from tb_alloc_page() which is what's code when a TB is created. Just a tangential note: a long time ago, I tried to disable self modifying code detection for Sparc. On most RISC architectures, SMC needs explicit flushing so in theory we need not track code memory writes. However, during exceptions the translator needs to access the original unmodified code that was used to generate the TB. But maybe there are other ways to avoid SMC tracking, on x86 it's still needed On x86 you're supposed to execute a serializing instruction (one of INVD, INVEPT, INVLPG, INVVPID, LGDT, LIDT, LLDT, LTR, MOV (to control register, with the exception of MOV CR8), MOV (to debug register), WBINVD, WRMSR, CPUID, IRET, and RSM) before running modified code. but I suppose SMC is pretty rare. Every time you demand load a code page from disk, you're running self modifying code (though it usually doesn't exist in the tlb, so there's no previous version that can cause trouble). -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/5] Fix some mmu/emulator atomicity issues (v2)
On 03/16/2010 09:33 PM, Marcelo Tosatti wrote: How relevant is this for -stable? Races don't sound good to me :) The race mentioned above is not existant on -stable since prefetch is disabled for invlpg. The atomic fixes seem like a candidate, since lack of them can trigger pagetable corruption. Avi? I would this to get some use on mainline before queuing the first few patches. They aren't trivial, and I don't know of any failures resulting from the races. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/17/2010 02:41 AM, Frank Ch. Eigler wrote: Hi - On Tue, Mar 16, 2010 at 06:04:10PM -0500, Anthony Liguori wrote: [...] The only way to really address this is to change the interaction. Instead of running perf externally to qemu, we should support a perf command in the qemu monitor that can then tie directly to the perf tooling. That gives us the best possible user experience. To what extent could this be solved with less crossing of isolation/abstraction layers, if the perfctr facilities were properly virtualized? That's the more interesting (by far) usage model. In general guest owners don't have access to the host, and host owners can't (and shouldn't) change guests. Monitoring guests from the host is useful for kvm developers, but less so for users. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: cleanup kvm_get_dirty_log()
Using bitmap_empty() to see whether memslot->dirty_bitmap is empty Signed-off-by: Xiao Guangrong --- virt/kvm/kvm_main.c |6 ++ 1 files changed, 2 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bcd08b8..497ae14 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -767,8 +767,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty) { struct kvm_memory_slot *memslot; - int r, i; - int n; + int r, n; unsigned long any = 0; r = -EINVAL; @@ -782,8 +781,7 @@ int kvm_get_dirty_log(struct kvm *kvm, n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - for (i = 0; !any && i < n/sizeof(long); ++i) - any = memslot->dirty_bitmap[i]; + any = !bitmap_empty(memslot->dirty_bitmap, memslot->npages); r = -EFAULT; if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) -- 1.6.1.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] KVM MMU: check reserved bits only when CR4.PSE=1 or CR4.PAE=1
- The RSV bit is possibility set in error code when #PF occurred only if CR4.PSE=1 or CR4.PAE=1 - context->rsvd_bits_mask[1][0] is always 0 Changlog: Move this operation to reset_rsvds_bits_mask() address Avi Kivity's suggestion Signed-off-by: Xiao Guangrong --- arch/x86/kvm/mmu.c | 12 +--- 1 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b137515..c49f8ec 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2288,18 +2288,26 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) if (!is_nx(vcpu)) exb_bit_rsvd = rsvd_bits(63, 63); + + context->rsvd_bits_mask[1][0] = 0; switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ context->rsvd_bits_mask[0][1] = 0; context->rsvd_bits_mask[0][0] = 0; + + /* check rsvd bits only when CR4.PSE=1 or CR4.PAE=1 */ + if (!is_pse(vcpu)) { + context->rsvd_bits_mask[1][1] = 0; + break; + } + if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = @@ -2312,7 +2320,6 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | @@ -2330,7 +2337,6 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; } } -- 1.6.1.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On Tue, 2010-03-16 at 11:32 +0200, Avi Kivity wrote: > On 03/16/2010 09:48 AM, Zhang, Yanmin wrote: > > > >> Excellent, support for guest kernel != host kernel is critical (I can't > >> remember the last time I ran same kernels). > >> > >> How would we support multiple guests with different kernels? > >> > > With the patch, 'perf kvm report --sort pid" could show > > summary statistics for all guest os instances. Then, use > > parameter --pid of 'perf kvm record' to collect single problematic instance > > data. > > > > That certainly works, though automatic association of guest data with > guest symbols is friendlier. Thanks. Originally, I planed to add a -G parameter to perf. Such like -G :/XXX/XXX/guestkallsyms:/XXX/XXX/modules,8889:/XXX/XXX/guestkallsyms:/XXX/XXX/modules and 8889 are just qemu guest pid. So we could define multiple guest os symbol files. But it seems ugly, and 'perf kvm report --sort pid" and 'perf kvm top --pid' could provide similar functionality. > > >>> diff -Nraup linux-2.6_tipmaster0315/arch/x86/kvm/vmx.c > >>> linux-2.6_tipmaster0315_perfkvm/arch/x86/kvm/vmx.c > >>> --- linux-2.6_tipmaster0315/arch/x86/kvm/vmx.c2010-03-16 > >>> 08:59:11.825295404 +0800 > >>> +++ linux-2.6_tipmaster0315_perfkvm/arch/x86/kvm/vmx.c2010-03-16 > >>> 09:01:09.976084492 +0800 > >>> @@ -26,6 +26,7 @@ > >>>#include > >>>#include > >>>#include > >>> +#include > >>>#include "kvm_cache_regs.h" > >>>#include "x86.h" > >>> > >>> @@ -3632,6 +3633,43 @@ static void update_cr8_intercept(struct > >>> vmcs_write32(TPR_THRESHOLD, irr); > >>>} > >>> > >>> +DEFINE_PER_CPU(int, kvm_in_guest) = {0}; > >>> + > >>> +static void kvm_set_in_guest(void) > >>> +{ > >>> + percpu_write(kvm_in_guest, 1); > >>> +} > >>> + > >>> +static int kvm_is_in_guest(void) > >>> +{ > >>> + return percpu_read(kvm_in_guest); > >>> +} > >>> > >>> > >> > > > >> There is already PF_VCPU for this. > >> > > Right, but there is a scope between kvm_guest_enter and really running > > in guest os, where a perf event might overflow. Anyway, the scope is very > > narrow, I will change it to use flag PF_VCPU. > > > > There is also a window between setting the flag and calling 'int $2' > where an NMI might happen and be accounted incorrectly. > > Perhaps separate the 'int $2' into a direct call into perf and another > call for the rest of NMI handling. I don't see how it would work on svm > though - AFAICT the NMI is held whereas vmx swallows it. > I guess NMIs > will be disabled until the next IRET so it isn't racy, just tricky. I'm not sure if vmexit does break NMI context or not. Hardware NMI context isn't reentrant till a IRET. YangSheng would like to double check it. > > >>> +static struct perf_guest_info_callbacks kvm_guest_cbs = { > >>> + .is_in_guest= kvm_is_in_guest, > >>> + .is_user_mode = kvm_is_user_mode, > >>> + .get_guest_ip = kvm_get_guest_ip, > >>> + .reset_in_guest = kvm_reset_in_guest, > >>> +}; > >>> > >>> > >> Should be in common code, not vmx specific. > >> > > Right. I discussed with Yangsheng. I will move above data structures and > > callbacks to file arch/x86/kvm/x86.c, and add get_ip, a new callback to > > kvm_x86_ops. > > > > You will need access to the vcpu pointer (kvm_rip_read() needs it), you > can put it in a percpu variable. We do so now in a new patch. > I guess if it's not null, you know > you're in a guest, so no need for PF_VCPU. Good suggestion. Thanks. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: host panic based on kernel 2.6.34-RC1
David Miller wrote: > From: "Hao, Xudong" > Date: Wed, 17 Mar 2010 10:14:50 +0800 > >> I installed a latest kvm based on kernel 2.6.34-rc1, after I load >> kvm kvm_i= ntel module, and start /etc/init.d/kvm, a few minutes >> later, the system wil= l panic. The panic is easy to reproduce when >> I use tcpdump in host. However, if I stop /etc/init.d/kvm, >> everything is OK, host works fine. Does= anyone met similar issue? >> any hint? > > This is fixed in Linus's GIT tree already. Thanks David. Thanks, Xudong-- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: host panic based on kernel 2.6.34-RC1
From: "Hao, Xudong" Date: Wed, 17 Mar 2010 10:14:50 +0800 > I installed a latest kvm based on kernel 2.6.34-rc1, after I load kvm kvm_i= > ntel module, and start /etc/init.d/kvm, a few minutes later, the system wil= > l panic. The panic is easy to reproduce when I use tcpdump in host. > However, if I stop /etc/init.d/kvm, everything is OK, host works fine. Does= > anyone met similar issue? any hint? This is fixed in Linus's GIT tree already. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
Hi - On Tue, Mar 16, 2010 at 06:04:10PM -0500, Anthony Liguori wrote: > [...] > The only way to really address this is to change the interaction. > Instead of running perf externally to qemu, we should support a perf > command in the qemu monitor that can then tie directly to the perf > tooling. That gives us the best possible user experience. To what extent could this be solved with less crossing of isolation/abstraction layers, if the perfctr facilities were properly virtualized? That way guests could run perf goo internally. Optionally virt tools on the host side could aggregate data from cooperating self-monitoring guests. - FChE -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
> Where does the translator need access to this original code? I was > just thinking about this problem today, wondering how much overhead > there is with this SMC page protection thing. When an MMU fault occurs qemu re-translates the TB with additional annotations to determine which guest instruction caused the fault. See translate-all.c:cpu_restore_state(). Paul -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 12:39 PM, Ingo Molnar wrote: If we look at the use-case, it's going to be something like, a user is creating virtual machines and wants to get performance information about them. Having to run a separate tool like perf is not going to be what they would expect they had to do. Instead, they would either use their existing GUI tool (like virt-manager) or they would use their management interface (either QMP or libvirt). The complexity of interaction is due to the fact that perf shouldn't be a stand alone tool. It should be a library or something with a programmatic interface that another tool can make use of. But ... a GUI interface/integration is of course possible too, and it's being worked on. perf is mainly a kernel developer tool, and kernel developers generally dont use GUIs to do their stuff: which is the (sole) reason why its first ~850 commits of tools/perf/ were done without a GUI. We go where our developers are. In any case it's not an excuse to have no proper command-line tooling. In fact if you cannot get simpler, more atomic command-line tooling right then you'll probably doubly suck at doing a GUI as well. It's about who owns the user interface. If qemu owns the user interface, than we can satisfy this in a very simple way by adding a perf monitor command. If we have to support third party tools, then it significantly complicates things. Regards, Anthony Liguori Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 01:28 PM, Ingo Molnar wrote: * Anthony Liguori wrote: On 03/16/2010 12:52 PM, Ingo Molnar wrote: * Anthony Liguori wrote: On 03/16/2010 10:52 AM, Ingo Molnar wrote: You are quite mistaken: KVM isnt really a 'random unprivileged application' in this context, it is clearly an extension of system/kernel services. ( Which can be seen from the simple fact that what started the discussion was 'how do we get /proc/kallsyms from the guest'. I.e. an extension of the existing host-space /proc/kallsyms was desired. ) Random tools (like perf) should not be able to do what you describe. It's a security nightmare. A security nightmare exactly how? Mind to go into details as i dont understand your point. Assume you're using SELinux to implement mandatory access control. How do you label this file system? Generally speaking, we don't know the difference between /proc/kallsyms vs. /dev/mem if we do generic passthrough. While it might be safe to have a relaxed label of kallsyms (since it's read only), it's clearly not safe to do that for /dev/mem, /etc/shadow, or any file containing sensitive information. What's your _point_? Please outline a threat model, a vector of attack, _anything_ that substantiates your "it's a security nightmare" claim. You suggested "to have a (read only) mount of all guest filesystems". As I described earlier, not all of the information within the guest filesystem has the same level of sensitivity. If you exposed a generic interface like this, it makes it very difficult to delegate privileges. Delegating privileges is important because from in a higher security environment, you may want to prevent a management tool from accessing the VM's disk directly, but still allow it to do basic operations (in particular, to view performance statistics). Rather, we ought to expose a higher level interface that we have more confidence in with respect to understanding the ramifications of exposing that guest data. Exactly, we want something that has a flexible namespace and works well with Linux tools in general. Preferably that namespace should be human readable, and it should be hierarchic, and it should have a well-known permission model. This concept exists in Linux and is generally called a 'filesystem'. If you want to use a synthetic filesystem as the management interface for qemu, that's one thing. But you suggested exposing the guest filesystem in its entirely and that's what I disagreed with. If a user cannot read the image file then the user has no access to its contents via other namespaces either. That is, of course, a basic security aspect. ( That is perfectly true with a non-SELinux Unix permission model as well, and is true in the SELinux case as well. ) I don't think that's reasonable at all. The guest may encrypt it's disk image. It still ought to be possible to run perf against that guest, no? Erm. Please explain to me, what exactly is 'not that simple' in a MAC environment? Also, i'd like to note that the 'restrictive SELinux setups' usecases are pretty secondary. To demonstrate that, i'd like every KVM developer on this list who reads this mail and who has their home development system where they produce their patches set up in a restrictive MAC environment, in that you cannot even read the images you are using, to chime in with a "I'm doing that" reply. My home system doesn't run SELinux but I work daily with systems that are using SELinux. I want to be able to run tools like perf on these systems because ultimately, I need to debug these systems on a daily basis. But that's missing the point. We want to have an interface that works for both cases so that we're not maintaining two separate interfaces. We've rat holed a bit though. You want: 1) to run perf kvm list and be able to enumerate KVM guests 2) for this to Just Work with qemu guests launched from the command line You could achieve (1) by tying perf to libvirt but that won't work for (2). There are a few practical problems with (2). qemu does not require the user to associate any uniquely identifying information with a VM. We've also optimized the command line use case so that if all you want to do is run a disk image, you just execute "qemu foo.img". To satisfy your use case, we would either have to force a use to always specify unique information, which would be less convenient for our users or we would have to let the name be an optional parameter. As it turns out, we already support "qemu -name Fedora foo.img". What we don't do today, but I've been suggesting we should, is automatically create a QMP management socket in a well known location based on the -name parameter when it's specified. That would let a tool like perf Just Work provided that a user specified -name. No one uses -name today though and I'm sure you don't either. Th
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
oerg Roedel wrote: > On Tue, Mar 16, 2010 at 12:25:00PM +0100, Ingo Molnar wrote: >> Hm, that sounds rather messy if we want to use it to basically expose kernel >> functionality in a guest/host unified way. Is the qemu process discoverable >> in >> some secure way? Can we trust it? Is there some proper tooling available to >> do >> it, or do we have to push it through 2-3 packages to get such a useful >> feature >> done? > > Since we want to implement a pmu usable for the guest anyway why we > don't just use a guests perf to get all information we want? If we get a > pmu-nmi from the guest we just re-inject it to the guest and perf in the > guest gives us all information we wand including kernel and userspace > symbols, stack traces, and so on. I guess this aims to get information from old environments running on kvm for life extension :) > In the previous thread we discussed about a direct trace channel between > guest and host kernel (which can be used for ftrace events for example). > This channel could be used to transport this information to the host > kernel. Interesting! I know the people who are trying to do that with systemtap. See, http://vesper.sourceforge.net/ > > The only additional feature needed is a way for the host to start a perf > instance in the guest. # ssh localguest perf record --host-chanel ... ? B-) Thank you, > > Opinions? > > > Joerg > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- Masami Hiramatsu e-mail: mhira...@redhat.com -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
Anthony Liguori wrote: On 03/16/2010 07:45 AM, Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + int ret = 0; + + mask = 1UL<< offset; + if (phys_ram_vga_dirty[index]& mask) + ret |= VGA_DIRTY_FLAG; + if (phys_ram_code_dirty[index]& mask) + ret |= CODE_DIRTY_FLAG; + if (phys_ram_migration_dirty[index]& mask) + ret |= MIGRATION_DIRTY_FLAG; + + return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { - return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; + return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; } This turns one cacheline access into three. If the dirty bitmaps were in an array, you could do return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + BITS_IN_LONG)] & mask; with one cacheline access. As far as I can tell, we only ever call with a single flag so your suggestion makes sense. I'd suggest introducing these functions before splitting the bitmap up. It makes review a bit easier. Thanks for your advise. I'll post the wrapper functions for existing byte-based phys_ram_dirty first. Yoshi -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
On 03/16/2010 01:10 PM, Blue Swirl wrote: > Just a tangential note: a long time ago, I tried to disable self > modifying code detection for Sparc. On most RISC architectures, SMC > needs explicit flushing so in theory we need not track code memory > writes. However, during exceptions the translator needs to access the > original unmodified code that was used to generate the TB. But maybe > there are other ways to avoid SMC tracking, on x86 it's still needed > but I suppose SMC is pretty rare. True SMC is fairly rare, but the SMC checker triggers fairly often on the PLT update during dynamic linking. Nearly all cpus (x86 being the only exception I recall) needed to re-design their PLT format to avoid this code update in order to support SELinux. Where does the translator need access to this original code? I was just thinking about this problem today, wondering how much overhead there is with this SMC page protection thing. r~ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On Tue, Mar 16, 2010 at 12:25:00PM +0100, Ingo Molnar wrote: > Hm, that sounds rather messy if we want to use it to basically expose kernel > functionality in a guest/host unified way. Is the qemu process discoverable > in > some secure way? Can we trust it? Is there some proper tooling available to > do > it, or do we have to push it through 2-3 packages to get such a useful > feature > done? Since we want to implement a pmu usable for the guest anyway why we don't just use a guests perf to get all information we want? If we get a pmu-nmi from the guest we just re-inject it to the guest and perf in the guest gives us all information we wand including kernel and userspace symbols, stack traces, and so on. In the previous thread we discussed about a direct trace channel between guest and host kernel (which can be used for ftrace events for example). This channel could be used to transport this information to the host kernel. The only additional feature needed is a way for the host to start a perf instance in the guest. Opinions? Joerg -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
On 3/16/10, Anthony Liguori wrote: > On 03/16/2010 08:57 AM, Avi Kivity wrote: > > > On 03/16/2010 03:51 PM, Anthony Liguori wrote: > > > > > On 03/16/2010 08:29 AM, Avi Kivity wrote: > > > > > > > On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote: > > > > > > > > > Avi Kivity wrote: > > > > > > > > > > > On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: > > > > > > > > > > > > > Modifies wrapper functions for byte-based phys_ram_dirty bitmap > to > > > > > > > bit-based phys_ram_dirty bitmap, and adds more wrapper functions > to > > > > > > > prevent > > > > > > > direct access to the phys_ram_dirty bitmap. > > > > > > > > > > > > > > > > > > > > > > > > > > + > > > > > > > +static inline int > cpu_physical_memory_get_dirty_flags(ram_addr_t addr) > > > > > > > +{ > > > > > > > + unsigned long mask; > > > > > > > + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; > > > > > > > + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); > > > > > > > + int ret = 0; > > > > > > > + > > > > > > > + mask = 1UL<< offset; > > > > > > > + if (phys_ram_vga_dirty[index]& mask) > > > > > > > + ret |= VGA_DIRTY_FLAG; > > > > > > > + if (phys_ram_code_dirty[index]& mask) > > > > > > > + ret |= CODE_DIRTY_FLAG; > > > > > > > + if (phys_ram_migration_dirty[index]& mask) > > > > > > > + ret |= MIGRATION_DIRTY_FLAG; > > > > > > > + > > > > > > > + return ret; > > > > > > > } > > > > > > > > > > > > > > static inline int > cpu_physical_memory_get_dirty(ram_addr_t addr, > > > > > > > int dirty_flags) > > > > > > > { > > > > > > > - return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; > > > > > > > + return > cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; > > > > > > > } > > > > > > > > > > > > > > > > > > > This turns one cacheline access into three. If the dirty bitmaps > were in > > > > > > an array, you could do > > > > > > > > > > > > return dirty_bitmaps[dirty_index][addr >> > (TARGET_PAGE_BITS + > > > > > > BITS_IN_LONG)] & mask; > > > > > > > > > > > > with one cacheline access. > > > > > > > > > > > > > > > > If I'm understanding the existing code correctly, > > > > > int dirty_flags can be combined, like VGA + MIGRATION. > > > > > If we only have to worry about a single dirty flag, I agree with > your idea. > > > > > > > > > > > > > From a quick grep it seems flags are not combined, except for > something strange with CODE_DIRTY_FLAG: > > > > > > > > > > > > > static void notdirty_mem_writel(void *opaque, > target_phys_addr_t ram_addr, > > > > >uint32_t val) > > > > > { > > > > >int dirty_flags; > > > > >dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; > > > > >if (!(dirty_flags & CODE_DIRTY_FLAG)) { > > > > > #if !defined(CONFIG_USER_ONLY) > > > > >tb_invalidate_phys_page_fast(ram_addr, 4); > > > > >dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; > > > > > #endif > > > > >} > > > > >stl_p(qemu_get_ram_ptr(ram_addr), val); > > > > >dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); > > > > >phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; > > > > >/* we remove the notdirty callback only if the code has been > > > > > flushed */ > > > > >if (dirty_flags == 0xff) > > > > >tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr); > > > > > } > > > > > > > > > > > > > I can't say I understand what it does. > > > > > > > > > > The semantics of CODE_DIRTY_FLAG are a little counter intuitive. > CODE_DIRTY_FLAG means that we know that something isn't code so writes do > not need checking for self modifying code. > > > > > > > So the hardware equivalent is, when the Instruction TLB loads a page > address, clear CODE_DIRTY_FLAG? > > > > Yes, and is what tlb_protect_code() does and it's called from > tb_alloc_page() which is what's code when a TB is created. Just a tangential note: a long time ago, I tried to disable self modifying code detection for Sparc. On most RISC architectures, SMC needs explicit flushing so in theory we need not track code memory writes. However, during exceptions the translator needs to access the original unmodified code that was used to generate the TB. But maybe there are other ways to avoid SMC tracking, on x86 it's still needed but I suppose SMC is pretty rare. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ kvm-Bugs-2971166 ] usb passthrough fails with RDP
Bugs item #2971166, was opened at 2010-03-16 05:32 Message generated for change (Comment added) made by jimatjtan You can respond by visiting: https://sourceforge.net/tracker/?func=detail&atid=893831&aid=2971166&group_id=180599 Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: libkvm Group: v1.0 (example) Status: Open Resolution: None Priority: 5 Private: No Submitted By: Peter Brooks (theneb) Assigned to: Nobody/Anonymous (nobody) Summary: usb passthrough fails with RDP Initial Comment: Base OS: Debian Etch 2.6.26-2-amd64 Processor: Intel(R) Xeon(TM) CPU 3.00GHz QEMU 0.9.1 Virt-manager 0.6.0 kvm 72+dfsg-5 Guest OS: Windows Server 2008 standard I've enabled USB pass through using virsh and editing the xml file for my vhost to add the usb device attached. When using the guest OS from virt-manager, the USB device works perfectly. When using the guest OS from RDP then the USB device fails to work. The device when failing to work it does still appear in the device manager. The device itself is a Safenet Sentinel dongle, sadly used for software copy protection. It is used with a Galaxy RS security system, the software fails to start without the dongle functioning perfectly. -- Comment By: Jim Paris (jimatjtan) Date: 2010-03-16 15:39 Message: Many Windows security systems specifically detect when a remote desktop session is attached, and refuse to allow access. Indeed, according to the Safenet Sentinel End User Guide at http://www2.safenet-inc.com/support/files/SafeNet_Sentinel_EndUser_Guide.pdf: "Please Note that if you attempt to run a Sentinel Key protected application in standalone mode via a remote client (Terminal Server, VNC, WinXP remote client...), the software protected with Sentinel keys will not allow this for security reasons if application is protected in Standalone mode. You will either need to run the software while directly logged into the machine, or need to get in touch with you software vendor for software protected in Network mode." So this is definitely not a libvirt or qemu or kvm problem. You'll have to bring this up with your DRM provider. -- You can respond by visiting: https://sourceforge.net/tracker/?func=detail&atid=893831&aid=2971166&group_id=180599 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/5] Fix some mmu/emulator atomicity issues (v2)
On Tue, Mar 16, 2010 at 07:22:55PM +0100, Alexander Graf wrote: > > On 16.03.2010, at 17:36, Marcelo Tosatti wrote: > > > On Mon, Mar 15, 2010 at 01:59:52PM +0200, Avi Kivity wrote: > >> Currently when we emulate a locked operation into a shadowed guest page > >> table, we perform a write rather than a true atomic. This is indicated > >> by the "emulating exchange as write" message that shows up in dmesg. > >> > >> In addition, the pte prefetch operation during invlpg suffered from a > >> race. This was fixed by removing the operation. > >> > >> This patchset fixes both issues and reinstates pte prefetch on invlpg. > >> > >> v3: > >> - rebase against next branch (resolves conflicts via hypercall patch) > >> > >> v2: > >> - fix truncated description for patch 1 > >> - add new patch 4, which fixes a bug in patch 5 > > > > Applied, thanks. > > How relevant is this for -stable? Races don't sound good to me :) The race mentioned above is not existant on -stable since prefetch is disabled for invlpg. The atomic fixes seem like a candidate, since lack of them can trigger pagetable corruption. Avi? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Anthony Liguori wrote: > On 03/16/2010 12:52 PM, Ingo Molnar wrote: > >* Anthony Liguori wrote: > > > >>On 03/16/2010 10:52 AM, Ingo Molnar wrote: > >>>You are quite mistaken: KVM isnt really a 'random unprivileged > >>>application' in > >>>this context, it is clearly an extension of system/kernel services. > >>> > >>>( Which can be seen from the simple fact that what started the discussion > >>>was > >>> 'how do we get /proc/kallsyms from the guest'. I.e. an extension of the > >>> existing host-space /proc/kallsyms was desired. ) > >>Random tools (like perf) should not be able to do what you describe. It's a > >>security nightmare. > >A security nightmare exactly how? Mind to go into details as i dont > >understand > >your point. > > Assume you're using SELinux to implement mandatory access control. > How do you label this file system? > > Generally speaking, we don't know the difference between /proc/kallsyms vs. > /dev/mem if we do generic passthrough. While it might be safe to have a > relaxed label of kallsyms (since it's read only), it's clearly not safe to > do that for /dev/mem, /etc/shadow, or any file containing sensitive > information. What's your _point_? Please outline a threat model, a vector of attack, _anything_ that substantiates your "it's a security nightmare" claim. > Rather, we ought to expose a higher level interface that we have more > confidence in with respect to understanding the ramifications of exposing > that guest data. Exactly, we want something that has a flexible namespace and works well with Linux tools in general. Preferably that namespace should be human readable, and it should be hierarchic, and it should have a well-known permission model. This concept exists in Linux and is generally called a 'filesystem'. > >> No way. The guest has sensitive data and exposing it widely on the host > >> is a bad thing to do. [...] > > > > Firstly, you are putting words into my mouth, as i said nothing about > > 'exposing it widely'. I suggest exposing it under the privileges of > > whoever has access to the guest image. > > That doesn't work as nicely with SELinux. > > It's completely reasonable to have a user that can interact in a read only > mode with a VM via libvirt but cannot read the guest's disk images or the > guest's memory contents. If a user cannot read the image file then the user has no access to its contents via other namespaces either. That is, of course, a basic security aspect. ( That is perfectly true with a non-SELinux Unix permission model as well, and is true in the SELinux case as well. ) > > Secondly, regarding confidentiality, and this is guest security 101: whoever > > can access the image on the host _already_ has access to all the guest data! > > > > A Linux image can generally be loopback mounted straight away: > > > > losetup -o 32256 /dev/loop0 ./guest-image.img > > mount -o ro /dev/loop0 /mnt-guest > > > >(Or, if you are an unprivileged user who cannot mount, it can be read via > >ext2 > >tools.) > > > > There's nothing the guest can do about that. The host is in total control of > > guest image data for heaven's sake! > > It's not that simple in a MAC environment. Erm. Please explain to me, what exactly is 'not that simple' in a MAC environment? Also, i'd like to note that the 'restrictive SELinux setups' usecases are pretty secondary. To demonstrate that, i'd like every KVM developer on this list who reads this mail and who has their home development system where they produce their patches set up in a restrictive MAC environment, in that you cannot even read the images you are using, to chime in with a "I'm doing that" reply. If there's just a _single_ KVM developer amongst dozens and dozens of developers on this list who develops in an environment like that i'd be surprised. That result should pretty much tell you where the weight of instrumentation focus should lie - and it isnt on restrictive MAC environments ... Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/5] Fix some mmu/emulator atomicity issues (v2)
On 16.03.2010, at 17:36, Marcelo Tosatti wrote: > On Mon, Mar 15, 2010 at 01:59:52PM +0200, Avi Kivity wrote: >> Currently when we emulate a locked operation into a shadowed guest page >> table, we perform a write rather than a true atomic. This is indicated >> by the "emulating exchange as write" message that shows up in dmesg. >> >> In addition, the pte prefetch operation during invlpg suffered from a >> race. This was fixed by removing the operation. >> >> This patchset fixes both issues and reinstates pte prefetch on invlpg. >> >> v3: >> - rebase against next branch (resolves conflicts via hypercall patch) >> >> v2: >> - fix truncated description for patch 1 >> - add new patch 4, which fixes a bug in patch 5 > > Applied, thanks. How relevant is this for -stable? Races don't sound good to me :) Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Corrupt qcow2 image, recovery?
Hi! I'm running kvm / qemu-kvm on a couple of production servers everything (or at least most things) works as it should. However today someone thought it was a "good" idea to restart one of the servers and after that the windows 2k3 guest on that server don't boot anymore. kvm on this server is a bit "outdated": "QEMU PC emulator version 0.9.1 (kvm-83)" (I guess this is one of the qcow2 corruption bugs, and i can only blame myself for not upgrading kvm sooner.) The guest.qcow2 is a 21GiB file for a 60GiB disk i have tried a couple of things kvm-img convert -f qcow2 -O raw guest.qcow2 guest.raw this stops and does nothing after creating a guest.raw that is 60GiB but only using 60MiB so mounted the fs from another server running: "QEMU PC emulator version 0.12.1 (qemu-kvm-0.12.1.2)" and run qemu-img with the same options as above and after a few secs got "qemu-img: error while reading" and the same 60MiB used by guest.raw i also tried booting qemu-kvm with a linux guest and this qcow2 image but only get I/O Errors (and no partitions found) # qemu-img check guest.qcow2 ERROR: invalid cluster offset=0x10a000 ERROR OFLAG_COPIED: l2_offset=ee73 refcount=1 ERROR l2_offset=ee73: Table is not cluster aligned; L1 entry corrupted ERROR: invalid cluster offset=0x11d44100080 ERROR: invalid cluster offset=0x11d61600080 ERROR: invalid cluster offset=0x11d68600080 ERROR: invalid cluster offset=0x11d95300080 (and a loot more in this style, full log can be provided if it would be of help to anybody) is there any possibility to repair this file, or convert it to a RAW file (even with parts padded that are not "safe" from the qcow2 image), or as a last resort, are there any debug tools for qcow2 images that might be of use? I have read up on the qcow fileformat but right now i'm a bit short of time, i need the data in this guests disk image, or at least the MS SQL datafiles that are on this disk) i have also checked the qcow2 file and it do contain a NTLDR string and a loot of other NTFS recognized strings so i know that all data is not gone. the question is how can i access it as a Filesystem again? Any help would be appreciated! Regards Christian Nilsson -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 12:52 PM, Ingo Molnar wrote: * Anthony Liguori wrote: On 03/16/2010 10:52 AM, Ingo Molnar wrote: You are quite mistaken: KVM isnt really a 'random unprivileged application' in this context, it is clearly an extension of system/kernel services. ( Which can be seen from the simple fact that what started the discussion was 'how do we get /proc/kallsyms from the guest'. I.e. an extension of the existing host-space /proc/kallsyms was desired. ) Random tools (like perf) should not be able to do what you describe. It's a security nightmare. A security nightmare exactly how? Mind to go into details as i dont understand your point. Assume you're using SELinux to implement mandatory access control. How do you label this file system? Generally speaking, we don't know the difference between /proc/kallsyms vs. /dev/mem if we do generic passthrough. While it might be safe to have a relaxed label of kallsyms (since it's read only), it's clearly not safe to do that for /dev/mem, /etc/shadow, or any file containing sensitive information. Rather, we ought to expose a higher level interface that we have more confidence in with respect to understanding the ramifications of exposing that guest data. No way. The guest has sensitive data and exposing it widely on the host is a bad thing to do. [...] Firstly, you are putting words into my mouth, as i said nothing about 'exposing it widely'. I suggest exposing it under the privileges of whoever has access to the guest image. That doesn't work as nicely with SELinux. It's completely reasonable to have a user that can interact in a read only mode with a VM via libvirt but cannot read the guest's disk images or the guest's memory contents. Secondly, regarding confidentiality, and this is guest security 101: whoever can access the image on the host _already_ has access to all the guest data! A Linux image can generally be loopback mounted straight away: losetup -o 32256 /dev/loop0 ./guest-image.img mount -o ro /dev/loop0 /mnt-guest (Or, if you are an unprivileged user who cannot mount, it can be read via ext2 tools.) There's nothing the guest can do about that. The host is in total control of guest image data for heaven's sake! It's not that simple in a MAC environment. Regards, Anthony Liguori All i'm suggesting is to make what is already possible more convenient. Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/5] Fix some mmu/emulator atomicity issues (v2)
On Mon, Mar 15, 2010 at 01:59:52PM +0200, Avi Kivity wrote: > Currently when we emulate a locked operation into a shadowed guest page > table, we perform a write rather than a true atomic. This is indicated > by the "emulating exchange as write" message that shows up in dmesg. > > In addition, the pte prefetch operation during invlpg suffered from a > race. This was fixed by removing the operation. > > This patchset fixes both issues and reinstates pte prefetch on invlpg. > > v3: >- rebase against next branch (resolves conflicts via hypercall patch) > > v2: >- fix truncated description for patch 1 >- add new patch 4, which fixes a bug in patch 5 Applied, thanks. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Anthony Liguori wrote: > On 03/16/2010 10:52 AM, Ingo Molnar wrote: > >You are quite mistaken: KVM isnt really a 'random unprivileged application' > >in > >this context, it is clearly an extension of system/kernel services. > > > >( Which can be seen from the simple fact that what started the discussion was > > 'how do we get /proc/kallsyms from the guest'. I.e. an extension of the > > existing host-space /proc/kallsyms was desired. ) > > Random tools (like perf) should not be able to do what you describe. It's a > security nightmare. A security nightmare exactly how? Mind to go into details as i dont understand your point. > If it's desirable to have /proc/kallsyms available, we can expose an > interface in QEMU to provide that. That can then be plumbed through libvirt > and QMP. > > Then a management tool can use libvirt or QMP to obtain that information and > interact with the kernel appropriately. > > > In that sense the most natural 'extension' would be the solution i > > mentioned a week or two ago: to have a (read only) mount of all guest > > filesystems, plus a channel for profiling/tracing data. That would make > > symbol parsing easier and it's what extends the existing 'host space' > > abstraction in the most natural way. > > > > ( It doesnt even have to be done via the kernel - Qemu could implement that > > via FUSE for example. ) > > No way. The guest has sensitive data and exposing it widely on the host is > a bad thing to do. [...] Firstly, you are putting words into my mouth, as i said nothing about 'exposing it widely'. I suggest exposing it under the privileges of whoever has access to the guest image. Secondly, regarding confidentiality, and this is guest security 101: whoever can access the image on the host _already_ has access to all the guest data! A Linux image can generally be loopback mounted straight away: losetup -o 32256 /dev/loop0 ./guest-image.img mount -o ro /dev/loop0 /mnt-guest (Or, if you are an unprivileged user who cannot mount, it can be read via ext2 tools.) There's nothing the guest can do about that. The host is in total control of guest image data for heaven's sake! All i'm suggesting is to make what is already possible more convenient. Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Anthony Liguori wrote: > On 03/16/2010 08:08 AM, Ingo Molnar wrote: > >* Avi Kivity wrote: > > > >>On 03/16/2010 02:29 PM, Ingo Molnar wrote: > >>>I mean, i can trust a kernel service and i can trust /proc/kallsyms. > >>> > >>>Can perf trust a random process claiming to be Qemu? What's the trust > >>>mechanism here? > >>Obviously you can't trust anything you get from a guest, no matter how you > >>get it. > >I'm not talking about the symbol strings and addresses, and the object > >contents for allocation (or debuginfo). I'm talking about the basic protocol > >of establishing which guest is which. > > > >I.e. we really want to be able users to: > > > > 1) have it all working with a single guest, without having to specify > > 'which' > > guest (qemu PID) to work with. That is the dominant usecase both for > > developers and for a fair portion of testers. > > You're making too many assumptions. > > There is no list of guests anymore than there is a list of web browsers. > > You can have a multi-tenant scenario where you have distinct groups of > virtual machines running as unprivileged users. "multi-tenant" and groups is not a valid excuse at all for giving crappy technology in the simplest case: when there's a single VM. Yes, eventually it can be supported and any sane scheme will naturally support it too, but it's by no means what we care about primarily when it comes to these tools. I thought everyone learned the lesson behind SystemTap's failure (and to a certain degree this was behind Oprofile's failure as well): when it comes to tooling/instrumentation we dont want to concentrate on the fancy complex setups and abstract requirements drawn up by CIOs, as development isnt being done there. Concentrate on our developers today, and provide no-compromises usability to those who contribute stuff. If we dont help make the simplest (and most common) use-case convenient then we are failing on a fundamental level. > > 2) Have some reasonable symbolic identification for guests. For example a > > usable approach would be to have 'perf kvm list', which would list all > > currently active guests: > > > > $ perf kvm list > >[1] Fedora > >[2] OpenSuse > >[3] Windows-XP > >[4] Windows-7 > > > > And from that point on 'perf kvm -g OpenSuse record' would do the > > obvious > > thing. Users will be able to just use the 'OpenSuse' symbolic name for > > that guest, even if the guest got restarted and switched its main PID. > > Does "perf kvm list" always run as root? What if two unprivileged users > both have a VM named "Fedora"? Again, the single-VM case is the most important case, by far. If you have multiple VMs running and want to develop the kernel on multiple VMs (sounds rather messy if you think it through ...), what would happen is similar to what happens when we have two probes for example: # perf probe schedule Added new event: probe:schedule (on schedule+0) You can now use it on all perf tools, such as: perf record -e probe:schedule -a sleep 1 # perf probe -f schedule Added new event: probe:schedule_1 (on schedule+0) You can now use it on all perf tools, such as: perf record -e probe:schedule_1 -a sleep 1 # perf probe -f schedule Added new event: probe:schedule_2 (on schedule+0) You can now use it on all perf tools, such as: perf record -e probe:schedule_2 -a sleep 1 Something similar could be used for KVM/Qemu: whichever got created first is named 'Fedora', the second is named 'Fedora-2'. > If we look at the use-case, it's going to be something like, a user is > creating virtual machines and wants to get performance information about > them. > > Having to run a separate tool like perf is not going to be what they would > expect they had to do. Instead, they would either use their existing GUI > tool (like virt-manager) or they would use their management interface > (either QMP or libvirt). > > The complexity of interaction is due to the fact that perf shouldn't be a > stand alone tool. It should be a library or something with a programmatic > interface that another tool can make use of. But ... a GUI interface/integration is of course possible too, and it's being worked on. perf is mainly a kernel developer tool, and kernel developers generally dont use GUIs to do their stuff: which is the (sole) reason why its first ~850 commits of tools/perf/ were done without a GUI. We go where our developers are. In any case it's not an excuse to have no proper command-line tooling. In fact if you cannot get simpler, more atomic command-line tooling right then you'll probably doubly suck at doing a GUI as well. Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.ke
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 10:52 AM, Ingo Molnar wrote: You are quite mistaken: KVM isnt really a 'random unprivileged application' in this context, it is clearly an extension of system/kernel services. ( Which can be seen from the simple fact that what started the discussion was 'how do we get /proc/kallsyms from the guest'. I.e. an extension of the existing host-space /proc/kallsyms was desired. ) Random tools (like perf) should not be able to do what you describe. It's a security nightmare. If it's desirable to have /proc/kallsyms available, we can expose an interface in QEMU to provide that. That can then be plumbed through libvirt and QMP. Then a management tool can use libvirt or QMP to obtain that information and interact with the kernel appropriately. In that sense the most natural 'extension' would be the solution i mentioned a week or two ago: to have a (read only) mount of all guest filesystems, plus a channel for profiling/tracing data. That would make symbol parsing easier and it's what extends the existing 'host space' abstraction in the most natural way. ( It doesnt even have to be done via the kernel - Qemu could implement that via FUSE for example. ) No way. The guest has sensitive data and exposing it widely on the host is a bad thing to do. It's a bad interface. We can expose specific information about guests but only through our existing channels which are validated through a security infrastructure. Ultimately, your goal is to keep perf a simple tool with little dependencies. But practically speaking, if you want to add features to it, it's going to have to interact with other subsystems in the appropriate way. That means, it's going to need to interact with libvirt or QMP. If you want all applications to expose their data via synthetic file systems, then there's always plan9 :-) Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 08:08 AM, Ingo Molnar wrote: * Avi Kivity wrote: On 03/16/2010 02:29 PM, Ingo Molnar wrote: I mean, i can trust a kernel service and i can trust /proc/kallsyms. Can perf trust a random process claiming to be Qemu? What's the trust mechanism here? Obviously you can't trust anything you get from a guest, no matter how you get it. I'm not talking about the symbol strings and addresses, and the object contents for allocation (or debuginfo). I'm talking about the basic protocol of establishing which guest is which. I.e. we really want to be able users to: 1) have it all working with a single guest, without having to specify 'which' guest (qemu PID) to work with. That is the dominant usecase both for developers and for a fair portion of testers. You're making too many assumptions. There is no list of guests anymore than there is a list of web browsers. You can have a multi-tenant scenario where you have distinct groups of virtual machines running as unprivileged users. 2) Have some reasonable symbolic identification for guests. For example a usable approach would be to have 'perf kvm list', which would list all currently active guests: $ perf kvm list [1] Fedora [2] OpenSuse [3] Windows-XP [4] Windows-7 And from that point on 'perf kvm -g OpenSuse record' would do the obvious thing. Users will be able to just use the 'OpenSuse' symbolic name for that guest, even if the guest got restarted and switched its main PID. Does "perf kvm list" always run as root? What if two unprivileged users both have a VM named "Fedora"? If we look at the use-case, it's going to be something like, a user is creating virtual machines and wants to get performance information about them. Having to run a separate tool like perf is not going to be what they would expect they had to do. Instead, they would either use their existing GUI tool (like virt-manager) or they would use their management interface (either QMP or libvirt). The complexity of interaction is due to the fact that perf shouldn't be a stand alone tool. It should be a library or something with a programmatic interface that another tool can make use of. Regards, Anthony Liguori Is such a scheme possible/available? I suspect all the KVM configuration tools (i havent used them in some time - gui and command-line tools alike) use similar methods to ease guest management? Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Ideas wiki for GSoC 2010
On 03/16/2010 06:33 PM, Jason wrote: Avi Kivity redhat.com> writes: On 03/16/2010 02:20 AM, Jason wrote: In comparing KVM 2.6.31.6b to XenServer 5.5.0, it seems KVM has fewer overall VMREADs and VMWRITEs, but there are a lot of VMWRITEs to Host FS_SEL, Host GS_SEL, Host FS_BASE, and Host GS_BASE that don't appear in Xen. Ugh, these should definitely be eliminated, they keep writing the same value most of the time. Sorry, I forgot to mention it would also be nice to reduce the unconditional writes to Host CR0 if the TS issue can still be dealt with. I have some ideas on how to deal with that. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Frank Ch. Eigler wrote: > Hi - > > On Tue, Mar 16, 2010 at 04:52:21PM +0100, Ingo Molnar wrote: > > [...] > > > Perhaps the fact that kvm happens to deal with an interesting application > > > area (virtualization) is misleading here. As far as the host kernel or > > > other host userspace is concerned, qemu is just some random unprivileged > > > userspace program [...] > > > You are quite mistaken: KVM isnt really a 'random unprivileged > > application' in this context, it is clearly an extension of > > system/kernel services. > > I don't know what "extension of system/kernel services" means in this > context, beyond something running on the system/kernel, like every other > process. [...] It means something like my example of 'extended to guest space' /proc/kallsyms: > > [...] > > > > ( Which can be seen from the simple fact that what started the > > discussion was 'how do we get /proc/kallsyms from the guest'. I.e. an > > extension of the existing host-space /proc/kallsyms was desired. ) > > (Sorry, that smacks of circular reasoning.) To me it sounds like an example supporting my point. /proc/kallsyms is a service by the kernel, and 'perf kvm' desires this to be extended to guest space as well. Thanks, Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Ideas wiki for GSoC 2010
Avi Kivity redhat.com> writes: > > On 03/16/2010 02:20 AM, Jason wrote: > > > > In comparing KVM 2.6.31.6b to XenServer 5.5.0, it seems KVM has fewer > > overall > > VMREADs and VMWRITEs, but there are a lot of VMWRITEs to Host FS_SEL, Host > > GS_SEL, Host FS_BASE, and Host GS_BASE that don't appear in Xen. > > Ugh, these should definitely be eliminated, they keep writing the same > value most of the time. > Sorry, I forgot to mention it would also be nice to reduce the unconditional writes to Host CR0 if the TS issue can still be dealt with. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
Hi - On Tue, Mar 16, 2010 at 04:52:21PM +0100, Ingo Molnar wrote: > [...] > > Perhaps the fact that kvm happens to deal with an interesting application > > area (virtualization) is misleading here. As far as the host kernel or > > other host userspace is concerned, qemu is just some random unprivileged > > userspace program [...] > You are quite mistaken: KVM isnt really a 'random unprivileged > application' in this context, it is clearly an extension of > system/kernel services. I don't know what "extension of system/kernel services" means in this context, beyond something running on the system/kernel, like every other process. To clarify, to what extent do you consider your classification similarly clear for a host is running * multiple kvm instances run as unprivileged users * non-kvm OS simulators such as vmware or xen or gdb * kvm instances running something other than linux > ( Which can be seen from the simple fact that what started the > discussion was 'how do we get /proc/kallsyms from the > guest'. I.e. an extension of the existing host-space /proc/kallsyms > was desired. ) (Sorry, that smacks of circular reasoning.) It may be a charming convenience function for perf users to give them shortcuts for certain favoured configurations (kvm running freshest linux), but that says more about perf than kvm. - FChE -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter
On 03/16/2010 04:27 PM, Balbir Singh wrote: Let's assume the guest has virtio (I agree with IDE we need reordering on the host). The guest sends batches of I/O separated by cache flushes. If the batches are smaller than the virtio queue length, ideally things look like: io_submit(..., batch_size_1); io_getevents(..., batch_size_1); fdatasync(); io_submit(..., batch_size_2); io_getevents(..., batch_size_2); fdatasync(); io_submit(..., batch_size_3); io_getevents(..., batch_size_3); fdatasync(); (certainly that won't happen today, but it could in principle). How does a write cache give any advantage? The host kernel sees _exactly_ the same information as it would from a bunch of threaded pwritev()s followed by fdatasync(). Are you suggesting that the model with cache=writeback gives us the same I/O pattern as cache=none, so there are no opportunities for optimization? Yes. The guest also has a large cache with the same optimization algorithm. (wish: IO_CMD_ORDERED_FDATASYNC) If the batch size is larger than the virtio queue size, or if there are no flushes at all, then yes the huge write cache gives more opportunity for reordering. But we're already talking hundreds of requests here. Let's say the virtio queue size was unlimited. What merging/reordering opportunity are we missing on the host? Again we have exactly the same information: either the pagecache lru + radix tree that identifies all dirty pages in disk order, or the block queue with pending requests that contains exactly the same information. Something is wrong. Maybe it's my understanding, but on the other hand it may be a piece of kernel code. I assume you are talking of dedicated disk partitions and not individual disk images residing on the same partition. Correct. Images in files introduce new writes which can be optimized. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Frank Ch. Eigler wrote: > > Ingo Molnar writes: > > > [...] > >> >I.e. we really want to be able users to: > >> > > >> > 1) have it all working with a single guest, without having to specify > >> > 'which' > >> > guest (qemu PID) to work with. That is the dominant usecase both for > >> > developers and for a fair portion of testers. > >> > >> That's reasonable if we can get it working simply. > > > > IMO such ease of use is reasonable and required, full stop. > > If it cannot be gotten simply then that's a bug: either in the code, or in > > the > > design, or in the development process that led to the design. Bugs need > > fixing. [...] > > Perhaps the fact that kvm happens to deal with an interesting application > area (virtualization) is misleading here. As far as the host kernel or > other host userspace is concerned, qemu is just some random unprivileged > userspace program (with some *optional* /dev/kvm services that might happen > to require temporary root). > > As such, perf trying to instrument qemu is no different than perf trying to > instrument any other userspace widget. Therefore, expecting 'trusted > enumeration' of instances is just as sensible as using 'trusted ps' and > 'trusted /var/run/FOO.pid files'. You are quite mistaken: KVM isnt really a 'random unprivileged application' in this context, it is clearly an extension of system/kernel services. ( Which can be seen from the simple fact that what started the discussion was 'how do we get /proc/kallsyms from the guest'. I.e. an extension of the existing host-space /proc/kallsyms was desired. ) In that sense the most natural 'extension' would be the solution i mentioned a week or two ago: to have a (read only) mount of all guest filesystems, plus a channel for profiling/tracing data. That would make symbol parsing easier and it's what extends the existing 'host space' abstraction in the most natural way. ( It doesnt even have to be done via the kernel - Qemu could implement that via FUSE for example. ) As a second best option a 'symbol server' might be used too. Thanks, Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for Mar 16
On 03/16/2010 10:23 AM, Daniel P. Berrange wrote: In the context of the RHEV management application, iSCSI/SCSI Fibre are providing the raw storage, with LVM VGs on top and the carving LVs for the guests. In the common case the admin/app would monitor VG usage& LV rate of increase to ensure extra space was available in the VG ahead of it being needed. eg if the VG comes close to exhaustion then further LUNS can be obtained and added as PVs to the LVM volume group. So you can't guarentee that a VM won't stop on ENOSPC, but it is very unlikely if the system is operating correctly. As an added complication, since cluster-LVM isn't used, all LVM operations have to be performed on a dedicated/exclusive storage host and then metadata refreshed/propagated to other hosts running VMs. This last issue implies that letting QEMU resize its LV would never be possible, even if it were not for the permissions problem. Sounds like a good argument for polling :-) Regards, Anthony Liguori Regards, Daniel -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Broken loadvm ?
PS: It just occurred to me , that it does indeed freeze and cause a 100% CPU usage. At least i can say for sure that network, serial line, keyboard, nor mouse work. If loadvm is loaded from the command line. If loaded from the monitor, everything seams to work, except the mouse. After a -loadvm from the command line, repeating the command from the monitor doesn't unfreeze it. i am really stuck with this. Any help is greatly appreciated, as downgrading is not an option. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for Mar 16
On Tue, Mar 16, 2010 at 10:05:40AM -0500, Anthony Liguori wrote: > On 03/16/2010 05:45 AM, Daniel P. Berrange wrote: > >On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote: > > > >>On 03/16/2010 12:31 PM, Daniel P. Berrange wrote: > >> > Polling loops are an indication that something is wrong. > > > >>>Except when people suggest they are the right answer, qcow high > >>>watermark ;-P > >>> > >>> > >>I liked Anthony's suggestion of an lvm2 block format driver. No polling. > >> > >Doesn't that require giving QEMU privileges to perform LVM operations which > >implies QEMU having CAP_SYS_ADMIN ? > > > > If QEMU is able to resize an LVM partition, it needs to carry privileges. > > I'm not sure how this can be done safely in a lesser privileged > environment. Presumably, you're over committing storage and there's not > much you can do if the guests exhaust their storage all at once. In the context of the RHEV management application, iSCSI/SCSI Fibre are providing the raw storage, with LVM VGs on top and the carving LVs for the guests. In the common case the admin/app would monitor VG usage & LV rate of increase to ensure extra space was available in the VG ahead of it being needed. eg if the VG comes close to exhaustion then further LUNS can be obtained and added as PVs to the LVM volume group. So you can't guarentee that a VM won't stop on ENOSPC, but it is very unlikely if the system is operating correctly. As an added complication, since cluster-LVM isn't used, all LVM operations have to be performed on a dedicated/exclusive storage host and then metadata refreshed/propagated to other hosts running VMs. This last issue implies that letting QEMU resize its LV would never be possible, even if it were not for the permissions problem. Regards, Daniel -- |: Red Hat, Engineering, London-o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org-o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
Ingo Molnar writes: > [...] >> >I.e. we really want to be able users to: >> > >> > 1) have it all working with a single guest, without having to specify >> > 'which' >> > guest (qemu PID) to work with. That is the dominant usecase both for >> > developers and for a fair portion of testers. >> >> That's reasonable if we can get it working simply. > > IMO such ease of use is reasonable and required, full stop. > If it cannot be gotten simply then that's a bug: either in the code, or in > the > design, or in the development process that led to the design. Bugs need > fixing. [...] Perhaps the fact that kvm happens to deal with an interesting application area (virtualization) is misleading here. As far as the host kernel or other host userspace is concerned, qemu is just some random unprivileged userspace program (with some *optional* /dev/kvm services that might happen to require temporary root). As such, perf trying to instrument qemu is no different than perf trying to instrument any other userspace widget. Therefore, expecting 'trusted enumeration' of instances is just as sensible as using 'trusted ps' and 'trusted /var/run/FOO.pid files'. - FChE -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for Mar 16
On 03/16/2010 05:45 AM, Daniel P. Berrange wrote: On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote: On 03/16/2010 12:31 PM, Daniel P. Berrange wrote: Polling loops are an indication that something is wrong. Except when people suggest they are the right answer, qcow high watermark ;-P I liked Anthony's suggestion of an lvm2 block format driver. No polling. Doesn't that require giving QEMU privileges to perform LVM operations which implies QEMU having CAP_SYS_ADMIN ? If QEMU is able to resize an LVM partition, it needs to carry privileges. I'm not sure how this can be done safely in a lesser privileged environment. Presumably, you're over committing storage and there's not much you can do if the guests exhaust their storage all at once. Regards, Anthony Liguori Daniel -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Broken loadvm ?
KVM module: modinfo kvm : filename: /lib/modules/2.6.33-30-default/kernel/arch/x86/kvm/kvm.ko license:GPL author: Qumranet srcversion: 611CD56E712705875D0E7BB depends: vermagic: 2.6.33-30-default SMP mod_unload modversions parm: oos_shadow:bool parm: ignore_msrs:bool qemu-kvm : (qemu) info version 0.11.0 (qemu-kvm-0.11.0-4.5.2) command line : qemu-kvm -drive file=/srv/pvmd/vm/templates/XP_32_SP3/disk.qcow2,cache=writeback,index=0,format=qcow2,boot=on -vnc :10 -monitor stdio -net nic,model=e1000,macaddr=52:54:00:12:34:00 -net tap,ifname=tap0,script=no -cpu core2duo -m 512 I have a freshly installed win xp. I was trying to create a snapshot with savevm from the monitor. With cace=writeback performance was good, but it *seemed* that after a loadvm (from monitor or command line) the VM was frozen. I played with it a couple of hours before realizing that in fact, it's only the mouse that doesn't work after a loadvm (trough vnc, the system is remote). At least i hope that every other device works as expected. If the loadvm happens in the same session as savevm everything works as expected. I also tested on a 2.6.27.39 with kvm-88 it all works ok. Is this a bug? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
On 03/16/2010 08:57 AM, Avi Kivity wrote: On 03/16/2010 03:51 PM, Anthony Liguori wrote: On 03/16/2010 08:29 AM, Avi Kivity wrote: On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote: Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + int ret = 0; + + mask = 1UL<< offset; + if (phys_ram_vga_dirty[index]& mask) + ret |= VGA_DIRTY_FLAG; + if (phys_ram_code_dirty[index]& mask) + ret |= CODE_DIRTY_FLAG; + if (phys_ram_migration_dirty[index]& mask) + ret |= MIGRATION_DIRTY_FLAG; + + return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { - return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; + return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; } This turns one cacheline access into three. If the dirty bitmaps were in an array, you could do return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + BITS_IN_LONG)] & mask; with one cacheline access. If I'm understanding the existing code correctly, int dirty_flags can be combined, like VGA + MIGRATION. If we only have to worry about a single dirty flag, I agree with your idea. From a quick grep it seems flags are not combined, except for something strange with CODE_DIRTY_FLAG: static void notdirty_mem_writel(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 4); dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; #endif } stl_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr); } I can't say I understand what it does. The semantics of CODE_DIRTY_FLAG are a little counter intuitive. CODE_DIRTY_FLAG means that we know that something isn't code so writes do not need checking for self modifying code. So the hardware equivalent is, when the Instruction TLB loads a page address, clear CODE_DIRTY_FLAG? Yes, and is what tlb_protect_code() does and it's called from tb_alloc_page() which is what's code when a TB is created. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter
* Avi Kivity [2010-03-16 13:08:28]: > On 03/16/2010 12:44 PM, Christoph Hellwig wrote: > >On Tue, Mar 16, 2010 at 12:36:31PM +0200, Avi Kivity wrote: > >>Are you talking about direct volume access or qcow2? > >Doesn't matter. > > > >>For direct volume access, I still don't get it. The number of barriers > >>issues by the host must equal (or exceed, but that's pointless) the > >>number of barriers issued by the guest. cache=writeback allows the host > >>to reorder writes, but so does cache=none. Where does the difference > >>come from? > >> > >>Put it another way. In an unvirtualized environment, if you implement a > >>write cache in a storage driver (not device), and sync it on a barrier > >>request, would you expect to see a performance improvement? > >cache=none only allows very limited reorderning in the host. O_DIRECT > >is synchronous on the host, so there's just some very limited reordering > >going on in the elevator if we have other I/O going on in parallel. > > Presumably there is lots of I/O going on, or we wouldn't be having > this conversation. > We are speaking of multiple VM's doing I/O in parallel. > >In addition to that the disk writecache can perform limited reodering > >and caching, but the disk cache has a rather limited size. The host > >pagecache gives a much wieder opportunity to reorder, especially if > >the guest workload is not cache flush heavy. If the guest workload > >is extremly cache flush heavy the usefulness of the pagecache is rather > >limited, as we'll only use very little of it, but pay by having to do > >a data copy. If the workload is not cache flush heavy, and we have > >multiple guests doing I/O to the same spindles it will allow the host > >do do much more efficient data writeout by beeing able to do better > >ordered (less seeky) and bigger I/O (especially if the host has real > >storage compared to ide for the guest). > > Let's assume the guest has virtio (I agree with IDE we need > reordering on the host). The guest sends batches of I/O separated > by cache flushes. If the batches are smaller than the virtio queue > length, ideally things look like: > > io_submit(..., batch_size_1); > io_getevents(..., batch_size_1); > fdatasync(); > io_submit(..., batch_size_2); > io_getevents(..., batch_size_2); > fdatasync(); > io_submit(..., batch_size_3); > io_getevents(..., batch_size_3); > fdatasync(); > > (certainly that won't happen today, but it could in principle). > > How does a write cache give any advantage? The host kernel sees > _exactly_ the same information as it would from a bunch of threaded > pwritev()s followed by fdatasync(). > Are you suggesting that the model with cache=writeback gives us the same I/O pattern as cache=none, so there are no opportunities for optimization? > (wish: IO_CMD_ORDERED_FDATASYNC) > > If the batch size is larger than the virtio queue size, or if there > are no flushes at all, then yes the huge write cache gives more > opportunity for reordering. But we're already talking hundreds of > requests here. > > Let's say the virtio queue size was unlimited. What > merging/reordering opportunity are we missing on the host? Again we > have exactly the same information: either the pagecache lru + radix > tree that identifies all dirty pages in disk order, or the block > queue with pending requests that contains exactly the same > information. > > Something is wrong. Maybe it's my understanding, but on the other > hand it may be a piece of kernel code. > I assume you are talking of dedicated disk partitions and not individual disk images residing on the same partition. -- Three Cheers, Balbir -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
On 03/16/2010 03:51 PM, Anthony Liguori wrote: On 03/16/2010 08:29 AM, Avi Kivity wrote: On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote: Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + int ret = 0; + + mask = 1UL<< offset; + if (phys_ram_vga_dirty[index]& mask) + ret |= VGA_DIRTY_FLAG; + if (phys_ram_code_dirty[index]& mask) + ret |= CODE_DIRTY_FLAG; + if (phys_ram_migration_dirty[index]& mask) + ret |= MIGRATION_DIRTY_FLAG; + + return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { - return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; + return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; } This turns one cacheline access into three. If the dirty bitmaps were in an array, you could do return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + BITS_IN_LONG)] & mask; with one cacheline access. If I'm understanding the existing code correctly, int dirty_flags can be combined, like VGA + MIGRATION. If we only have to worry about a single dirty flag, I agree with your idea. From a quick grep it seems flags are not combined, except for something strange with CODE_DIRTY_FLAG: static void notdirty_mem_writel(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 4); dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; #endif } stl_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr); } I can't say I understand what it does. The semantics of CODE_DIRTY_FLAG are a little counter intuitive. CODE_DIRTY_FLAG means that we know that something isn't code so writes do not need checking for self modifying code. So the hardware equivalent is, when the Instruction TLB loads a page address, clear CODE_DIRTY_FLAG? notdirty_mem_write() is called for any ram that is in the virtual TLB that has not been updated yet and once a write has occurred, we can switch to faster access functions (provided we've invalidated any translation blocks). That's why the check is if (!(dirty_flags & CODE_DIRTY_FLAG)), if it hasn't been set yet, we have to assume that it could be a TB so we need to invalidate it. tb_invalidate_phys_page_fast() will set the CODE_DIRTY_FLAG if no code is present in that memory area which is why we fetch dirty_flags again. Ok. We do the store, and then set the dirty bits to mark that the page is now dirty taking care to not change the CODE_DIRTY_FLAG bit. At the very end, we check to see if CODE_DIRTY_FLAG which indicates that we no longer need to trap writes. If so, we call tlb_set_dirty() which will ultimately remove the notdirty callback in favor of a faster access mechanism. With respect patch series, there should be no problem having a separate code bitmap that gets updated along with a main bitmap provided that the semantics of CODE_DIRTY_FLAG are preserved. Sounds good to me. So we're going to introduce 4 (VGA, CODE, MIGRATION, master) bit-based bitmaps in total. Yeah, except CODE doesn't behave like the others. Would be best to understand what it's requirements are before making the change. Maybe CODE will need separate handling (so master will only feed VGA and MIGRATION). Generally speaking, cpu_physical_memory_set_dirty() is called by the device model. Any writes by the device model that results in self-modifying code are not going to have predictable semantics which is why it can set CODE_DIRTY_FLAG. CODE_DIRTY_FLAG doesn't need to get updated from a master bitmap. It should be treated as a separate bitmap that is strictly dealt with by the virtual TLB. Thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
On 03/16/2010 08:29 AM, Avi Kivity wrote: On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote: Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + int ret = 0; + + mask = 1UL<< offset; + if (phys_ram_vga_dirty[index]& mask) + ret |= VGA_DIRTY_FLAG; + if (phys_ram_code_dirty[index]& mask) + ret |= CODE_DIRTY_FLAG; + if (phys_ram_migration_dirty[index]& mask) + ret |= MIGRATION_DIRTY_FLAG; + + return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { - return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; + return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; } This turns one cacheline access into three. If the dirty bitmaps were in an array, you could do return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + BITS_IN_LONG)] & mask; with one cacheline access. If I'm understanding the existing code correctly, int dirty_flags can be combined, like VGA + MIGRATION. If we only have to worry about a single dirty flag, I agree with your idea. From a quick grep it seems flags are not combined, except for something strange with CODE_DIRTY_FLAG: static void notdirty_mem_writel(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 4); dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; #endif } stl_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr); } I can't say I understand what it does. The semantics of CODE_DIRTY_FLAG are a little counter intuitive. CODE_DIRTY_FLAG means that we know that something isn't code so writes do not need checking for self modifying code. notdirty_mem_write() is called for any ram that is in the virtual TLB that has not been updated yet and once a write has occurred, we can switch to faster access functions (provided we've invalidated any translation blocks). That's why the check is if (!(dirty_flags & CODE_DIRTY_FLAG)), if it hasn't been set yet, we have to assume that it could be a TB so we need to invalidate it. tb_invalidate_phys_page_fast() will set the CODE_DIRTY_FLAG if no code is present in that memory area which is why we fetch dirty_flags again. We do the store, and then set the dirty bits to mark that the page is now dirty taking care to not change the CODE_DIRTY_FLAG bit. At the very end, we check to see if CODE_DIRTY_FLAG which indicates that we no longer need to trap writes. If so, we call tlb_set_dirty() which will ultimately remove the notdirty callback in favor of a faster access mechanism. With respect patch series, there should be no problem having a separate code bitmap that gets updated along with a main bitmap provided that the semantics of CODE_DIRTY_FLAG are preserved. Sounds good to me. So we're going to introduce 4 (VGA, CODE, MIGRATION, master) bit-based bitmaps in total. Yeah, except CODE doesn't behave like the others. Would be best to understand what it's requirements are before making the change. Maybe CODE will need separate handling (so master will only feed VGA and MIGRATION). Generally speaking, cpu_physical_memory_set_dirty() is called by the device model. Any writes by the device model that results in self-modifying code are not going to have predictable semantics which is why it can set CODE_DIRTY_FLAG. CODE_DIRTY_FLAG doesn't need to get updated from a master bitmap. It should be treated as a separate bitmap that is strictly dealt with by the virtual TLB. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
Avi Kivity wrote: On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote: Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + int ret = 0; + + mask = 1UL<< offset; + if (phys_ram_vga_dirty[index]& mask) + ret |= VGA_DIRTY_FLAG; + if (phys_ram_code_dirty[index]& mask) + ret |= CODE_DIRTY_FLAG; + if (phys_ram_migration_dirty[index]& mask) + ret |= MIGRATION_DIRTY_FLAG; + + return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { - return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; + return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; } This turns one cacheline access into three. If the dirty bitmaps were in an array, you could do return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + BITS_IN_LONG)] & mask; with one cacheline access. If I'm understanding the existing code correctly, int dirty_flags can be combined, like VGA + MIGRATION. If we only have to worry about a single dirty flag, I agree with your idea. From a quick grep it seems flags are not combined, except for something strange with CODE_DIRTY_FLAG: Thanks for checking out. But the CODE_DIRTY_FLAG makes me really nervous... static void notdirty_mem_writel(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 4); dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; #endif } stl_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr); } I can't say I understand what it does. Me neither. This the reason I had to take naive approach... On the other hand, qemu seems to require getting combined dirty flags. If we introduce dirty bitmaps for each type, we need to access each bitmap to get combined flags. I wasn't sure how to make this more efficient... static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) { - phys_ram_dirty[addr>> TARGET_PAGE_BITS] = 0xff; + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + + mask = 1UL<< offset; + phys_ram_vga_dirty[index] |= mask; + phys_ram_code_dirty[index] |= mask; + phys_ram_migration_dirty[index] |= mask; +} This is also three cacheline accesses. I think we should have a master bitmap which is updated by set_dirty(), and which is or'ed into the other bitmaps when they are accessed. At least the vga and migration bitmaps are only read periodically, not randomly, so this would be very fast. In a way, this is similar to how the qemu bitmap is updated from the kvm bitmap today. Sounds good to me. So we're going to introduce 4 (VGA, CODE, MIGRATION, master) bit-based bitmaps in total. Yeah, except CODE doesn't behave like the others. Would be best to understand what it's requirements are before making the change. Maybe CODE will need separate handling (so master will only feed VGA and MIGRATION). After implementing this patch set, I thought separating the wrapper functions for each dirty flag type might be an option. Unifying everything makes inefficient here. But anyway, do you know somebody who has a strong insight on this CODE_DIRTY_FLAG? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/6] qemu-kvm: Introduce bit-based phys_ram_dirty, and bit-based dirty page checker.
Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Experimental results: Cond1: 1.9 ~ 61 times speed up Cond2: 1.9 ~ 56 times speed up Cond3: 1.9 ~ 59 times speed up Cond4: 1.7 ~ 59 times speed up Impressive results. What's the typical speedup? Closer to 1.9 or 61? To be honest, I thought the result above was too vague... The speed up grows when the number of dirty pages decreases. Let me paste the snipped actual data measured during live migration on Cond1. This result is measured with cpu_get_real_ticks(), so the values should be in raw ticks. 135200 dirty pages: orig.2488419, bitbased.1251171, ratio.1.99 ... 98346 dirty pages: orig.3580533, bitbased.1386918, ratio.2.58 ... 54865 dirty pages: orig.4220865, bitbased.984924, ratio.4.29 ... 27883 dirty pages: orig.4088970, bitbased.514602, ratio.7.95 ... 11541 dirty pages: orig.3854277, bitbased.220410, ratio.17.49 ... 8117 dirty pages: orig.4041765, bitbased.175446, ratio.23.04 3231 dirty pages: orig.3337083, bitbased.105921, ratio.31.51 2401 dirty pages: orig.4103469, bitbased.89406, ratio.45.90 1595 dirty pages: orig.4028949, bitbased.78570, ratio.51.28 756 dirty pages: orig.4036707, bitbased.67662, ratio.59.66 0 dirty pages: orig.3938085, bitbased.23634, ratio.166.63 0 dirty pages: orig.3968163, bitbased.23526, ratio.168.67 We didn't show the data for checking completely empty bitmap because it was too fast and didn't wan't to get wrong impression. Note the issue with the cache accesses for set_dirty() is only applicable to tcg, since kvm always updates the dirty bitmap in a batch (well, I/O also updates the bitmap). I understand. I'm still concerned regarding the way of reseting the dirty bitmap. I was thinking to reset them in a batch, but it seems difficult because of the consistency with the tlb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 03:31 PM, Ingo Molnar wrote: You can do that through libvirt, but that only works for guests started through libvirt. libvirt provides command-line tools to list and manage guests (for example autostarting them on startup), and tools built on top of libvirt can manage guests graphically. Looks like we have a layer inversion here. Maybe we need a plugin system - libvirt drops a .so into perf that teaches it how to list guests and get their symbols. Is libvirt used to start up all KVM guests? If not, if it's only used on some distros while other distros have other solutions then there's apparently no good way to get to such information, and the kernel bits of KVM do not provide it. Developers tend to start qemu from the command line, but the majority of users and all distros I know of use libvirt. Some users cobble up their own scripts. To the user (and to me) this looks like a KVM bug / missing feature. (and the user doesnt care where the blame is) If that is true then apparently the current KVM design has no technically actionable solution for certain categories of features! A plugin system allows anyone who is interested to provide the information; they just need to write a plugin for their management tool. Since we can't prevent people from writing management tools, I don't see what else we can do. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
On 03/16/2010 07:45 AM, Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ +unsigned long mask; +int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); +int ret = 0; + +mask = 1UL<< offset; +if (phys_ram_vga_dirty[index]& mask) +ret |= VGA_DIRTY_FLAG; +if (phys_ram_code_dirty[index]& mask) +ret |= CODE_DIRTY_FLAG; +if (phys_ram_migration_dirty[index]& mask) +ret |= MIGRATION_DIRTY_FLAG; + +return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { -return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; +return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; } This turns one cacheline access into three. If the dirty bitmaps were in an array, you could do return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + BITS_IN_LONG)] & mask; with one cacheline access. As far as I can tell, we only ever call with a single flag so your suggestion makes sense. I'd suggest introducing these functions before splitting the bitmap up. It makes review a bit easier. static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) { -phys_ram_dirty[addr>> TARGET_PAGE_BITS] = 0xff; +unsigned long mask; +int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + +mask = 1UL<< offset; +phys_ram_vga_dirty[index] |= mask; +phys_ram_code_dirty[index] |= mask; +phys_ram_migration_dirty[index] |= mask; +} This is also three cacheline accesses. I think we should have a master bitmap which is updated by set_dirty(), and which is or'ed into the other bitmaps when they are accessed. At least the vga and migration bitmaps are only read periodically, not randomly, so this would be very fast. In a way, this is similar to how the qemu bitmap is updated from the kvm bitmap today. I am not sure about the code bitmap though. I think your suggestion makes sense and would also work for the code bitmap. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Avi Kivity wrote: > On 03/16/2010 03:08 PM, Ingo Molnar wrote: > > > >>>I mean, i can trust a kernel service and i can trust /proc/kallsyms. > >>> > >>>Can perf trust a random process claiming to be Qemu? What's the trust > >>>mechanism here? > >>Obviously you can't trust anything you get from a guest, no matter how you > >>get it. > >I'm not talking about the symbol strings and addresses, and the object > >contents for allocation (or debuginfo). I'm talking about the basic protocol > >of establishing which guest is which. > > There is none. So far, qemu only dealt with managing just its own > guest, and left all multiple guest management to higher levels up > the stack (like libvirt). > > >I.e. we really want to be able users to: > > > > 1) have it all working with a single guest, without having to specify > > 'which' > > guest (qemu PID) to work with. That is the dominant usecase both for > > developers and for a fair portion of testers. > > That's reasonable if we can get it working simply. IMO such ease of use is reasonable and required, full stop. If it cannot be gotten simply then that's a bug: either in the code, or in the design, or in the development process that led to the design. Bugs need fixing. > > 2) Have some reasonable symbolic identification for guests. For example a > > usable approach would be to have 'perf kvm list', which would list all > > currently active guests: > > > > $ perf kvm list > >[1] Fedora > >[2] OpenSuse > >[3] Windows-XP > >[4] Windows-7 > > > > And from that point on 'perf kvm -g OpenSuse record' would do the > > obvious > > thing. Users will be able to just use the 'OpenSuse' symbolic name for > > that guest, even if the guest got restarted and switched its main PID. > > > > Any such facility needs trusted enumeration and a protocol where i can > > trust that the information i got is authorative. (I.e. 'OpenSuse' truly > > matches to the OpenSuse session - not to some local user starting up a > > Qemu instance that claims to be 'OpenSuse'.) > > > > Is such a scheme possible/available? I suspect all the KVM configuration > > tools (i havent used them in some time - gui and command-line tools alike) > > use similar methods to ease guest management? > > You can do that through libvirt, but that only works for guests started > through libvirt. libvirt provides command-line tools to list and manage > guests (for example autostarting them on startup), and tools built on top of > libvirt can manage guests graphically. > > Looks like we have a layer inversion here. Maybe we need a plugin system - > libvirt drops a .so into perf that teaches it how to list guests and get > their symbols. Is libvirt used to start up all KVM guests? If not, if it's only used on some distros while other distros have other solutions then there's apparently no good way to get to such information, and the kernel bits of KVM do not provide it. To the user (and to me) this looks like a KVM bug / missing feature. (and the user doesnt care where the blame is) If that is true then apparently the current KVM design has no technically actionable solution for certain categories of features! Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
On 03/16/2010 03:17 PM, Yoshiaki Tamura wrote: Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + int ret = 0; + + mask = 1UL<< offset; + if (phys_ram_vga_dirty[index]& mask) + ret |= VGA_DIRTY_FLAG; + if (phys_ram_code_dirty[index]& mask) + ret |= CODE_DIRTY_FLAG; + if (phys_ram_migration_dirty[index]& mask) + ret |= MIGRATION_DIRTY_FLAG; + + return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { - return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; + return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; } This turns one cacheline access into three. If the dirty bitmaps were in an array, you could do return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + BITS_IN_LONG)] & mask; with one cacheline access. If I'm understanding the existing code correctly, int dirty_flags can be combined, like VGA + MIGRATION. If we only have to worry about a single dirty flag, I agree with your idea. From a quick grep it seems flags are not combined, except for something strange with CODE_DIRTY_FLAG: static void notdirty_mem_writel(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 4); dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; #endif } stl_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr); } I can't say I understand what it does. On the other hand, qemu seems to require getting combined dirty flags. If we introduce dirty bitmaps for each type, we need to access each bitmap to get combined flags. I wasn't sure how to make this more efficient... static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) { - phys_ram_dirty[addr>> TARGET_PAGE_BITS] = 0xff; + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + + mask = 1UL<< offset; + phys_ram_vga_dirty[index] |= mask; + phys_ram_code_dirty[index] |= mask; + phys_ram_migration_dirty[index] |= mask; +} This is also three cacheline accesses. I think we should have a master bitmap which is updated by set_dirty(), and which is or'ed into the other bitmaps when they are accessed. At least the vga and migration bitmaps are only read periodically, not randomly, so this would be very fast. In a way, this is similar to how the qemu bitmap is updated from the kvm bitmap today. Sounds good to me. So we're going to introduce 4 (VGA, CODE, MIGRATION, master) bit-based bitmaps in total. Yeah, except CODE doesn't behave like the others. Would be best to understand what it's requirements are before making the change. Maybe CODE will need separate handling (so master will only feed VGA and MIGRATION). -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + int ret = 0; + + mask = 1UL<< offset; + if (phys_ram_vga_dirty[index]& mask) + ret |= VGA_DIRTY_FLAG; + if (phys_ram_code_dirty[index]& mask) + ret |= CODE_DIRTY_FLAG; + if (phys_ram_migration_dirty[index]& mask) + ret |= MIGRATION_DIRTY_FLAG; + + return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { - return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; + return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; } This turns one cacheline access into three. If the dirty bitmaps were in an array, you could do return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + BITS_IN_LONG)] & mask; with one cacheline access. If I'm understanding the existing code correctly, int dirty_flags can be combined, like VGA + MIGRATION. If we only have to worry about a single dirty flag, I agree with your idea. On the other hand, qemu seems to require getting combined dirty flags. If we introduce dirty bitmaps for each type, we need to access each bitmap to get combined flags. I wasn't sure how to make this more efficient... static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) { - phys_ram_dirty[addr>> TARGET_PAGE_BITS] = 0xff; + unsigned long mask; + int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; + int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + + mask = 1UL<< offset; + phys_ram_vga_dirty[index] |= mask; + phys_ram_code_dirty[index] |= mask; + phys_ram_migration_dirty[index] |= mask; +} This is also three cacheline accesses. I think we should have a master bitmap which is updated by set_dirty(), and which is or'ed into the other bitmaps when they are accessed. At least the vga and migration bitmaps are only read periodically, not randomly, so this would be very fast. In a way, this is similar to how the qemu bitmap is updated from the kvm bitmap today. Sounds good to me. So we're going to introduce 4 (VGA, CODE, MIGRATION, master) bit-based bitmaps in total. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 03:08 PM, Ingo Molnar wrote: I mean, i can trust a kernel service and i can trust /proc/kallsyms. Can perf trust a random process claiming to be Qemu? What's the trust mechanism here? Obviously you can't trust anything you get from a guest, no matter how you get it. I'm not talking about the symbol strings and addresses, and the object contents for allocation (or debuginfo). I'm talking about the basic protocol of establishing which guest is which. There is none. So far, qemu only dealt with managing just its own guest, and left all multiple guest management to higher levels up the stack (like libvirt). I.e. we really want to be able users to: 1) have it all working with a single guest, without having to specify 'which' guest (qemu PID) to work with. That is the dominant usecase both for developers and for a fair portion of testers. That's reasonable if we can get it working simply. 2) Have some reasonable symbolic identification for guests. For example a usable approach would be to have 'perf kvm list', which would list all currently active guests: $ perf kvm list [1] Fedora [2] OpenSuse [3] Windows-XP [4] Windows-7 And from that point on 'perf kvm -g OpenSuse record' would do the obvious thing. Users will be able to just use the 'OpenSuse' symbolic name for that guest, even if the guest got restarted and switched its main PID. Any such facility needs trusted enumeration and a protocol where i can trust that the information i got is authorative. (I.e. 'OpenSuse' truly matches to the OpenSuse session - not to some local user starting up a Qemu instance that claims to be 'OpenSuse'.) Is such a scheme possible/available? I suspect all the KVM configuration tools (i havent used them in some time - gui and command-line tools alike) use similar methods to ease guest management? You can do that through libvirt, but that only works for guests started through libvirt. libvirt provides command-line tools to list and manage guests (for example autostarting them on startup), and tools built on top of libvirt can manage guests graphically. Looks like we have a layer inversion here. Maybe we need a plugin system - libvirt drops a .so into perf that teaches it how to list guests and get their symbols. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/6] qemu-kvm: Introduce bit-based phys_ram_dirty, and bit-based dirty page checker.
On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Experimental results: Cond1: 1.9 ~ 61 times speed up Cond2: 1.9 ~ 56 times speed up Cond3: 1.9 ~ 59 times speed up Cond4: 1.7 ~ 59 times speed up Impressive results. What's the typical speedup? Closer to 1.9 or 61? Note the issue with the cache accesses for set_dirty() is only applicable to tcg, since kvm always updates the dirty bitmap in a batch (well, I/O also updates the bitmap). -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Avi Kivity wrote: > On 03/16/2010 02:29 PM, Ingo Molnar wrote: > > I mean, i can trust a kernel service and i can trust /proc/kallsyms. > > > > Can perf trust a random process claiming to be Qemu? What's the trust > > mechanism here? > > Obviously you can't trust anything you get from a guest, no matter how you > get it. I'm not talking about the symbol strings and addresses, and the object contents for allocation (or debuginfo). I'm talking about the basic protocol of establishing which guest is which. I.e. we really want to be able users to: 1) have it all working with a single guest, without having to specify 'which' guest (qemu PID) to work with. That is the dominant usecase both for developers and for a fair portion of testers. 2) Have some reasonable symbolic identification for guests. For example a usable approach would be to have 'perf kvm list', which would list all currently active guests: $ perf kvm list [1] Fedora [2] OpenSuse [3] Windows-XP [4] Windows-7 And from that point on 'perf kvm -g OpenSuse record' would do the obvious thing. Users will be able to just use the 'OpenSuse' symbolic name for that guest, even if the guest got restarted and switched its main PID. Any such facility needs trusted enumeration and a protocol where i can trust that the information i got is authorative. (I.e. 'OpenSuse' truly matches to the OpenSuse session - not to some local user starting up a Qemu instance that claims to be 'OpenSuse'.) Is such a scheme possible/available? I suspect all the KVM configuration tools (i havent used them in some time - gui and command-line tools alike) use similar methods to ease guest management? Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/6] qemu-kvm: Introduce bit-based phys_ram_dirty for VGA, CODE and MIGRATION.
On 03/16/2010 03:01 PM, Yoshiaki Tamura wrote: -uint8_t *phys_ram_dirty; +unsigned long *phys_ram_vga_dirty; +unsigned long *phys_ram_code_dirty; +unsigned long *phys_ram_migration_dirty; Would be nice to make this an array. Thanks for pointing out. I have a question regarding the index of the array. From the compatibility perspective, I would prefer using the existing macros. #define VGA_DIRTY_FLAG 0x01 #define CODE_DIRTY_FLAG 0x02 #define MIGRATION_DIRTY_FLAG 0x08 However, if I use them as is, I'll get a sparse array... Is it acceptable to change these values like 0, 1, 2? Sure. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/6] qemu-kvm: Introduce bit-based phys_ram_dirty for VGA, CODE and MIGRATION.
Avi Kivity wrote: On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Replaces byte-based phys_ram_dirty bitmap with three bit-based phys_ram_dirty bitmap. On allocation, it sets all bits in the bitmap. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- exec.c | 22 +- 1 files changed, 17 insertions(+), 5 deletions(-) diff --git a/exec.c b/exec.c index 9bcb4de..ba334e7 100644 --- a/exec.c +++ b/exec.c @@ -119,7 +119,9 @@ uint8_t *code_gen_ptr; #if !defined(CONFIG_USER_ONLY) int phys_ram_fd; -uint8_t *phys_ram_dirty; +unsigned long *phys_ram_vga_dirty; +unsigned long *phys_ram_code_dirty; +unsigned long *phys_ram_migration_dirty; Would be nice to make this an array. Thanks for pointing out. I have a question regarding the index of the array. From the compatibility perspective, I would prefer using the existing macros. #define VGA_DIRTY_FLAG 0x01 #define CODE_DIRTY_FLAG 0x02 #define MIGRATION_DIRTY_FLAG 0x08 However, if I use them as is, I'll get a sparse array... Is it acceptable to change these values like 0, 1, 2? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] add "xchg ax, reg" emulator test
On 03/16/2010 02:52 PM, Gleb Natapov wrote: + MK_INSN(xchg_test1, "xchg %eax,%eax\n\t"); AKA 'nop'. Yep, but we still emulate it. And well too. Note that 'xchg eax, ebx' will clear the top 32-bits of rax and rbx, but 'xchg eax, eax' will not. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] add "xchg ax, reg" emulator test
On Tue, Mar 16, 2010 at 02:50:58PM +0200, Avi Kivity wrote: > On 03/16/2010 02:42 PM, Gleb Natapov wrote: > >Add test for opcodes 0x90-0x9f emulation > > > > Reviewed-by: Avi Kivity > > >+MK_INSN(xchg_test1, "xchg %eax,%eax\n\t"); > > AKA 'nop'. > Yep, but we still emulate it. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] add "xchg ax, reg" emulator test
On 03/16/2010 02:42 PM, Gleb Natapov wrote: Add test for opcodes 0x90-0x9f emulation Reviewed-by: Avi Kivity + MK_INSN(xchg_test1, "xchg %eax,%eax\n\t"); AKA 'nop'. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/6] qemu-kvm: Introduce cpu_physical_memory_get_dirty_range().
On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Introduces cpu_physical_memory_get_dirty_range(). It checks the first row and puts dirty addr in the array. If the first row is empty, it skips to the first non-dirty row or the end addr, and put the length in the first entry of the array. +/* It checks the first row and puts dirty addrs in the array. + If the first row is empty, it skips to the first non-dirty row + or the end addr, and put the length in the first entry of the array. */ +int cpu_physical_memory_get_dirty_range(ram_addr_t start, ram_addr_t end, +ram_addr_t *dirty_rams, int length, +int dirty_flag) +{ +unsigned long phys_ram_dirty, page_number, *p; +ram_addr_t addr; +int s_idx = (start>> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int e_idx = (end>> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int i, j, offset; + +switch (dirty_flag) { +case VGA_DIRTY_FLAG: +p = phys_ram_vga_dirty; +break; +case CODE_DIRTY_FLAG: +p = phys_ram_code_dirty; +break; +case MIGRATION_DIRTY_FLAG: +p = phys_ram_migration_dirty; +break; +default: +abort(); +} This bit would be improved by switching to an array of bitmaps. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ +unsigned long mask; +int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); +int ret = 0; + +mask = 1UL<< offset; +if (phys_ram_vga_dirty[index]& mask) +ret |= VGA_DIRTY_FLAG; +if (phys_ram_code_dirty[index]& mask) +ret |= CODE_DIRTY_FLAG; +if (phys_ram_migration_dirty[index]& mask) +ret |= MIGRATION_DIRTY_FLAG; + +return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { -return phys_ram_dirty[addr>> TARGET_PAGE_BITS]& dirty_flags; +return cpu_physical_memory_get_dirty_flags(addr)& dirty_flags; } This turns one cacheline access into three. If the dirty bitmaps were in an array, you could do return dirty_bitmaps[dirty_index][addr >> (TARGET_PAGE_BITS + BITS_IN_LONG)] & mask; with one cacheline access. static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) { -phys_ram_dirty[addr>> TARGET_PAGE_BITS] = 0xff; +unsigned long mask; +int index = (addr>> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int offset = (addr>> TARGET_PAGE_BITS)& (HOST_LONG_BITS - 1); + +mask = 1UL<< offset; +phys_ram_vga_dirty[index] |= mask; +phys_ram_code_dirty[index] |= mask; +phys_ram_migration_dirty[index] |= mask; +} This is also three cacheline accesses. I think we should have a master bitmap which is updated by set_dirty(), and which is or'ed into the other bitmaps when they are accessed. At least the vga and migration bitmaps are only read periodically, not randomly, so this would be very fast. In a way, this is similar to how the qemu bitmap is updated from the kvm bitmap today. I am not sure about the code bitmap though. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] add "xchg ax, reg" emulator test
Add test for opcodes 0x90-0x9f emulation Signed-off-by: Gleb Natapov diff --git a/kvm/user/test/x86/realmode.c b/kvm/user/test/x86/realmode.c index bc6b27f..bfc2942 100644 --- a/kvm/user/test/x86/realmode.c +++ b/kvm/user/test/x86/realmode.c @@ -141,6 +141,90 @@ int regs_equal(const struct regs *r1, const struct regs *r2, int ignore) ); \ extern u8 insn_##name[], insn_##name##_end[] +void test_xchg(void) +{ + struct regs inregs = { .eax = 0, .ebx = 1, .ecx = 2, .edx = 3, .esi = 4, .edi = 5, .ebp = 6, .esp = 7}, outregs; + + MK_INSN(xchg_test1, "xchg %eax,%eax\n\t"); + MK_INSN(xchg_test2, "xchg %eax,%ebx\n\t"); + MK_INSN(xchg_test3, "xchg %eax,%ecx\n\t"); + MK_INSN(xchg_test4, "xchg %eax,%edx\n\t"); + MK_INSN(xchg_test5, "xchg %eax,%esi\n\t"); + MK_INSN(xchg_test6, "xchg %eax,%edi\n\t"); + MK_INSN(xchg_test7, "xchg %eax,%ebp\n\t"); + MK_INSN(xchg_test8, "xchg %eax,%esp\n\t"); + + exec_in_big_real_mode(&inregs, &outregs, + insn_xchg_test1, + insn_xchg_test1_end - insn_xchg_test1); + + if (!regs_equal(&inregs, &outregs, 0)) + print_serial("xchg test 1: FAIL\n"); + + exec_in_big_real_mode(&inregs, &outregs, + insn_xchg_test2, + insn_xchg_test2_end - insn_xchg_test2); + + if (!regs_equal(&inregs, &outregs, R_AX | R_BX) || +outregs.eax != inregs.ebx || +outregs.ebx != inregs.eax) + print_serial("xchg test 2: FAIL\n"); + + exec_in_big_real_mode(&inregs, &outregs, + insn_xchg_test3, + insn_xchg_test3_end - insn_xchg_test3); + + if (!regs_equal(&inregs, &outregs, R_AX | R_CX) || +outregs.eax != inregs.ecx || +outregs.ecx != inregs.eax) + print_serial("xchg test 3: FAIL\n"); + + exec_in_big_real_mode(&inregs, &outregs, + insn_xchg_test4, + insn_xchg_test4_end - insn_xchg_test4); + + if (!regs_equal(&inregs, &outregs, R_AX | R_DX) || +outregs.eax != inregs.edx || +outregs.edx != inregs.eax) + print_serial("xchg test 4: FAIL\n"); + + exec_in_big_real_mode(&inregs, &outregs, + insn_xchg_test5, + insn_xchg_test5_end - insn_xchg_test5); + + if (!regs_equal(&inregs, &outregs, R_AX | R_SI) || +outregs.eax != inregs.esi || +outregs.esi != inregs.eax) + print_serial("xchg test 5: FAIL\n"); + + exec_in_big_real_mode(&inregs, &outregs, + insn_xchg_test6, + insn_xchg_test6_end - insn_xchg_test6); + + if (!regs_equal(&inregs, &outregs, R_AX | R_DI) || +outregs.eax != inregs.edi || +outregs.edi != inregs.eax) + print_serial("xchg test 6: FAIL\n"); + + exec_in_big_real_mode(&inregs, &outregs, + insn_xchg_test7, + insn_xchg_test7_end - insn_xchg_test7); + + if (!regs_equal(&inregs, &outregs, R_AX | R_BP) || +outregs.eax != inregs.ebp || +outregs.ebp != inregs.eax) + print_serial("xchg test 7: FAIL\n"); + + exec_in_big_real_mode(&inregs, &outregs, + insn_xchg_test8, + insn_xchg_test8_end - insn_xchg_test8); + + if (!regs_equal(&inregs, &outregs, R_AX | R_SP) || +outregs.eax != inregs.esp || +outregs.esp != inregs.eax) + print_serial("xchg test 8: FAIL\n"); +} + void test_shld(void) { struct regs inregs = { .eax = 0xbe, .edx = 0xef00 }, outregs; @@ -572,6 +656,7 @@ void realmode_start(void) test_call(); /* long jmp test uses call near so test it after testing call */ test_long_jmp(); + test_xchg(); exit(0); } -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 02:29 PM, Ingo Molnar wrote: * Avi Kivity wrote: On 03/16/2010 01:25 PM, Ingo Molnar wrote: I haven't followed vmchannel closely, but I think it is. vmchannel is terminated in qemu on the host side, not in the host kernel. So perf would need to connect to qemu. Hm, that sounds rather messy if we want to use it to basically expose kernel functionality in a guest/host unified way. Is the qemu process discoverable in some secure way? We know its pid. How do i get a list of all 'guest instance PIDs', Libvirt manages all qemus, but this should be implemented independently of libvirt. and what is the way to talk to Qemu? In general qemu exposes communication channels (such as the monitor) as tcp connections, unix-domain sockets, stdio, etc. It's very flexible. Can we trust it? No choice, it contains the guest address space. I mean, i can trust a kernel service and i can trust /proc/kallsyms. Can perf trust a random process claiming to be Qemu? What's the trust mechanism here? Obviously you can't trust anything you get from a guest, no matter how you get it. How do you trust a userspace program's symbols? you don't. How do you get them? they're on a well-known location. Is there some proper tooling available to do it, or do we have to push it through 2-3 packages to get such a useful feature done? libvirt manages qemu processes, but I don't think this should go through libvirt. qemu can do this directly by opening a unix domain socket in a well-known place. So Qemu has never run into such problems before? ( Sounds weird - i think Qemu configuration itself should be done via a unix domain socket driven configuration protocol as well. ) That's exactly what happens. You invoke qemu with -monitor unix:blah,server (or -qmp for a machine-readable format) and have your management application connect to that. You can redirect guest serial ports, console, parallel port, etc. to unix-domain or tcp sockets. vmchannel is an extension of that mechanism. ( That is the general thought process how many cross-discipline useful desktop/server features hit the bit bucket before having had any chance of being vetted by users, and why Linux sucks so much when it comes to feature integration and application usability. ) You can't solve everything in the kernel, even with a well populated tools/. Certainly not, but this is a technical problem in the kernel's domain, so it's a fair (and natural) expectation to be able to solve this within the kernel project. Someone writing perf-gui outside the kernel would have the same problems, no? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Avi Kivity wrote: > On 03/16/2010 01:25 PM, Ingo Molnar wrote: > > > >>I haven't followed vmchannel closely, but I think it is. vmchannel is > >>terminated in qemu on the host side, not in the host kernel. So perf would > >>need to connect to qemu. > >Hm, that sounds rather messy if we want to use it to basically expose kernel > >functionality in a guest/host unified way. Is the qemu process discoverable > >in > >some secure way? > > We know its pid. How do i get a list of all 'guest instance PIDs', and what is the way to talk to Qemu? > > Can we trust it? > > No choice, it contains the guest address space. I mean, i can trust a kernel service and i can trust /proc/kallsyms. Can perf trust a random process claiming to be Qemu? What's the trust mechanism here? > > Is there some proper tooling available to do it, or do we have to push it > > through 2-3 packages to get such a useful feature done? > > libvirt manages qemu processes, but I don't think this should go through > libvirt. qemu can do this directly by opening a unix domain socket in a > well-known place. So Qemu has never run into such problems before? ( Sounds weird - i think Qemu configuration itself should be done via a unix domain socket driven configuration protocol as well. ) > >( That is the general thought process how many cross-discipline useful > > desktop/server features hit the bit bucket before having had any chance of > > being vetted by users, and why Linux sucks so much when it comes to > > feature > > integration and application usability. ) > > You can't solve everything in the kernel, even with a well populated tools/. Certainly not, but this is a technical problem in the kernel's domain, so it's a fair (and natural) expectation to be able to solve this within the kernel project. Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/6] qemu-kvm: Introduce bit-based phys_ram_dirty for VGA, CODE and MIGRATION.
On 03/16/2010 12:53 PM, Yoshiaki Tamura wrote: Replaces byte-based phys_ram_dirty bitmap with three bit-based phys_ram_dirty bitmap. On allocation, it sets all bits in the bitmap. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- exec.c | 22 +- 1 files changed, 17 insertions(+), 5 deletions(-) diff --git a/exec.c b/exec.c index 9bcb4de..ba334e7 100644 --- a/exec.c +++ b/exec.c @@ -119,7 +119,9 @@ uint8_t *code_gen_ptr; #if !defined(CONFIG_USER_ONLY) int phys_ram_fd; -uint8_t *phys_ram_dirty; +unsigned long *phys_ram_vga_dirty; +unsigned long *phys_ram_code_dirty; +unsigned long *phys_ram_migration_dirty; Would be nice to make this an array. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 01:25 PM, Ingo Molnar wrote: I haven't followed vmchannel closely, but I think it is. vmchannel is terminated in qemu on the host side, not in the host kernel. So perf would need to connect to qemu. Hm, that sounds rather messy if we want to use it to basically expose kernel functionality in a guest/host unified way. Is the qemu process discoverable in some secure way? We know its pid. Can we trust it? No choice, it contains the guest address space. Is there some proper tooling available to do it, or do we have to push it through 2-3 packages to get such a useful feature done? libvirt manages qemu processes, but I don't think this should go through libvirt. qemu can do this directly by opening a unix domain socket in a well-known place. ( That is the general thought process how many cross-discipline useful desktop/server features hit the bit bucket before having had any chance of being vetted by users, and why Linux sucks so much when it comes to feature integration and application usability. ) You can't solve everything in the kernel, even with a well populated tools/. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.
On Tue, Mar 16, 2010 at 05:32:17PM +0800, Xin Xiaohui wrote: > The vhost-net backend now only supports synchronous send/recv > operations. The patch provides multiple submits and asynchronous > notifications. This is needed for zero-copy case. > > Signed-off-by: Xin Xiaohui > --- > > Michael, > I don't use the kiocb comes from the sendmsg/recvmsg, > since I have embeded the kiocb in page_info structure, > and allocate it when page_info allocated. So what I suggested was that vhost allocates and tracks the iocbs, and passes them to your device with sendmsg/ recvmsg calls. This way your device won't need to share structures and locking strategy with vhost: you get an iocb, handle it, invoke a callback to notify vhost about completion. This also gets rid of the 'receiver' callback. > Please have a review and thanks for the instruction > for replying email which helps me a lot. > > Thanks, > Xiaohui > > drivers/vhost/net.c | 159 > +++-- > drivers/vhost/vhost.h | 12 > 2 files changed, 166 insertions(+), 5 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 22d5fef..5483848 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -17,11 +17,13 @@ > #include > #include > #include > +#include > > #include > #include > #include > #include > +#include > > #include > > @@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct > socket *sock) > net->tx_poll_state = VHOST_NET_POLL_STARTED; > } > > +static void handle_async_rx_events_notify(struct vhost_net *net, > + struct vhost_virtqueue *vq); > + > +static void handle_async_tx_events_notify(struct vhost_net *net, > + struct vhost_virtqueue *vq); > + A couple of style comments: - It's better to arrange functions in such order that forward declarations aren't necessary. Since we don't have recursion, this should always be possible. - continuation lines should be idented at least at the position of '(' on the previous line. > /* Expects to be always run from workqueue - which acts as > * read-size critical section for our kind of RCU. */ > static void handle_tx(struct vhost_net *net) > @@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net) > tx_poll_stop(net); > hdr_size = vq->hdr_size; > > + handle_async_tx_events_notify(net, vq); > + > for (;;) { > head = vhost_get_vq_desc(&net->dev, vq, vq->iov, >ARRAY_SIZE(vq->iov), > @@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net) > /* Skip header. TODO: support TSO. */ > s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); > msg.msg_iovlen = out; > + > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) { > + vq->head = head; > + msg.msg_control = (void *)vq; So here a device gets a pointer to vhost_virtqueue structure. If it gets an iocb and invokes a callback, it would not care about vhost internals. > + } > + > len = iov_length(vq->iov, out); > /* Sanity check */ > if (!len) { > @@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net) > tx_poll_start(net, sock); > break; > } > + > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) > + continue; > + > if (err != len) > pr_err("Truncated TX packet: " > " len %d != %zd\n", err, len); > @@ -177,6 +197,8 @@ static void handle_tx(struct vhost_net *net) > } > } > > + handle_async_tx_events_notify(net, vq); > + > mutex_unlock(&vq->mutex); > unuse_mm(net->dev.mm); > } > @@ -206,7 +228,8 @@ static void handle_rx(struct vhost_net *net) > int err; > size_t hdr_size; > struct socket *sock = rcu_dereference(vq->private_data); > - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) > + if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) && > + vq->link_state == VHOST_VQ_LINK_SYNC)) > return; > > use_mm(net->dev.mm); > @@ -214,9 +237,18 @@ static void handle_rx(struct vhost_net *net) > vhost_disable_notify(vq); > hdr_size = vq->hdr_size; > > - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? > + /* In async cases, for write logging, the simple way is to get > + * the log info always, and really logging is decided later. > + * Thus, when logging enabled, we can get log, and when logging > + * disabled, we can get log disabled accordingly. > + */ > + This adds overhead and might be one of the reasons your patch does not perform that well. A better way would be to flush outsta
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Avi Kivity wrote: > On 03/16/2010 12:50 PM, Ingo Molnar wrote: > > > >>I'm confused - initrd seems to be guest-side. I was talking about the host > >>side. > >host side doesnt need much support - just some client capability in perf > >itself. I suspect vmchannels are sufficiently flexible and configuration-free > >for such purposes? (i.e. like a filesystem in essence) > > I haven't followed vmchannel closely, but I think it is. vmchannel is > terminated in qemu on the host side, not in the host kernel. So perf would > need to connect to qemu. Hm, that sounds rather messy if we want to use it to basically expose kernel functionality in a guest/host unified way. Is the qemu process discoverable in some secure way? Can we trust it? Is there some proper tooling available to do it, or do we have to push it through 2-3 packages to get such a useful feature done? ( That is the general thought process how many cross-discipline useful desktop/server features hit the bit bucket before having had any chance of being vetted by users, and why Linux sucks so much when it comes to feature integration and application usability. ) Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for Mar 16
On 03/16/2010 12:45 PM, Daniel P. Berrange wrote: On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote: On 03/16/2010 12:31 PM, Daniel P. Berrange wrote: Polling loops are an indication that something is wrong. Except when people suggest they are the right answer, qcow high watermark ;-P I liked Anthony's suggestion of an lvm2 block format driver. No polling. Doesn't that require giving QEMU privileges to perform LVM operations which implies QEMU having CAP_SYS_ADMIN ? Ouch. I expect fd permissions on the volume are insufficient, and fd permissions on the group are excessive. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 12:50 PM, Ingo Molnar wrote: I'm confused - initrd seems to be guest-side. I was talking about the host side. host side doesnt need much support - just some client capability in perf itself. I suspect vmchannels are sufficiently flexible and configuration-free for such purposes? (i.e. like a filesystem in essence) I haven't followed vmchannel closely, but I think it is. vmchannel is terminated in qemu on the host side, not in the host kernel. So perf would need to connect to qemu. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter
On 03/16/2010 12:44 PM, Christoph Hellwig wrote: On Tue, Mar 16, 2010 at 12:36:31PM +0200, Avi Kivity wrote: Are you talking about direct volume access or qcow2? Doesn't matter. For direct volume access, I still don't get it. The number of barriers issues by the host must equal (or exceed, but that's pointless) the number of barriers issued by the guest. cache=writeback allows the host to reorder writes, but so does cache=none. Where does the difference come from? Put it another way. In an unvirtualized environment, if you implement a write cache in a storage driver (not device), and sync it on a barrier request, would you expect to see a performance improvement? cache=none only allows very limited reorderning in the host. O_DIRECT is synchronous on the host, so there's just some very limited reordering going on in the elevator if we have other I/O going on in parallel. Presumably there is lots of I/O going on, or we wouldn't be having this conversation. In addition to that the disk writecache can perform limited reodering and caching, but the disk cache has a rather limited size. The host pagecache gives a much wieder opportunity to reorder, especially if the guest workload is not cache flush heavy. If the guest workload is extremly cache flush heavy the usefulness of the pagecache is rather limited, as we'll only use very little of it, but pay by having to do a data copy. If the workload is not cache flush heavy, and we have multiple guests doing I/O to the same spindles it will allow the host do do much more efficient data writeout by beeing able to do better ordered (less seeky) and bigger I/O (especially if the host has real storage compared to ide for the guest). Let's assume the guest has virtio (I agree with IDE we need reordering on the host). The guest sends batches of I/O separated by cache flushes. If the batches are smaller than the virtio queue length, ideally things look like: io_submit(..., batch_size_1); io_getevents(..., batch_size_1); fdatasync(); io_submit(..., batch_size_2); io_getevents(..., batch_size_2); fdatasync(); io_submit(..., batch_size_3); io_getevents(..., batch_size_3); fdatasync(); (certainly that won't happen today, but it could in principle). How does a write cache give any advantage? The host kernel sees _exactly_ the same information as it would from a bunch of threaded pwritev()s followed by fdatasync(). (wish: IO_CMD_ORDERED_FDATASYNC) If the batch size is larger than the virtio queue size, or if there are no flushes at all, then yes the huge write cache gives more opportunity for reordering. But we're already talking hundreds of requests here. Let's say the virtio queue size was unlimited. What merging/reordering opportunity are we missing on the host? Again we have exactly the same information: either the pagecache lru + radix tree that identifies all dirty pages in disk order, or the block queue with pending requests that contains exactly the same information. Something is wrong. Maybe it's my understanding, but on the other hand it may be a piece of kernel code. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/6] qemu-kvm: Use cpu_physical_memory_set_dirty_range() to update phys_ram_dirty.
Modifies kvm_get_dirty_pages_log_range to use cpu_physical_memory_set_dirty_range() to update the row of the bit-based phys_ram_dirty bitmap at once. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- qemu-kvm.c | 19 ++- 1 files changed, 6 insertions(+), 13 deletions(-) diff --git a/qemu-kvm.c b/qemu-kvm.c index e417f21..75fa9b0 100644 --- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -2305,9 +2305,8 @@ static int kvm_get_dirty_pages_log_range(unsigned long start_addr, unsigned long offset, unsigned long mem_size) { -unsigned int i, j; -unsigned long page_number, addr, addr1, c; -ram_addr_t ram_addr; +unsigned int i; +unsigned long page_number, addr, addr1; unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) / HOST_LONG_BITS; @@ -2317,16 +2316,10 @@ static int kvm_get_dirty_pages_log_range(unsigned long start_addr, */ for (i = 0; i < len; i++) { if (bitmap[i] != 0) { -c = leul_to_cpu(bitmap[i]); -do { -j = ffsl(c) - 1; -c &= ~(1ul << j); -page_number = i * HOST_LONG_BITS + j; -addr1 = page_number * TARGET_PAGE_SIZE; -addr = offset + addr1; -ram_addr = cpu_get_physical_page_desc(addr); -cpu_physical_memory_set_dirty(ram_addr); -} while (c != 0); +page_number = i * HOST_LONG_BITS; +addr1 = page_number * TARGET_PAGE_SIZE; +addr = offset + addr1; +cpu_physical_memory_set_dirty_range(addr, leul_to_cpu(bitmap[i])); } } return 0; -- 1.7.0.31.g1df487 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/6] qemu-kvm: Introduce bit-based phys_ram_dirty for VGA, CODE and MIGRATION.
Replaces byte-based phys_ram_dirty bitmap with three bit-based phys_ram_dirty bitmap. On allocation, it sets all bits in the bitmap. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- exec.c | 22 +- 1 files changed, 17 insertions(+), 5 deletions(-) diff --git a/exec.c b/exec.c index 9bcb4de..ba334e7 100644 --- a/exec.c +++ b/exec.c @@ -119,7 +119,9 @@ uint8_t *code_gen_ptr; #if !defined(CONFIG_USER_ONLY) int phys_ram_fd; -uint8_t *phys_ram_dirty; +unsigned long *phys_ram_vga_dirty; +unsigned long *phys_ram_code_dirty; +unsigned long *phys_ram_migration_dirty; uint8_t *bios_mem; static int in_migration; @@ -2659,10 +2661,20 @@ ram_addr_t qemu_ram_alloc(ram_addr_t size) new_block->next = ram_blocks; ram_blocks = new_block; -phys_ram_dirty = qemu_realloc(phys_ram_dirty, -(last_ram_offset + size) >> TARGET_PAGE_BITS); -memset(phys_ram_dirty + (last_ram_offset >> TARGET_PAGE_BITS), - 0xff, size >> TARGET_PAGE_BITS); +if (BITMAP_SIZE(last_ram_offset + size) != BITMAP_SIZE(last_ram_offset)) { +phys_ram_vga_dirty = qemu_realloc(phys_ram_vga_dirty, +BITMAP_SIZE(last_ram_offset + size)); +phys_ram_code_dirty = qemu_realloc(phys_ram_code_dirty, +BITMAP_SIZE(last_ram_offset + size)); +phys_ram_migration_dirty = qemu_realloc(phys_ram_migration_dirty, +BITMAP_SIZE(last_ram_offset + size)); +memset((uint8_t *)phys_ram_vga_dirty + + BITMAP_SIZE(last_ram_offset), 0xff, BITMAP_SIZE(size)); +memset((uint8_t *)phys_ram_code_dirty + + BITMAP_SIZE(last_ram_offset), 0xff, BITMAP_SIZE(size)); +memset((uint8_t *)phys_ram_migration_dirty + + BITMAP_SIZE(last_ram_offset), 0xff, BITMAP_SIZE(size)); +} last_ram_offset += size; -- 1.7.0.31.g1df487 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/6] qemu-kvm: Use cpu_physical_memory_get_dirty_range() to check multiple dirty pages.
Modifies ram_save_block() and ram_save_remaining() to use cpu_physical_memory_get_dirty_range() to check multiple dirty and non-dirty pages at once. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- vl.c | 55 +++ 1 files changed, 35 insertions(+), 20 deletions(-) diff --git a/vl.c b/vl.c index 6e35cc6..e9ad7c9 100644 --- a/vl.c +++ b/vl.c @@ -2779,7 +2779,8 @@ static int ram_save_block(QEMUFile *f) static ram_addr_t current_addr = 0; ram_addr_t saved_addr = current_addr; ram_addr_t addr = 0; -int found = 0; +ram_addr_t dirty_rams[HOST_LONG_BITS]; +int i, found = 0; while (addr < last_ram_offset) { if (kvm_enabled() && current_addr == 0) { @@ -2791,28 +2792,35 @@ static int ram_save_block(QEMUFile *f) return 0; } } -if (cpu_physical_memory_get_dirty(current_addr, MIGRATION_DIRTY_FLAG)) { +if ((found = cpu_physical_memory_get_dirty_range( + current_addr, last_ram_offset, dirty_rams, HOST_LONG_BITS, + MIGRATION_DIRTY_FLAG))) { uint8_t *p; -cpu_physical_memory_reset_dirty(current_addr, -current_addr + TARGET_PAGE_SIZE, -MIGRATION_DIRTY_FLAG); +for (i = 0; i < found; i++) { +ram_addr_t page_addr = dirty_rams[i]; +cpu_physical_memory_reset_dirty(page_addr, +page_addr + TARGET_PAGE_SIZE, +MIGRATION_DIRTY_FLAG); -p = qemu_get_ram_ptr(current_addr); +p = qemu_get_ram_ptr(page_addr); -if (is_dup_page(p, *p)) { -qemu_put_be64(f, current_addr | RAM_SAVE_FLAG_COMPRESS); -qemu_put_byte(f, *p); -} else { -qemu_put_be64(f, current_addr | RAM_SAVE_FLAG_PAGE); -qemu_put_buffer(f, p, TARGET_PAGE_SIZE); +if (is_dup_page(p, *p)) { +qemu_put_be64(f, (page_addr) | + RAM_SAVE_FLAG_COMPRESS); +qemu_put_byte(f, *p); +} else { +qemu_put_be64(f, (page_addr) | + RAM_SAVE_FLAG_PAGE); +qemu_put_buffer(f, p, TARGET_PAGE_SIZE); +} } - -found = 1; + break; +} else { +addr += dirty_rams[0]; +current_addr = (saved_addr + addr) % last_ram_offset; } -addr += TARGET_PAGE_SIZE; -current_addr = (saved_addr + addr) % last_ram_offset; } return found; @@ -2822,12 +2830,19 @@ static uint64_t bytes_transferred; static ram_addr_t ram_save_remaining(void) { -ram_addr_t addr; +ram_addr_t addr = 0; ram_addr_t count = 0; +ram_addr_t dirty_rams[HOST_LONG_BITS]; +int found = 0; -for (addr = 0; addr < last_ram_offset; addr += TARGET_PAGE_SIZE) { -if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) -count++; +while (addr < last_ram_offset) { +if ((found = cpu_physical_memory_get_dirty_range(addr, last_ram_offset, +dirty_rams, HOST_LONG_BITS, MIGRATION_DIRTY_FLAG))) { +count += found; +addr = dirty_rams[found - 1] + TARGET_PAGE_SIZE; +} else { +addr += dirty_rams[0]; +} } return count; -- 1.7.0.31.g1df487 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/6] qemu-kvm: Replace direct phys_ram_dirty access with wrapper functions.
Replaces direct phys_ram_dirty access with wrapper functions to prevent direct access to the phys_ram_dirty bitmap. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- exec.c | 45 - 1 files changed, 20 insertions(+), 25 deletions(-) diff --git a/exec.c b/exec.c index ba334e7..b31c349 100644 --- a/exec.c +++ b/exec.c @@ -1946,7 +1946,7 @@ static void tlb_protect_code(ram_addr_t ram_addr) static void tlb_unprotect_code_phys(CPUState *env, ram_addr_t ram_addr, target_ulong vaddr) { -phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] |= CODE_DIRTY_FLAG; +cpu_physical_memory_set_dirty_flags(ram_addr, CODE_DIRTY_FLAG); } static inline void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, @@ -1967,8 +1967,7 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end, { CPUState *env; unsigned long length, start1; -int i, mask, len; -uint8_t *p; +int i; start &= TARGET_PAGE_MASK; end = TARGET_PAGE_ALIGN(end); @@ -1976,11 +1975,7 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end, length = end - start; if (length == 0) return; -len = length >> TARGET_PAGE_BITS; -mask = ~dirty_flags; -p = phys_ram_dirty + (start >> TARGET_PAGE_BITS); -for(i = 0; i < len; i++) -p[i] &= mask; +cpu_physical_memory_mask_dirty_range(start, length, dirty_flags); /* we modify the TLB cache so that the dirty bit will be set again when accessing the range */ @@ -2837,16 +2832,16 @@ static void notdirty_mem_writeb(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 1); -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); #endif } stb_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); -phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; +cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags); /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) @@ -2857,16 +2852,16 @@ static void notdirty_mem_writew(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 2); -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); #endif } stw_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); -phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; +cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags); /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) @@ -2877,16 +2872,16 @@ static void notdirty_mem_writel(void *opaque, target_phys_addr_t ram_addr, uint32_t val) { int dirty_flags; -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 4); -dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; +dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); #endif } stl_p(qemu_get_ram_ptr(ram_addr), val); dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); -phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; +cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags); /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) @@ -3337,8 +3332,8 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, /* invalidate code */ tb_invalidate_phys_page_range(addr1, addr1 + l, 0); /* set dirty bit */ -phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |= -(0xff & ~CODE_DIRTY_FLAG); +cpu_physical_memory_set_dirty_flags( +addr1, (0xff & ~CODE_DIRTY_FLAG)); } /* qemu doesn't execute guest code directly, but kvm does therefore flush instruction caches */ @@ -3551,8 +354
[PATCH 2/6] qemu-kvm: Modify and introduce wrapper functions to access phys_ram_dirty.
Modifies wrapper functions for byte-based phys_ram_dirty bitmap to bit-based phys_ram_dirty bitmap, and adds more wrapper functions to prevent direct access to the phys_ram_dirty bitmap. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- cpu-all.h | 94 ++-- 1 files changed, 90 insertions(+), 4 deletions(-) diff --git a/cpu-all.h b/cpu-all.h index 9bc01b9..91ec3e5 100644 --- a/cpu-all.h +++ b/cpu-all.h @@ -843,7 +843,9 @@ int cpu_str_to_log_mask(const char *str); /* memory API */ extern int phys_ram_fd; -extern uint8_t *phys_ram_dirty; +extern unsigned long *phys_ram_vga_dirty; +extern unsigned long *phys_ram_code_dirty; +extern unsigned long *phys_ram_migration_dirty; extern ram_addr_t ram_size; extern ram_addr_t last_ram_offset; extern uint8_t *bios_mem; @@ -879,20 +881,104 @@ int cpu_memory_rw_debug(CPUState *env, target_ulong addr, /* read dirty bit (return 0 or 1) */ static inline int cpu_physical_memory_is_dirty(ram_addr_t addr) { -return phys_ram_dirty[addr >> TARGET_PAGE_BITS] == 0xff; +unsigned long mask; +int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int offset = (addr >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1); + +mask = 1UL << offset; +return (phys_ram_vga_dirty[index] & +phys_ram_code_dirty[index] & +phys_ram_migration_dirty[index] & mask) == mask; +} + +static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr) +{ +unsigned long mask; +int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int offset = (addr >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1); +int ret = 0; + +mask = 1UL << offset; +if (phys_ram_vga_dirty[index] & mask) +ret |= VGA_DIRTY_FLAG; +if (phys_ram_code_dirty[index] & mask) +ret |= CODE_DIRTY_FLAG; +if (phys_ram_migration_dirty[index] & mask) +ret |= MIGRATION_DIRTY_FLAG; + +return ret; } static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, int dirty_flags) { -return phys_ram_dirty[addr >> TARGET_PAGE_BITS] & dirty_flags; +return cpu_physical_memory_get_dirty_flags(addr) & dirty_flags; } static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) { -phys_ram_dirty[addr >> TARGET_PAGE_BITS] = 0xff; +unsigned long mask; +int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int offset = (addr >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1); + +mask = 1UL << offset; +phys_ram_vga_dirty[index] |= mask; +phys_ram_code_dirty[index] |= mask; +phys_ram_migration_dirty[index] |= mask; +} + +static inline void cpu_physical_memory_set_dirty_range(ram_addr_t addr, + unsigned long mask) +{ +int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS; + +phys_ram_vga_dirty[index] |= mask; +phys_ram_code_dirty[index] |= mask; +phys_ram_migration_dirty[index] |= mask; } +static inline void cpu_physical_memory_set_dirty_flags(ram_addr_t addr, + int dirty_flags) +{ +unsigned long mask; +int index = (addr >> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int offset = (addr >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1); + +mask = 1UL << offset; +if (dirty_flags & VGA_DIRTY_FLAG) +phys_ram_vga_dirty[index] |= mask; +if (dirty_flags & CODE_DIRTY_FLAG) +phys_ram_code_dirty[index] |= mask; +if (dirty_flags & MIGRATION_DIRTY_FLAG) +phys_ram_migration_dirty[index] |= mask; +} + +static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start, +int length, +int dirty_flags) +{ +ram_addr_t addr = start; +unsigned long mask; +int index, offset, i; + +for (i = 0; i < length; i += TARGET_PAGE_SIZE) { +index = ((addr + i) >> TARGET_PAGE_BITS) / HOST_LONG_BITS; +offset = ((addr + i) >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1); +mask = ~(1UL << offset); + +if (dirty_flags & VGA_DIRTY_FLAG) +phys_ram_vga_dirty[index] &= mask; +if (dirty_flags & CODE_DIRTY_FLAG) +phys_ram_code_dirty[index] &= mask; +if (dirty_flags & MIGRATION_DIRTY_FLAG) +phys_ram_migration_dirty[index] &= mask; + } +} + +int cpu_physical_memory_get_dirty_range(ram_addr_t start, ram_addr_t end, +ram_addr_t *dirty_rams, int length, +int dirty_flags); + void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end, int dirty_flags); void cpu_tlb_update_dirty(CPUState *env); -- 1.7.0.31.g1df487 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.ke
[PATCH 4/6] qemu-kvm: Introduce cpu_physical_memory_get_dirty_range().
Introduces cpu_physical_memory_get_dirty_range(). It checks the first row and puts dirty addr in the array. If the first row is empty, it skips to the first non-dirty row or the end addr, and put the length in the first entry of the array. Signed-off-by: Yoshiaki Tamura Signed-off-by: OHMURA Kei --- exec.c | 73 1 files changed, 73 insertions(+), 0 deletions(-) diff --git a/exec.c b/exec.c index b31c349..87056a6 100644 --- a/exec.c +++ b/exec.c @@ -1961,6 +1961,79 @@ static inline void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, } } +/* It checks the first row and puts dirty addrs in the array. + If the first row is empty, it skips to the first non-dirty row + or the end addr, and put the length in the first entry of the array. */ +int cpu_physical_memory_get_dirty_range(ram_addr_t start, ram_addr_t end, +ram_addr_t *dirty_rams, int length, +int dirty_flag) +{ +unsigned long phys_ram_dirty, page_number, *p; +ram_addr_t addr; +int s_idx = (start >> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int e_idx = (end >> TARGET_PAGE_BITS) / HOST_LONG_BITS; +int i, j, offset; + +switch (dirty_flag) { +case VGA_DIRTY_FLAG: +p = phys_ram_vga_dirty; +break; +case CODE_DIRTY_FLAG: +p = phys_ram_code_dirty; +break; +case MIGRATION_DIRTY_FLAG: +p = phys_ram_migration_dirty; +break; +default: +abort(); +} + +/* mask bits before the start addr */ +offset = (start >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1); +phys_ram_dirty = p[s_idx] & ~((1UL << offset) - 1); + +if (s_idx == e_idx) { +/* mask bits after the end addr */ +offset = (end >> TARGET_PAGE_BITS) & (HOST_LONG_BITS - 1); +phys_ram_dirty &= (1UL << offset) - 1; +} + +if (phys_ram_dirty == 0) { +/* when the row is empty */ +ram_addr_t skip; +if (s_idx == e_idx) +skip = end; +else { +/* skip empty rows */ +while (s_idx < e_idx && p[++s_idx] == 0); +skip = (s_idx * HOST_LONG_BITS * TARGET_PAGE_SIZE); +} +dirty_rams[0] = skip - start; +i = 0; + +} else if (phys_ram_dirty == ~0UL) { +/* when the row is fully dirtied */ +addr = start; +for (i = 0; i < length; i++) { +dirty_rams[i] = addr; +addr += TARGET_PAGE_SIZE; +} +} else { +/* when the row is partially dirtied */ +i = 0; +do { +j = ffsl(phys_ram_dirty) - 1; +phys_ram_dirty &= ~(1UL << j); +page_number = s_idx * HOST_LONG_BITS + j; +addr = page_number * TARGET_PAGE_SIZE; +dirty_rams[i] = addr; +i++; +} while (phys_ram_dirty != 0 && i < length); +} + +return i; +} + /* Note: start and end must be within the same ram block. */ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end, int dirty_flags) -- 1.7.0.31.g1df487 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/6] qemu-kvm: Introduce bit-based phys_ram_dirty, and bit-based dirty page checker.
The dirty and non-dirty pages are checked one by one in vl.c. When the most of the memory is not dirty, checking the dirty and non-dirty pages by multiple page size should be much faster than checking them one by one. We introduced bit-based phys_ram_dirty for VGA, CODE and MIGRATION, and cpu_physical_memory_get_dirty_range() for this purpose. This patch is based on the following discussion. http://www.mail-archive.com/kvm@vger.kernel.org/msg28733.html To prove our prospect, we have evaluated effect of this patch. We compared runtime of ram_save_remaining with original ram_save_remaining() and ram_save_remaining() using functions of this patch. Test Environment: CPU: 4x Intel Xeon Quad Core 2.66GHz Mem size: 96GB kvm version: 2.6.33 qemu-kvm version: commit 2b644fd0e737407133c88054ba498e772ce01f27 Host OS: CentOS (kernel 2.6.33) Guest OS: Debian/GNU Linux lenny (kernel 2.6.26) Guest Mem size: 512MB Conditions of experiments are as follows: Cond1: Guest OS periodically makes the 256MB continuous dirty pages. Cond2: Guest OS periodically makes the 256MB dirty pages and non-dirty pages in turn. Cond3: Guest OS read 1GB file, which is bigger than memory. Cond4: Guest OS write 1GB file, which is bigger than memory. Experimental results: Cond1: 1.9 ~ 61 times speed up Cond2: 1.9 ~ 56 times speed up Cond3: 1.9 ~ 59 times speed up Cond4: 1.7 ~ 59 times speed up -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Avi Kivity wrote: > On 03/16/2010 12:20 PM, Ingo Molnar wrote: > > The symbol server's client can certainly access the bits through > vmchannel. > >>>Ok, that would work i suspect. > >>> > >>>Would be nice to have the symbol server in tools/perf/ and also make it > >>>easy > >>>to add it to the initrd via a .config switch or so. > >>> > >>>That would have basically all of the advantages of being built into the > >>>kernel > >>>(availability, configurability, transparency, hackability), while having > >>>all > >>>the advantages of a user-space approach as well (flexibility, > >>>extensibility, > >>>robustness, ease of maintenance, etc.). > >>Note, I am not advocating building the vmchannel client into the host > >>kernel. [...] > >Neither am i. What i suggested was a user-space binary/executable built in > >tools/perf and put into the initrd. > > I'm confused - initrd seems to be guest-side. I was talking about the host > side. host side doesnt need much support - just some client capability in perf itself. I suspect vmchannels are sufficiently flexible and configuration-free for such purposes? (i.e. like a filesystem in essence) Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for Mar 16
On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote: > On 03/16/2010 12:31 PM, Daniel P. Berrange wrote: > >>Polling loops are an indication that something is wrong. > >> > >Except when people suggest they are the right answer, qcow high > >watermark ;-P > > > > I liked Anthony's suggestion of an lvm2 block format driver. No polling. Doesn't that require giving QEMU privileges to perform LVM operations which implies QEMU having CAP_SYS_ADMIN ? Daniel -- |: Red Hat, Engineering, London-o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org-o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter
On Tue, Mar 16, 2010 at 12:36:31PM +0200, Avi Kivity wrote: > Are you talking about direct volume access or qcow2? Doesn't matter. > For direct volume access, I still don't get it. The number of barriers > issues by the host must equal (or exceed, but that's pointless) the > number of barriers issued by the guest. cache=writeback allows the host > to reorder writes, but so does cache=none. Where does the difference > come from? > > Put it another way. In an unvirtualized environment, if you implement a > write cache in a storage driver (not device), and sync it on a barrier > request, would you expect to see a performance improvement? cache=none only allows very limited reorderning in the host. O_DIRECT is synchronous on the host, so there's just some very limited reordering going on in the elevator if we have other I/O going on in parallel. In addition to that the disk writecache can perform limited reodering and caching, but the disk cache has a rather limited size. The host pagecache gives a much wieder opportunity to reorder, especially if the guest workload is not cache flush heavy. If the guest workload is extremly cache flush heavy the usefulness of the pagecache is rather limited, as we'll only use very little of it, but pay by having to do a data copy. If the workload is not cache flush heavy, and we have multiple guests doing I/O to the same spindles it will allow the host do do much more efficient data writeout by beeing able to do better ordered (less seeky) and bigger I/O (especially if the host has real storage compared to ide for the guest). > >If you don't have a of lot of cache flushes, e.g. due to dumb > >applications that do not issue fsync, or even run ext3 in it's default > >mode never issues cache flushes the benefit will be enormous, but the > >data loss and possible corruption will be enormous. > > > > Shouldn't the host never issue cache flushes in this case? (for direct > volume access; qcow2 still needs flushes for metadata integrity). If the guest never issues a flush the host will neither, indeed. Data will only go to disk by background writeout or memory pressure. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for Mar 16
On Tue, Mar 16, 2010 at 12:38:02PM +0200, Avi Kivity wrote: > On 03/16/2010 12:31 PM, Daniel P. Berrange wrote: >>> Polling loops are an indication that something is wrong. >>> >> Except when people suggest they are the right answer, qcow high >> watermark ;-P >> > > I liked Anthony's suggestion of an lvm2 block format driver. No polling. I have done some work on linking the new lvm library to qemu to control snapshotting. But introducing a whole new block format seems like a lot of duplication to me. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 12:20 PM, Ingo Molnar wrote: The symbol server's client can certainly access the bits through vmchannel. Ok, that would work i suspect. Would be nice to have the symbol server in tools/perf/ and also make it easy to add it to the initrd via a .config switch or so. That would have basically all of the advantages of being built into the kernel (availability, configurability, transparency, hackability), while having all the advantages of a user-space approach as well (flexibility, extensibility, robustness, ease of maintenance, etc.). Note, I am not advocating building the vmchannel client into the host kernel. [...] Neither am i. What i suggested was a user-space binary/executable built in tools/perf and put into the initrd. I'm confused - initrd seems to be guest-side. I was talking about the host side. For the guest, placing the symbol server in tools/ is reasonable. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for Mar 16
On 03/16/2010 12:31 PM, Daniel P. Berrange wrote: Polling loops are an indication that something is wrong. Except when people suggest they are the right answer, qcow high watermark ;-P I liked Anthony's suggestion of an lvm2 block format driver. No polling. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter
On 03/16/2010 12:26 PM, Christoph Hellwig wrote: Avi, cache=writeback can be faster than cache=none for the same reasons a disk cache speeds up access. As long as the I/O mix contains more asynchronous then synchronous writes it allows the host to do much more reordering, only limited by the cache size (which can be quite huge when using the host pagecache) and the amount of cache flushes coming from the host. If you have a fsync heavy workload or metadata operation with a filesystem like the current XFS you will get lots of cache flushes that make the use of the additional cache limits. Are you talking about direct volume access or qcow2? For direct volume access, I still don't get it. The number of barriers issues by the host must equal (or exceed, but that's pointless) the number of barriers issued by the guest. cache=writeback allows the host to reorder writes, but so does cache=none. Where does the difference come from? Put it another way. In an unvirtualized environment, if you implement a write cache in a storage driver (not device), and sync it on a barrier request, would you expect to see a performance improvement? If you don't have a of lot of cache flushes, e.g. due to dumb applications that do not issue fsync, or even run ext3 in it's default mode never issues cache flushes the benefit will be enormous, but the data loss and possible corruption will be enormous. Shouldn't the host never issue cache flushes in this case? (for direct volume access; qcow2 still needs flushes for metadata integrity). But even for something like btrfs that does provide data integrity but issues cache flushes fairly effeciently data=writeback may provide a quite nice speedup, especially if using multiple guest accessing the same spindle(s). But I wouldn't be surprised if IBM's exteme differences are indeed due to the extremly unsafe ext3 default behaviour. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for Mar 16
On Tue, Mar 16, 2010 at 11:43:48AM +0200, Avi Kivity wrote: > On 03/16/2010 11:29 AM, Daniel P. Berrange wrote: > >On Tue, Mar 16, 2010 at 10:18:03AM +0100, Juan Quintela wrote: > > > >>Chris Wright wrote: > >> > >>>Please send in any agenda items you are interested in covering. > >>> > >>Migration: > >>- flexible migration: I hope to sent an RFC patch on time for the > >> call. idea is to use subsections. > >> > >>- callbacks. block migration introduced several callbacks: > >> * cancel() > >> * get_status() > >> * release() > >> in spice we need now another to callbacks: on_start() and on_end(). > >>* on_start(): tells spice that migration has started (it will then > >> manage certificates, passwords, ... itself) > >>* on_end(): it is called when migration ends. spice use it to > >> transparently connect to the new host and user don't have to > >> "reconnect" > >> > >>- what to do on migration error: > >> - target side: libvirt folks want the program to print a message if > >> it fails. Current code spent 100% cpu time doing select on a closed > >> fd. (patches already on the list to make it wait without using > >> cpu). > >> > >No, that is not correct. We want QEMU to exit when incoming migration > >fails. Printing to stderr is just something that will end up in the > >logs for admin to further diagnose the problem if required. There is > >nothing to be gained by leaving QEMU running, and everything to loose > >since the failed migration may have left it in a dangerous state from > >which you do not want to attempt incoming migration again. > > > >If we really want to leave it running when migration fails, then we're > >going to have to add yet another QMP event to inform libvirt when > >migration has finished/failed, and/or make 'query_migrate' work on > >the destination too. > > > > A qmp event seems the logical thing to do? Exiting can happen for many > reasons, a qmp event is unambiguous. Yes, for the QEMU upstream adding an event is more flexible. I had originally suggested exiting in the context of the Fedora bug report which was for QEMU 0.10.x which has no events capability. > > > >Incidentally I have a feeling we might need to introduce a migration > >event in QMP. Currently libvirt polls on the 'query_migrate' command > >to get the ongoing migration status. This means there can be a delay > >in detecting completion as long as the polling interval - for this > >reason we just dropped libvirt's polling time from 1/2 sec to 50ms > >to ensure prompt detection. > > > > Whenever you implement a polling loop, can you send an event to qemu-de...@? Yep, sure thing. This is the only polling loop that isn't related to I/O stats collection. > > Polling loops are an indication that something is wrong. Except when people suggest they are the right answer, qcow high watermark ;-P Regards, Daniel -- |: Red Hat, Engineering, London-o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org-o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter
Avi, cache=writeback can be faster than cache=none for the same reasons a disk cache speeds up access. As long as the I/O mix contains more asynchronous then synchronous writes it allows the host to do much more reordering, only limited by the cache size (which can be quite huge when using the host pagecache) and the amount of cache flushes coming from the host. If you have a fsync heavy workload or metadata operation with a filesystem like the current XFS you will get lots of cache flushes that make the use of the additional cache limits. If you don't have a of lot of cache flushes, e.g. due to dumb applications that do not issue fsync, or even run ext3 in it's default mode never issues cache flushes the benefit will be enormous, but the data loss and possible corruption will be enormous. But even for something like btrfs that does provide data integrity but issues cache flushes fairly effeciently data=writeback may provide a quite nice speedup, especially if using multiple guest accessing the same spindle(s). But I wouldn't be surprised if IBM's exteme differences are indeed due to the extremly unsafe ext3 default behaviour. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Avi Kivity wrote: > On 03/16/2010 11:53 AM, Ingo Molnar wrote: > >* Avi Kivity wrote: > > > >>On 03/16/2010 09:24 AM, Ingo Molnar wrote: > >>>* Avi Kivity wrote: > >>> > On 03/16/2010 07:27 AM, Zhang, Yanmin wrote: > >From: Zhang, Yanmin > > > >Based on the discussion in KVM community, I worked out the patch to > >support > >perf to collect guest os statistics from host side. This patch is > >implemented > >with Ingo, Peter and some other guys' kind help. Yang Sheng pointed out a > >critical bug and provided good suggestions with other guys. I really > >appreciate > >their kind help. > > > >The patch adds new subcommand kvm to perf. > > > > perf kvm top > > perf kvm record > > perf kvm report > > perf kvm diff > > > >The new perf could profile guest os kernel except guest os user space, > >but it > >could summarize guest os user space utilization per guest os. > > > >Below are some examples. > >1) perf kvm top > >[r...@lkp-ne01 norm]# perf kvm --host --guest > >--guestkallsyms=/home/ymzhang/guest/kallsyms > >--guestmodules=/home/ymzhang/guest/modules top > > > Excellent, support for guest kernel != host kernel is critical (I > can't remember the last time I ran same kernels). > > How would we support multiple guests with different kernels? Perhaps a > symbol server that perf can connect to (and that would connect to guests > in > turn)? > >>>The highest quality solution would be if KVM offered a 'guest extension' to > >>>the guest kernel's /proc/kallsyms that made it easy for user-space to get > >>>this > >>>information from an authorative source. > >>> > >>>That's the main reason why the host side /proc/kallsyms is so popular and > >>>so > >>>useful: while in theory it's mostly redundant information which can be > >>>gleaned > >>>from the System.map and other sources of symbol information, it's easily > >>>available and is _always_ trustable to come from the host kernel. > >>> > >>>Separate System.map's have a tendency to go out of sync (or go missing > >>>when a > >>>devel kernel gets rebuilt, or if a devel package is not installed), and > >>>server > >>>ports (be that a TCP port space server or an UDP port space mount-point) > >>>are > >>>both a configuration hassle and are not guest-transparent. > >>> > >>>So for instrumentation infrastructure (such as perf) we have a large and > >>>well > >>>founded preference for intrinsic, built-in, kernel-provided information: > >>>i.e. > >>>a largely 'built-in' and transparent mechanism to get to guest symbols. > >>The symbol server's client can certainly access the bits through vmchannel. > >Ok, that would work i suspect. > > > >Would be nice to have the symbol server in tools/perf/ and also make it easy > >to add it to the initrd via a .config switch or so. > > > >That would have basically all of the advantages of being built into the > >kernel > >(availability, configurability, transparency, hackability), while having all > >the advantages of a user-space approach as well (flexibility, extensibility, > >robustness, ease of maintenance, etc.). > > Note, I am not advocating building the vmchannel client into the host > kernel. [...] Neither am i. What i suggested was a user-space binary/executable built in tools/perf and put into the initrd. That approach has the advantages i listed above, without having the disadvantages of in-kernel code you listed. Thanks, Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter
On 03/16/2010 11:54 AM, Kevin Wolf wrote: Is this with qcow2, raw file, or direct volume access? I can understand it for qcow2, but for direct volume access this shouldn't happen. The guest schedules as many writes as it can, followed by a sync. The host (and disk) can then reschedule them whether they are in the writeback cache or in the block layer, and must sync in the same way once completed. Perhaps what we need is bdrv_aio_submit() which can take a number of requests. For direct volume access, this allows easier reordering (io_submit() should plug the queues before it starts processing and unplug them when done, though I don't see the code for this?). For qcow2, we can coalesce metadata updates for multiple requests into one RMW (for example, a sequential write split into multiple 64K-256K write requests). We already do merge sequential writes back into one larger request. So this is in fact a case that wouldn't benefit from such changes. I'm not happy with that. It increases overall latency. With qcow2 it's fine, but I'd let requests to raw volumes flow unaltered. It may help for other cases. But even if it did, coalescing metadata writes in qcow2 sounds like a good way to mess up, so I'd stay with doing it only for the data itself. I don't see why. Apart from that, wouldn't your points apply to writeback as well? They do, but for writeback the host kernel already does all the coalescing/merging/blah for us. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
On 03/16/2010 11:53 AM, Ingo Molnar wrote: * Avi Kivity wrote: On 03/16/2010 09:24 AM, Ingo Molnar wrote: * Avi Kivity wrote: On 03/16/2010 07:27 AM, Zhang, Yanmin wrote: From: Zhang, Yanmin Based on the discussion in KVM community, I worked out the patch to support perf to collect guest os statistics from host side. This patch is implemented with Ingo, Peter and some other guys' kind help. Yang Sheng pointed out a critical bug and provided good suggestions with other guys. I really appreciate their kind help. The patch adds new subcommand kvm to perf. perf kvm top perf kvm record perf kvm report perf kvm diff The new perf could profile guest os kernel except guest os user space, but it could summarize guest os user space utilization per guest os. Below are some examples. 1) perf kvm top [r...@lkp-ne01 norm]# perf kvm --host --guest --guestkallsyms=/home/ymzhang/guest/kallsyms --guestmodules=/home/ymzhang/guest/modules top Excellent, support for guest kernel != host kernel is critical (I can't remember the last time I ran same kernels). How would we support multiple guests with different kernels? Perhaps a symbol server that perf can connect to (and that would connect to guests in turn)? The highest quality solution would be if KVM offered a 'guest extension' to the guest kernel's /proc/kallsyms that made it easy for user-space to get this information from an authorative source. That's the main reason why the host side /proc/kallsyms is so popular and so useful: while in theory it's mostly redundant information which can be gleaned >from the System.map and other sources of symbol information, it's easily available and is _always_ trustable to come from the host kernel. Separate System.map's have a tendency to go out of sync (or go missing when a devel kernel gets rebuilt, or if a devel package is not installed), and server ports (be that a TCP port space server or an UDP port space mount-point) are both a configuration hassle and are not guest-transparent. So for instrumentation infrastructure (such as perf) we have a large and well founded preference for intrinsic, built-in, kernel-provided information: i.e. a largely 'built-in' and transparent mechanism to get to guest symbols. The symbol server's client can certainly access the bits through vmchannel. Ok, that would work i suspect. Would be nice to have the symbol server in tools/perf/ and also make it easy to add it to the initrd via a .config switch or so. That would have basically all of the advantages of being built into the kernel (availability, configurability, transparency, hackability), while having all the advantages of a user-space approach as well (flexibility, extensibility, robustness, ease of maintenance, etc.). Note, I am not advocating building the vmchannel client into the host kernel. While that makes everything simpler for the user, it increases the kernel footprint with all the disadvantages that come with that (any bug is converted into a host DoS or worse). So, perf would connect to qemu via (say) a well-known unix domain socket, which would then talk to the guest kernel. I know you won't like it, we'll continue to disagree on this unfortunately. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for Mar 16
On Tue, Mar 16, 2010 at 09:29:44AM +, Daniel P. Berrange wrote: > On Tue, Mar 16, 2010 at 10:18:03AM +0100, Juan Quintela wrote: > > Chris Wright wrote: > > > Please send in any agenda items you are interested in covering. > > > > Migration: > > - flexible migration: I hope to sent an RFC patch on time for the > > call. idea is to use subsections. > > > > - callbacks. block migration introduced several callbacks: > > * cancel() > > * get_status() > > * release() > > in spice we need now another to callbacks: on_start() and on_end(). > >* on_start(): tells spice that migration has started (it will then > > manage certificates, passwords, ... itself) > >* on_end(): it is called when migration ends. spice use it to > > transparently connect to the new host and user don't have to > > "reconnect" > > > > - what to do on migration error: > > - target side: libvirt folks want the program to print a message if > > it fails. Current code spent 100% cpu time doing select on a closed > > fd. (patches already on the list to make it wait without using > > cpu). > > No, that is not correct. We want QEMU to exit when incoming migration > fails. Printing to stderr is just something that will end up in the > logs for admin to further diagnose the problem if required. There is > nothing to be gained by leaving QEMU running, and everything to loose > since the failed migration may have left it in a dangerous state from > which you do not want to attempt incoming migration again. Sorry, I forgot to include the original BZ report about this problem from Fedora. In essence, we just truncated the original save state image and then tried to restore from it to check handling in the event of corrupted save image. https://bugzilla.redhat.com/show_bug.cgi?id=518032 Regards, Daniel -- |: Red Hat, Engineering, London-o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org-o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter
Am 16.03.2010 10:17, schrieb Avi Kivity: > On 03/15/2010 10:23 PM, Chris Webb wrote: >> Avi Kivity writes: >> >> >>> On 03/15/2010 10:07 AM, Balbir Singh wrote: >>> >>> Yes, it is a virtio call away, but is the cost of paying twice in terms of memory acceptable? >>> Usually, it isn't, which is why I recommend cache=off. >>> >> Hi Avi. One observation about your recommendation for cache=none: >> >> We run hosts of VMs accessing drives backed by logical volumes carved out >> from md RAID1. Each host has 32GB RAM and eight cores, divided between (say) >> twenty virtual machines, which pretty much fill the available memory on the >> host. Our qemu-kvm is new enough that IDE and SCSI drives with writeback >> caching turned on get advertised to the guest as having a write-cache, and >> FLUSH gets translated to fsync() by qemu. (Consequently cache=writeback >> isn't acting as cache=neverflush like it would have done a year ago. I know >> that comparing performance for cache=none against that unsafe behaviour >> would be somewhat unfair!) >> >> Wasteful duplication of page cache between guest and host notwithstanding, >> turning on cache=writeback is a spectacular performance win for our guests. >> For example, even IDE with cache=writeback easily beats virtio with >> cache=none in most of the guest filesystem performance tests I've tried. The >> anecdotal feedback from clients is also very strongly in favour of >> cache=writeback. >> > > Is this with qcow2, raw file, or direct volume access? > > I can understand it for qcow2, but for direct volume access this > shouldn't happen. The guest schedules as many writes as it can, > followed by a sync. The host (and disk) can then reschedule them > whether they are in the writeback cache or in the block layer, and must > sync in the same way once completed. > > Perhaps what we need is bdrv_aio_submit() which can take a number of > requests. For direct volume access, this allows easier reordering > (io_submit() should plug the queues before it starts processing and > unplug them when done, though I don't see the code for this?). For > qcow2, we can coalesce metadata updates for multiple requests into one > RMW (for example, a sequential write split into multiple 64K-256K write > requests). We already do merge sequential writes back into one larger request. So this is in fact a case that wouldn't benefit from such changes. It may help for other cases. But even if it did, coalescing metadata writes in qcow2 sounds like a good way to mess up, so I'd stay with doing it only for the data itself. Apart from that, wouldn't your points apply to writeback as well? Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Avi Kivity wrote: > On 03/16/2010 09:24 AM, Ingo Molnar wrote: > >* Avi Kivity wrote: > > > >>On 03/16/2010 07:27 AM, Zhang, Yanmin wrote: > >>>From: Zhang, Yanmin > >>> > >>>Based on the discussion in KVM community, I worked out the patch to support > >>>perf to collect guest os statistics from host side. This patch is > >>>implemented > >>>with Ingo, Peter and some other guys' kind help. Yang Sheng pointed out a > >>>critical bug and provided good suggestions with other guys. I really > >>>appreciate > >>>their kind help. > >>> > >>>The patch adds new subcommand kvm to perf. > >>> > >>> perf kvm top > >>> perf kvm record > >>> perf kvm report > >>> perf kvm diff > >>> > >>>The new perf could profile guest os kernel except guest os user space, but > >>>it > >>>could summarize guest os user space utilization per guest os. > >>> > >>>Below are some examples. > >>>1) perf kvm top > >>>[r...@lkp-ne01 norm]# perf kvm --host --guest > >>>--guestkallsyms=/home/ymzhang/guest/kallsyms > >>>--guestmodules=/home/ymzhang/guest/modules top > >>> > >>Excellent, support for guest kernel != host kernel is critical (I > >>can't remember the last time I ran same kernels). > >> > >>How would we support multiple guests with different kernels? Perhaps a > >>symbol server that perf can connect to (and that would connect to guests in > >>turn)? > >The highest quality solution would be if KVM offered a 'guest extension' to > >the guest kernel's /proc/kallsyms that made it easy for user-space to get > >this > >information from an authorative source. > > > >That's the main reason why the host side /proc/kallsyms is so popular and so > >useful: while in theory it's mostly redundant information which can be > >gleaned > >from the System.map and other sources of symbol information, it's easily > >available and is _always_ trustable to come from the host kernel. > > > >Separate System.map's have a tendency to go out of sync (or go missing when a > >devel kernel gets rebuilt, or if a devel package is not installed), and > >server > >ports (be that a TCP port space server or an UDP port space mount-point) are > >both a configuration hassle and are not guest-transparent. > > > >So for instrumentation infrastructure (such as perf) we have a large and well > >founded preference for intrinsic, built-in, kernel-provided information: i.e. > >a largely 'built-in' and transparent mechanism to get to guest symbols. > > The symbol server's client can certainly access the bits through vmchannel. Ok, that would work i suspect. Would be nice to have the symbol server in tools/perf/ and also make it easy to add it to the initrd via a .config switch or so. That would have basically all of the advantages of being built into the kernel (availability, configurability, transparency, hackability), while having all the advantages of a user-space approach as well (flexibility, extensibility, robustness, ease of maintenance, etc.). If only we had tools/xorg/ integrated via the initrd that way ;-) Thanks, Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enhance perf to collect KVM guest os statistics from host side
* Zhang, Yanmin wrote: > On Tue, 2010-03-16 at 15:48 +0800, Zhang, Yanmin wrote: > > On Tue, 2010-03-16 at 07:41 +0200, Avi Kivity wrote: > > > On 03/16/2010 07:27 AM, Zhang, Yanmin wrote: > > > > From: Zhang, Yanmin > > > > > > > > Based on the discussion in KVM community, I worked out the patch to > > > > support > > > > perf to collect guest os statistics from host side. This patch is > > > > implemented > > > > with Ingo, Peter and some other guys' kind help. Yang Sheng pointed out > > > > a > > > > critical bug and provided good suggestions with other guys. I really > > > > appreciate > > > > their kind help. > > > > > > > > The patch adds new subcommand kvm to perf. > > > > > > > >perf kvm top > > > >perf kvm record > > > >perf kvm report > > > >perf kvm diff > > > > > > > > The new perf could profile guest os kernel except guest os user space, > > > > but it > > > > could summarize guest os user space utilization per guest os. > > > > > > > > Below are some examples. > > > > 1) perf kvm top > > > > [r...@lkp-ne01 norm]# perf kvm --host --guest > > > > --guestkallsyms=/home/ymzhang/guest/kallsyms > > > > --guestmodules=/home/ymzhang/guest/modules top > > > > > > > > > > > > > Thanks for your kind comments. > > > > > Excellent, support for guest kernel != host kernel is critical (I can't > > > remember the last time I ran same kernels). > > > > > > How would we support multiple guests with different kernels? > > With the patch, 'perf kvm report --sort pid" could show > > summary statistics for all guest os instances. Then, use > > parameter --pid of 'perf kvm record' to collect single problematic instance > > data. > Sorry. I found currently --pid isn't process but a thread (main thread). > > Ingo, > > Is it possible to support a new parameter or extend --inherit, so 'perf > record' and 'perf top' could collect data on all threads of a process when > the process is running? > > If not, I need add a new ugly parameter which is similar to --pid to filter > out process data in userspace. Yeah. For maximum utility i'd suggest to extend --pid to include this, and introduce --tid for the previous, limited-to-a-single-task functionality. Most users would expect --pid to work like a 'late attach' - i.e. to work like strace -f or like a gdb attach. Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html