[PATCH v3 2/6] kvm: Use APIC_DEFAULT_PHYS_BASE macro as the apic access page address.
We have APIC_DEFAULT_PHYS_BASE defined as 0xfee0, which is also the address of apic access page. So use this macro. Signed-off-by: Tang Chen tangc...@cn.fujitsu.com --- arch/x86/kvm/svm.c | 3 ++- arch/x86/kvm/vmx.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ec8366c..576b525 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1257,7 +1257,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm-asid_generation = 0; init_vmcb(svm); - svm-vcpu.arch.apic_base = 0xfee0 | MSR_IA32_APICBASE_ENABLE; + svm-vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(svm-vcpu)) svm-vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 801332e..0e1117c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3982,13 +3982,13 @@ static int alloc_apic_access_page(struct kvm *kvm) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = 0xfee0ULL; + kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, kvm_userspace_mem); if (r) goto out; - page = gfn_to_page(kvm, 0xfee00); + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE PAGE_SHIFT); if (is_error_page(page)) { r = -EFAULT; goto out; @@ -4460,7 +4460,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx-vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vmx-vcpu, 0); - apic_base_msr.data = 0xfee0 | MSR_IA32_APICBASE_ENABLE; + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(vmx-vcpu)) apic_base_msr.data |= MSR_IA32_APICBASE_BSP; apic_base_msr.host_initiated = true; -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 4/6] kvm: Make init_rmode_identity_map() return 0 on success.
In init_rmode_identity_map(), there two variables indicating the return value, r and ret, and it return 0 on error, 1 on success. The function is only called by vmx_create_vcpu(), and r is redundant. This patch removes the redundant variable r, and make init_rmode_identity_map() return 0 on success, -errno on failure. Signed-off-by: Tang Chen tangc...@cn.fujitsu.com --- arch/x86/kvm/vmx.c | 25 +++-- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b8bf47d..6ab4f87 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3922,45 +3922,42 @@ out: static int init_rmode_identity_map(struct kvm *kvm) { - int i, idx, r, ret = 0; + int i, idx, ret = 0; pfn_t identity_map_pfn; u32 tmp; if (!enable_ept) - return 1; + return 0; /* Protect kvm-arch.ept_identity_pagetable_done. */ mutex_lock(kvm-slots_lock); - if (likely(kvm-arch.ept_identity_pagetable_done)) { - ret = 1; + if (likely(kvm-arch.ept_identity_pagetable_done)) goto out2; - } identity_map_pfn = kvm-arch.ept_identity_map_addr PAGE_SHIFT; - r = alloc_identity_pagetable(kvm); - if (r) + ret = alloc_identity_pagetable(kvm); + if (ret) goto out2; idx = srcu_read_lock(kvm-srcu); - r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); - if (r 0) + ret = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); + if (ret) goto out; /* Set up identity-mapping pagetable for EPT in real mode */ for (i = 0; i PT32_ENT_PER_PAGE; i++) { tmp = (i 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); - r = kvm_write_guest_page(kvm, identity_map_pfn, + ret = kvm_write_guest_page(kvm, identity_map_pfn, tmp, i * sizeof(tmp), sizeof(tmp)); - if (r 0) + if (ret) goto out; } kvm-arch.ept_identity_pagetable_done = true; - ret = 1; + out: srcu_read_unlock(kvm-srcu, idx); - out2: mutex_unlock(kvm-slots_lock); return ret; @@ -7584,7 +7581,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) kvm-arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; err = -ENOMEM; - if (!init_rmode_identity_map(kvm)) + if (init_rmode_identity_map(kvm)) goto free_vmcs; } -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 3/6] kvm: Remove ept_identity_pagetable from struct kvm_arch.
kvm_arch-ept_identity_pagetable holds the ept identity pagetable page. But it is never used to refer to the page at all. In vcpu initialization, it indicates two things: 1. indicates if ept page is allocated 2. indicates if a memory slot for identity page is initialized Actually, kvm_arch-ept_identity_pagetable_done is enough to tell if the ept identity pagetable is initialized. So we can remove ept_identity_pagetable. NOTE: In the original code, ept identity pagetable page is pinned in memroy. As a result, it cannot be migrated/hot-removed. After this patch, since kvm_arch-ept_identity_pagetable is removed, ept identity pagetable page is no longer pinned in memory. And it can be migrated/hot-removed. Signed-off-by: Tang Chen tangc...@cn.fujitsu.com --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/vmx.c | 50 - arch/x86/kvm/x86.c | 2 -- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4931415..62f973e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -578,7 +578,6 @@ struct kvm_arch { gpa_t wall_clock; - struct page *ept_identity_pagetable; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0e1117c..b8bf47d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -741,6 +741,7 @@ static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static bool vmx_mpx_supported(void); +static int alloc_identity_pagetable(struct kvm *kvm); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3921,21 +3922,27 @@ out: static int init_rmode_identity_map(struct kvm *kvm) { - int i, idx, r, ret; + int i, idx, r, ret = 0; pfn_t identity_map_pfn; u32 tmp; if (!enable_ept) return 1; - if (unlikely(!kvm-arch.ept_identity_pagetable)) { - printk(KERN_ERR EPT: identity-mapping pagetable - haven't been allocated!\n); - return 0; + + /* Protect kvm-arch.ept_identity_pagetable_done. */ + mutex_lock(kvm-slots_lock); + + if (likely(kvm-arch.ept_identity_pagetable_done)) { + ret = 1; + goto out2; } - if (likely(kvm-arch.ept_identity_pagetable_done)) - return 1; - ret = 0; + identity_map_pfn = kvm-arch.ept_identity_map_addr PAGE_SHIFT; + + r = alloc_identity_pagetable(kvm); + if (r) + goto out2; + idx = srcu_read_lock(kvm-srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r 0) @@ -3953,6 +3960,9 @@ static int init_rmode_identity_map(struct kvm *kvm) ret = 1; out: srcu_read_unlock(kvm-srcu, idx); + +out2: + mutex_unlock(kvm-slots_lock); return ret; } @@ -4002,31 +4012,23 @@ out: static int alloc_identity_pagetable(struct kvm *kvm) { - struct page *page; + /* +* In init_rmode_identity_map(), kvm-arch.ept_identity_pagetable_done +* is checked before calling this function and set to true after the +* calling. The access to kvm-arch.ept_identity_pagetable_done should +* be protected by kvm-slots_lock. +*/ + struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - mutex_lock(kvm-slots_lock); - if (kvm-arch.ept_identity_pagetable) - goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = kvm-arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, kvm_userspace_mem); - if (r) - goto out; - page = gfn_to_page(kvm, kvm-arch.ept_identity_map_addr PAGE_SHIFT); - if (is_error_page(page)) { - r = -EFAULT; - goto out; - } - - kvm-arch.ept_identity_pagetable = page; -out: - mutex_unlock(kvm-slots_lock); return r; } @@ -7582,8 +7584,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) kvm-arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; err = -ENOMEM; - if (alloc_identity_pagetable(kvm) != 0) - goto free_vmcs; if (!init_rmode_identity_map(kvm)) goto free_vmcs; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f32a025..ffbe557 100644 --- a/arch/x86/kvm/x86.c +++
[PATCH v3 0/6] kvm, mem-hotplug: Do not pin ept identity pagetable and apic access page.
ept identity pagetable and apic access page in kvm are pinned in memory. As a result, they cannot be migrated/hot-removed. But actually they don't need to be pinned in memory. [For ept identity page] Just do not pin it. When it is migrated, guest will be able to find the new page in the next ept violation. [For apic access page] The hpa of apic access page is stored in VMCS APIC_ACCESS_ADDR pointer. When apic access page is migrated, we update VMCS APIC_ACCESS_ADDR pointer for each vcpu in addition. NOTE: Patch 1~5 are tested with -cpu xxx,-x2apic option, and they work well. Patch 6 is not tested yet, not sure if it is right. Change log v2 - v3: 1. Remove original [PATCH 3/6] since ept_identity_pagetable has been removed in new [PATCH 3/6]. 2. In [PATCH 3/6], fix the problem that kvm-slots_lock does not protect kvm-arch.ept_identity_pagetable_done checking. 3. In [PATCH 3/6], drop gfn_to_page() since ept_identity_pagetable has been removed. 4. Add new [PATCH 4/6], remove redundant variable in init_rmode_identity_map(), and make it return 0 on success. 5. In [PATCH 5/6], drop put_page(kvm-arch.apic_access_page) from x86.c . 6. In [PATCH 5/6], update kvm-arch.apic_access_page in vcpu_reload_apic_access_page(). 7. Add new [PATCH 6/6], reload apic access page in L2-L1 exit. Change log v1 - v2: 1. Add [PATCH 4/5] to remove unnecessary kvm_arch-ept_identity_pagetable. 2. In [PATCH 5/5], only introduce KVM_REQ_APIC_PAGE_RELOAD request. 3. In [PATCH 5/5], add set_apic_access_page_addr() for svm. Tang Chen (6): kvm: Add gfn_to_page_no_pin() to translate gfn to page without pinning. kvm: Use APIC_DEFAULT_PHYS_BASE macro as the apic access page address. kvm: Remove ept_identity_pagetable from struct kvm_arch. kvm: Make init_rmode_identity_map() return 0 on success. kvm, mem-hotplug: Do not pin apic access page in memory. kvm, mem-hotplug: Reload L1's apic access page if it is migrated when L2 is running. arch/x86/include/asm/kvm_host.h | 3 +- arch/x86/kvm/svm.c | 15 +- arch/x86/kvm/vmx.c | 108 +++- arch/x86/kvm/x86.c | 22 ++-- include/linux/kvm_host.h| 3 ++ virt/kvm/kvm_main.c | 29 ++- 6 files changed, 139 insertions(+), 41 deletions(-) -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 5/6] kvm, mem-hotplug: Do not pin apic access page in memory.
apic access page is pinned in memory. As a result, it cannot be migrated/hot-removed. Actually, it is not necessary to be pinned. The hpa of apic access page is stored in VMCS APIC_ACCESS_ADDR pointer. When the page is migrated, kvm_mmu_notifier_invalidate_page() will invalidate the corresponding ept entry. This patch introduces a new vcpu request named KVM_REQ_APIC_PAGE_RELOAD, and makes this request to all the vcpus at this time, and force all the vcpus exit guest, and re-enter guest till they updates the VMCS APIC_ACCESS_ADDR pointer to the new apic access page address, and updates kvm-arch.apic_access_page to the new page. Signed-off-by: Tang Chen tangc...@cn.fujitsu.com --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c | 8 +++- arch/x86/kvm/x86.c | 17 +++-- include/linux/kvm_host.h| 2 ++ virt/kvm/kvm_main.c | 12 6 files changed, 43 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 62f973e..9ce6bfd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -737,6 +737,7 @@ struct kvm_x86_ops { void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); + void (*set_apic_access_page_addr)(struct kvm *kvm, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 576b525..dc76f29 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3612,6 +3612,11 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } +static void svm_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa) +{ + return; +} + static int svm_vm_has_apicv(struct kvm *kvm) { return 0; @@ -4365,6 +4370,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, + .set_apic_access_page_addr = svm_set_apic_access_page_addr, .vm_has_apicv = svm_vm_has_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6ab4f87..c123c1d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3995,7 +3995,7 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE PAGE_SHIFT); + page = gfn_to_page_no_pin(kvm, APIC_DEFAULT_PHYS_BASE PAGE_SHIFT); if (is_error_page(page)) { r = -EFAULT; goto out; @@ -7072,6 +7072,11 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) vmx_set_msr_bitmap(vcpu); } +static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa) +{ + vmcs_write64(APIC_ACCESS_ADDR, hpa); +} + static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) { u16 status; @@ -8841,6 +8846,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .vm_has_apicv = vmx_vm_has_apicv, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ffbe557..7541a66 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5929,6 +5929,19 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_apic_update_tmr(vcpu, tmr); } +static void vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) +{ + /* +* apic access page could be migrated. When the page is being migrated, +* GUP will wait till the migrate entry is replaced with the new pte +* entry pointing to the new page. +*/ + vcpu-kvm-arch.apic_access_page = gfn_to_page_no_pin(vcpu-kvm, + APIC_DEFAULT_PHYS_BASE PAGE_SHIFT); + kvm_x86_ops-set_apic_access_page_addr(vcpu-kvm, + page_to_phys(vcpu-kvm-arch.apic_access_page)); +} + /* * Returns 1 to let __vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -5989,6 +6002,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_deliver_pmi(vcpu); if
[PATCH v3 1/6] kvm: Add gfn_to_page_no_pin() to translate gfn to page without pinning.
gfn_to_page() will finally call hva_to_pfn() to get the pfn, and pin the page in memory by calling GUP functions. This function unpins the page. Will be used by the followed patches. Signed-off-by: Tang Chen tangc...@cn.fujitsu.com --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 17 - 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ec4e3bd..7c58d9d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -541,6 +541,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, int nr_pages); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +struct page *gfn_to_page_no_pin(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4b6c01b..6091849 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1371,9 +1371,24 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) return kvm_pfn_to_page(pfn); } - EXPORT_SYMBOL_GPL(gfn_to_page); +struct page *gfn_to_page_no_pin(struct kvm *kvm, gfn_t gfn) +{ + struct page *page = gfn_to_page(kvm, gfn); + + /* +* gfn_to_page() will finally call hva_to_pfn() to get the pfn, and pin +* the page in memory by calling GUP functions. This function unpins +* the page. +*/ + if (!is_error_page(page)) + put_page(page); + + return page; +} +EXPORT_SYMBOL_GPL(gfn_to_page_no_pin); + void kvm_release_page_clean(struct page *page) { WARN_ON(is_error_page(page)); -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 6/6] kvm, mem-hotplug: Reload L1's apic access page if it is migrated when L2 is running.
This patch only handle L1 and L2 vm share one apic access page situation. When L1 vm is running, if the shared apic access page is migrated, mmu_notifier will request all vcpus to exit to L0, and reload apic access page physical address for all the vcpus' vmcs (which is done by patch 5/6). And when it enters L2 vm, L2's vmcs will be updated in prepare_vmcs02() called by nested_vm_run(). So we need to do nothing. When L2 vm is running, if the shared apic access page is migrated, mmu_notifier will request all vcpus to exit to L0, and reload apic access page physical address for all L2 vmcs. And this patch requests apic access page reload in L2-L1 vmexit. Signed-off-by: Tang Chen tangc...@cn.fujitsu.com --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c | 37 + arch/x86/kvm/x86.c | 3 +++ 4 files changed, 47 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9ce6bfd..613ee7f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -738,6 +738,7 @@ struct kvm_x86_ops { void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm *kvm, hpa_t hpa); + void (*set_nested_apic_page_migrated)(struct kvm_vcpu *vcpu, bool set); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index dc76f29..87273ef 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3617,6 +3617,11 @@ static void svm_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa) return; } +static void svm_set_nested_apic_page_migrated(struct kvm_vcpu *vcpu, bool set) +{ + return; +} + static int svm_vm_has_apicv(struct kvm *kvm) { return 0; @@ -4371,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, .set_apic_access_page_addr = svm_set_apic_access_page_addr, + .set_nested_apic_page_migrated = svm_set_nested_apic_page_migrated, .vm_has_apicv = svm_vm_has_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c123c1d..9231afe 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -379,6 +379,16 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + /* +* L1's apic access page can be migrated. When L1 and L2 are sharing +* the apic access page, after the page is migrated when L2 is running, +* we have to reload it to L1 vmcs before we enter L1. +* +* When the shared apic access page is migrated in L1 mode, we don't +* need to do anything else because we reload apic access page each +* time when entering L2 in prepare_vmcs02(). +*/ + bool apic_access_page_migrated; u64 msr_ia32_feature_control; struct hrtimer preemption_timer; @@ -7077,6 +7087,12 @@ static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa) vmcs_write64(APIC_ACCESS_ADDR, hpa); } +static void vmx_set_nested_apic_page_migrated(struct kvm_vcpu *vcpu, bool set) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx-nested.apic_access_page_migrated = set; +} + static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) { u16 status; @@ -8727,6 +8743,26 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, } /* +* When shared (L1 L2) apic access page is migrated during L2 is +* running, mmu_notifier will force to reload the page's hpa for L2 +* vmcs. Need to reload it for L1 before entering L1. +*/ + if (vmx-nested.apic_access_page_migrated) { + /* +* Do not call kvm_reload_apic_access_page() because we are now +* in L2. We should not call make_all_cpus_request() to exit to +* L0, otherwise we will reload for L2 vmcs again. +*/ + int i; + + for (i = 0; i atomic_read(vcpu-kvm-online_vcpus); i++) + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, +vcpu-kvm-vcpus[i]); + + vmx-nested.apic_access_page_migrated = false; + } + + /* * Exiting from L2 to L1, we're now back to L1 which thinks it just * finished a VMLAUNCH or VMRESUME instruction, so we need to set the * success or
Re: [PATCH] vhost: Add polling mode
Jason Wang jasow...@redhat.com wrote on 23/07/2014 08:26:36 AM: From: Jason Wang jasow...@redhat.com To: Razya Ladelsky/Haifa/IBM@IBMIL, kvm@vger.kernel.org, Michael S. Tsirkin m...@redhat.com, Cc: abel.gor...@gmail.com, Joel Nider/Haifa/IBM@IBMIL, Yossi Kuperman1/Haifa/IBM@IBMIL, Eran Raichstein/Haifa/IBM@IBMIL, Alex Glikson/Haifa/IBM@IBMIL Date: 23/07/2014 08:26 AM Subject: Re: [PATCH] vhost: Add polling mode On 07/21/2014 09:23 PM, Razya Ladelsky wrote: Hello All, When vhost is waiting for buffers from the guest driver (e.g., more packets to send in vhost-net's transmit queue), it normally goes to sleep and waits for the guest to kick it. This kick involves a PIO in the guest, and therefore an exit (and possibly userspace involvement in translating this PIO exit into a file descriptor event), all of which hurts performance. If the system is under-utilized (has cpu time to spare), vhost can continuously poll the virtqueues for new buffers, and avoid asking the guest to kick us. This patch adds an optional polling mode to vhost, that can be enabled via a kernel module parameter, poll_start_rate. When polling is active for a virtqueue, the guest is asked to disable notification (kicks), and the worker thread continuously checks for new buffers. When it does discover new buffers, it simulates a kick by invoking the underlying backend driver (such as vhost-net), which thinks it got a real kick from the guest, and acts accordingly. If the underlying driver asks not to be kicked, we disable polling on this virtqueue. We start polling on a virtqueue when we notice it has work to do. Polling on this virtqueue is later disabled after 3 seconds of polling turning up no new work, as in this case we are better off returning to the exit-based notification mechanism. The default timeout of 3 seconds can be changed with the poll_stop_idle kernel module parameter. This polling approach makes lot of sense for new HW with posted-interrupts for which we have exitless host-to-guest notifications. But even with support for posted interrupts, guest-to-host communication still causes exits. Polling adds the missing part. When systems are overloaded, there won?t be enough cpu time for the various vhost threads to poll their guests' devices. For these scenarios, we plan to add support for vhost threads that can be shared by multiple devices, even of multiple vms. Our ultimate goal is to implement the I/O acceleration features described in: KVM Forum 2013: Efficient and Scalable Virtio (by Abel Gordon) https://www.youtube.com/watch?v=9EyweibHfEs and https://www.mail-archive.com/kvm@vger.kernel.org/msg98179.html Comments are welcome, Thank you, Razya Thanks for the work. Do you have perf numbers for this? Hi Jason, Thanks for reviewing. I ran some experiments with TCP stream netperf and filebench (having 2 threads performing random reads) benchmarks on an IBM System x3650 M4. All runs loaded the guests in a way that they were (cpu) saturated. The system had two cores per guest, as to allow for both the vcpu and the vhost thread to run concurrently for maximum throughput (but I didn't pin the threads to specific cores) I get: Netperf, 1 vm: The polling patch improved throughput by ~33%. Number of exits/sec decreased 6x. The same improvement was shown when I tested with 3 vms running netperf. filebench, 1 vm: ops/sec improved by 13% with the polling patch. Number of exits was reduced by 31%. The same experiment with 3 vms running filebench showed similar numbers. And looks like the patch only poll for virtqueue. In the future, may worth to add callbacks for vhost_net to poll socket. Then it could be used with rx busy polling in host which may speedup the rx also. Did you mean polling the network device to avoid interrupts? diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c90f437..678d766 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -24,9 +24,17 @@ #include linux/slab.h #include linux/kthread.h #include linux/cgroup.h +#include linux/jiffies.h #include linux/module.h #include vhost.h +static int poll_start_rate = 0; +module_param(poll_start_rate, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(poll_start_rate, Start continuous polling of virtqueue when rate of events is at least this number per jiffy. If 0, never start polling.); + +static int poll_stop_idle = 3*HZ; /* 3 seconds */ +module_param(poll_stop_idle, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(poll_stop_idle, Stop continuous polling of virtqueue after this many jiffies of no work.); I'm not sure using jiffy is good enough since user need know HZ value. May worth to look at sk_busy_loop() which use sched_clock() and us. Ok, Will look into it, thanks. +/* Enable or disable virtqueue polling
RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail
-Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc- ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Wednesday, July 23, 2014 12:21 AM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; kvm@vger.kernel.org Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail On 21.07.14 11:59, mihai.cara...@freescale.com wrote: -Original Message- From: Linuxppc-dev [mailto:linuxppc-dev- bounces+mihai.caraman=freescale@lists.ozlabs.org] On Behalf Of mihai.cara...@freescale.com Sent: Friday, July 18, 2014 12:06 PM To: Alexander Graf; kvm-...@vger.kernel.org Cc: linuxppc-...@lists.ozlabs.org; kvm@vger.kernel.org Subject: RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, July 17, 2014 5:21 PM To: Caraman Mihai Claudiu-B02008; kvm-...@vger.kernel.org Cc: kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail On 17.07.14 13:22, Mihai Caraman wrote: On book3e, guest last instruction is read on the exit path using load external pid (lwepx) dedicated instruction. This load operation may fail due to TLB eviction and execute-but-not-read entries. This patch lay down the path for an alternative solution to read the guest last instruction, by allowing kvmppc_get_lat_inst() function to fail. Architecture specific implmentations of kvmppc_load_last_inst() may read last guest instruction and instruct the emulation layer to re- execute the guest in case of failure. Make kvmppc_get_last_inst() definition common between architectures. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- ... diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e2fd5a1..7f9c634 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -47,6 +47,11 @@ enum emulation_result { EMULATE_EXIT_USER,/* emulation requires exit to user- space */ }; +enum instruction_type { +INST_GENERIC, +INST_SC,/* system call */ +}; + extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern void kvmppc_handler_highmem(void); @@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 val, unsigned int bytes, int is_default_endian); +extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, + enum instruction_type type, u32 *inst); + extern int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -234,6 +242,23 @@ struct kvmppc_ops { extern struct kvmppc_ops *kvmppc_hv_ops; extern struct kvmppc_ops *kvmppc_pr_ops; +static inline int kvmppc_get_last_inst(struct kvm_vcpu *vcpu, +enum instruction_type type, u32 *inst) +{ +int ret = EMULATE_DONE; + +/* Load the instruction manually if it failed to do so in the + * exit path */ +if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED) +ret = kvmppc_load_last_inst(vcpu, type, vcpu- arch.last_inst); + + +*inst = (ret == EMULATE_DONE kvmppc_need_byteswap(vcpu)) ? +swab32(vcpu-arch.last_inst) : vcpu-arch.last_inst; This makes even less sense than the previous version. Either you treat inst as definitely overwritten or as preserves previous data on failure. Both v4 and v5 versions treat inst as definitely overwritten. So either you unconditionally swap like you did before If we make abstraction of its symmetry, KVM_INST_FETCH_FAILED is operated in host endianness, so it doesn't need byte swap. I agree with your reasoning if last_inst is initialized and compared with data in guest endianess, which is not the case yet for KVM_INST_FETCH_FAILED. Alex, are you relying on the fact that KVM_INST_FETCH_FAILED value is symmetrical? With a non symmetrical value like 0xDEADBEEF, and considering a little- endian guest on a big-endian host, we need to fix kvm logic to initialize and compare last_inst with 0xEFBEADDE swaped value. Your suggestion to unconditionally swap makes sense only with the above fix, otherwise inst may end up with 0xEFBEADDE swaped value with is wrong. Only for *inst which we would treat as undefined after the function returned EMULATE_AGAIN. Right. With this do you acknowledge that v5
Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail
Am 23.07.2014 um 10:24 schrieb mihai.cara...@freescale.com mihai.cara...@freescale.com: -Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc- ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Wednesday, July 23, 2014 12:21 AM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; kvm@vger.kernel.org Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail On 21.07.14 11:59, mihai.cara...@freescale.com wrote: -Original Message- From: Linuxppc-dev [mailto:linuxppc-dev- bounces+mihai.caraman=freescale@lists.ozlabs.org] On Behalf Of mihai.cara...@freescale.com Sent: Friday, July 18, 2014 12:06 PM To: Alexander Graf; kvm-...@vger.kernel.org Cc: linuxppc-...@lists.ozlabs.org; kvm@vger.kernel.org Subject: RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, July 17, 2014 5:21 PM To: Caraman Mihai Claudiu-B02008; kvm-...@vger.kernel.org Cc: kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail On 17.07.14 13:22, Mihai Caraman wrote: On book3e, guest last instruction is read on the exit path using load external pid (lwepx) dedicated instruction. This load operation may fail due to TLB eviction and execute-but-not-read entries. This patch lay down the path for an alternative solution to read the guest last instruction, by allowing kvmppc_get_lat_inst() function to fail. Architecture specific implmentations of kvmppc_load_last_inst() may read last guest instruction and instruct the emulation layer to re- execute the guest in case of failure. Make kvmppc_get_last_inst() definition common between architectures. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- ... diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e2fd5a1..7f9c634 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -47,6 +47,11 @@ enum emulation_result { EMULATE_EXIT_USER,/* emulation requires exit to user- space */ }; +enum instruction_type { +INST_GENERIC, +INST_SC,/* system call */ +}; + extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern void kvmppc_handler_highmem(void); @@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 val, unsigned int bytes, int is_default_endian); +extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, + enum instruction_type type, u32 *inst); + extern int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -234,6 +242,23 @@ struct kvmppc_ops { extern struct kvmppc_ops *kvmppc_hv_ops; extern struct kvmppc_ops *kvmppc_pr_ops; +static inline int kvmppc_get_last_inst(struct kvm_vcpu *vcpu, +enum instruction_type type, u32 *inst) +{ +int ret = EMULATE_DONE; + +/* Load the instruction manually if it failed to do so in the + * exit path */ +if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED) +ret = kvmppc_load_last_inst(vcpu, type, vcpu- arch.last_inst); + + +*inst = (ret == EMULATE_DONE kvmppc_need_byteswap(vcpu)) ? +swab32(vcpu-arch.last_inst) : vcpu-arch.last_inst; This makes even less sense than the previous version. Either you treat inst as definitely overwritten or as preserves previous data on failure. Both v4 and v5 versions treat inst as definitely overwritten. So either you unconditionally swap like you did before If we make abstraction of its symmetry, KVM_INST_FETCH_FAILED is operated in host endianness, so it doesn't need byte swap. I agree with your reasoning if last_inst is initialized and compared with data in guest endianess, which is not the case yet for KVM_INST_FETCH_FAILED. Alex, are you relying on the fact that KVM_INST_FETCH_FAILED value is symmetrical? With a non symmetrical value like 0xDEADBEEF, and considering a little- endian guest on a big-endian host, we need to fix kvm logic to initialize and compare last_inst with 0xEFBEADDE swaped value. Your suggestion to unconditionally swap makes sense only with the above fix, otherwise inst may end up with 0xEFBEADDE swaped value with is wrong. Only for *inst which we would treat as undefined after the function returned EMULATE_AGAIN. Right. With this do you acknowledge that v5 (definitely overwritten approach) is ok? I think I'm starting to understand your logic of v5. You write fetch_failed into *inst
Re: [PATCH] vhost: Add polling mode
On 07/23/2014 04:12 PM, Razya Ladelsky wrote: Jason Wang jasow...@redhat.com wrote on 23/07/2014 08:26:36 AM: From: Jason Wang jasow...@redhat.com To: Razya Ladelsky/Haifa/IBM@IBMIL, kvm@vger.kernel.org, Michael S. Tsirkin m...@redhat.com, Cc: abel.gor...@gmail.com, Joel Nider/Haifa/IBM@IBMIL, Yossi Kuperman1/Haifa/IBM@IBMIL, Eran Raichstein/Haifa/IBM@IBMIL, Alex Glikson/Haifa/IBM@IBMIL Date: 23/07/2014 08:26 AM Subject: Re: [PATCH] vhost: Add polling mode On 07/21/2014 09:23 PM, Razya Ladelsky wrote: Hello All, When vhost is waiting for buffers from the guest driver (e.g., more packets to send in vhost-net's transmit queue), it normally goes to sleep and waits for the guest to kick it. This kick involves a PIO in the guest, and therefore an exit (and possibly userspace involvement in translating this PIO exit into a file descriptor event), all of which hurts performance. If the system is under-utilized (has cpu time to spare), vhost can continuously poll the virtqueues for new buffers, and avoid asking the guest to kick us. This patch adds an optional polling mode to vhost, that can be enabled via a kernel module parameter, poll_start_rate. When polling is active for a virtqueue, the guest is asked to disable notification (kicks), and the worker thread continuously checks for new buffers. When it does discover new buffers, it simulates a kick by invoking the underlying backend driver (such as vhost-net), which thinks it got a real kick from the guest, and acts accordingly. If the underlying driver asks not to be kicked, we disable polling on this virtqueue. We start polling on a virtqueue when we notice it has work to do. Polling on this virtqueue is later disabled after 3 seconds of polling turning up no new work, as in this case we are better off returning to the exit-based notification mechanism. The default timeout of 3 seconds can be changed with the poll_stop_idle kernel module parameter. This polling approach makes lot of sense for new HW with posted-interrupts for which we have exitless host-to-guest notifications. But even with support for posted interrupts, guest-to-host communication still causes exits. Polling adds the missing part. When systems are overloaded, there won?t be enough cpu time for the various vhost threads to poll their guests' devices. For these scenarios, we plan to add support for vhost threads that can be shared by multiple devices, even of multiple vms. Our ultimate goal is to implement the I/O acceleration features described in: KVM Forum 2013: Efficient and Scalable Virtio (by Abel Gordon) https://www.youtube.com/watch?v=9EyweibHfEs and https://www.mail-archive.com/kvm@vger.kernel.org/msg98179.html Comments are welcome, Thank you, Razya Thanks for the work. Do you have perf numbers for this? Hi Jason, Thanks for reviewing. I ran some experiments with TCP stream netperf and filebench (having 2 threads performing random reads) benchmarks on an IBM System x3650 M4. All runs loaded the guests in a way that they were (cpu) saturated. The system had two cores per guest, as to allow for both the vcpu and the vhost thread to run concurrently for maximum throughput (but I didn't pin the threads to specific cores) I get: Netperf, 1 vm: The polling patch improved throughput by ~33%. Number of exits/sec decreased 6x. The same improvement was shown when I tested with 3 vms running netperf. filebench, 1 vm: ops/sec improved by 13% with the polling patch. Number of exits was reduced by 31%. The same experiment with 3 vms running filebench showed similar numbers. Looks good, may worth to add the result in the commit log. And looks like the patch only poll for virtqueue. In the future, may worth to add callbacks for vhost_net to poll socket. Then it could be used with rx busy polling in host which may speedup the rx also. Did you mean polling the network device to avoid interrupts? Yes, recent linux host support rx busy polling which can reduce the interrupts. If vhost can utilize this, it can also reduce the latency caused by vhost thread wakeups. And I'm also working on virtio-net busy polling in guest, if vhost can poll socket, it can also help in guest rx polling. diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c90f437..678d766 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -24,9 +24,17 @@ #include linux/slab.h #include linux/kthread.h #include linux/cgroup.h +#include linux/jiffies.h #include linux/module.h #include vhost.h +static int poll_start_rate = 0; +module_param(poll_start_rate, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(poll_start_rate, Start continuous polling of virtqueue when rate of events is at least this number per jiffy. If 0, never start polling.); + +static int poll_stop_idle = 3*HZ; /* 3 seconds */ +module_param(poll_stop_idle, int, S_IRUGO|S_IWUSR);
Re: [PATCH] vhost: Add polling mode
On Wed, Jul 23, 2014 at 11:42 AM, Jason Wang jasow...@redhat.com wrote: On 07/23/2014 04:12 PM, Razya Ladelsky wrote: Jason Wang jasow...@redhat.com wrote on 23/07/2014 08:26:36 AM: From: Jason Wang jasow...@redhat.com To: Razya Ladelsky/Haifa/IBM@IBMIL, kvm@vger.kernel.org, Michael S. Tsirkin m...@redhat.com, Cc: abel.gor...@gmail.com, Joel Nider/Haifa/IBM@IBMIL, Yossi Kuperman1/Haifa/IBM@IBMIL, Eran Raichstein/Haifa/IBM@IBMIL, Alex Glikson/Haifa/IBM@IBMIL Date: 23/07/2014 08:26 AM Subject: Re: [PATCH] vhost: Add polling mode On 07/21/2014 09:23 PM, Razya Ladelsky wrote: Hello All, When vhost is waiting for buffers from the guest driver (e.g., more packets to send in vhost-net's transmit queue), it normally goes to sleep and waits for the guest to kick it. This kick involves a PIO in the guest, and therefore an exit (and possibly userspace involvement in translating this PIO exit into a file descriptor event), all of which hurts performance. If the system is under-utilized (has cpu time to spare), vhost can continuously poll the virtqueues for new buffers, and avoid asking the guest to kick us. This patch adds an optional polling mode to vhost, that can be enabled via a kernel module parameter, poll_start_rate. When polling is active for a virtqueue, the guest is asked to disable notification (kicks), and the worker thread continuously checks for new buffers. When it does discover new buffers, it simulates a kick by invoking the underlying backend driver (such as vhost-net), which thinks it got a real kick from the guest, and acts accordingly. If the underlying driver asks not to be kicked, we disable polling on this virtqueue. We start polling on a virtqueue when we notice it has work to do. Polling on this virtqueue is later disabled after 3 seconds of polling turning up no new work, as in this case we are better off returning to the exit-based notification mechanism. The default timeout of 3 seconds can be changed with the poll_stop_idle kernel module parameter. This polling approach makes lot of sense for new HW with posted-interrupts for which we have exitless host-to-guest notifications. But even with support for posted interrupts, guest-to-host communication still causes exits. Polling adds the missing part. When systems are overloaded, there won?t be enough cpu time for the various vhost threads to poll their guests' devices. For these scenarios, we plan to add support for vhost threads that can be shared by multiple devices, even of multiple vms. Our ultimate goal is to implement the I/O acceleration features described in: KVM Forum 2013: Efficient and Scalable Virtio (by Abel Gordon) https://www.youtube.com/watch?v=9EyweibHfEs and https://www.mail-archive.com/kvm@vger.kernel.org/msg98179.html Comments are welcome, Thank you, Razya Thanks for the work. Do you have perf numbers for this? Hi Jason, Thanks for reviewing. I ran some experiments with TCP stream netperf and filebench (having 2 threads performing random reads) benchmarks on an IBM System x3650 M4. All runs loaded the guests in a way that they were (cpu) saturated. The system had two cores per guest, as to allow for both the vcpu and the vhost thread to run concurrently for maximum throughput (but I didn't pin the threads to specific cores) I get: Netperf, 1 vm: The polling patch improved throughput by ~33%. Number of exits/sec decreased 6x. The same improvement was shown when I tested with 3 vms running netperf. filebench, 1 vm: ops/sec improved by 13% with the polling patch. Number of exits was reduced by 31%. The same experiment with 3 vms running filebench showed similar numbers. Looks good, may worth to add the result in the commit log. And looks like the patch only poll for virtqueue. In the future, may worth to add callbacks for vhost_net to poll socket. Then it could be used with rx busy polling in host which may speedup the rx also. Did you mean polling the network device to avoid interrupts? Yes, recent linux host support rx busy polling which can reduce the interrupts. If vhost can utilize this, it can also reduce the latency caused by vhost thread wakeups. And I'm also working on virtio-net busy polling in guest, if vhost can poll socket, it can also help in guest rx polling. Nice :) Note that you may want to check if if the processor support posted interrupts. I guess that if CPU supports posted interrupts then benefits of polling in the front-end (from performance perspective) may not worth the cpu cycles wasted in the guest. diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c90f437..678d766 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -24,9 +24,17 @@ #include linux/slab.h #include linux/kthread.h #include linux/cgroup.h +#include linux/jiffies.h
RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail
Right. With this do you acknowledge that v5 (definitely overwritten approach) is ok? I think I'm starting to understand your logic of v5. You write fetch_failed into *inst unswapped if the fetch failed. v5 - don't swap when load fails :) I think that's ok, but I definitely do not like the code flow - it's too hard to understand at a glimpse. Just rewrite it to swab at local variable level, preferably with if()s and comments what this is about and have a single unconditional *inst = fetched_inst; at the end of the function. I will incorporate these change requests into v6. Thanks, -Mike -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: PPC: fix incorrect way saving SPRN_MMCR2
SPRN_SIER and SPRN_MMCR2 are doublely saved, particularly SPRN_MMCR2 is oversaved with a incorrect value which comes from SPRN_PMC5 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_hv_interrupts.S | 5 - 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 8c86422..731be74 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -127,11 +127,6 @@ BEGIN_FTR_SECTION stw r10, HSTATE_PMC + 24(r13) stw r11, HSTATE_PMC + 28(r13) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) -BEGIN_FTR_SECTION - mfspr r9, SPRN_SIER - std r8, HSTATE_MMCR + 40(r13) - std r9, HSTATE_MMCR + 48(r13) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 31: /* -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 01/17] configure: add CONFIG_COLO to switch COLO support
./configure --enable-colo/--disable-colo to switch COLO support on/off. COLO support is off by default. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- configure | 14 ++ 1 file changed, 14 insertions(+) diff --git a/configure b/configure index f7685b5..4071943 100755 --- a/configure +++ b/configure @@ -258,6 +258,7 @@ xfs= vhost_net=no vhost_scsi=no kvm=no +colo=no rdma= gprof=no debug_tcg=no @@ -921,6 +922,10 @@ for opt do ;; --enable-kvm) kvm=yes ;; + --disable-colo) colo=no + ;; + --enable-colo) colo=yes + ;; --disable-tcg-interpreter) tcg_interpreter=no ;; --enable-tcg-interpreter) tcg_interpreter=yes @@ -1314,6 +1319,10 @@ Advanced options (experts only): --disable-slirp disable SLIRP userspace network connectivity --disable-kvmdisable KVM acceleration support --enable-kvm enable KVM acceleration support + --disable-colo disable COarse-grain LOck-stepping Virtual + Machines for Non-stop Service(default) + --enable-coloenable COarse-grain LOck-stepping Virtual + Machines for Non-stop Service --disable-rdma disable RDMA-based migration support --enable-rdmaenable RDMA-based migration support --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI) @@ -4215,6 +4224,7 @@ echo Linux AIO support $linux_aio echo ATTR/XATTR support $attr echo Install blobs $blobs echo KVM support $kvm +echo COLO support $colo echo RDMA support $rdma echo TCG interpreter $tcg_interpreter echo fdt support $fdt @@ -4751,6 +4761,10 @@ if have_backend ftrace; then fi echo CONFIG_TRACE_FILE=$trace_file $config_host_mak +if test $colo = yes; then + echo CONFIG_COLO=y $config_host_mak +fi + if test $rdma = yes ; then echo CONFIG_RDMA=y $config_host_mak fi -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 00/17] COarse-grain LOck-stepping(COLO) Virtual Machines for Non-stop Service
Virtual machine (VM) replication is a well known technique for providing application-agnostic software-implemented hardware fault tolerance non-stop service. COLO is a high availability solution. Both primary VM (PVM) and secondary VM (SVM) run in parallel. They receive the same request from client, and generate response in parallel too. If the response packets from PVM and SVM are identical, they are released immediately. Otherwise, a VM checkpoint (on demand) is conducted. The idea is presented in Xen summit 2012, and 2013, and academia paper in SOCC 2013. It's also presented in KVM forum 2013: http://www.linux-kvm.org/wiki/images/1/1d/Kvm-forum-2013-COLO.pdf Please refer to above document for detailed information. Please also refer to previous posted RFC proposal: http://lists.nongnu.org/archive/html/qemu-devel/2014-06/msg05567.html The patchset is also hosted on github: https://github.com/macrosheep/qemu/tree/colo_v0.1 This patchset is RFC, implements the frame of colo, without failover and nic/disk replication. But it is ready for demo the COLO idea above QEMU-Kvm. Steps using this patchset to get an overview of COLO: 1. configure the source with --enable-colo option 2. compile 3. just like QEMU's normal migration, run 2 QEMU VM: - Primary VM - Secondary VM with -incoming tcp:[IP]:[PORT] option 4. on Primary VM's QEMU monitor, run following command: migrate_set_capability colo on migrate tcp:[IP]:[PORT] 5. done you will see two runing VMs, whenever you make changes to PVM, SVM will be synced to PVM's state. TODO list: 1. failover 2. nic replication 3. disk replication[COLO Disk manager] Any comments/feedbacks are warmly welcomed. Thanks, Yang Yang Hongyang (17): configure: add CONFIG_COLO to switch COLO support COLO: introduce an api colo_supported() to indicate COLO support COLO migration: add a migration capability 'colo' COLO info: use colo info to tell migration target colo is enabled COLO save: integrate COLO checkpointed save into qemu migration COLO restore: integrate COLO checkpointed restore into qemu restore COLO buffer: implement colo buffer as well as QEMUFileOps based on it COLO: disable qdev hotplug COLO ctl: implement API's that communicate with colo agent COLO ctl: introduce is_slave() and is_master() COLO ctl: implement colo checkpoint protocol COLO ctl: add a RunState RUN_STATE_COLO COLO ctl: implement colo save COLO ctl: implement colo restore COLO save: reuse migration bitmap under colo checkpoint COLO ram cache: implement colo ram cache on slaver HACK: trigger checkpoint every 500ms Makefile.objs | 2 + arch_init.c| 174 +- configure | 14 + include/exec/cpu-all.h | 1 + include/migration/migration-colo.h | 36 +++ include/migration/migration.h | 13 + include/qapi/qmp/qerror.h | 3 + migration-colo-comm.c | 78 + migration-colo.c | 643 + migration.c| 45 ++- qapi-schema.json | 9 +- stubs/Makefile.objs| 1 + stubs/migration-colo.c | 34 ++ vl.c | 12 + 14 files changed, 1044 insertions(+), 21 deletions(-) create mode 100644 include/migration/migration-colo.h create mode 100644 migration-colo-comm.c create mode 100644 migration-colo.c create mode 100644 stubs/migration-colo.c -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 05/17] COLO save: integrate COLO checkpointed save into qemu migration
Integrate COLO checkpointed save flow into qemu migration. Add a migrate state: MIG_STATE_COLO, enter this migrate state after the first live migration successfully finished. Create a colo thread to do the checkpointed save. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- include/migration/migration-colo.h | 4 include/migration/migration.h | 13 +++ migration-colo-comm.c | 2 +- migration-colo.c | 48 ++ migration.c| 36 stubs/migration-colo.c | 4 6 files changed, 91 insertions(+), 16 deletions(-) diff --git a/include/migration/migration-colo.h b/include/migration/migration-colo.h index e3735d8..24589c0 100644 --- a/include/migration/migration-colo.h +++ b/include/migration/migration-colo.h @@ -18,4 +18,8 @@ void colo_info_mig_init(void); bool colo_supported(void); +/* save */ +bool migrate_use_colo(void); +void colo_init_checkpointer(MigrationState *s); + #endif diff --git a/include/migration/migration.h b/include/migration/migration.h index 3cb5ba8..3e81a27 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -64,6 +64,19 @@ struct MigrationState int64_t dirty_sync_count; }; +enum { +MIG_STATE_ERROR = -1, +MIG_STATE_NONE, +MIG_STATE_SETUP, +MIG_STATE_CANCELLING, +MIG_STATE_CANCELLED, +MIG_STATE_ACTIVE, +MIG_STATE_COLO, +MIG_STATE_COMPLETED, +}; + +void migrate_set_state(MigrationState *s, int old_state, int new_state); + void process_incoming_migration(QEMUFile *f); void qemu_start_incoming_migration(const char *uri, Error **errp); diff --git a/migration-colo-comm.c b/migration-colo-comm.c index ccbc246..4504ceb 100644 --- a/migration-colo-comm.c +++ b/migration-colo-comm.c @@ -25,7 +25,7 @@ static bool colo_requested; /* save */ -static bool migrate_use_colo(void) +bool migrate_use_colo(void) { MigrationState *s = migrate_get_current(); return s-enabled_capabilities[MIGRATION_CAPABILITY_COLO]; diff --git a/migration-colo.c b/migration-colo.c index 1d3bef8..0cef8bd 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -8,9 +8,57 @@ * the COPYING file in the top-level directory. */ +#include qemu/main-loop.h +#include qemu/thread.h #include migration/migration-colo.h +static QEMUBH *colo_bh; + bool colo_supported(void) { return true; } + +/* save */ + +static void *colo_thread(void *opaque) +{ +MigrationState *s = opaque; + +/*TODO: COLO checkpointed save loop*/ + +if (s-state != MIG_STATE_ERROR) { +migrate_set_state(s, MIG_STATE_COLO, MIG_STATE_COMPLETED); +} + +qemu_mutex_lock_iothread(); +qemu_bh_schedule(s-cleanup_bh); +qemu_mutex_unlock_iothread(); + +return NULL; +} + +static void colo_start_checkpointer(void *opaque) +{ +MigrationState *s = opaque; + +if (colo_bh) { +qemu_bh_delete(colo_bh); +colo_bh = NULL; +} + +qemu_mutex_unlock_iothread(); +qemu_thread_join(s-thread); +qemu_mutex_lock_iothread(); + +migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COLO); + +qemu_thread_create(s-thread, colo, colo_thread, s, + QEMU_THREAD_JOINABLE); +} + +void colo_init_checkpointer(MigrationState *s) +{ +colo_bh = qemu_bh_new(colo_start_checkpointer, s); +qemu_bh_schedule(colo_bh); +} diff --git a/migration.c b/migration.c index ca83310..b7f8e7e 100644 --- a/migration.c +++ b/migration.c @@ -27,16 +27,6 @@ #include trace.h #include migration/migration-colo.h -enum { -MIG_STATE_ERROR = -1, -MIG_STATE_NONE, -MIG_STATE_SETUP, -MIG_STATE_CANCELLING, -MIG_STATE_CANCELLED, -MIG_STATE_ACTIVE, -MIG_STATE_COMPLETED, -}; - #define MAX_THROTTLE (32 20) /* Migration speed throttling */ /* Amount of time to allocate to each chunk of bandwidth-throttled @@ -229,6 +219,11 @@ MigrationInfo *qmp_query_migrate(Error **errp) get_xbzrle_cache_stats(info); break; +case MIG_STATE_COLO: +info-has_status = true; +info-status = g_strdup(colo); +/* TODO: display COLO specific informations(checkpoint info etc.),*/ +break; case MIG_STATE_COMPLETED: get_xbzrle_cache_stats(info); @@ -272,7 +267,8 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, MigrationState *s = migrate_get_current(); MigrationCapabilityStatusList *cap; -if (s-state == MIG_STATE_ACTIVE || s-state == MIG_STATE_SETUP) { +if (s-state == MIG_STATE_ACTIVE || s-state == MIG_STATE_SETUP || +s-state == MIG_STATE_COLO) { error_set(errp, QERR_MIGRATION_ACTIVE); return; } @@ -289,7 +285,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, /* shared migration helpers */ -static void migrate_set_state(MigrationState *s, int old_state, int new_state) +void
[RFC PATCH 09/17] COLO ctl: implement API's that communicate with colo agent
We use COLO agent to compare the packets returned by Primary VM and Secondary VM, and decide whether to start a checkpoint according to some rules. It is a linux kernel module for host. COLO controller communicate with the agent through ioctl(). Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- migration-colo.c | 115 +-- 1 file changed, 112 insertions(+), 3 deletions(-) diff --git a/migration-colo.c b/migration-colo.c index f295e56..802f8b0 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -13,7 +13,16 @@ #include block/coroutine.h #include qemu/error-report.h #include hw/qdev-core.h +#include qemu/timer.h #include migration/migration-colo.h +#include sys/ioctl.h + +/* + * checkpoint timer: unit ms + * this is large because COLO checkpoint will mostly depend on + * COLO compare module. + */ +#define CHKPOINT_TIMER 1 static QEMUBH *colo_bh; @@ -22,6 +31,56 @@ bool colo_supported(void) return true; } +/* colo compare */ +#define COMP_IOC_MAGIC 'k' +#define COMP_IOCTWAIT _IO(COMP_IOC_MAGIC, 0) +#define COMP_IOCTFLUSH _IO(COMP_IOC_MAGIC, 1) +#define COMP_IOCTRESUME _IO(COMP_IOC_MAGIC, 2) + +#define COMPARE_DEV /dev/HA_compare +/* COLO compare module FD */ +static int comp_fd = -1; + +static int colo_compare_init(void) +{ +comp_fd = open(COMPARE_DEV, O_RDONLY); +if (comp_fd 0) { +return -1; +} + +return 0; +} + +static void colo_compare_destroy(void) +{ +if (comp_fd = 0) { +close(comp_fd); +comp_fd = -1; +} +} + +/* + * Communicate with COLO Agent through ioctl. + * return: + * 0: start a checkpoint + * other: errno == ETIME or ERESTART, try again + *errno == other, error, quit colo save + */ +static int colo_compare(void) +{ +return ioctl(comp_fd, COMP_IOCTWAIT, 250); +} + +static __attribute__((unused)) int colo_compare_flush(void) +{ +return ioctl(comp_fd, COMP_IOCTFLUSH, 1); +} + +static __attribute__((unused)) int colo_compare_resume(void) +{ +return ioctl(comp_fd, COMP_IOCTRESUME, 1); +} + /* colo buffer */ #define COLO_BUFFER_BASE_SIZE (1000*1000*4ULL) @@ -131,15 +190,48 @@ static const QEMUFileOps colo_read_ops = { static void *colo_thread(void *opaque) { MigrationState *s = opaque; -int dev_hotplug = qdev_hotplug; +int dev_hotplug = qdev_hotplug, wait_cp = 0; +int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); +int64_t current_time; + +if (colo_compare_init() 0) { +error_report(Init colo compare error\n); +goto out; +} qdev_hotplug = 0; colo_buffer_init(); -/*TODO: COLO checkpointed save loop*/ +while (s-state == MIG_STATE_COLO) { +/* wait for a colo checkpoint */ +wait_cp = colo_compare(); +if (wait_cp) { +if (errno != ETIME errno != ERESTART) { +error_report(compare module failed(%s), strerror(errno)); +goto out; +} +/* + * no checkpoint is needed, wait for 1ms and then + * check if we need checkpoint + */ +current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); +if (current_time - start_time CHKPOINT_TIMER) { +usleep(1000); +continue; +} +} + +/* start a colo checkpoint */ + +/*TODO: COLO save */ +start_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); +} + +out: colo_buffer_destroy(); +colo_compare_destroy(); if (s-state != MIG_STATE_ERROR) { migrate_set_state(s, MIG_STATE_COLO, MIG_STATE_COMPLETED); @@ -183,6 +275,17 @@ void colo_init_checkpointer(MigrationState *s) static Coroutine *colo; +/* + * return: + * 0: start a checkpoint + * 1: some error happend, exit colo restore + */ +static int slave_wait_new_checkpoint(QEMUFile *f) +{ +/* TODO: wait checkpoint start command from master */ +return 1; +} + void colo_process_incoming_checkpoints(QEMUFile *f) { int dev_hotplug = qdev_hotplug; @@ -198,7 +301,13 @@ void colo_process_incoming_checkpoints(QEMUFile *f) colo_buffer_init(); -/* TODO: COLO checkpointed restore loop */ +while (true) { +if (slave_wait_new_checkpoint(f)) { +break; +} + +/* TODO: COLO restore */ +} colo_buffer_destroy(); colo = NULL; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 02/17] COLO: introduce an api colo_supported() to indicate COLO support
introduce an api colo_supported() to indicate COLO support, returns true if colo supported(configured with --enable-colo). Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- Makefile.objs | 1 + include/migration/migration-colo.h | 18 ++ migration-colo.c | 16 stubs/Makefile.objs| 1 + stubs/migration-colo.c | 16 5 files changed, 52 insertions(+) create mode 100644 include/migration/migration-colo.h create mode 100644 migration-colo.c create mode 100644 stubs/migration-colo.c diff --git a/Makefile.objs b/Makefile.objs index 1f76cea..cab5824 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -50,6 +50,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o common-obj-$(CONFIG_LINUX) += fsdev/ common-obj-y += migration.o migration-tcp.o +common-obj-$(CONFIG_COLO) += migration-colo.o common-obj-y += vmstate.o common-obj-y += qemu-file.o common-obj-$(CONFIG_RDMA) += migration-rdma.o diff --git a/include/migration/migration-colo.h b/include/migration/migration-colo.h new file mode 100644 index 000..35b384c --- /dev/null +++ b/include/migration/migration-colo.h @@ -0,0 +1,18 @@ +/* + * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO) + * (a.k.a. Fault Tolerance or Continuous Replication) + * + * Copyright (C) 2014 FUJITSU LIMITED + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef QEMU_MIGRATION_COLO_H +#define QEMU_MIGRATION_COLO_H + +#include qemu-common.h + +bool colo_supported(void); + +#endif diff --git a/migration-colo.c b/migration-colo.c new file mode 100644 index 000..1d3bef8 --- /dev/null +++ b/migration-colo.c @@ -0,0 +1,16 @@ +/* + * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO) + * (a.k.a. Fault Tolerance or Continuous Replication) + * + * Copyright (C) 2014 FUJITSU LIMITED + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include migration/migration-colo.h + +bool colo_supported(void) +{ +return true; +} diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 528e161..6810c89 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -39,3 +39,4 @@ stub-obj-$(CONFIG_WIN32) += fd-register.o stub-obj-y += cpus.o stub-obj-y += kvm.o stub-obj-y += qmp_pc_dimm_device_list.o +stub-obj-y += migration-colo.o diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c new file mode 100644 index 000..b9ee6a0 --- /dev/null +++ b/stubs/migration-colo.c @@ -0,0 +1,16 @@ +/* + * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO) + * (a.k.a. Fault Tolerance or Continuous Replication) + * + * Copyright (C) 2014 FUJITSU LIMITED + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include migration/migration-colo.h + +bool colo_supported(void) +{ +return false; +} -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 07/17] COLO buffer: implement colo buffer as well as QEMUFileOps based on it
We need a buffer to store migration data. On save side: all saved data was write into colo buffer first, so that we can know the total size of the migration data. this can also separate the data transmission from colo control data, we use colo control data over socket fd to synchronous both side's stat. On restore side: all migration data was read into colo buffer first, then load data from the buffer: If network error happens while data transmission, the slaver can still functinal because the migration data are not yet loaded. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- migration-colo.c | 112 +++ 1 file changed, 112 insertions(+) diff --git a/migration-colo.c b/migration-colo.c index d566b9d..b90d9b6 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -11,6 +11,7 @@ #include qemu/main-loop.h #include qemu/thread.h #include block/coroutine.h +#include qemu/error-report.h #include migration/migration-colo.h static QEMUBH *colo_bh; @@ -20,14 +21,122 @@ bool colo_supported(void) return true; } +/* colo buffer */ + +#define COLO_BUFFER_BASE_SIZE (1000*1000*4ULL) +#define COLO_BUFFER_MAX_SIZE (1000*1000*1000*10ULL) + +typedef struct colo_buffer { +uint8_t *data; +uint64_t used; +uint64_t freed; +uint64_t size; +} colo_buffer_t; + +static colo_buffer_t colo_buffer; + +static void colo_buffer_init(void) +{ +if (colo_buffer.size == 0) { +colo_buffer.data = g_malloc(COLO_BUFFER_BASE_SIZE); +colo_buffer.size = COLO_BUFFER_BASE_SIZE; +} +colo_buffer.used = 0; +colo_buffer.freed = 0; +} + +static void colo_buffer_destroy(void) +{ +if (colo_buffer.data) { +g_free(colo_buffer.data); +colo_buffer.data = NULL; +} +colo_buffer.used = 0; +colo_buffer.freed = 0; +colo_buffer.size = 0; +} + +static void colo_buffer_extend(uint64_t len) +{ +if (len colo_buffer.size - colo_buffer.used) { +len = len + colo_buffer.used - colo_buffer.size; +len = ROUND_UP(len, COLO_BUFFER_BASE_SIZE) + COLO_BUFFER_BASE_SIZE; + +colo_buffer.size += len; +if (colo_buffer.size COLO_BUFFER_MAX_SIZE) { +error_report(colo_buffer overflow!\n); +exit(EXIT_FAILURE); +} +colo_buffer.data = g_realloc(colo_buffer.data, colo_buffer.size); +} +} + +static int colo_put_buffer(void *opaque, const uint8_t *buf, + int64_t pos, int size) +{ +colo_buffer_extend(size); +memcpy(colo_buffer.data + colo_buffer.used, buf, size); +colo_buffer.used += size; + +return size; +} + +static int colo_get_buffer_internal(uint8_t *buf, int size) +{ +if ((size + colo_buffer.freed) colo_buffer.used) { +size = colo_buffer.used - colo_buffer.freed; +} +memcpy(buf, colo_buffer.data + colo_buffer.freed, size); +colo_buffer.freed += size; + +return size; +} + +static int colo_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +{ +return colo_get_buffer_internal(buf, size); +} + +static int colo_close(void *opaque) +{ +colo_buffer_t *cb = opaque ; + +cb-used = 0; +cb-freed = 0; + +return 0; +} + +static int colo_get_fd(void *opaque) +{ +/* colo buffer, no fd */ +return -1; +} + +static const QEMUFileOps colo_write_ops = { +.put_buffer = colo_put_buffer, +.get_fd = colo_get_fd, +.close = colo_close, +}; + +static const QEMUFileOps colo_read_ops = { +.get_buffer = colo_get_buffer, +.get_fd = colo_get_fd, +.close = colo_close, +}; + /* save */ static void *colo_thread(void *opaque) { MigrationState *s = opaque; +colo_buffer_init(); + /*TODO: COLO checkpointed save loop*/ +colo_buffer_destroy(); + if (s-state != MIG_STATE_ERROR) { migrate_set_state(s, MIG_STATE_COLO, MIG_STATE_COMPLETED); } @@ -77,8 +186,11 @@ void colo_process_incoming_checkpoints(QEMUFile *f) colo = qemu_coroutine_self(); assert(colo != NULL); +colo_buffer_init(); + /* TODO: COLO checkpointed restore loop */ +colo_buffer_destroy(); colo = NULL; restore_exit_colo(); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 10/17] COLO ctl: introduce is_slave() and is_master()
is_slaver is to determine whether the QEMU instance is a slaver(migration target) at runtime. is_master is to determine whether the QEMU instance is a master(migration starter) at runtime. This 2 APIs will be used later. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- migration-colo.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/migration-colo.c b/migration-colo.c index 802f8b0..2699e77 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -187,6 +187,12 @@ static const QEMUFileOps colo_read_ops = { /* save */ +static __attribute__((unused)) bool is_master(void) +{ +MigrationState *s = migrate_get_current(); +return (s-state == MIG_STATE_COLO); +} + static void *colo_thread(void *opaque) { MigrationState *s = opaque; @@ -275,6 +281,11 @@ void colo_init_checkpointer(MigrationState *s) static Coroutine *colo; +static __attribute__((unused)) bool is_slave(void) +{ +return colo != NULL; +} + /* * return: * 0: start a checkpoint -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 12/17] COLO ctl: add a RunState RUN_STATE_COLO
Guest will enter this state when paused to save/resore VM state under colo checkpoint. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- qapi-schema.json | 4 +++- vl.c | 8 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/qapi-schema.json b/qapi-schema.json index 807f5a2..b42171c 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -145,12 +145,14 @@ # @watchdog: the watchdog action is configured to pause and has been triggered # # @guest-panicked: guest has been panicked as a result of guest OS panic +# +# @colo: guest is paused to save/restore VM state under colo checkpoint ## { 'enum': 'RunState', 'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused', 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm', 'running', 'save-vm', 'shutdown', 'suspended', 'watchdog', -'guest-panicked' ] } +'guest-panicked', 'colo' ] } ## # @StatusInfo: diff --git a/vl.c b/vl.c index 1a282d8..545155d 100644 --- a/vl.c +++ b/vl.c @@ -597,6 +597,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_INMIGRATE, RUN_STATE_PAUSED }, +{ RUN_STATE_INMIGRATE, RUN_STATE_COLO }, { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PAUSED }, { RUN_STATE_INTERNAL_ERROR, RUN_STATE_FINISH_MIGRATE }, @@ -606,6 +607,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_PAUSED, RUN_STATE_RUNNING }, { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_PAUSED, RUN_STATE_COLO}, { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE }, @@ -616,9 +618,12 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE }, +{ RUN_STATE_FINISH_MIGRATE, RUN_STATE_COLO}, { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING }, +{ RUN_STATE_COLO, RUN_STATE_RUNNING }, + { RUN_STATE_RUNNING, RUN_STATE_DEBUG }, { RUN_STATE_RUNNING, RUN_STATE_INTERNAL_ERROR }, { RUN_STATE_RUNNING, RUN_STATE_IO_ERROR }, @@ -629,6 +634,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN }, { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG }, { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED }, +{ RUN_STATE_RUNNING, RUN_STATE_COLO}, { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING }, @@ -639,9 +645,11 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED }, { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING }, { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_SUSPENDED, RUN_STATE_COLO}, { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING }, { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_WATCHDOG, RUN_STATE_COLO}, { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING }, { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE }, -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 16/17] COLO ram cache: implement colo ram cache on slaver
The ram cache was initially the same as PVM's memory. At checkpoint, we cache the dirty memory of PVM into ram cache (so that ram cache always the same as PVM's memory at every checkpoint), flush cached memory to SVM after we received all PVM dirty memory(only needed to flush memory that was both dirty on PVM and SVM since last checkpoint). Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- arch_init.c| 154 - include/exec/cpu-all.h | 1 + include/migration/migration-colo.h | 3 + migration-colo.c | 4 + 4 files changed, 159 insertions(+), 3 deletions(-) diff --git a/arch_init.c b/arch_init.c index c84e6c8..009bcb5 100644 --- a/arch_init.c +++ b/arch_init.c @@ -1013,6 +1013,7 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) return 0; } +static void *memory_region_get_ram_cache_ptr(MemoryRegion *mr, RAMBlock *block); static inline void *host_from_stream_offset(QEMUFile *f, ram_addr_t offset, int flags) @@ -1027,7 +1028,12 @@ static inline void *host_from_stream_offset(QEMUFile *f, return NULL; } -return memory_region_get_ram_ptr(block-mr) + offset; +if (is_slave()) { +migration_bitmap_set_dirty(block-mr-ram_addr + offset); +return memory_region_get_ram_cache_ptr(block-mr, block) + offset; +} else { +return memory_region_get_ram_ptr(block-mr) + offset; +} } len = qemu_get_byte(f); @@ -1035,8 +1041,15 @@ static inline void *host_from_stream_offset(QEMUFile *f, id[len] = 0; QTAILQ_FOREACH(block, ram_list.blocks, next) { -if (!strncmp(id, block-idstr, sizeof(id))) -return memory_region_get_ram_ptr(block-mr) + offset; +if (!strncmp(id, block-idstr, sizeof(id))) { +if (is_slave()) { +migration_bitmap_set_dirty(block-mr-ram_addr + offset); +return memory_region_get_ram_cache_ptr(block-mr, block) + + offset; +} else { +return memory_region_get_ram_ptr(block-mr) + offset; +} +} } error_report(Can't find block %s!, id); @@ -1054,11 +1067,13 @@ void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) } } +static void ram_flush_cache(void); static int ram_load(QEMUFile *f, void *opaque, int version_id) { ram_addr_t addr; int flags, ret = 0; static uint64_t seq_iter; +bool need_flush = false; seq_iter++; @@ -1121,6 +1136,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) break; } +need_flush = true; ch = qemu_get_byte(f); ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); } else if (flags RAM_SAVE_FLAG_PAGE) { @@ -1133,6 +1149,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) break; } +need_flush = true; qemu_get_buffer(f, host, TARGET_PAGE_SIZE); } else if (flags RAM_SAVE_FLAG_XBZRLE) { void *host = host_from_stream_offset(f, addr, flags); @@ -1148,6 +1165,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) ret = -EINVAL; break; } +need_flush = true; } else if (flags RAM_SAVE_FLAG_HOOK) { ram_control_load_hook(f, flags); } else if (flags RAM_SAVE_FLAG_EOS) { @@ -1161,11 +1179,141 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) ret = qemu_file_get_error(f); } +if (!ret is_slave() need_flush) { +ram_flush_cache(); +} + DPRINTF(Completed load of VM with exit code %d seq iteration % PRIu64 \n, ret, seq_iter); return ret; } +/* + * colo cache: this is for secondary VM, we cache the whole + * memory of the secondary VM. + */ +void create_and_init_ram_cache(void) +{ +/* + * called after first migration + */ +RAMBlock *block; +int64_t ram_cache_pages = last_ram_offset() TARGET_PAGE_BITS; + +QTAILQ_FOREACH(block, ram_list.blocks, next) { +block-host_cache = g_malloc(block-length); +memcpy(block-host_cache, block-host, block-length); +} + +migration_bitmap = bitmap_new(ram_cache_pages); +migration_dirty_pages = 0; +memory_global_dirty_log_start(); +} + +void release_ram_cache(void) +{ +RAMBlock *block; + +if (migration_bitmap) { +memory_global_dirty_log_stop(); +g_free(migration_bitmap); +migration_bitmap = NULL; +} + +QTAILQ_FOREACH(block, ram_list.blocks, next) { +g_free(block-host_cache); +} +} + +static void *memory_region_get_ram_cache_ptr(MemoryRegion *mr, RAMBlock *block) +{ + if (mr-alias) { +
[RFC PATCH 15/17] COLO save: reuse migration bitmap under colo checkpoint
reuse migration bitmap under colo checkpoint, only send dirty pages per-checkpoint. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- arch_init.c| 20 +++- include/migration/migration-colo.h | 2 ++ migration-colo.c | 6 ++ stubs/migration-colo.c | 10 ++ 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/arch_init.c b/arch_init.c index 8ddaf35..c84e6c8 100644 --- a/arch_init.c +++ b/arch_init.c @@ -52,6 +52,7 @@ #include exec/ram_addr.h #include hw/acpi/acpi.h #include qemu/host-utils.h +#include migration/migration-colo.h #ifdef DEBUG_ARCH_INIT #define DPRINTF(fmt, ...) \ @@ -769,6 +770,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque) RAMBlock *block; int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ +/* + * migration has already setup the bitmap, reuse it. + */ +if (is_master()) { +qemu_mutex_lock_ramlist(); +reset_ram_globals(); +goto out_setup; +} + mig_throttle_on = false; dirty_rate_high_cnt = 0; bitmap_sync_count = 0; @@ -828,6 +838,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) migration_bitmap_sync(); qemu_mutex_unlock_iothread(); +out_setup: qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); QTAILQ_FOREACH(block, ram_list.blocks, next) { @@ -937,7 +948,14 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } ram_control_after_iterate(f, RAM_CONTROL_FINISH); -migration_end(); + +/* + * Since we need to reuse dirty bitmap in colo, + * don't cleanup the bitmap. + */ +if (!migrate_use_colo() || migration_has_failed(migrate_get_current())) { +migration_end(); +} qemu_mutex_unlock_ramlist(); qemu_put_be64(f, RAM_SAVE_FLAG_EOS); diff --git a/include/migration/migration-colo.h b/include/migration/migration-colo.h index 861fa27..c286a60 100644 --- a/include/migration/migration-colo.h +++ b/include/migration/migration-colo.h @@ -21,10 +21,12 @@ bool colo_supported(void); /* save */ bool migrate_use_colo(void); void colo_init_checkpointer(MigrationState *s); +bool is_master(void); /* restore */ bool restore_use_colo(void); void restore_exit_colo(void); +bool is_slave(void); void colo_process_incoming_checkpoints(QEMUFile *f); diff --git a/migration-colo.c b/migration-colo.c index 8596845..13a6a57 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -222,8 +222,6 @@ static const QEMUFileOps colo_read_ops = { }; /* colo checkpoint control helper */ -static bool is_master(void); -static bool is_slave(void); static void ctl_error_handler(void *opaque, int err) { @@ -295,7 +293,7 @@ static int colo_ctl_get(QEMUFile *f, uint64_t require) /* save */ -static bool is_master(void) +bool is_master(void) { MigrationState *s = migrate_get_current(); return (s-state == MIG_STATE_COLO); @@ -499,7 +497,7 @@ void colo_init_checkpointer(MigrationState *s) static Coroutine *colo; -static bool is_slave(void) +bool is_slave(void) { return colo != NULL; } diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c index 55f0d37..ef65be6 100644 --- a/stubs/migration-colo.c +++ b/stubs/migration-colo.c @@ -22,3 +22,13 @@ void colo_init_checkpointer(MigrationState *s) void colo_process_incoming_checkpoints(QEMUFile *f) { } + +bool is_master(void) +{ +return false; +} + +bool is_slave(void) +{ +return false; +} -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 03/17] COLO migration: add a migration capability 'colo'
Add a migration capability 'colo'. If this capability is on, The migration will never end, and the VM will be continuously checkpointed. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- include/qapi/qmp/qerror.h | 3 +++ migration.c | 6 ++ qapi-schema.json | 5 - 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h index 902d1a7..226b805 100644 --- a/include/qapi/qmp/qerror.h +++ b/include/qapi/qmp/qerror.h @@ -166,4 +166,7 @@ void qerror_report_err(Error *err); #define QERR_SOCKET_CREATE_FAILED \ ERROR_CLASS_GENERIC_ERROR, Failed to create socket +#define QERR_COLO_UNSUPPORTED \ +ERROR_CLASS_GENERIC_ERROR, COLO is not currently supported, please rerun configure with --enable-colo option in order to support COLO feature + #endif /* QERROR_H */ diff --git a/migration.c b/migration.c index 8d675b3..ca83310 100644 --- a/migration.c +++ b/migration.c @@ -25,6 +25,7 @@ #include qemu/thread.h #include qmp-commands.h #include trace.h +#include migration/migration-colo.h enum { MIG_STATE_ERROR = -1, @@ -277,6 +278,11 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, } for (cap = params; cap; cap = cap-next) { +if (cap-value-capability == MIGRATION_CAPABILITY_COLO +cap-value-state !colo_supported()) { +error_set(errp, QERR_COLO_UNSUPPORTED); +continue; +} s-enabled_capabilities[cap-value-capability] = cap-value-state; } } diff --git a/qapi-schema.json b/qapi-schema.json index b11aad2..807f5a2 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -491,10 +491,13 @@ # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # +# @colo: The migration will never end, and the VM will instead be continuously +#checkpointed. The feature is disabled by default. (since 2.1) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks'] } + 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', 'colo'] } ## # @MigrationCapabilityStatus -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 14/17] COLO ctl: implement colo restore
implement colo restore Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- migration-colo.c | 43 +++ 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/migration-colo.c b/migration-colo.c index 03ac157..8596845 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -535,8 +535,9 @@ void colo_process_incoming_checkpoints(QEMUFile *f) { int fd = qemu_get_fd(f); int dev_hotplug = qdev_hotplug; -QEMUFile *ctl = NULL; +QEMUFile *ctl = NULL, *fb = NULL; int ret; +uint64_t total_size; if (!restore_use_colo()) { return; @@ -560,7 +561,8 @@ void colo_process_incoming_checkpoints(QEMUFile *f) goto out; } -/* TODO: in COLO mode, slave is runing, so start the vm */ +/* in COLO mode, slave is runing, so start the vm */ +vm_start(); while (true) { if (slave_wait_new_checkpoint(f)) { @@ -569,43 +571,68 @@ void colo_process_incoming_checkpoints(QEMUFile *f) /* start colo checkpoint */ -/* TODO: suspend guest */ +/* suspend guest */ +vm_stop_force_state(RUN_STATE_COLO); ret = colo_ctl_put(ctl, COLO_CHECKPOINT_SUSPENDED); if (ret) { goto out; } -/* TODO: open colo buffer for read */ +/* open colo buffer for read */ +fb = qemu_fopen_ops(colo_buffer, colo_read_ops); +if (!fb) { +error_report(can't open colo buffer\n); +goto out; +} ret = colo_ctl_get(f, COLO_CHECKPOINT_SEND); if (ret) { goto out; } -/* TODO: read migration data into colo buffer */ +/* read migration data into colo buffer */ + +/* read the vmstate total size first */ +ret = colo_ctl_get_value(f, total_size); +if (ret) { +goto out; +} +colo_buffer_extend(total_size); +qemu_get_buffer(f, colo_buffer.data, total_size); +colo_buffer.used = total_size; ret = colo_ctl_put(ctl, COLO_CHECKPOINT_RECEIVED); if (ret) { goto out; } -/* TODO: load vm state */ +/* load vm state */ +if (qemu_loadvm_state(fb) 0) { +error_report(COLO: loadvm failed\n); +goto out; +} ret = colo_ctl_put(ctl, COLO_CHECKPOINT_LOADED); if (ret) { goto out; } -/* TODO: resume guest */ +/* resume guest */ +vm_start(); -/* TODO: close colo buffer */ +qemu_fclose(fb); +fb = NULL; } out: colo_buffer_destroy(); colo = NULL; +if (fb) { +qemu_fclose(fb); +} + if (ctl) { qemu_fclose(ctl); } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 08/17] COLO: disable qdev hotplug
COLO do not support qdev hotplug migration, disable it. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- migration-colo.c | 12 1 file changed, 12 insertions(+) diff --git a/migration-colo.c b/migration-colo.c index b90d9b6..f295e56 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -12,6 +12,7 @@ #include qemu/thread.h #include block/coroutine.h #include qemu/error-report.h +#include hw/qdev-core.h #include migration/migration-colo.h static QEMUBH *colo_bh; @@ -130,6 +131,9 @@ static const QEMUFileOps colo_read_ops = { static void *colo_thread(void *opaque) { MigrationState *s = opaque; +int dev_hotplug = qdev_hotplug; + +qdev_hotplug = 0; colo_buffer_init(); @@ -145,6 +149,8 @@ static void *colo_thread(void *opaque) qemu_bh_schedule(s-cleanup_bh); qemu_mutex_unlock_iothread(); +qdev_hotplug = dev_hotplug; + return NULL; } @@ -179,10 +185,14 @@ static Coroutine *colo; void colo_process_incoming_checkpoints(QEMUFile *f) { +int dev_hotplug = qdev_hotplug; + if (!restore_use_colo()) { return; } +qdev_hotplug = 0; + colo = qemu_coroutine_self(); assert(colo != NULL); @@ -194,5 +204,7 @@ void colo_process_incoming_checkpoints(QEMUFile *f) colo = NULL; restore_exit_colo(); +qdev_hotplug = dev_hotplug; + return; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 13/17] COLO ctl: implement colo save
implement colo save Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- migration-colo.c | 44 ++-- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/migration-colo.c b/migration-colo.c index a708872..03ac157 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -14,6 +14,7 @@ #include qemu/error-report.h #include hw/qdev-core.h #include qemu/timer.h +#include sysemu/sysemu.h #include migration/migration-colo.h #include sys/ioctl.h @@ -106,12 +107,12 @@ static int colo_compare(void) return ioctl(comp_fd, COMP_IOCTWAIT, 250); } -static __attribute__((unused)) int colo_compare_flush(void) +static int colo_compare_flush(void) { return ioctl(comp_fd, COMP_IOCTFLUSH, 1); } -static __attribute__((unused)) int colo_compare_resume(void) +static int colo_compare_resume(void) { return ioctl(comp_fd, COMP_IOCTRESUME, 1); } @@ -315,30 +316,61 @@ static int do_colo_transaction(MigrationState *s, QEMUFile *control, goto out; } -/* TODO: suspend and save vm state to colo buffer */ +/* suspend and save vm state to colo buffer */ + +qemu_mutex_lock_iothread(); +vm_stop_force_state(RUN_STATE_COLO); +qemu_mutex_unlock_iothread(); +/* Disable block migration */ +s-params.blk = 0; +s-params.shared = 0; +qemu_savevm_state_begin(trans, s-params); +qemu_savevm_state_complete(trans); + +qemu_fflush(trans); ret = colo_ctl_put(s-file, COLO_CHECKPOINT_SEND); if (ret) { goto out; } -/* TODO: send vmstate to slave */ +/* send vmstate to slave */ + +/* we send the total size of the vmstate first */ +ret = colo_ctl_put(s-file, colo_buffer.used); +if (ret) { +goto out; +} + +qemu_put_buffer_async(s-file, colo_buffer.data, colo_buffer.used); +ret = qemu_file_get_error(s-file); +if (ret 0) { +goto out; +} +qemu_fflush(s-file); ret = colo_ctl_get(control, COLO_CHECKPOINT_RECEIVED); if (ret) { goto out; } -/* TODO: Flush network etc. */ +/* Flush network etc. */ +colo_compare_flush(); ret = colo_ctl_get(control, COLO_CHECKPOINT_LOADED); if (ret) { goto out; } -/* TODO: resume master */ +colo_compare_resume(); +ret = 0; out: +/* resume master */ +qemu_mutex_lock_iothread(); +vm_start(); +qemu_mutex_unlock_iothread(); + return ret; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 17/17] HACK: trigger checkpoint every 500ms
Because COLO Agent is under development. We add this hack for test purpose. Trigger checkpoint every 500ms so that we can test the process of COLO save/restore. NOTE: This is only a hack, and will be removed at last. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- migration-colo.c | 14 +- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/migration-colo.c b/migration-colo.c index 52156e7..4be037e 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -23,7 +23,7 @@ * this is large because COLO checkpoint will mostly depend on * COLO compare module. */ -#define CHKPOINT_TIMER 1 +#define CHKPOINT_TIMER 500 enum { COLO_READY = 0x46, @@ -79,11 +79,6 @@ static int comp_fd = -1; static int colo_compare_init(void) { -comp_fd = open(COMPARE_DEV, O_RDONLY); -if (comp_fd 0) { -return -1; -} - return 0; } @@ -104,17 +99,18 @@ static void colo_compare_destroy(void) */ static int colo_compare(void) { -return ioctl(comp_fd, COMP_IOCTWAIT, 250); +errno = ERESTART; +return 1; } static int colo_compare_flush(void) { -return ioctl(comp_fd, COMP_IOCTFLUSH, 1); +return 0; } static int colo_compare_resume(void) { -return ioctl(comp_fd, COMP_IOCTRESUME, 1); +return 0; } /* colo buffer */ -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 11/17] COLO ctl: implement colo checkpoint protocol
implement colo checkpoint protocol. Checkpoint synchronzing points. Primary Secondary NEW @ Suspend SUSPENDED @ SuspendSave state SEND@ Send state Receive state RECEIVED@ Flush network Load state LOADED @ Resume Resume Start Comparing NOTE: 1) '@' who sends the message 2) Every sync-point is synchronized by two sides with only one handshake(single direction) for low-latency. If more strict synchronization is required, a opposite direction sync-point should be added. 3) Since sync-points are single direction, the remote side may go forward a lot when this side just receives the sync-point. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- migration-colo.c | 268 +-- 1 file changed, 262 insertions(+), 6 deletions(-) diff --git a/migration-colo.c b/migration-colo.c index 2699e77..a708872 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -24,6 +24,41 @@ */ #define CHKPOINT_TIMER 1 +enum { +COLO_READY = 0x46, + +/* + * Checkpoint synchronzing points. + * + * Primary Secondary + * NEW @ + * Suspend + * SUSPENDED @ + * SuspendSave state + * SEND@ + * Send state Receive state + * RECEIVED@ + * Flush network Load state + * LOADED @ + * Resume Resume + * + * Start Comparing + * NOTE: + * 1) '@' who sends the message + * 2) Every sync-point is synchronized by two sides with only + *one handshake(single direction) for low-latency. + *If more strict synchronization is required, a opposite direction + *sync-point should be added. + * 3) Since sync-points are single direction, the remote side may + *go forward a lot when this side just receives the sync-point. + */ +COLO_CHECKPOINT_NEW, +COLO_CHECKPOINT_SUSPENDED, +COLO_CHECKPOINT_SEND, +COLO_CHECKPOINT_RECEIVED, +COLO_CHECKPOINT_LOADED, +}; + static QEMUBH *colo_bh; bool colo_supported(void) @@ -185,30 +220,161 @@ static const QEMUFileOps colo_read_ops = { .close = colo_close, }; +/* colo checkpoint control helper */ +static bool is_master(void); +static bool is_slave(void); + +static void ctl_error_handler(void *opaque, int err) +{ +if (is_slave()) { +/* TODO: determine whether we need to failover */ +/* FIXME: we will not failover currently, just kill slave */ +error_report(error: colo transmission failed!\n); +exit(1); +} else if (is_master()) { +/* Master still alive, do not failover */ +error_report(error: colo transmission failed!\n); +return; +} else { +error_report(COLO: Unexpected error happend!\n); +exit(EXIT_FAILURE); +} +} + +static int colo_ctl_put(QEMUFile *f, uint64_t request) +{ +int ret = 0; + +qemu_put_be64(f, request); +qemu_fflush(f); + +ret = qemu_file_get_error(f); +if (ret 0) { +ctl_error_handler(f, ret); +return 1; +} + +return ret; +} + +static int colo_ctl_get_value(QEMUFile *f, uint64_t *value) +{ +int ret = 0; +uint64_t temp; + +temp = qemu_get_be64(f); + +ret = qemu_file_get_error(f); +if (ret 0) { +ctl_error_handler(f, ret); +return 1; +} + +*value = temp; +return 0; +} + +static int colo_ctl_get(QEMUFile *f, uint64_t require) +{ +int ret; +uint64_t value; + +ret = colo_ctl_get_value(f, value); +if (ret) { +return ret; +} + +if (value != require) { +error_report(unexpected state received!\n); +exit(1); +} + +return ret; +} + /* save */ -static __attribute__((unused)) bool is_master(void) +static bool is_master(void) { MigrationState *s = migrate_get_current(); return (s-state == MIG_STATE_COLO); } +static int do_colo_transaction(MigrationState *s, QEMUFile *control, + QEMUFile *trans) +{ +int ret; + +ret = colo_ctl_put(s-file, COLO_CHECKPOINT_NEW); +if (ret) { +goto out; +} + +ret = colo_ctl_get(control, COLO_CHECKPOINT_SUSPENDED); +if (ret) { +goto out; +} + +/* TODO: suspend and save vm state to colo buffer */ + +ret = colo_ctl_put(s-file, COLO_CHECKPOINT_SEND); +if (ret) { +goto out; +} + +/*
[RFC PATCH 06/17] COLO restore: integrate COLO checkpointed restore into qemu restore
enter colo checkpointed restore loop after live migration. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- include/migration/migration-colo.h | 6 ++ migration-colo-comm.c | 10 ++ migration-colo.c | 22 ++ migration.c| 3 +++ stubs/migration-colo.c | 4 5 files changed, 45 insertions(+) diff --git a/include/migration/migration-colo.h b/include/migration/migration-colo.h index 24589c0..861fa27 100644 --- a/include/migration/migration-colo.h +++ b/include/migration/migration-colo.h @@ -22,4 +22,10 @@ bool colo_supported(void); bool migrate_use_colo(void); void colo_init_checkpointer(MigrationState *s); +/* restore */ +bool restore_use_colo(void); +void restore_exit_colo(void); + +void colo_process_incoming_checkpoints(QEMUFile *f); + #endif diff --git a/migration-colo-comm.c b/migration-colo-comm.c index 4504ceb..b12a57a 100644 --- a/migration-colo-comm.c +++ b/migration-colo-comm.c @@ -38,6 +38,16 @@ static void colo_info_save(QEMUFile *f, void *opaque) /* restore */ +bool restore_use_colo(void) +{ +return colo_requested; +} + +void restore_exit_colo(void) +{ +colo_requested = false; +} + static int colo_info_load(QEMUFile *f, void *opaque, int version_id) { int value = qemu_get_byte(f); diff --git a/migration-colo.c b/migration-colo.c index 0cef8bd..d566b9d 100644 --- a/migration-colo.c +++ b/migration-colo.c @@ -10,6 +10,7 @@ #include qemu/main-loop.h #include qemu/thread.h +#include block/coroutine.h #include migration/migration-colo.h static QEMUBH *colo_bh; @@ -62,3 +63,24 @@ void colo_init_checkpointer(MigrationState *s) colo_bh = qemu_bh_new(colo_start_checkpointer, s); qemu_bh_schedule(colo_bh); } + +/* restore */ + +static Coroutine *colo; + +void colo_process_incoming_checkpoints(QEMUFile *f) +{ +if (!restore_use_colo()) { +return; +} + +colo = qemu_coroutine_self(); +assert(colo != NULL); + +/* TODO: COLO checkpointed restore loop */ + +colo = NULL; +restore_exit_colo(); + +return; +} diff --git a/migration.c b/migration.c index b7f8e7e..190571d 100644 --- a/migration.c +++ b/migration.c @@ -86,6 +86,9 @@ static void process_incoming_migration_co(void *opaque) int ret; ret = qemu_loadvm_state(f); +if (!ret) { +colo_process_incoming_checkpoints(f); +} qemu_fclose(f); free_xbzrle_decoded_buf(); if (ret 0) { diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c index 9013c40..55f0d37 100644 --- a/stubs/migration-colo.c +++ b/stubs/migration-colo.c @@ -18,3 +18,7 @@ bool colo_supported(void) void colo_init_checkpointer(MigrationState *s) { } + +void colo_process_incoming_checkpoints(QEMUFile *f) +{ +} -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 04/17] COLO info: use colo info to tell migration target colo is enabled
migrate colo info to migration target to tell the target colo is enabled. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- Makefile.objs | 1 + include/migration/migration-colo.h | 3 ++ migration-colo-comm.c | 68 ++ vl.c | 4 +++ 4 files changed, 76 insertions(+) create mode 100644 migration-colo-comm.c diff --git a/Makefile.objs b/Makefile.objs index cab5824..1836a68 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -50,6 +50,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o common-obj-$(CONFIG_LINUX) += fsdev/ common-obj-y += migration.o migration-tcp.o +common-obj-y += migration-colo-comm.o common-obj-$(CONFIG_COLO) += migration-colo.o common-obj-y += vmstate.o common-obj-y += qemu-file.o diff --git a/include/migration/migration-colo.h b/include/migration/migration-colo.h index 35b384c..e3735d8 100644 --- a/include/migration/migration-colo.h +++ b/include/migration/migration-colo.h @@ -12,6 +12,9 @@ #define QEMU_MIGRATION_COLO_H #include qemu-common.h +#include migration/migration.h + +void colo_info_mig_init(void); bool colo_supported(void); diff --git a/migration-colo-comm.c b/migration-colo-comm.c new file mode 100644 index 000..ccbc246 --- /dev/null +++ b/migration-colo-comm.c @@ -0,0 +1,68 @@ +/* + * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO) + * (a.k.a. Fault Tolerance or Continuous Replication) + * + * Copyright (C) 2014 FUJITSU LIMITED + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + */ + +#include migration/migration-colo.h + +#define DEBUG_COLO + +#ifdef DEBUG_COLO +#define DPRINTF(fmt, ...) \ +do { fprintf(stdout, COLO: fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +do { } while (0) +#endif + +static bool colo_requested; + +/* save */ + +static bool migrate_use_colo(void) +{ +MigrationState *s = migrate_get_current(); +return s-enabled_capabilities[MIGRATION_CAPABILITY_COLO]; +} + +static void colo_info_save(QEMUFile *f, void *opaque) +{ +qemu_put_byte(f, migrate_use_colo()); +} + +/* restore */ + +static int colo_info_load(QEMUFile *f, void *opaque, int version_id) +{ +int value = qemu_get_byte(f); + +if (value !colo_supported()) { +fprintf(stderr, COLO is not supported\n); +return -EINVAL; +} + +if (value !colo_requested) { +DPRINTF(COLO requested!\n); +} + +colo_requested = value; + +return 0; +} + +static SaveVMHandlers savevm_colo_info_handlers = { +.save_state = colo_info_save, +.load_state = colo_info_load, +}; + +void colo_info_mig_init(void) +{ +register_savevm_live(NULL, colo info, -1, 1, + savevm_colo_info_handlers, NULL); +} diff --git a/vl.c b/vl.c index fe451aa..1a282d8 100644 --- a/vl.c +++ b/vl.c @@ -89,6 +89,7 @@ int main(int argc, char **argv) #include sysemu/dma.h #include audio/audio.h #include migration/migration.h +#include migration/migration-colo.h #include sysemu/kvm.h #include qapi/qmp/qjson.h #include qemu/option.h @@ -4339,6 +4340,9 @@ int main(int argc, char **argv, char **envp) blk_mig_init(); ram_mig_init(); +if (colo_supported()) { +colo_info_mig_init(); +} /* open the virtual block devices */ if (snapshot) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC PATCH 03/17] COLO migration: add a migration capability 'colo'
On 07/23/2014 08:25 AM, Yang Hongyang wrote: Add a migration capability 'colo'. If this capability is on, The migration will never end, and the VM will be continuously checkpointed. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- include/qapi/qmp/qerror.h | 3 +++ migration.c | 6 ++ qapi-schema.json | 5 - 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h index 902d1a7..226b805 100644 --- a/include/qapi/qmp/qerror.h +++ b/include/qapi/qmp/qerror.h @@ -166,4 +166,7 @@ void qerror_report_err(Error *err); #define QERR_SOCKET_CREATE_FAILED \ ERROR_CLASS_GENERIC_ERROR, Failed to create socket +#define QERR_COLO_UNSUPPORTED \ +ERROR_CLASS_GENERIC_ERROR, COLO is not currently supported, please rerun configure with --enable-colo option in order to support COLO feature Unless you plan on using this message in more than one place, we prefer that you don't add new #defines here. Instead, just use error_setg with the message inline. +++ b/qapi-schema.json @@ -491,10 +491,13 @@ # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # +# @colo: The migration will never end, and the VM will instead be continuously +#checkpointed. The feature is disabled by default. (since 2.1) You missed 2.1. This has to be since 2.2. -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
Re: [Qemu-devel] [RFC PATCH 00/17] COarse-grain LOck-stepping(COLO) Virtual Machines for Non-stop Service
On 07/23/2014 08:25 AM, Yang Hongyang wrote: Virtual machine (VM) replication is a well known technique for providing application-agnostic software-implemented hardware fault tolerance non-stop service. COLO is a high availability solution. Both primary VM (PVM) and secondary VM (SVM) run in parallel. They receive the same request from client, and generate response in parallel too. If the response packets from PVM and SVM are identical, they are released immediately. Otherwise, a VM checkpoint (on demand) is conducted. The idea is presented in Xen summit 2012, and 2013, and academia paper in SOCC 2013. It's also presented in KVM forum 2013: http://www.linux-kvm.org/wiki/images/1/1d/Kvm-forum-2013-COLO.pdf Please refer to above document for detailed information. Please also refer to previous posted RFC proposal: http://lists.nongnu.org/archive/html/qemu-devel/2014-06/msg05567.html The patchset is also hosted on github: https://github.com/macrosheep/qemu/tree/colo_v0.1 This patchset is RFC, implements the frame of colo, without failover and nic/disk replication. But it is ready for demo the COLO idea above QEMU-Kvm. Steps using this patchset to get an overview of COLO: 1. configure the source with --enable-colo option Code that has to be opt-in tends to bitrot, because people don't configure their build-bots to opt in. What sort of penalties does opting in cause to the code if colo is not used? I'd much rather make the default to compile colo unless configured --disable-colo. Are there any pre-req libraries required for it to work? That would be the only reason to make the default of on or off conditional, rather than defaulting to on. -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
Re: [Qemu-devel] [RFC PATCH 02/17] COLO: introduce an api colo_supported() to indicate COLO support
On 07/23/2014 08:25 AM, Yang Hongyang wrote: introduce an api colo_supported() to indicate COLO support, returns true if colo supported(configured with --enable-colo). Space before () in English sentences: s/supported(configured/supported (configured/ As I mentioned in the cover letter, defaulting to off is probably a bad idea; I'd rather default to on or even make it unconditional if it doesn't negatively affect the code base when not used. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- Makefile.objs | 1 + include/migration/migration-colo.h | 18 ++ migration-colo.c | 16 stubs/Makefile.objs| 1 + stubs/migration-colo.c | 16 5 files changed, 52 insertions(+) create mode 100644 include/migration/migration-colo.h create mode 100644 migration-colo.c create mode 100644 stubs/migration-colo.c -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
Re: [Qemu-devel] [RFC PATCH 12/17] COLO ctl: add a RunState RUN_STATE_COLO
On 07/23/2014 08:25 AM, Yang Hongyang wrote: Guest will enter this state when paused to save/resore VM state s/resore/restore/ under colo checkpoint. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- qapi-schema.json | 4 +++- vl.c | 8 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/qapi-schema.json b/qapi-schema.json index 807f5a2..b42171c 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -145,12 +145,14 @@ # @watchdog: the watchdog action is configured to pause and has been triggered # # @guest-panicked: guest has been panicked as a result of guest OS panic +# +# @colo: guest is paused to save/restore VM state under colo checkpoint Missing a '(since 2.2)' designation. -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
[PATCH v6 0/5] Read guest last instruction from kvmppc_get_last_inst()
Read guest last instruction from kvmppc_get_last_inst() allowing the function to fail in order to emulate again. On bookehv architecture search for the physical address and kmap it, instead of using Load External PID (lwepx) instruction. This fixes an infinite loop caused by lwepx's data TLB miss exception handled in the host and the TODO for execute-but-not-read entries and TLB eviction. Mihai Caraman (5): KVM: PPC: e500mc: Revert add load inst fixup KVM: PPC: Book3e: Add TLBSEL/TSIZE defines for MAS0/1 KVM: PPC: Book3s: Remove kvmppc_read_inst() function KVM: PPC: Alow kvmppc_get_last_inst() to fail KVM: PPC: Bookehv: Get vcpu's last instruction for emulation arch/powerpc/include/asm/kvm_book3s.h| 26 --- arch/powerpc/include/asm/kvm_booke.h | 5 -- arch/powerpc/include/asm/kvm_ppc.h | 31 + arch/powerpc/include/asm/mmu-book3e.h| 9 ++- arch/powerpc/kvm/book3s.c| 17 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 17 ++--- arch/powerpc/kvm/book3s_paired_singles.c | 38 +++ arch/powerpc/kvm/book3s_pr.c | 114 --- arch/powerpc/kvm/booke.c | 47 + arch/powerpc/kvm/bookehv_interrupts.S| 55 ++- arch/powerpc/kvm/e500_mmu_host.c | 98 ++ arch/powerpc/kvm/emulate.c | 18 +++-- arch/powerpc/kvm/powerpc.c | 11 ++- 13 files changed, 314 insertions(+), 172 deletions(-) -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 1/5] KVM: PPC: e500mc: Revert add load inst fixup
The commit 1d628af7 add load inst fixup made an attempt to handle failures generated by reading the guest current instruction. The fixup code that was added works by chance hiding the real issue. Load external pid (lwepx) instruction, used by KVM to read guest instructions, is executed in a subsituted guest translation context (EPLC[EGS] = 1). In consequence lwepx's TLB error and data storage interrupts need to be handled by KVM, even though these interrupts are generated from host context (MSR[GS] = 0) where lwepx is executed. Currently, KVM hooks only interrupts generated from guest context (MSR[GS] = 1), doing minimal checks on the fast path to avoid host performance degradation. As a result, the host kernel handles lwepx faults searching the faulting guest data address (loaded in DEAR) in its own Logical Partition ID (LPID) 0 context. In case a host translation is found the execution returns to the lwepx instruction instead of the fixup, the host ending up in an infinite loop. Revert the commit add load inst fixup. lwepx issue will be addressed in a subsequent patch without needing fixup code. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- v6-v2: - no change arch/powerpc/kvm/bookehv_interrupts.S | 26 +- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index a1712b8..6ff4480 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -29,7 +29,6 @@ #include asm/asm-compat.h #include asm/asm-offsets.h #include asm/bitsperlong.h -#include asm/thread_info.h #ifdef CONFIG_64BIT #include asm/exception-64e.h @@ -164,32 +163,9 @@ PPC_STL r30, VCPU_GPR(R30)(r4) PPC_STL r31, VCPU_GPR(R31)(r4) mtspr SPRN_EPLC, r8 - - /* disable preemption, so we are sure we hit the fixup handler */ - CURRENT_THREAD_INFO(r8, r1) - li r7, 1 - stw r7, TI_PREEMPT(r8) - isync - - /* -* In case the read goes wrong, we catch it and write an invalid value -* in LAST_INST instead. -*/ -1: lwepx r9, 0, r5 -2: -.section .fixup, ax -3: li r9, KVM_INST_FETCH_FAILED - b 2b -.previous -.section __ex_table,a - PPC_LONG_ALIGN - PPC_LONG 1b,3b -.previous - + lwepx r9, 0, r5 mtspr SPRN_EPLC, r3 - li r7, 0 - stw r7, TI_PREEMPT(r8) stw r9, VCPU_LAST_INST(r4) .endif -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 2/5] KVM: PPC: Book3e: Add TLBSEL/TSIZE defines for MAS0/1
Add mising defines MAS0_GET_TLBSEL() and MAS1_GET_TSIZE() for Book3E. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- v6-v2: - no change arch/powerpc/include/asm/mmu-book3e.h | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 8d24f78..cd4f04a 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -40,9 +40,11 @@ /* MAS registers bit definitions */ -#define MAS0_TLBSEL_MASK0x3000 -#define MAS0_TLBSEL_SHIFT 28 -#define MAS0_TLBSEL(x) (((x) MAS0_TLBSEL_SHIFT) MAS0_TLBSEL_MASK) +#define MAS0_TLBSEL_MASK 0x3000 +#define MAS0_TLBSEL_SHIFT 28 +#define MAS0_TLBSEL(x) (((x) MAS0_TLBSEL_SHIFT) MAS0_TLBSEL_MASK) +#define MAS0_GET_TLBSEL(mas0) (((mas0) MAS0_TLBSEL_MASK) \ + MAS0_TLBSEL_SHIFT) #define MAS0_ESEL_MASK 0x0FFF #define MAS0_ESEL_SHIFT16 #define MAS0_ESEL(x) (((x) MAS0_ESEL_SHIFT) MAS0_ESEL_MASK) @@ -60,6 +62,7 @@ #define MAS1_TSIZE_MASK0x0f80 #define MAS1_TSIZE_SHIFT 7 #define MAS1_TSIZE(x) (((x) MAS1_TSIZE_SHIFT) MAS1_TSIZE_MASK) +#define MAS1_GET_TSIZE(mas1) (((mas1) MAS1_TSIZE_MASK) MAS1_TSIZE_SHIFT) #define MAS2_EPN (~0xFFFUL) #define MAS2_X00x0040 -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 5/5] KVM: PPC: Bookehv: Get vcpu's last instruction for emulation
On book3e, KVM uses load external pid (lwepx) dedicated instruction to read guest last instruction on the exit path. lwepx exceptions (DTLB_MISS, DSI and LRAT), generated by loading a guest address, needs to be handled by KVM. These exceptions are generated in a substituted guest translation context (EPLC[EGS] = 1) from host context (MSR[GS] = 0). Currently, KVM hooks only interrupts generated from guest context (MSR[GS] = 1), doing minimal checks on the fast path to avoid host performance degradation. lwepx exceptions originate from host state (MSR[GS] = 0) which implies additional checks in DO_KVM macro (beside the current MSR[GS] = 1) by looking at the Exception Syndrome Register (ESR[EPID]) and the External PID Load Context Register (EPLC[EGS]). Doing this on each Data TLB miss exception is obvious too intrusive for the host. Read guest last instruction from kvmppc_load_last_inst() by searching for the physical address and kmap it. This address the TODO for TLB eviction and execute-but-not-read entries, and allow us to get rid of lwepx until we are able to handle failures. A simple stress benchmark shows a 1% sys performance degradation compared with previous approach (lwepx without failure handling): time for i in `seq 1 1`; do /bin/echo /dev/null; done real0m 8.85s user0m 4.34s sys 0m 4.48s vs real0m 8.84s user0m 4.36s sys 0m 4.44s A solution to use lwepx and to handle its exceptions in KVM would be to temporary highjack the interrupt vector from host. This imposes additional synchronizations for cores like FSL e6500 that shares host IVOR registers between hardware threads. This optimized solution can be later developed on top of this patch. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- v6: - no change v5: - return ENULATE_AGAIN in case of failure v4: - add switch and new function when getting last inst earlier - use enum instead of prev semnatic - get rid of mas0, optimize mas7_mas3 - give more context in visible messages - check storage attributes mismatch on MMUv2 - get rid of pfn_valid check v3: - reworked patch description - use unaltered kmap addr for kunmap - get last instruction before beeing preempted v2: - reworked patch description - used pr_* functions - addressed cosmetic feedback arch/powerpc/kvm/booke.c | 44 + arch/powerpc/kvm/bookehv_interrupts.S | 37 -- arch/powerpc/kvm/e500_mmu_host.c | 92 +++ 3 files changed, 145 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 34a42b9..843077b 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -869,6 +869,28 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, } } +static int kvmppc_resume_inst_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + enum emulation_result emulated, u32 last_inst) +{ + switch (emulated) { + case EMULATE_AGAIN: + return RESUME_GUEST; + + case EMULATE_FAIL: + pr_debug(%s: load instruction from guest address %lx failed\n, + __func__, vcpu-arch.pc); + /* For debugging, encode the failing instruction and +* report it to userspace. */ + run-hw.hardware_exit_reason = ~0ULL 32; + run-hw.hardware_exit_reason |= last_inst; + kvmppc_core_queue_program(vcpu, ESR_PIL); + return RESUME_HOST; + + default: + BUG(); + } +} + /** * kvmppc_handle_exit * @@ -880,6 +902,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, int r = RESUME_HOST; int s; int idx; + u32 last_inst = KVM_INST_FETCH_FAILED; + enum emulation_result emulated = EMULATE_DONE; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); @@ -887,6 +911,20 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* restart interrupts if they were meant for the host */ kvmppc_restart_interrupt(vcpu, exit_nr); + /* +* get last instruction before beeing preempted +* TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR ESR_DATA +*/ + switch (exit_nr) { + case BOOKE_INTERRUPT_DATA_STORAGE: + case BOOKE_INTERRUPT_DTLB_MISS: + case BOOKE_INTERRUPT_HV_PRIV: + emulated = kvmppc_get_last_inst(vcpu, false, last_inst); + break; + default: + break; + } + local_irq_enable(); trace_kvm_exit(exit_nr, vcpu); @@ -895,6 +933,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run-exit_reason = KVM_EXIT_UNKNOWN; run-ready_for_interrupt_injection = 1; + if (emulated != EMULATE_DONE) { + r =
[PATCH v6 3/5] KVM: PPC: Book3s: Remove kvmppc_read_inst() function
In the context of replacing kvmppc_ld() function calls with a version of kvmppc_get_last_inst() which allow to fail, Alex Graf suggested this: If we get EMULATE_AGAIN, we just have to make sure we go back into the guest. No need to inject an ISI into the guest - it'll do that all by itself. With an error returning kvmppc_get_last_inst we can just use completely get rid of kvmppc_read_inst() and only use kvmppc_get_last_inst() instead. As a intermediate step get rid of kvmppc_read_inst() and only use kvmppc_ld() instead. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- v6: - add proper comments for VSX interrupt handling v5: - make paired single emulation the unusual v4: - new patch arch/powerpc/kvm/book3s_pr.c | 85 ++-- 1 file changed, 34 insertions(+), 51 deletions(-) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index e40765f..e76aec3 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -710,42 +710,6 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac) #endif } -static int kvmppc_read_inst(struct kvm_vcpu *vcpu) -{ - ulong srr0 = kvmppc_get_pc(vcpu); - u32 last_inst = kvmppc_get_last_inst(vcpu); - int ret; - - ret = kvmppc_ld(vcpu, srr0, sizeof(u32), last_inst, false); - if (ret == -ENOENT) { - ulong msr = kvmppc_get_msr(vcpu); - - msr = kvmppc_set_field(msr, 33, 33, 1); - msr = kvmppc_set_field(msr, 34, 36, 0); - msr = kvmppc_set_field(msr, 42, 47, 0); - kvmppc_set_msr_fast(vcpu, msr); - kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); - return EMULATE_AGAIN; - } - - return EMULATE_DONE; -} - -static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr) -{ - - /* Need to do paired single emulation? */ - if (!(vcpu-arch.hflags BOOK3S_HFLAG_PAIRED_SINGLE)) - return EMULATE_DONE; - - /* Read out the instruction */ - if (kvmppc_read_inst(vcpu) == EMULATE_DONE) - /* Need to emulate */ - return EMULATE_FAIL; - - return EMULATE_AGAIN; -} - /* Handle external providers (FPU, Altivec, VSX) */ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, ulong msr) @@ -1149,31 +1113,49 @@ program_interrupt: case BOOK3S_INTERRUPT_VSX: { int ext_msr = 0; + int emul; + ulong pc; + u32 last_inst; + + if (vcpu-arch.hflags BOOK3S_HFLAG_PAIRED_SINGLE) { + /* Do paired single instruction emulation */ + pc = kvmppc_get_pc(vcpu); + last_inst = kvmppc_get_last_inst(vcpu); + emul = kvmppc_ld(vcpu, pc, sizeof(u32), last_inst, +false); + if (emul == EMULATE_DONE) + goto program_interrupt; + else + r = RESUME_GUEST; - switch (exit_nr) { - case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP; break; - case BOOK3S_INTERRUPT_ALTIVEC:ext_msr = MSR_VEC; break; - case BOOK3S_INTERRUPT_VSX:ext_msr = MSR_VSX; break; + break; } - switch (kvmppc_check_ext(vcpu, exit_nr)) { - case EMULATE_DONE: - /* everything ok - let's enable the ext */ - r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr); + /* Enable external provider */ + switch (exit_nr) { + case BOOK3S_INTERRUPT_FP_UNAVAIL: + ext_msr = MSR_FP; break; - case EMULATE_FAIL: - /* we need to emulate this instruction */ - goto program_interrupt; + + case BOOK3S_INTERRUPT_ALTIVEC: + ext_msr = MSR_VEC; break; - default: - /* nothing to worry about - go again */ + + case BOOK3S_INTERRUPT_VSX: + ext_msr = MSR_VSX; break; } + + r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr); break; } case BOOK3S_INTERRUPT_ALIGNMENT: - if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { - u32 last_inst = kvmppc_get_last_inst(vcpu); + { + ulong pc = kvmppc_get_pc(vcpu); + u32 last_inst = kvmppc_get_last_inst(vcpu); + int emul = kvmppc_ld(vcpu, pc, sizeof(u32), last_inst, false); + + if (emul == EMULATE_DONE) { u32 dsisr;
[PATCH v6 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail
On book3e, guest last instruction is read on the exit path using load external pid (lwepx) dedicated instruction. This load operation may fail due to TLB eviction and execute-but-not-read entries. This patch lay down the path for an alternative solution to read the guest last instruction, by allowing kvmppc_get_lat_inst() function to fail. Architecture specific implmentations of kvmppc_load_last_inst() may read last guest instruction and instruct the emulation layer to re-execute the guest in case of failure. Make kvmppc_get_last_inst() definition common between architectures. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- v6: - rewrite kvmppc_get_last_inst() swap code to be understood at a glimpse :) - use inst in kvmppc_load_last_inst - these changes compile on book3s, please validate the functionality and do the necessary changes! v5: - don't swap when load fail - convert the return value space of kvmppc_ld() v4: - common declaration and enum for kvmppc_load_last_inst() - remove kvmppc_read_inst() in a preceding patch v3: - rework patch description - add common definition for kvmppc_get_last_inst() - check return values in book3s code v2: - integrated kvmppc_get_last_inst() in book3s code and checked build - addressed cosmetic feedback arch/powerpc/include/asm/kvm_book3s.h| 26 -- arch/powerpc/include/asm/kvm_booke.h | 5 arch/powerpc/include/asm/kvm_ppc.h | 31 ++ arch/powerpc/kvm/book3s.c| 17 arch/powerpc/kvm/book3s_64_mmu_hv.c | 17 arch/powerpc/kvm/book3s_paired_singles.c | 38 +-- arch/powerpc/kvm/book3s_pr.c | 45 +++- arch/powerpc/kvm/booke.c | 3 +++ arch/powerpc/kvm/e500_mmu_host.c | 6 + arch/powerpc/kvm/emulate.c | 18 - arch/powerpc/kvm/powerpc.c | 11 ++-- 11 files changed, 140 insertions(+), 77 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 20fb6f2..a86ca65 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -276,32 +276,6 @@ static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu) return (kvmppc_get_msr(vcpu) MSR_LE) != (MSR_KERNEL MSR_LE); } -static inline u32 kvmppc_get_last_inst_internal(struct kvm_vcpu *vcpu, ulong pc) -{ - /* Load the instruction manually if it failed to do so in the -* exit path */ - if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED) - kvmppc_ld(vcpu, pc, sizeof(u32), vcpu-arch.last_inst, false); - - return kvmppc_need_byteswap(vcpu) ? swab32(vcpu-arch.last_inst) : - vcpu-arch.last_inst; -} - -static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) -{ - return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu)); -} - -/* - * Like kvmppc_get_last_inst(), but for fetching a sc instruction. - * Because the sc instruction sets SRR0 to point to the following - * instruction, we have to fetch from pc - 4. - */ -static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu) -{ - return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu) - 4); -} - static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) { return vcpu-arch.fault_dar; diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index c7aed61..cbb1990 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h @@ -69,11 +69,6 @@ static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu) return false; } -static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) -{ - return vcpu-arch.last_inst; -} - static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val) { vcpu-arch.ctr = val; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e2fd5a1..2da5f547 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -47,6 +47,11 @@ enum emulation_result { EMULATE_EXIT_USER,/* emulation requires exit to user-space */ }; +enum instruction_type { + INST_GENERIC, + INST_SC,/* system call */ +}; + extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern void kvmppc_handler_highmem(void); @@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 val, unsigned int bytes, int is_default_endian); +extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, +enum instruction_type type, u32 *inst); + extern int kvmppc_emulate_instruction(struct kvm_run *run,
Re: [Qemu-devel] [RFC PATCH 07/17] COLO buffer: implement colo buffer as well as QEMUFileOps based on it
On 07/23/2014 08:25 AM, Yang Hongyang wrote: We need a buffer to store migration data. On save side: all saved data was write into colo buffer first, so that we can know s/was write/is written/ the total size of the migration data. this can also separate the data transmission from colo control data, we use colo control data over socket fd to synchronous both side's stat. On restore side: all migration data was read into colo buffer first, then load data from the buffer: If network error happens while data transmission, s/while/during/ the slaver can still functinal because the migration data are not yet s/slaver/slave/ s/functinal/function/ s/are/is/ loaded. Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com --- migration-colo.c | 112 +++ 1 file changed, 112 insertions(+) +/* colo buffer */ + +#define COLO_BUFFER_BASE_SIZE (1000*1000*4ULL) +#define COLO_BUFFER_MAX_SIZE (1000*1000*1000*10ULL) Spaces around binary operators. + +typedef struct colo_buffer { For consistency with the rest of the code base, name this ColoBuffer, not colo_buffer. +uint8_t *data; +uint64_t used; +uint64_t freed; +uint64_t size; +} colo_buffer_t; HACKING says to NOT name types with a trailing _t. Just name the typedef ColoBuffer. +static void colo_buffer_destroy(void) +{ +if (colo_buffer.data) { +g_free(colo_buffer.data); +colo_buffer.data = NULL; g_free(NULL) behaves sanely, just make these two lines unconditional. +static void colo_buffer_extend(uint64_t len) +{ +if (len colo_buffer.size - colo_buffer.used) { +len = len + colo_buffer.used - colo_buffer.size; +len = ROUND_UP(len, COLO_BUFFER_BASE_SIZE) + COLO_BUFFER_BASE_SIZE; + +colo_buffer.size += len; +if (colo_buffer.size COLO_BUFFER_MAX_SIZE) { +error_report(colo_buffer overflow!\n); No trailing \n in error_report(). -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
Re: [Qemu-devel] [RFC PATCH 00/17] COarse-grain LOck-stepping(COLO) Virtual Machines for Non-stop Service
On 07/23/2014 11:44 PM, Eric Blake wrote: On 07/23/2014 08:25 AM, Yang Hongyang wrote: Virtual machine (VM) replication is a well known technique for providing application-agnostic software-implemented hardware fault tolerance non-stop service. COLO is a high availability solution. Both primary VM (PVM) and secondary VM (SVM) run in parallel. They receive the same request from client, and generate response in parallel too. If the response packets from PVM and SVM are identical, they are released immediately. Otherwise, a VM checkpoint (on demand) is conducted. The idea is presented in Xen summit 2012, and 2013, and academia paper in SOCC 2013. It's also presented in KVM forum 2013: http://www.linux-kvm.org/wiki/images/1/1d/Kvm-forum-2013-COLO.pdf Please refer to above document for detailed information. Please also refer to previous posted RFC proposal: http://lists.nongnu.org/archive/html/qemu-devel/2014-06/msg05567.html The patchset is also hosted on github: https://github.com/macrosheep/qemu/tree/colo_v0.1 This patchset is RFC, implements the frame of colo, without failover and nic/disk replication. But it is ready for demo the COLO idea above QEMU-Kvm. Steps using this patchset to get an overview of COLO: 1. configure the source with --enable-colo option Code that has to be opt-in tends to bitrot, because people don't configure their build-bots to opt in. What sort of penalties does opting in cause to the code if colo is not used? I'd much rather make the default to compile colo unless configured --disable-colo. Are there any pre-req libraries required for it to work? That would be the only reason to make the default of on or off conditional, rather than defaulting to on. Thanks for all your comments on this patchset, will address them. For this one, it will not affect the rest of the code if COLO is compiled but not used, and it does not require pre-req libraries for now, so we can make COLO support default to on next time. -- Thanks, Yang. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 0/5] random,x86,kvm: Rework arch RNG seeds and get some from kvm
This introduces and uses a very simple synchronous mechanism to get /dev/urandom-style bits appropriate for initial KVM PV guest RNG seeding. It also re-works the way that architectural random data is fed into random.c's pools. I added a new arch hook called arch_get_rng_seed. The default implementation is more or less the same as the current code, except that random_get_entropy is now called unconditionally. x86 gets a custom arch_get_rng_seed. It will use KVM_GET_RNG_SEED if available, and, if it does anything, it will log the number of bits collected from each available architectural source. If more paravirt seed sources show up, it will be a natural place to add them. I sent the corresponding kvm-unit-tests and qemu changes separately. Changes from v4: - Got rid of the RDRAND behavior change. If this series is accepted, I may resend it separately, but I think it's an unrelated issue. - Fix up the changelog entries -- I misunderstood how the old code worked. - Avoid lots of failed attempts to use KVM_GET_RNG_SEED if it's not available. Changes from v3: - Other than KASLR, the guest pieces are completely rewritten. Patches 2-4 have essentially nothing in common with v2. Changes from v2: - Bisection fix (patch 2 had a misplaced brace). The final states is identical to that of v2. - Improve the 0/5 description a little bit. Changes from v1: - Split patches 2 and 3 - Log all arch sources in init_std_data - Fix the 32-bit kaslr build Andy Lutomirski (5): x86,kvm: Add MSR_KVM_GET_RNG_SEED and a matching feature bit random: Add and use arch_get_rng_seed x86,random: Add an x86 implementation of arch_get_rng_seed x86,random,kvm: Use KVM_GET_RNG_SEED in arch_get_rng_seed x86,kaslr: Use MSR_KVM_GET_RNG_SEED for KASLR if available Documentation/virtual/kvm/cpuid.txt | 3 ++ arch/x86/Kconfig | 4 ++ arch/x86/boot/compressed/aslr.c | 27 + arch/x86/include/asm/archrandom.h| 6 +++ arch/x86/include/asm/kvm_guest.h | 9 + arch/x86/include/asm/processor.h | 21 -- arch/x86/include/uapi/asm/kvm_para.h | 2 + arch/x86/kernel/Makefile | 2 + arch/x86/kernel/archrandom.c | 74 arch/x86/kernel/kvm.c| 10 + arch/x86/kvm/cpuid.c | 3 +- arch/x86/kvm/x86.c | 4 ++ drivers/char/random.c| 14 +-- include/linux/random.h | 40 +++ 14 files changed, 212 insertions(+), 7 deletions(-) create mode 100644 arch/x86/kernel/archrandom.c -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 2/5] random: Add and use arch_get_rng_seed
Currently, init_std_data contains its own logic for using arch random sources. This replaces that logic with a generic function arch_get_rng_seed that allows arch code to supply its own logic. The default implementation tries arch_get_random_seed_long and arch_get_random_long individually. The only functional change here is that random_get_entropy() is used unconditionally instead of being used only when the arch sources fail. This may add a tiny amount of security. Signed-off-by: Andy Lutomirski l...@amacapital.net --- drivers/char/random.c | 14 +++--- include/linux/random.h | 40 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 0a7ac0a..be7a94e 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1236,6 +1236,10 @@ void get_random_bytes_arch(void *buf, int nbytes) } EXPORT_SYMBOL(get_random_bytes_arch); +static void seed_entropy_store(void *ctx, u32 data) +{ + mix_pool_bytes((struct entropy_store *)ctx, data, sizeof(data), NULL); +} /* * init_std_data - initialize pool with system data @@ -1251,15 +1255,19 @@ static void init_std_data(struct entropy_store *r) int i; ktime_t now = ktime_get_real(); unsigned long rv; + char log_prefix[128]; r-last_pulled = jiffies; mix_pool_bytes(r, now, sizeof(now), NULL); for (i = r-poolinfo-poolbytes; i 0; i -= sizeof(rv)) { - if (!arch_get_random_seed_long(rv) - !arch_get_random_long(rv)) - rv = random_get_entropy(); + rv = random_get_entropy(); mix_pool_bytes(r, rv, sizeof(rv), NULL); } + + sprintf(log_prefix, random: seeded %s pool, r-name); + arch_get_rng_seed(r, seed_entropy_store, 8 * r-poolinfo-poolbytes, + log_prefix); + mix_pool_bytes(r, utsname(), sizeof(*(utsname())), NULL); } diff --git a/include/linux/random.h b/include/linux/random.h index 57fbbff..81a6145 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -106,6 +106,46 @@ static inline int arch_has_random_seed(void) } #endif +#ifndef __HAVE_ARCH_GET_RNG_SEED + +/** + * arch_get_rng_seed() - get architectural rng seed data + * @ctx: context for the seed function + * @seed: function to call for each u32 obtained + * @bits_per_source: number of bits from each source to try to use + * @log_prefix: beginning of log output (may be NULL) + * + * Synchronously load some architectural entropy or other best-effort + * random seed data. An arch-specific implementation should be no worse + * than this generic implementation. If the arch code does something + * interesting, it may log something of the form log_prefix with + * 8 bits of stuff. + * + * No arch-specific implementation should be any worse than the generic + * implementation. + */ +static inline void arch_get_rng_seed(void *ctx, +void (*seed)(void *ctx, u32 data), +int bits_per_source, +const char *log_prefix) +{ + int i; + + for (i = 0; i bits_per_source; i += 8 * sizeof(long)) { + unsigned long rv; + + if (arch_get_random_seed_long(rv) || + arch_get_random_long(rv)) { + seed(ctx, (u32)rv); +#if BITS_PER_LONG 32 + seed(ctx, (u32)(rv 32)); +#endif + } + } +} + +#endif /* __HAVE_ARCH_GET_RNG_SEED */ + /* Pseudo random number generator from numerical recipes. */ static inline u32 next_pseudo_random32(u32 seed) { -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 5/5] x86,kaslr: Use MSR_KVM_GET_RNG_SEED for KASLR if available
It's considerably better than any of the alternatives on KVM. Rather than reinventing all of the cpu feature query code, this fixes native_cpuid to work in PIC objects. I haven't combined it with boot/cpuflags.c's cpuid implementation: including asm/processor.h from boot/cpuflags.c results in a flood of unrelated errors, and fixing it might be messy. Reviewed-by: Kees Cook keesc...@chromium.org Signed-off-by: Andy Lutomirski l...@amacapital.net --- arch/x86/boot/compressed/aslr.c | 27 +++ arch/x86/include/asm/processor.h | 21 ++--- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index fc6091a..8583f0e 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -5,6 +5,8 @@ #include asm/archrandom.h #include asm/e820.h +#include uapi/asm/kvm_para.h + #include generated/compile.h #include linux/module.h #include linux/uts.h @@ -15,6 +17,22 @@ static const char build_str[] = UTS_RELEASE ( LINUX_COMPILE_BY @ LINUX_COMPILE_HOST ) ( LINUX_COMPILER ) UTS_VERSION; +static bool kvm_para_has_feature(unsigned int feature) +{ + u32 kvm_base; + u32 features; + + if (!has_cpuflag(X86_FEATURE_HYPERVISOR)) + return false; + + kvm_base = hypervisor_cpuid_base(KVMKVMKVM\0\0\0, KVM_CPUID_FEATURES); + if (!kvm_base) + return false; + + features = cpuid_eax(kvm_base | KVM_CPUID_FEATURES); + return features (1UL feature); +} + #define I8254_PORT_CONTROL 0x43 #define I8254_PORT_COUNTER00x40 #define I8254_CMD_READBACK 0xC0 @@ -81,6 +99,15 @@ static unsigned long get_random_long(void) } } + if (kvm_para_has_feature(KVM_FEATURE_GET_RNG_SEED)) { + u64 seed; + + debug_putstr( MSR_KVM_GET_RNG_SEED); + rdmsrl(MSR_KVM_GET_RNG_SEED, seed); + random ^= (unsigned long)seed; + use_i8254 = false; + } + if (has_cpuflag(X86_FEATURE_TSC)) { debug_putstr( RDTSC); rdtscll(raw); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a4ea023..6096f3c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -189,10 +189,25 @@ static inline int have_cpuid_p(void) static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { - /* ecx is often an input as well as an output. */ - asm volatile(cpuid + /* +* This function can be used from the boot code, so it needs +* to avoid using EBX in constraints in PIC mode. +* +* ecx is often an input as well as an output. +*/ + asm volatile(.ifnc %%ebx,%1 ; .ifnc %%rbx,%1 \n\t +movl %%ebx,%1\n\t +.endif ; .endif \n\t +cpuid \n\t +.ifnc %%ebx,%1 ; .ifnc %%rbx,%1 \n\t +xchgl %%ebx,%1\n\t +.endif ; .endif : =a (*eax), - =b (*ebx), +#if defined(__i386__) defined(__PIC__) + =r (*ebx), /* gcc won't let us use ebx */ +#else + =b (*ebx), /* ebx is okay */ +#endif =c (*ecx), =d (*edx) : 0 (*eax), 2 (*ecx) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 3/5] x86,random: Add an x86 implementation of arch_get_rng_seed
This does the same thing as the generic implementation, except that it logs how many bits of each type it collected. I want to know whether the initial seeding is working and, if so, whether the RNG is fast enough. (I know that hpa assures me that the hardware RNG is more than fast enough, but I'd still like a direct way to verify this.) Arguably, arch_get_random_seed could be removed now: I'm having some trouble imagining a sensible non-architecture-specific use of it that wouldn't be better served by arch_get_rng_seed. Signed-off-by: Andy Lutomirski l...@amacapital.net --- arch/x86/include/asm/archrandom.h | 6 + arch/x86/kernel/Makefile | 2 ++ arch/x86/kernel/archrandom.c | 51 +++ 3 files changed, 59 insertions(+) create mode 100644 arch/x86/kernel/archrandom.c diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 69f1366..88f9c5a 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -117,6 +117,12 @@ GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); #define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND) #define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED) +#define __HAVE_ARCH_GET_RNG_SEED +extern void arch_get_rng_seed(void *ctx, + void (*seed)(void *ctx, u32 data), + int bits_per_source, + const char *log_prefix); + #else static inline int rdrand_long(unsigned long *v) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 047f9ff..0718bae 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -92,6 +92,8 @@ obj-$(CONFIG_PARAVIRT)+= paravirt.o paravirt_patch_$(BITS).o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o +obj-$(CONFIG_ARCH_RANDOM) += archrandom.o + obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o diff --git a/arch/x86/kernel/archrandom.c b/arch/x86/kernel/archrandom.c new file mode 100644 index 000..47d13b0 --- /dev/null +++ b/arch/x86/kernel/archrandom.c @@ -0,0 +1,51 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2014 Andy Lutomirski + * Authors: Andy Lutomirski l...@amacapital.net + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include asm/archrandom.h + +void arch_get_rng_seed(void *ctx, + void (*seed)(void *ctx, u32 data), + int bits_per_source, + const char *log_prefix) +{ + int i; + int rdseed_bits = 0, rdrand_bits = 0; + char buf[128] = ; + char *msgptr = buf; + + for (i = 0; i bits_per_source; i += 8 * sizeof(long)) { + unsigned long rv; + + if (arch_get_random_seed_long(rv)) + rdseed_bits += 8 * sizeof(rv); + else if (arch_get_random_long(rv)) + rdrand_bits += 8 * sizeof(rv); + else + continue; /* Don't waste time mixing. */ + + seed(ctx, (u32)rv); +#if BITS_PER_LONG 32 + seed(ctx, (u32)(rv 32)); +#endif + } + + if (rdseed_bits) + msgptr += sprintf(msgptr, , %d bits from RDSEED, rdseed_bits); + if (rdrand_bits) + msgptr += sprintf(msgptr, , %d bits from RDRAND, rdrand_bits); + if (buf[0]) + pr_info(%s with %s\n, log_prefix, buf + 2); +} -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 4/5] x86,random,kvm: Use KVM_GET_RNG_SEED in arch_get_rng_seed
This is a straightforward implementation: for each bit of internal RNG state, request one bit from KVM_GET_RNG_SEED. This is done even if RDSEED/RDRAND worked, since KVM_GET_RNG_SEED is likely to provide cryptographically secure output even if the CPU's RNG is weak or compromised. Signed-off-by: Andy Lutomirski l...@amacapital.net --- arch/x86/Kconfig | 4 arch/x86/include/asm/kvm_guest.h | 9 + arch/x86/kernel/archrandom.c | 25 - arch/x86/kernel/kvm.c| 10 ++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a8f749e..adfa09c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -593,6 +593,7 @@ config KVM_GUEST bool KVM Guest support (including kvmclock) depends on PARAVIRT select PARAVIRT_CLOCK + select ARCH_RANDOM default y ---help--- This option enables various optimizations for running under the KVM @@ -1507,6 +1508,9 @@ config ARCH_RANDOM If supported, this is a high bandwidth, cryptographically secure hardware random number generator. + This also enables paravirt RNGs such as KVM's if the relevant + PV guest support is enabled. + config X86_SMAP def_bool y prompt Supervisor Mode Access Prevention if EXPERT diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h index a92b176..8c4dbd5 100644 --- a/arch/x86/include/asm/kvm_guest.h +++ b/arch/x86/include/asm/kvm_guest.h @@ -3,4 +3,13 @@ int kvm_setup_vsyscall_timeinfo(void); +#if defined(CONFIG_KVM_GUEST) defined(CONFIG_ARCH_RANDOM) +extern bool kvm_get_rng_seed(u64 *rv); +#else +static inline bool kvm_get_rng_seed(u64 *rv) +{ + return false; +} +#endif + #endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/kernel/archrandom.c b/arch/x86/kernel/archrandom.c index 47d13b0..8c8d021 100644 --- a/arch/x86/kernel/archrandom.c +++ b/arch/x86/kernel/archrandom.c @@ -15,6 +15,7 @@ */ #include asm/archrandom.h +#include asm/kvm_guest.h void arch_get_rng_seed(void *ctx, void (*seed)(void *ctx, u32 data), @@ -22,7 +23,7 @@ void arch_get_rng_seed(void *ctx, const char *log_prefix) { int i; - int rdseed_bits = 0, rdrand_bits = 0; + int rdseed_bits = 0, rdrand_bits = 0, kvm_bits = 0; char buf[128] = ; char *msgptr = buf; @@ -42,10 +43,32 @@ void arch_get_rng_seed(void *ctx, #endif } + /* +* Use KVM_GET_RNG_SEED regardless of whether the CPU RNG +* worked, since it incorporates entropy unavailable to the CPU, +* and we shouldn't trust the hardware RNG more than we need to. +* We request enough bits for the entire internal RNG state, +* because there's no good reason not to. +*/ + for (i = 0; i bits_per_source; i += 64) { + u64 rv; + + if (kvm_get_rng_seed(rv)) { + seed(ctx, (u32)rv); + seed(ctx, (u32)(rv 32)); + kvm_bits += 8 * sizeof(rv); + } else { + break; /* If it fails once, it will keep failing. */ + } + } + if (rdseed_bits) msgptr += sprintf(msgptr, , %d bits from RDSEED, rdseed_bits); if (rdrand_bits) msgptr += sprintf(msgptr, , %d bits from RDRAND, rdrand_bits); + if (kvm_bits) + msgptr += sprintf(msgptr, , %d bits from KVM_GET_RNG_BITS, + kvm_bits); if (buf[0]) pr_info(%s with %s\n, log_prefix, buf + 2); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 3dd8e2c..bd8783a 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -416,6 +416,16 @@ void kvm_disable_steal_time(void) wrmsr(MSR_KVM_STEAL_TIME, 0, 0); } +bool kvm_get_rng_seed(u64 *v) +{ + /* +* Allow migration from a hypervisor with the GET_RNG_SEED +* feature to a hypervisor without it. +*/ + return (kvm_para_has_feature(KVM_FEATURE_GET_RNG_SEED) + rdmsrl_safe(MSR_KVM_GET_RNG_SEED, v) == 0); +} + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 1/5] x86,kvm: Add MSR_KVM_GET_RNG_SEED and a matching feature bit
This adds a simple interface to allow a guest to request 64 bits of host nonblocking entropy. This is independent of virtio-rng for a couple of reasons: - It's intended to be usable during early boot, when a trivial synchronous interface is needed. - virtio-rng gives blocking entropy, and making guest boot wait for the host's /dev/random will cause problems. MSR_KVM_GET_RNG_SEED is intended to provide 64 bits of best-effort cryptographically secure data for use as a seed. It provides no guarantee that the result contains any actual entropy. Signed-off-by: Andy Lutomirski l...@amacapital.net --- Documentation/virtual/kvm/cpuid.txt | 3 +++ arch/x86/include/uapi/asm/kvm_para.h | 2 ++ arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/x86.c | 4 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 3c65feb..0ab043b 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -54,6 +54,9 @@ KVM_FEATURE_PV_UNHALT || 7 || guest checks this feature bit || || before enabling paravirtualized || || spinlock support. -- +KVM_FEATURE_GET_RNG_SEED || 8 || host provides rng seed data via + || || MSR_KVM_GET_RNG_SEED. +-- KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||24 || host will warn if no guest-side || || per-cpu warps are expected in || || kvmclock. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 94dc8ca..e2eaf93 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -24,6 +24,7 @@ #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 #define KVM_FEATURE_PV_UNHALT 7 +#define KVM_FEATURE_GET_RNG_SEED 8 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -40,6 +41,7 @@ #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 #define MSR_KVM_STEAL_TIME 0x4b564d03 #define MSR_KVM_PV_EOI_EN 0x4b564d04 +#define MSR_KVM_GET_RNG_SEED 0x4b564d05 struct kvm_steal_time { __u64 steal; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 38a0afe..40d6763 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -479,7 +479,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 KVM_FEATURE_ASYNC_PF) | (1 KVM_FEATURE_PV_EOI) | (1 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | -(1 KVM_FEATURE_PV_UNHALT); +(1 KVM_FEATURE_PV_UNHALT) | +(1 KVM_FEATURE_GET_RNG_SEED); if (sched_info_on()) entry-eax |= (1 KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f644933..4e81853 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -48,6 +48,7 @@ #include linux/pci.h #include linux/timekeeper_internal.h #include linux/pvclock_gtod.h +#include linux/random.h #include trace/events/kvm.h #define CREATE_TRACE_POINTS @@ -2480,6 +2481,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_PV_EOI_EN: data = vcpu-arch.pv_eoi.msr_val; break; + case MSR_KVM_GET_RNG_SEED: + get_random_bytes(data, sizeof(data)); + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] vhost: Add polling mode
On 07/23/2014 04:48 PM, Abel Gordon wrote: On Wed, Jul 23, 2014 at 11:42 AM, Jason Wang jasow...@redhat.com wrote: On 07/23/2014 04:12 PM, Razya Ladelsky wrote: Jason Wang jasow...@redhat.com wrote on 23/07/2014 08:26:36 AM: From: Jason Wang jasow...@redhat.com To: Razya Ladelsky/Haifa/IBM@IBMIL, kvm@vger.kernel.org, Michael S. Tsirkin m...@redhat.com, Cc: abel.gor...@gmail.com, Joel Nider/Haifa/IBM@IBMIL, Yossi Kuperman1/Haifa/IBM@IBMIL, Eran Raichstein/Haifa/IBM@IBMIL, Alex Glikson/Haifa/IBM@IBMIL Date: 23/07/2014 08:26 AM Subject: Re: [PATCH] vhost: Add polling mode On 07/21/2014 09:23 PM, Razya Ladelsky wrote: Hello All, When vhost is waiting for buffers from the guest driver (e.g., more packets to send in vhost-net's transmit queue), it normally goes to sleep and waits for the guest to kick it. This kick involves a PIO in the guest, and therefore an exit (and possibly userspace involvement in translating this PIO exit into a file descriptor event), all of which hurts performance. If the system is under-utilized (has cpu time to spare), vhost can continuously poll the virtqueues for new buffers, and avoid asking the guest to kick us. This patch adds an optional polling mode to vhost, that can be enabled via a kernel module parameter, poll_start_rate. When polling is active for a virtqueue, the guest is asked to disable notification (kicks), and the worker thread continuously checks for new buffers. When it does discover new buffers, it simulates a kick by invoking the underlying backend driver (such as vhost-net), which thinks it got a real kick from the guest, and acts accordingly. If the underlying driver asks not to be kicked, we disable polling on this virtqueue. We start polling on a virtqueue when we notice it has work to do. Polling on this virtqueue is later disabled after 3 seconds of polling turning up no new work, as in this case we are better off returning to the exit-based notification mechanism. The default timeout of 3 seconds can be changed with the poll_stop_idle kernel module parameter. This polling approach makes lot of sense for new HW with posted-interrupts for which we have exitless host-to-guest notifications. But even with support for posted interrupts, guest-to-host communication still causes exits. Polling adds the missing part. When systems are overloaded, there won?t be enough cpu time for the various vhost threads to poll their guests' devices. For these scenarios, we plan to add support for vhost threads that can be shared by multiple devices, even of multiple vms. Our ultimate goal is to implement the I/O acceleration features described in: KVM Forum 2013: Efficient and Scalable Virtio (by Abel Gordon) https://www.youtube.com/watch?v=9EyweibHfEs and https://www.mail-archive.com/kvm@vger.kernel.org/msg98179.html Comments are welcome, Thank you, Razya Thanks for the work. Do you have perf numbers for this? Hi Jason, Thanks for reviewing. I ran some experiments with TCP stream netperf and filebench (having 2 threads performing random reads) benchmarks on an IBM System x3650 M4. All runs loaded the guests in a way that they were (cpu) saturated. The system had two cores per guest, as to allow for both the vcpu and the vhost thread to run concurrently for maximum throughput (but I didn't pin the threads to specific cores) I get: Netperf, 1 vm: The polling patch improved throughput by ~33%. Number of exits/sec decreased 6x. The same improvement was shown when I tested with 3 vms running netperf. filebench, 1 vm: ops/sec improved by 13% with the polling patch. Number of exits was reduced by 31%. The same experiment with 3 vms running filebench showed similar numbers. Looks good, may worth to add the result in the commit log. And looks like the patch only poll for virtqueue. In the future, may worth to add callbacks for vhost_net to poll socket. Then it could be used with rx busy polling in host which may speedup the rx also. Did you mean polling the network device to avoid interrupts? Yes, recent linux host support rx busy polling which can reduce the interrupts. If vhost can utilize this, it can also reduce the latency caused by vhost thread wakeups. And I'm also working on virtio-net busy polling in guest, if vhost can poll socket, it can also help in guest rx polling. Nice :) Note that you may want to check if if the processor support posted interrupts. I guess that if CPU supports posted interrupts then benefits of polling in the front-end (from performance perspective) may not worth the cpu cycles wasted in the guest. Yes it's worth to check. But I think busy polling in guest may still help since it may
RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail
-Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc- ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Wednesday, July 23, 2014 12:21 AM To: Caraman Mihai Claudiu-B02008 Cc: kvm-ppc@vger.kernel.org; linuxppc-...@lists.ozlabs.org; k...@vger.kernel.org Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail On 21.07.14 11:59, mihai.cara...@freescale.com wrote: -Original Message- From: Linuxppc-dev [mailto:linuxppc-dev- bounces+mihai.caraman=freescale@lists.ozlabs.org] On Behalf Of mihai.cara...@freescale.com Sent: Friday, July 18, 2014 12:06 PM To: Alexander Graf; kvm-ppc@vger.kernel.org Cc: linuxppc-...@lists.ozlabs.org; k...@vger.kernel.org Subject: RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, July 17, 2014 5:21 PM To: Caraman Mihai Claudiu-B02008; kvm-ppc@vger.kernel.org Cc: k...@vger.kernel.org; linuxppc-...@lists.ozlabs.org Subject: Re: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail On 17.07.14 13:22, Mihai Caraman wrote: On book3e, guest last instruction is read on the exit path using load external pid (lwepx) dedicated instruction. This load operation may fail due to TLB eviction and execute-but-not-read entries. This patch lay down the path for an alternative solution to read the guest last instruction, by allowing kvmppc_get_lat_inst() function to fail. Architecture specific implmentations of kvmppc_load_last_inst() may read last guest instruction and instruct the emulation layer to re- execute the guest in case of failure. Make kvmppc_get_last_inst() definition common between architectures. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- ... diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e2fd5a1..7f9c634 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -47,6 +47,11 @@ enum emulation_result { EMULATE_EXIT_USER,/* emulation requires exit to user- space */ }; +enum instruction_type { +INST_GENERIC, +INST_SC,/* system call */ +}; + extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern void kvmppc_handler_highmem(void); @@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 val, unsigned int bytes, int is_default_endian); +extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, + enum instruction_type type, u32 *inst); + extern int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -234,6 +242,23 @@ struct kvmppc_ops { extern struct kvmppc_ops *kvmppc_hv_ops; extern struct kvmppc_ops *kvmppc_pr_ops; +static inline int kvmppc_get_last_inst(struct kvm_vcpu *vcpu, +enum instruction_type type, u32 *inst) +{ +int ret = EMULATE_DONE; + +/* Load the instruction manually if it failed to do so in the + * exit path */ +if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED) +ret = kvmppc_load_last_inst(vcpu, type, vcpu- arch.last_inst); + + +*inst = (ret == EMULATE_DONE kvmppc_need_byteswap(vcpu)) ? +swab32(vcpu-arch.last_inst) : vcpu-arch.last_inst; This makes even less sense than the previous version. Either you treat inst as definitely overwritten or as preserves previous data on failure. Both v4 and v5 versions treat inst as definitely overwritten. So either you unconditionally swap like you did before If we make abstraction of its symmetry, KVM_INST_FETCH_FAILED is operated in host endianness, so it doesn't need byte swap. I agree with your reasoning if last_inst is initialized and compared with data in guest endianess, which is not the case yet for KVM_INST_FETCH_FAILED. Alex, are you relying on the fact that KVM_INST_FETCH_FAILED value is symmetrical? With a non symmetrical value like 0xDEADBEEF, and considering a little- endian guest on a big-endian host, we need to fix kvm logic to initialize and compare last_inst with 0xEFBEADDE swaped value. Your suggestion to unconditionally swap makes sense only with the above fix, otherwise inst may end up with 0xEFBEADDE swaped value with is wrong. Only for *inst which we would treat as undefined after the function returned EMULATE_AGAIN. Right. With this do you acknowledge that v5
RE: [PATCH v5 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail
Right. With this do you acknowledge that v5 (definitely overwritten approach) is ok? I think I'm starting to understand your logic of v5. You write fetch_failed into *inst unswapped if the fetch failed. v5 - don't swap when load fails :) I think that's ok, but I definitely do not like the code flow - it's too hard to understand at a glimpse. Just rewrite it to swab at local variable level, preferably with if()s and comments what this is about and have a single unconditional *inst = fetched_inst; at the end of the function. I will incorporate these change requests into v6. Thanks, -Mike -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 0/5] Read guest last instruction from kvmppc_get_last_inst()
Read guest last instruction from kvmppc_get_last_inst() allowing the function to fail in order to emulate again. On bookehv architecture search for the physical address and kmap it, instead of using Load External PID (lwepx) instruction. This fixes an infinite loop caused by lwepx's data TLB miss exception handled in the host and the TODO for execute-but-not-read entries and TLB eviction. Mihai Caraman (5): KVM: PPC: e500mc: Revert add load inst fixup KVM: PPC: Book3e: Add TLBSEL/TSIZE defines for MAS0/1 KVM: PPC: Book3s: Remove kvmppc_read_inst() function KVM: PPC: Alow kvmppc_get_last_inst() to fail KVM: PPC: Bookehv: Get vcpu's last instruction for emulation arch/powerpc/include/asm/kvm_book3s.h| 26 --- arch/powerpc/include/asm/kvm_booke.h | 5 -- arch/powerpc/include/asm/kvm_ppc.h | 31 + arch/powerpc/include/asm/mmu-book3e.h| 9 ++- arch/powerpc/kvm/book3s.c| 17 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 17 ++--- arch/powerpc/kvm/book3s_paired_singles.c | 38 +++ arch/powerpc/kvm/book3s_pr.c | 114 --- arch/powerpc/kvm/booke.c | 47 + arch/powerpc/kvm/bookehv_interrupts.S| 55 ++- arch/powerpc/kvm/e500_mmu_host.c | 98 ++ arch/powerpc/kvm/emulate.c | 18 +++-- arch/powerpc/kvm/powerpc.c | 11 ++- 13 files changed, 314 insertions(+), 172 deletions(-) -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 4/5] KVM: PPC: Alow kvmppc_get_last_inst() to fail
On book3e, guest last instruction is read on the exit path using load external pid (lwepx) dedicated instruction. This load operation may fail due to TLB eviction and execute-but-not-read entries. This patch lay down the path for an alternative solution to read the guest last instruction, by allowing kvmppc_get_lat_inst() function to fail. Architecture specific implmentations of kvmppc_load_last_inst() may read last guest instruction and instruct the emulation layer to re-execute the guest in case of failure. Make kvmppc_get_last_inst() definition common between architectures. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- v6: - rewrite kvmppc_get_last_inst() swap code to be understood at a glimpse :) - use inst in kvmppc_load_last_inst - these changes compile on book3s, please validate the functionality and do the necessary changes! v5: - don't swap when load fail - convert the return value space of kvmppc_ld() v4: - common declaration and enum for kvmppc_load_last_inst() - remove kvmppc_read_inst() in a preceding patch v3: - rework patch description - add common definition for kvmppc_get_last_inst() - check return values in book3s code v2: - integrated kvmppc_get_last_inst() in book3s code and checked build - addressed cosmetic feedback arch/powerpc/include/asm/kvm_book3s.h| 26 -- arch/powerpc/include/asm/kvm_booke.h | 5 arch/powerpc/include/asm/kvm_ppc.h | 31 ++ arch/powerpc/kvm/book3s.c| 17 arch/powerpc/kvm/book3s_64_mmu_hv.c | 17 arch/powerpc/kvm/book3s_paired_singles.c | 38 +-- arch/powerpc/kvm/book3s_pr.c | 45 +++- arch/powerpc/kvm/booke.c | 3 +++ arch/powerpc/kvm/e500_mmu_host.c | 6 + arch/powerpc/kvm/emulate.c | 18 - arch/powerpc/kvm/powerpc.c | 11 ++-- 11 files changed, 140 insertions(+), 77 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 20fb6f2..a86ca65 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -276,32 +276,6 @@ static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu) return (kvmppc_get_msr(vcpu) MSR_LE) != (MSR_KERNEL MSR_LE); } -static inline u32 kvmppc_get_last_inst_internal(struct kvm_vcpu *vcpu, ulong pc) -{ - /* Load the instruction manually if it failed to do so in the -* exit path */ - if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED) - kvmppc_ld(vcpu, pc, sizeof(u32), vcpu-arch.last_inst, false); - - return kvmppc_need_byteswap(vcpu) ? swab32(vcpu-arch.last_inst) : - vcpu-arch.last_inst; -} - -static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) -{ - return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu)); -} - -/* - * Like kvmppc_get_last_inst(), but for fetching a sc instruction. - * Because the sc instruction sets SRR0 to point to the following - * instruction, we have to fetch from pc - 4. - */ -static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu) -{ - return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu) - 4); -} - static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) { return vcpu-arch.fault_dar; diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index c7aed61..cbb1990 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h @@ -69,11 +69,6 @@ static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu) return false; } -static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) -{ - return vcpu-arch.last_inst; -} - static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val) { vcpu-arch.ctr = val; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e2fd5a1..2da5f547 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -47,6 +47,11 @@ enum emulation_result { EMULATE_EXIT_USER,/* emulation requires exit to user-space */ }; +enum instruction_type { + INST_GENERIC, + INST_SC,/* system call */ +}; + extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern void kvmppc_handler_highmem(void); @@ -62,6 +67,9 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 val, unsigned int bytes, int is_default_endian); +extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, +enum instruction_type type, u32 *inst); + extern int kvmppc_emulate_instruction(struct kvm_run *run,
[PATCH v6 5/5] KVM: PPC: Bookehv: Get vcpu's last instruction for emulation
On book3e, KVM uses load external pid (lwepx) dedicated instruction to read guest last instruction on the exit path. lwepx exceptions (DTLB_MISS, DSI and LRAT), generated by loading a guest address, needs to be handled by KVM. These exceptions are generated in a substituted guest translation context (EPLC[EGS] = 1) from host context (MSR[GS] = 0). Currently, KVM hooks only interrupts generated from guest context (MSR[GS] = 1), doing minimal checks on the fast path to avoid host performance degradation. lwepx exceptions originate from host state (MSR[GS] = 0) which implies additional checks in DO_KVM macro (beside the current MSR[GS] = 1) by looking at the Exception Syndrome Register (ESR[EPID]) and the External PID Load Context Register (EPLC[EGS]). Doing this on each Data TLB miss exception is obvious too intrusive for the host. Read guest last instruction from kvmppc_load_last_inst() by searching for the physical address and kmap it. This address the TODO for TLB eviction and execute-but-not-read entries, and allow us to get rid of lwepx until we are able to handle failures. A simple stress benchmark shows a 1% sys performance degradation compared with previous approach (lwepx without failure handling): time for i in `seq 1 1`; do /bin/echo /dev/null; done real0m 8.85s user0m 4.34s sys 0m 4.48s vs real0m 8.84s user0m 4.36s sys 0m 4.44s A solution to use lwepx and to handle its exceptions in KVM would be to temporary highjack the interrupt vector from host. This imposes additional synchronizations for cores like FSL e6500 that shares host IVOR registers between hardware threads. This optimized solution can be later developed on top of this patch. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- v6: - no change v5: - return ENULATE_AGAIN in case of failure v4: - add switch and new function when getting last inst earlier - use enum instead of prev semnatic - get rid of mas0, optimize mas7_mas3 - give more context in visible messages - check storage attributes mismatch on MMUv2 - get rid of pfn_valid check v3: - reworked patch description - use unaltered kmap addr for kunmap - get last instruction before beeing preempted v2: - reworked patch description - used pr_* functions - addressed cosmetic feedback arch/powerpc/kvm/booke.c | 44 + arch/powerpc/kvm/bookehv_interrupts.S | 37 -- arch/powerpc/kvm/e500_mmu_host.c | 92 +++ 3 files changed, 145 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 34a42b9..843077b 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -869,6 +869,28 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, } } +static int kvmppc_resume_inst_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + enum emulation_result emulated, u32 last_inst) +{ + switch (emulated) { + case EMULATE_AGAIN: + return RESUME_GUEST; + + case EMULATE_FAIL: + pr_debug(%s: load instruction from guest address %lx failed\n, + __func__, vcpu-arch.pc); + /* For debugging, encode the failing instruction and +* report it to userspace. */ + run-hw.hardware_exit_reason = ~0ULL 32; + run-hw.hardware_exit_reason |= last_inst; + kvmppc_core_queue_program(vcpu, ESR_PIL); + return RESUME_HOST; + + default: + BUG(); + } +} + /** * kvmppc_handle_exit * @@ -880,6 +902,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, int r = RESUME_HOST; int s; int idx; + u32 last_inst = KVM_INST_FETCH_FAILED; + enum emulation_result emulated = EMULATE_DONE; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); @@ -887,6 +911,20 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* restart interrupts if they were meant for the host */ kvmppc_restart_interrupt(vcpu, exit_nr); + /* +* get last instruction before beeing preempted +* TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR ESR_DATA +*/ + switch (exit_nr) { + case BOOKE_INTERRUPT_DATA_STORAGE: + case BOOKE_INTERRUPT_DTLB_MISS: + case BOOKE_INTERRUPT_HV_PRIV: + emulated = kvmppc_get_last_inst(vcpu, false, last_inst); + break; + default: + break; + } + local_irq_enable(); trace_kvm_exit(exit_nr, vcpu); @@ -895,6 +933,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run-exit_reason = KVM_EXIT_UNKNOWN; run-ready_for_interrupt_injection = 1; + if (emulated != EMULATE_DONE) { + r =